Merge pull request #51702 from thaJeztah/bump_wazero

vendor: github.com/tetratelabs/wazero v1.10.1
2026-01-11 18:51:37 +00:00 · 2025-12-12 12:18:09 +00:00
parent 0a0245a31a 413b4afcba
commit 4384364748
52 changed files with 1511 additions and 811 deletions
--- a/go.mod
+++ b/go.mod
@@ -217,7 +217,7 @@ require (
 	github.com/shibumi/go-pathspec v1.3.0 // indirect
 	github.com/spdx/tools-golang v0.5.5 // indirect
 	github.com/stretchr/testify v1.11.1 // indirect
-	github.com/tetratelabs/wazero v1.9.0 // indirect
+	github.com/tetratelabs/wazero v1.10.1 // indirect
 	github.com/tinylib/msgp v1.3.0 // indirect
 	github.com/tonistiigi/dchapes-mode v0.0.0-20250318174251-73d941a28323 // indirect
 	github.com/tonistiigi/fsutil v0.0.0-20250605211040-586307ad452f // indirect
--- a/go.sum
+++ b/go.sum
@@ -602,8 +602,8 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 github.com/tedsuo/ifrit v0.0.0-20230516164442-7862c310ad26 h1:mWCRvpoEMVlslxEvvptKgIUb35va9yj9Oq5wGw/er5I=
 github.com/tedsuo/ifrit v0.0.0-20230516164442-7862c310ad26/go.mod h1:0uD3VMXkZ7Bw0ojGCwDzebBBzPBXtzEZeXai+56BLX4=
-github.com/tetratelabs/wazero v1.9.0 h1:IcZ56OuxrtaEz8UYNRHBrUa9bYeX9oVY93KspZZBf/I=
-github.com/tetratelabs/wazero v1.9.0/go.mod h1:TSbcXCfFP0L2FGkRPxHphadXPjo1T6W+CseNNY7EkjM=
+github.com/tetratelabs/wazero v1.10.1 h1:2DugeJf6VVk58KTPszlNfeeN8AhhpwcZqkJj2wwFuH8=
+github.com/tetratelabs/wazero v1.10.1/go.mod h1:DRm5twOQ5Gr1AoEdSi0CLjDQF1J9ZAuyqFIjl1KKfQU=
 github.com/tinylib/msgp v1.3.0 h1:ULuf7GPooDaIlbyvgAxBV/FI7ynli6LZ1/nVUNu+0ww=
 github.com/tinylib/msgp v1.3.0/go.mod h1:ykjzy2wzgrlvpDCRc4LA8UXy6D8bzMSuAF3WD57Gok0=
 github.com/tonistiigi/dchapes-mode v0.0.0-20250318174251-73d941a28323 h1:r0p7fK56l8WPequOaR3i9LBqfPtEdXIQbUTzT55iqT4=
--- a/vendor/github.com/tetratelabs/wazero/Makefile
+++ b/vendor/github.com/tetratelabs/wazero/Makefile
@@ -113,6 +113,7 @@ spectest_v1_testdata_dir := $(spectest_v1_dir)/testdata
 spec_version_v1 := wg-1.0
 spectest_v2_dir := $(spectest_base_dir)/v2
 spectest_v2_testdata_dir := $(spectest_v2_dir)/testdata
+
 # Latest draft state as of March 12, 2024.
 spec_version_v2 := 1c5e5d178bd75c79b7a12881c529098beaee2a05
 spectest_threads_dir := $(spectest_base_dir)/threads
@@ -121,6 +122,10 @@ spectest_threads_testdata_dir := $(spectest_threads_dir)/testdata
 # It will likely be renamed to main in the future - https://github.com/WebAssembly/threads/issues/216.
 spec_version_threads := 3635ca51a17e57e106988846c5b0e0cc48ac04fc

+spectest_tail_call_dir := $(spectest_base_dir)/tail-call
+spectest_tail_call_testdata_dir := $(spectest_tail_call_dir)/testdata
+spec_version_tail_call := 4fd2339b5e9709e74b326797f69a88b13eac4d47
+
 .PHONY: build.spectest
 build.spectest:
 	@$(MAKE) build.spectest.v1
@@ -175,6 +180,15 @@ build.spectest.threads:
 		wast2json --enable-threads --debug-names $$f; \
 	done

+.PHONY: build.spectest.tail_call
+build.spectest.tail_call:
+	mkdir -p $(spectest_tail_call_testdata_dir)
+	cd $(spectest_tail_call_testdata_dir) \
+		&& curl -sSL 'https://api.github.com/repos/WebAssembly/testsuite/contents/proposals/tail-call?ref=$(spec_version_tail_call)' | jq -r '.[]| .download_url' | grep -E ".wast" | xargs -Iurl curl -sJL url -O
+	cd $(spectest_tail_call_testdata_dir) && for f in `find . -name '*.wast'`; do \
+		wast2json --enable-tail-call --debug-names $$f; \
+	done
+
 .PHONY: test
 test:
 	@go test $(go_test_options) ./...
@@ -220,13 +234,10 @@ check:
 	@GOARCH=wasm GOOS=wasip1 go build ./...
 # Ensure we build on aix. See #1723
 	@GOARCH=ppc64 GOOS=aix go build ./...
-# Ensure we build on windows:
-	@GOARCH=amd64 GOOS=windows go build ./...
-# Ensure we build on an arbitrary operating system:
-	@GOARCH=amd64 GOOS=dragonfly go build ./...
-# Ensure we build on solaris/illumos:
-	@GOARCH=amd64 GOOS=illumos go build ./...
-	@GOARCH=amd64 GOOS=solaris go build ./...
+# Ensure we build on linux s390x. See #2412
+	@GOARCH=s390x GOOS=linux go build ./...
+# Ensure we build on linux ppc64le. See #2412
+	@GOARCH=ppc64le GOOS=linux go build ./...
 # Ensure we build on linux arm for Dapr:
 #	gh release view -R dapr/dapr --json assets --jq 'first(.assets[] | select(.name = "daprd_linux_arm.tar.gz") | {url, downloadCount})'
 	@GOARCH=arm GOOS=linux go build ./...
@@ -274,22 +285,15 @@ libsodium:
 #### CLI release related ####

 VERSION ?= dev
-# Default to a dummy version 0.0.1.1, which is always lower than a real release.
-# Legal version values should look like 'x.x.x.x' where x is an integer from 0 to 65534.
-# https://learn.microsoft.com/en-us/windows/win32/msi/productversion?redirectedfrom=MSDN
-# https://stackoverflow.com/questions/9312221/msi-version-numbers
-MSI_VERSION ?= 0.0.1.1
 non_windows_platforms := darwin_amd64 darwin_arm64 linux_amd64 linux_arm64
 non_windows_archives  := $(non_windows_platforms:%=dist/wazero_$(VERSION)_%.tar.gz)
 windows_platforms     := windows_amd64 # TODO: add arm64 windows once we start testing on it.
-windows_archives      := $(windows_platforms:%=dist/wazero_$(VERSION)_%.zip) $(windows_platforms:%=dist/wazero_$(VERSION)_%.msi)
+windows_archives      := $(windows_platforms:%=dist/wazero_$(VERSION)_%.zip)
 checksum_txt          := dist/wazero_$(VERSION)_checksums.txt

 # define macros for multi-platform builds. these parse the filename being built
 go-arch = $(if $(findstring amd64,$1),amd64,arm64)
 go-os   = $(if $(findstring .exe,$1),windows,$(if $(findstring linux,$1),linux,darwin))
-# msi-arch is a macro so we can detect it based on the file naming convention
-msi-arch     = $(if $(findstring amd64,$1),x64,arm64)

 build/wazero_%/wazero:
 	$(call go-build,$@,$<)
@@ -314,51 +318,15 @@ define go-build
 	@echo build "ok"
 endef

-# this makes a marker file ending in .signed to avoid repeatedly calling codesign
-%.signed: %
-	$(call codesign,$<)
-	@touch $@
-
-# This requires osslsigncode package (apt or brew) or latest windows release from mtrojnar/osslsigncode
-#
-# Default is self-signed while production should be a Digicert signing key
-#
-# Ex.
-# ```bash
-# keytool -genkey -alias wazero -storetype PKCS12 -keyalg RSA -keysize 2048 -storepass wazero-bunch \
-# -keystore wazero.p12 -dname "O=wazero,CN=wazero.io" -validity 3650
-# ```
-WINDOWS_CODESIGN_P12      ?= packaging/msi/wazero.p12
-WINDOWS_CODESIGN_PASSWORD ?= wazero-bunch
-define codesign
-	@printf "$(ansi_format_dark)" codesign "signing $1"
-	@osslsigncode sign -h sha256 -pkcs12 ${WINDOWS_CODESIGN_P12} -pass "${WINDOWS_CODESIGN_PASSWORD}" \
-	-n "wazero is the zero dependency WebAssembly runtime for Go developers" -i https://wazero.io -t http://timestamp.digicert.com \
-	$(if $(findstring msi,$(1)),-add-msi-dse) -in $1 -out $1-signed
-	@mv $1-signed $1
-	@printf "$(ansi_format_bright)" codesign "ok"
-endef
-
-# This task is only supported on Windows, where we use candle.exe (compile wxs to wixobj) and light.exe (link to msi)
-dist/wazero_$(VERSION)_%.msi: build/wazero_%/wazero.exe.signed
-ifeq ($(OS),Windows_NT)
-	@echo msi "building $@"
-	@mkdir -p $(@D)
-	@candle -nologo -arch $(call msi-arch,$@) -dVersion=$(MSI_VERSION) -dBin=$(<:.signed=) -o build/wazero.wixobj packaging/msi/wazero.wxs
-	@light -nologo -o $@ build/wazero.wixobj -spdb
-	$(call codesign,$@)
-	@echo msi "ok"
-endif
-
-dist/wazero_$(VERSION)_%.zip: build/wazero_%/wazero.exe.signed
+dist/wazero_$(VERSION)_%.zip: build/wazero_%/wazero.exe
 	@echo zip "zipping $@"
 	@mkdir -p $(@D)
-	@zip -qj $@ $(<:.signed=)
+	@zip -qj $@ $<
 	@echo zip "ok"

 # Darwin doesn't have sha256sum. See https://github.com/actions/virtual-environments/issues/90
 sha256sum := $(if $(findstring darwin,$(shell go env GOOS)),shasum -a 256,sha256sum)
-$(checksum_txt):
-	@cd $(@D); touch $(@F); $(sha256sum) * >> $(@F)
+$(checksum_txt): $(non_windows_archives) $(windows_archives)
+	@cd $(@D); touch $(@F); $(sha256sum) * > $(@F)

-dist: $(non_windows_archives) $(if $(findstring Windows_NT,$(OS)),$(windows_archives),) $(checksum_txt)
+dist: $(non_windows_archives) $(windows_archives) $(checksum_txt)
--- a/vendor/github.com/tetratelabs/wazero/RATIONALE.md
+++ b/vendor/github.com/tetratelabs/wazero/RATIONALE.md
@@ -507,7 +507,7 @@ inserted after exit: https://github.com/emscripten-core/emscripten/issues/12322

 ## WASI

-Unfortunately, (WASI Snapshot Preview 1)[https://github.com/WebAssembly/WASI/blob/snapshot-01/phases/snapshot/docs.md] is not formally defined enough, and has APIs with ambiguous semantics.
+Unfortunately, [WASI Snapshot Preview 1](https://github.com/WebAssembly/WASI/blob/snapshot-01/phases/snapshot/docs.md) is not formally defined enough, and has APIs with ambiguous semantics.
 This section describes how Wazero interprets and implements the semantics of several WASI APIs that may be interpreted differently by different wasm runtimes.
 Those APIs may affect the portability of a WASI application.

--- a/vendor/github.com/tetratelabs/wazero/README.md
+++ b/vendor/github.com/tetratelabs/wazero/README.md
@@ -43,7 +43,7 @@ magnitude (10x) or more. This is done without host-specific dependencies.

 ### Conformance

-Both runtimes pass WebAssembly Core [1.0][7] and [2.0][14] specification tests
+Both runtimes pass WebAssembly Core [1.0][3] and [2.0][4] specification tests
 on supported platforms:

 |   Runtime   |                 Usage                  | amd64 | arm64 | others |
@@ -58,7 +58,7 @@ wazero into their Go applications.

 ### wazero

-wazero's [1.0 release][15] happened in March 2023, and is [in use][16] by many
+wazero's [1.0 release][8] happened in March 2023, and is [in use][9] by many
 projects and production sites.

 We offer an API stability promise with semantic versioning. In other words, we
@@ -72,14 +72,14 @@ You can get the latest version of wazero like this.
 go get github.com/tetratelabs/wazero@latest
 ```

-Please give us a [star][17] if you end up using wazero!
+Please give us a [star][10] if you end up using wazero!

 ### Go

 wazero has no dependencies except Go, so the only source of conflict in your
 project's use of wazero is the Go version.

-wazero follows the same version policy as Go's [Release Policy][10]: two
+wazero follows the same version policy as Go's [Release Policy][5]: two
 versions. wazero will ensure these versions work and bugs are valid if there's
 an issue with a current Go version.

@@ -96,18 +96,18 @@ systems are ones we test, but that doesn't necessarily mean other operating
 system versions won't work.

 We currently test Linux (Ubuntu and scratch), MacOS and Windows as packaged by
-[GitHub Actions][11], as well as nested VMs running on Linux for FreeBSD, NetBSD,
+[GitHub Actions][6], as well as nested VMs running on Linux for FreeBSD, NetBSD,
 OpenBSD, DragonFly BSD, illumos and Solaris.

 We also test cross compilation for many `GOOS` and `GOARCH` combinations.

 * Interpreter
-  * Linux is tested on amd64 (native) as well arm64 and riscv64 via emulation.
+  * Linux is tested on amd64 and arm64 (native) as well as riscv64 via emulation.
  * Windows, FreeBSD, NetBSD, OpenBSD, DragonFly BSD, illumos and Solaris are
    tested only on amd64.
  * macOS is tested only on arm64.
 * Compiler
-  * Linux is tested on amd64 (native) as well arm64 via emulation.
+  * Linux is tested on amd64 and arm64.
  * Windows, FreeBSD, NetBSD, DragonFly BSD, illumos and Solaris are
    tested only on amd64.
  * macOS is tested only on arm64.
@@ -116,24 +116,25 @@ wazero has no dependencies and doesn't require CGO. This means it can also be
 embedded in an application that doesn't use an operating system. This is a main
 differentiator between wazero and alternatives.

-We verify zero dependencies by running tests in Docker's [scratch image][12].
+We verify zero dependencies by running tests in Docker's [scratch image][7].
 This approach ensures compatibility with any parent image.

+### macOS code-signing entitlements
+
+If you're developing for macOS and need to code-sign your application,
+please read issue [#2393][11].
+
 -----
 wazero is a registered trademark of Tetrate.io, Inc. in the United States and/or other countries

 [1]: https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/
 [2]: https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/
-[4]: https://github.com/WebAssembly/meetings/blob/main/process/subgroups.md
-[5]: https://github.com/WebAssembly/WASI
-[6]: https://pkg.go.dev/golang.org/x/sys/unix
-[7]: https://github.com/WebAssembly/spec/tree/wg-1.0/test/core
-[9]: https://github.com/tetratelabs/wazero/issues/506
-[10]: https://go.dev/doc/devel/release
-[11]: https://github.com/actions/virtual-environments
-[12]: https://docs.docker.com/develop/develop-images/baseimages/#create-a-simple-parent-image-using-scratch
-[13]: https://github.com/WebAssembly/WASI/blob/snapshot-01/phases/snapshot/docs.md
-[14]: https://github.com/WebAssembly/spec/tree/d39195773112a22b245ffbe864bab6d1182ccb06/test/core
-[15]: https://tetrate.io/blog/introducing-wazero-from-tetrate/
-[16]: https://wazero.io/community/users/
-[17]: https://github.com/tetratelabs/wazero/stargazers
+[3]: https://github.com/WebAssembly/spec/tree/wg-1.0/test/core
+[4]: https://github.com/WebAssembly/spec/tree/d39195773112a22b245ffbe864bab6d1182ccb06/test/core
+[5]: https://go.dev/doc/devel/release
+[6]: https://github.com/actions/virtual-environments
+[7]: https://docs.docker.com/develop/develop-images/baseimages/#create-a-simple-parent-image-using-scratch
+[8]: https://tetrate.io/blog/introducing-wazero-from-tetrate/
+[9]: https://wazero.io/community/users/
+[10]: https://github.com/wazero/wazero/stargazers
+[11]: https://github.com/wazero/wazero/issues/2393
--- a/vendor/github.com/tetratelabs/wazero/experimental/compilationworkers.go
+++ b/vendor/github.com/tetratelabs/wazero/experimental/compilationworkers.go
@@ -0,0 +1,19 @@
+package experimental
+
+import (
+	"context"
+
+	"github.com/tetratelabs/wazero/internal/expctxkeys"
+)
+
+// WithCompilationWorkers sets the desired number of compilation workers.
+func WithCompilationWorkers(ctx context.Context, workers int) context.Context {
+	return context.WithValue(ctx, expctxkeys.CompilationWorkers{}, workers)
+}
+
+// GetCompilationWorkers returns the desired number of compilation workers.
+// The minimum value returned is 1.
+func GetCompilationWorkers(ctx context.Context) int {
+	workers, _ := ctx.Value(expctxkeys.CompilationWorkers{}).(int)
+	return max(workers, 1)
+}
--- a/vendor/github.com/tetratelabs/wazero/experimental/features.go
+++ b/vendor/github.com/tetratelabs/wazero/experimental/features.go
@@ -13,3 +13,6 @@ import "github.com/tetratelabs/wazero/api"
 //     binaries will use a theroetical maximum like 4GB, so if using such a binary on a system
 //     without mmap, consider editing the binary to reduce the max size setting of memory.
 const CoreFeaturesThreads = api.CoreFeatureSIMD << 1
+
+// CoreFeaturesThreads enables tail call instructions ("tail-call").
+const CoreFeaturesTailCall = api.CoreFeatureSIMD << 2
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go
@@ -814,6 +814,7 @@ operatorSwitch:
 		c.emit(
 			newOperationCallIndirect(typeIndex, tableIndex),
 		)
+
 	case wasm.OpcodeDrop:
 		r := inclusiveRange{Start: 0, End: 0}
 		if peekValueType == unsignedTypeV128 {
@@ -3423,6 +3424,45 @@ operatorSwitch:
 		default:
 			return fmt.Errorf("unsupported atomic instruction in interpreterir: %s", wasm.AtomicInstructionName(atomicOp))
 		}
+
+	case wasm.OpcodeTailCallReturnCall:
+		fdef := c.module.FunctionDefinition(index)
+		functionFrame := c.controlFrames.functionFrame()
+		// Currently we do not support imported functions, we treat them as regular calls.
+		// For details, see internal/engine/RATIONALE.md
+		if _, _, isImport := fdef.Import(); isImport {
+			c.emit(newOperationCall(index))
+			dropOp := newOperationDrop(c.getFrameDropRange(functionFrame, false))
+
+			// Cleanup the stack and then jmp to function frame's continuation (meaning return).
+			c.emit(dropOp)
+			c.emit(newOperationBr(functionFrame.asLabel()))
+		} else {
+			c.emit(newOperationTailCallReturnCall(index))
+		}
+
+		// Return operation is stack-polymorphic, and mark the state as unreachable.
+		// That means subsequent instructions in the current control frame are "unreachable"
+		// and can be safely removed.
+		c.markUnreachable()
+
+	case wasm.OpcodeTailCallReturnCallIndirect:
+		typeIndex := index
+		tableIndex, n, err := leb128.LoadUint32(c.body[c.pc+1:])
+		if err != nil {
+			return fmt.Errorf("read target for br_table: %w", err)
+		}
+		c.pc += n
+
+		functionFrame := c.controlFrames.functionFrame()
+		dropRange := c.getFrameDropRange(functionFrame, false)
+		c.emit(newOperationTailCallReturnCallIndirect(typeIndex, tableIndex, dropRange, functionFrame.asLabel()))
+
+		// Return operation is stack-polymorphic, and mark the state as unreachable.
+		// That means subsequent instructions in the current control frame are "unreachable"
+		// and can be safely removed.
+		c.markUnreachable()
+
 	default:
 		return fmt.Errorf("unsupported instruction in interpreterir: 0x%x", op)
 	}
@@ -3449,7 +3489,10 @@ func (c *compiler) applyToStack(opcode wasm.Opcode) (index uint32, err error) {
 		wasm.OpcodeLocalSet,
 		wasm.OpcodeLocalTee,
 		wasm.OpcodeGlobalGet,
-		wasm.OpcodeGlobalSet:
+		wasm.OpcodeGlobalSet,
+		// tail-call proposal
+		wasm.OpcodeTailCallReturnCall,
+		wasm.OpcodeTailCallReturnCallIndirect:
 		// Assumes that we are at the opcode now so skip it before read immediates.
 		v, num, err := leb128.LoadUint32(c.body[c.pc+1:])
 		if err != nil {
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"math"
 	"math/bits"
+	"slices"
 	"sync"
 	"unsafe"

@@ -27,27 +28,37 @@ import (
 // The default value should suffice for most use cases. Those wishing to change this can via `go build -ldflags`.
 var callStackCeiling = 2000

+type compiledFunctionWithCount struct {
+	funcs    []compiledFunction
+	refCount int
+}
+
 // engine is an interpreter implementation of wasm.Engine
 type engine struct {
 	enabledFeatures   api.CoreFeatures
-	compiledFunctions map[wasm.ModuleID][]compiledFunction // guarded by mutex.
-	mux               sync.RWMutex
+	compiledFunctions map[wasm.ModuleID]*compiledFunctionWithCount // guarded by mutex.
+	mux               sync.Mutex
 }

 func NewEngine(_ context.Context, enabledFeatures api.CoreFeatures, _ filecache.Cache) wasm.Engine {
 	return &engine{
 		enabledFeatures:   enabledFeatures,
-		compiledFunctions: map[wasm.ModuleID][]compiledFunction{},
+		compiledFunctions: map[wasm.ModuleID]*compiledFunctionWithCount{},
 	}
 }

 // Close implements the same method as documented on wasm.Engine.
 func (e *engine) Close() (err error) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	clear(e.compiledFunctions)
 	return
 }

 // CompiledModuleCount implements the same method as documented on wasm.Engine.
 func (e *engine) CompiledModuleCount() uint32 {
+	e.mux.Lock()
+	defer e.mux.Unlock()
 	return uint32(len(e.compiledFunctions))
 }

@@ -59,19 +70,33 @@ func (e *engine) DeleteCompiledModule(m *wasm.Module) {
 func (e *engine) deleteCompiledFunctions(module *wasm.Module) {
 	e.mux.Lock()
 	defer e.mux.Unlock()
+	cf, ok := e.compiledFunctions[module.ID]
+	if !ok {
+		return
+	}
+	cf.refCount--
+	if cf.refCount > 0 {
+		return
+	}
 	delete(e.compiledFunctions, module.ID)
 }

 func (e *engine) addCompiledFunctions(module *wasm.Module, fs []compiledFunction) {
 	e.mux.Lock()
 	defer e.mux.Unlock()
-	e.compiledFunctions[module.ID] = fs
+	e.compiledFunctions[module.ID] = &compiledFunctionWithCount{funcs: fs, refCount: 1}
 }

-func (e *engine) getCompiledFunctions(module *wasm.Module) (fs []compiledFunction, ok bool) {
-	e.mux.RLock()
-	defer e.mux.RUnlock()
-	fs, ok = e.compiledFunctions[module.ID]
+func (e *engine) getCompiledFunctions(module *wasm.Module, increaseRefCount bool) (fs []compiledFunction, ok bool) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	cf, ok := e.compiledFunctions[module.ID]
+	if ok {
+		fs = cf.funcs
+		if increaseRefCount {
+			cf.refCount++
+		}
+	}
 	return
 }

@@ -242,15 +267,9 @@ type snapshot struct {

 // Snapshot implements the same method as documented on experimental.Snapshotter.
 func (ce *callEngine) Snapshot() experimental.Snapshot {
-	stack := make([]uint64, len(ce.stack))
-	copy(stack, ce.stack)
-
-	frames := make([]*callFrame, len(ce.frames))
-	copy(frames, ce.frames)
-
 	return &snapshot{
-		stack:  stack,
-		frames: frames,
+		stack:  slices.Clone(ce.stack),
+		frames: slices.Clone(ce.frames),
 		ce:     ce,
 	}
 }
@@ -356,7 +375,7 @@ const callFrameStackSize = 0

 // CompileModule implements the same method as documented on wasm.Engine.
 func (e *engine) CompileModule(_ context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) error {
-	if _, ok := e.getCompiledFunctions(module); ok { // cache hit!
+	if _, ok := e.getCompiledFunctions(module, true); ok { // cache hit!
 		return nil
 	}

@@ -405,7 +424,7 @@ func (e *engine) NewModuleEngine(module *wasm.Module, instance *wasm.ModuleInsta
 		functions:    make([]function, len(module.FunctionSection)+int(module.ImportFunctionCount)),
 	}

-	codes, ok := e.getCompiledFunctions(module)
+	codes, ok := e.getCompiledFunctions(module, false)
 	if !ok {
 		return nil, errors.New("source module must be compiled before instantiation")
 	}
@@ -427,12 +446,10 @@ func (e *engine) NewModuleEngine(module *wasm.Module, instance *wasm.ModuleInsta
 // lowerIR lowers the interpreterir operations to engine friendly struct.
 func (e *engine) lowerIR(ir *compilationResult, ret *compiledFunction) error {
 	// Copy the body from the result.
-	ret.body = make([]unionOperation, len(ir.Operations))
-	copy(ret.body, ir.Operations)
+	ret.body = slices.Clone(ir.Operations)
 	// Also copy the offsets if necessary.
 	if offsets := ir.IROperationSourceOffsetsInWasmBinary; len(offsets) > 0 {
-		ret.offsetsInWasmBinary = make([]uint64, len(offsets))
-		copy(ret.offsetsInWasmBinary, offsets)
+		ret.offsetsInWasmBinary = slices.Clone(offsets)
 	}

 	labelAddressResolutions := [labelKindNum][]uint64{}
@@ -449,9 +466,7 @@ func (e *engine) lowerIR(ir *compilationResult, ret *compiledFunction) error {
 			frameToAddresses := labelAddressResolutions[label.Kind()]
 			// Expand the slice if necessary.
 			if diff := fid - len(frameToAddresses) + 1; diff > 0 {
-				for j := 0; j < diff; j++ {
-					frameToAddresses = append(frameToAddresses, 0)
-				}
+				frameToAddresses = append(frameToAddresses, make([]uint64, diff)...)
 			}
 			frameToAddresses[fid] = address
 			labelAddressResolutions[kind] = frameToAddresses
@@ -472,6 +487,8 @@ func (e *engine) lowerIR(ir *compilationResult, ret *compiledFunction) error {
 				target := op.Us[j]
 				e.setLabelAddress(&op.Us[j], label(target), labelAddressResolutions)
 			}
+		case operationKindTailCallReturnCallIndirect:
+			e.setLabelAddress(&op.Us[1], label(op.Us[1]), labelAddressResolutions)
 		}
 	}
 	return nil
@@ -761,18 +778,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance
 		case operationKindCallIndirect:
 			offset := ce.popValue()
 			table := tables[op.U2]
-			if offset >= uint64(len(table.References)) {
-				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
-			}
-			rawPtr := table.References[offset]
-			if rawPtr == 0 {
-				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
-			}
-
-			tf := functionFromUintptr(rawPtr)
-			if tf.typeID != typeIDs[op.U1] {
-				panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
-			}
+			tf := ce.functionForOffset(table, offset, typeIDs[op.U1])

 			ce.callFunction(ctx, f.moduleInstance, tf)
 			frame.pc++
@@ -1725,12 +1731,17 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance
 			if fillSize+offset > uint64(len(memoryInst.Buffer)) {
 				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 			} else if fillSize != 0 {
-				// Uses the copy trick for faster filling buffer.
-				// https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
+				// Uses the copy trick for faster filling the buffer with the value.
+				// https://github.com/golang/go/blob/go1.24.0/src/bytes/bytes.go#L664-L673
 				buf := memoryInst.Buffer[offset : offset+fillSize]
-				buf[0] = value
-				for i := 1; i < len(buf); i *= 2 {
-					copy(buf[i:], buf[:i])
+				if value == 0 {
+					clear(buf)
+				} else {
+					buf[0] = value
+					for i := 1; i < len(buf); {
+						chunk := min(i, 8192)
+						i += copy(buf[i:], buf[:chunk])
+					}
 				}
 			}
 			frame.pc++
@@ -1804,7 +1815,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance
 				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
 			} else if num > 0 {
 				// Uses the copy trick for faster filling the region with the value.
-				// https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
+				// https://github.com/golang/go/blob/go1.24.0/src/slices/slices.go#L514-L517
 				targetRegion := table.References[offset : offset+num]
 				targetRegion[0] = ref
 				for i := 1; i < len(targetRegion); i *= 2 {
@@ -4331,6 +4342,32 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance
 			memoryInst.Mux.Unlock()
 			ce.pushValue(uint64(old))
 			frame.pc++
+		case operationKindTailCallReturnCall:
+			f := &functions[op.U1]
+			ce.dropForTailCall(frame, f)
+			body, bodyLen = ce.resetPc(frame, f)
+
+		case operationKindTailCallReturnCallIndirect:
+			offset := ce.popValue()
+			table := tables[op.U2]
+			tf := ce.functionForOffset(table, offset, typeIDs[op.U1])
+
+			// We are allowing proper tail calls only across functions that belong to the same
+			// module; for indirect calls, we have to enforce it at run-time.
+			// For details, see internal/engine/RATIONALE.md
+			if tf.moduleInstance != f.moduleInstance {
+				// Revert to a normal call.
+				ce.callFunction(ctx, f.moduleInstance, tf)
+				// Return
+				ce.drop(op.Us[0])
+				// Jump to the function frame (return)
+				frame.pc = op.Us[1]
+				continue
+			}
+
+			ce.dropForTailCall(frame, tf)
+			body, bodyLen = ce.resetPc(frame, tf)
+
 		default:
 			frame.pc++
 		}
@@ -4338,6 +4375,40 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance
 	ce.popFrame()
 }

+func (ce *callEngine) dropForTailCall(frame *callFrame, f *function) {
+	base := frame.base - frame.f.funcType.ParamNumInUint64
+	paramCount := f.funcType.ParamNumInUint64
+	ce.stack = append(ce.stack[:base], ce.stack[len(ce.stack)-paramCount:]...)
+}
+
+func (ce *callEngine) resetPc(frame *callFrame, f *function) (body []unionOperation, bodyLen uint64) {
+	// The compiler is currently allowing proper tail call only across functions
+	// that belong to the same module; thus, we can overwrite the frame in-place.
+	// For details, see internal/engine/RATIONALE.md
+	frame.f = f
+	frame.base = len(ce.stack)
+	frame.pc = 0
+	body = frame.f.parent.body
+	bodyLen = uint64(len(body))
+	return body, bodyLen
+}
+
+func (ce *callEngine) functionForOffset(table *wasm.TableInstance, offset uint64, expectedTypeID wasm.FunctionTypeID) *function {
+	if offset >= uint64(len(table.References)) {
+		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+	}
+	rawPtr := table.References[offset]
+	if rawPtr == 0 {
+		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+	}
+
+	tf := functionFromUintptr(rawPtr)
+	if tf.typeID != expectedTypeID {
+		panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
+	}
+	return tf
+}
+
 func wasmCompatMax32bits(v1, v2 uint32) uint64 {
 	return uint64(math.Float32bits(moremath.WasmCompatMax32(
 		math.Float32frombits(v1),
@@ -4564,9 +4635,7 @@ func (ce *callEngine) callGoFuncWithStack(ctx context.Context, m *wasm.ModuleIns
 	// In the interpreter engine, ce.stack may only have capacity to store
 	// parameters. Grow when there are more results than parameters.
 	if growLen := resultLen - paramLen; growLen > 0 {
-		for i := 0; i < growLen; i++ {
-			ce.stack = append(ce.stack, 0)
-		}
+		ce.stack = append(ce.stack, make([]uint64, growLen)...)
 		stackLen += growLen
 	}

--- a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/operations.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/operations.go
@@ -445,6 +445,10 @@ func (o operationKind) String() (ret string) {
 		ret = "operationKindAtomicRMW8Cmpxchg"
 	case operationKindAtomicRMW16Cmpxchg:
 		ret = "operationKindAtomicRMW16Cmpxchg"
+	case operationKindTailCallReturnCall:
+		ret = "operationKindTailCallReturnCall"
+	case operationKindTailCallReturnCallIndirect:
+		ret = "operationKindTailCallReturnCallIndirect"
 	default:
 		panic(fmt.Errorf("unknown operation %d", o))
 	}
@@ -768,6 +772,11 @@ const (
 	// operationKindAtomicRMW16Cmpxchg is the kind for NewOperationAtomicRMW16Cmpxchg.
 	operationKindAtomicRMW16Cmpxchg

+	// operationKindTailCallReturnCall is the Kind for newOperationTailCallReturnCall.
+	operationKindTailCallReturnCall
+	// operationKindTailCallReturnCallIndirect is the Kind for newOperationKindTailCallReturnCallIndirect.
+	operationKindTailCallReturnCallIndirect
+
 	// operationKindEnd is always placed at the bottom of this iota definition to be used in the test.
 	operationKindEnd
 )
@@ -1097,6 +1106,12 @@ func (o unionOperation) String() string {
 		operationKindAtomicRMW16Cmpxchg:
 		return o.Kind.String()

+	case operationKindTailCallReturnCall:
+		return fmt.Sprintf("%s %d %s", o.Kind, o.U1, label(o.U2).String())
+
+	case operationKindTailCallReturnCallIndirect:
+		return fmt.Sprintf("%s %d %d", o.Kind, o.U1, o.U2)
+
 	default:
 		panic(fmt.Sprintf("TODO: %v", o.Kind))
 	}
@@ -2810,3 +2825,21 @@ func newOperationAtomicRMW8Cmpxchg(unsignedType unsignedType, arg memoryArg) uni
 func newOperationAtomicRMW16Cmpxchg(unsignedType unsignedType, arg memoryArg) unionOperation {
 	return unionOperation{Kind: operationKindAtomicRMW16Cmpxchg, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
 }
+
+// newOperationTailCallReturnCall is a constructor for unionOperation with operationKindTailCallReturnCall.
+//
+// This corresponds to
+//
+//	wasm.OpcodeTailCallReturnCall.
+func newOperationTailCallReturnCall(functionIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindTailCallReturnCall, U1: uint64(functionIndex)}
+}
+
+// NewOperationCallIndirect is a constructor for unionOperation with operationKindTailCallReturnCallIndirect.
+//
+// This corresponds to
+//
+//	wasm.OpcodeTailCallReturnCallIndirect.
+func newOperationTailCallReturnCallIndirect(typeIndex, tableIndex uint32, dropDepth inclusiveRange, l label) unionOperation {
+	return unionOperation{Kind: operationKindTailCallReturnCallIndirect, U1: uint64(typeIndex), U2: uint64(tableIndex), Us: []uint64{dropDepth.AsU64(), uint64(l)}}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/signature.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/signature.go
@@ -272,9 +272,9 @@ func (c *compiler) wasmOpcodeSignature(op wasm.Opcode, index uint32) (*signature
 		return signature_I32_None, nil
 	case wasm.OpcodeReturn:
 		return signature_None_None, nil
-	case wasm.OpcodeCall:
+	case wasm.OpcodeCall, wasm.OpcodeTailCallReturnCall:
 		return c.funcTypeToSigs.get(c.funcs[index], false /* direct */), nil
-	case wasm.OpcodeCallIndirect:
+	case wasm.OpcodeCallIndirect, wasm.OpcodeTailCallReturnCallIndirect:
 		return c.funcTypeToSigs.get(index, true /* call_indirect */), nil
 	case wasm.OpcodeDrop:
 		return signature_Unknown_None, nil
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
@@ -88,7 +88,7 @@ type Compiler interface {
 	MatchInstrOneOf(def SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode

 	// AddRelocationInfo appends the relocation information for the function reference at the current buffer offset.
-	AddRelocationInfo(funcRef ssa.FuncRef)
+	AddRelocationInfo(funcRef ssa.FuncRef, isTailCall bool)

 	// AddSourceOffsetInfo appends the source offset information for the given offset.
 	AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset)
@@ -115,6 +115,8 @@ type RelocationInfo struct {
 	Offset int64
 	// Target is the target function of the call instruction.
 	FuncRef ssa.FuncRef
+	// IsTailCall indicates whether the call instruction is a tail call.
+	IsTailCall bool
 }

 // compiler implements Compiler.
@@ -352,10 +354,11 @@ func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo {
 }

 // AddRelocationInfo implements Compiler.AddRelocationInfo.
-func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) {
+func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef, isTailCall bool) {
 	c.relocations = append(c.relocations, RelocationInfo{
-		Offset:  int64(len(c.buf)),
-		FuncRef: funcRef,
+		Offset:     int64(len(c.buf)),
+		FuncRef:    funcRef,
+		IsTailCall: isTailCall,
 	})
 }

--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
@@ -21,7 +21,9 @@ type instruction struct {
 func (i *instruction) IsCall() bool { return i.kind == call }

 // IsIndirectCall implements regalloc.Instr.
-func (i *instruction) IsIndirectCall() bool { return i.kind == callIndirect }
+func (i *instruction) IsIndirectCall() bool {
+	return i.kind == callIndirect
+}

 // IsReturn implements regalloc.Instr.
 func (i *instruction) IsReturn() bool { return i.kind == ret }
@@ -288,6 +290,11 @@ func (i *instruction) String() string {
 	case nopUseReg:
 		return fmt.Sprintf("nop_use_reg %s", i.op1.format(true))

+	case tailCall:
+		return fmt.Sprintf("tailCall %s", ssa.FuncRef(i.u1))
+	case tailCallIndirect:
+		return fmt.Sprintf("tailCallIndirect %s", i.op1.format(true))
+
 	default:
 		panic(fmt.Sprintf("BUG: %d", int(i.kind)))
 	}
@@ -357,7 +364,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 		default:
 			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
 		}
-	case useKindCallInd:
+	case useKindCallInd, useKindTailCallInd:
 		op := i.op1
 		switch op.kind {
 		case operandKindReg:
@@ -428,13 +435,16 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 func (i *instruction) AssignUse(index int, v regalloc.VReg) {
 	switch uk := useKinds[i.kind]; uk {
 	case useKindNone:
-	case useKindCallInd:
+	case useKindCallInd, useKindTailCallInd:
 		if index != 0 {
 			panic("BUG")
 		}
 		op := &i.op1
 		switch op.kind {
 		case operandKindReg:
+			if uk == useKindTailCallInd && v != r11VReg {
+				panic("BUG")
+			}
 			op.setReg(v)
 		case operandKindMem:
 			op.addressMode().assignUses(index, v)
@@ -838,6 +848,12 @@ const (
 	// nopUseReg is a meta instruction that uses one register and does nothing.
 	nopUseReg

+	// tailCall is a meta instruction that emits a tail call.
+	tailCall
+
+	// tailCallIndirect is a meta instruction that emits a tail call with an indirect call.
+	tailCallIndirect
+
 	instrMax
 )

@@ -1079,6 +1095,10 @@ func (k instructionKind) String() string {
 		return "lockcmpxchg"
 	case lockxadd:
 		return "lockxadd"
+	case tailCall:
+		return "tailCall"
+	case tailCallIndirect:
+		return "tailCallIndirect"
 	default:
 		panic("BUG")
 	}
@@ -1173,6 +1193,27 @@ func (i *instruction) asCallIndirect(ptr operand, abi *backend.FunctionABI) *ins
 	return i
 }

+func (i *instruction) asTailCallReturnCall(ref ssa.FuncRef, abi *backend.FunctionABI) *instruction {
+	i.kind = tailCall
+	i.u1 = uint64(ref)
+	if abi != nil {
+		i.u2 = abi.ABIInfoAsUint64()
+	}
+	return i
+}
+
+func (i *instruction) asTailCallReturnCallIndirect(ptr operand, abi *backend.FunctionABI) *instruction {
+	if ptr.kind != operandKindReg && ptr.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = tailCallIndirect
+	i.op1 = ptr
+	if abi != nil {
+		i.u2 = abi.ABIInfoAsUint64()
+	}
+	return i
+}
+
 func (i *instruction) asRet() *instruction {
 	i.kind = ret
 	return i
@@ -2342,6 +2383,8 @@ var defKinds = [instrMax]defKind{
 	lockxadd:               defKindNone,
 	neg:                    defKindNone,
 	nopUseReg:              defKindNone,
+	tailCall:               defKindCall,
+	tailCallIndirect:       defKindCall,
 }

 // String implements fmt.Stringer.
@@ -2375,6 +2418,7 @@ const (
 	useKindBlendvpd
 	useKindCall
 	useKindCallInd
+	useKindTailCallInd
 	useKindFcvtToSintSequence
 	useKindFcvtToUintSequence
 )
@@ -2425,6 +2469,8 @@ var useKinds = [instrMax]useKind{
 	lockxadd:               useKindOp1RegOp2,
 	neg:                    useKindOp1,
 	nopUseReg:              useKindOp1,
+	tailCall:               useKindCall,
+	tailCallIndirect:       useKindTailCallInd,
 }

 func (u useKind) String() string {
@@ -2441,6 +2487,8 @@ func (u useKind) String() string {
 		return "call"
 	case useKindCallInd:
 		return "callInd"
+	case useKindTailCallInd:
+		return "tailCallInd"
 	default:
 		return "invalid"
 	}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
@@ -1211,7 +1211,7 @@ func (i *instruction) encode(c backend.Compiler) (needsLabelResolution bool) {
 	case call:
 		c.EmitByte(0xe8)
 		// Meaning that the call target is a function value, and requires relocation.
-		c.AddRelocationInfo(ssa.FuncRef(i.u1))
+		c.AddRelocationInfo(ssa.FuncRef(i.u1), false)
 		// Note that this is zero as a placeholder for the call target if it's a function value.
 		c.Emit4Bytes(uint32(i.u2))

@@ -1244,6 +1244,37 @@ func (i *instruction) encode(c backend.Compiler) (needsLabelResolution bool) {
 			panic("BUG: invalid operand kind")
 		}

+	case tailCall:
+		// Encode as jmp.
+		c.EmitByte(0xe9)
+		// Meaning that the call target is a function value, and requires relocation.
+		c.AddRelocationInfo(ssa.FuncRef(i.u1), true)
+		// Note that this is zero as a placeholder for the call target if it's a function value.
+		c.Emit4Bytes(uint32(i.u2))
+
+	case tailCallIndirect:
+		op := i.op1
+
+		const opcodeNum = 1
+		const opcode = 0xff
+		const regMemSubOpcode = 4
+		rex := rexInfo(0).clearW()
+		switch op.kind {
+		// Indirect tail calls always take a register as the target.
+		// Note: the register should be a callee-saved register (usually r11).
+		case operandKindReg:
+			dst := regEncodings[op.reg().RealReg()]
+			encodeRegReg(c,
+				legacyPrefixesNone,
+				opcode, opcodeNum,
+				regMemSubOpcode,
+				dst,
+				rex,
+			)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
 	case xchg:
 		src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
 		size := i.u1
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
@@ -17,7 +17,7 @@ import (
 // NewBackend returns a new backend for arm64.
 func NewBackend() backend.Machine {
 	m := &machine{
-		cpuFeatures:                         platform.CpuFeatures,
+		cpuFeatures:                         platform.CpuFeatures(),
 		regAlloc:                            regalloc.NewAllocator[*instruction, *labelPosition, *regAllocFn](regInfo),
 		spillSlots:                          map[regalloc.VRegID]int64{},
 		amodePool:                           wazevoapi.NewPool[amode](nil),
@@ -1109,6 +1109,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		atomicOp, size := instr.AtomicRmwData()
 		m.lowerAtomicRmw(atomicOp, addr, val, size, instr.Return())

+	case ssa.OpcodeTailCallReturnCall, ssa.OpcodeTailCallReturnCallIndirect:
+		m.lowerTailCall(instr)
+
 	default:
 		panic("TODO: lowering " + op.String())
 	}
@@ -1885,31 +1888,7 @@ func (m *machine) lowerStore(si *ssa.Instruction) {

 func (m *machine) lowerCall(si *ssa.Instruction) {
 	isDirectCall := si.Opcode() == ssa.OpcodeCall
-	var indirectCalleePtr ssa.Value
-	var directCallee ssa.FuncRef
-	var sigID ssa.SignatureID
-	var args []ssa.Value
-	var isMemmove bool
-	if isDirectCall {
-		directCallee, sigID, args = si.CallData()
-	} else {
-		indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData()
-	}
-	calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID))
-
-	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
-	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
-		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP.
-	}
-
-	// Note: See machine.SetupPrologue for the stack layout.
-	// The stack pointer decrease/increase will be inserted later in the compilation.
-
-	for i, arg := range args {
-		reg := m.c.VRegOf(arg)
-		def := m.c.ValueDefinition(arg)
-		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
-	}
+	indirectCalleePtr, directCallee, isMemmove, calleeABI, stackSlotSize := m.prepareCall(si, isDirectCall)

 	if isMemmove {
 		// Go's memmove *might* use all xmm0-xmm15, so we need to release them.
@@ -1939,6 +1918,39 @@ func (m *machine) lowerCall(si *ssa.Instruction) {
 		m.insert(m.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[rdx]))
 	}

+	m.insertReturns(si, calleeABI, stackSlotSize)
+}
+
+func (m *machine) prepareCall(si *ssa.Instruction, isDirectCall bool) (ssa.Value, ssa.FuncRef, bool, *backend.FunctionABI, int64) {
+	var indirectCalleePtr ssa.Value
+	var directCallee ssa.FuncRef
+	var sigID ssa.SignatureID
+	var args []ssa.Value
+	var isMemmove bool
+	if isDirectCall {
+		directCallee, sigID, args = si.CallData()
+	} else {
+		indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData()
+	}
+	calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID))
+
+	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
+	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
+		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP.
+	}
+
+	// Note: See machine.SetupPrologue for the stack layout.
+	// The stack pointer decrease/increase will be inserted later in the compilation.
+
+	for i, arg := range args {
+		reg := m.c.VRegOf(arg)
+		def := m.c.ValueDefinition(arg)
+		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
+	}
+	return indirectCalleePtr, directCallee, isMemmove, calleeABI, stackSlotSize
+}
+
+func (m *machine) insertReturns(si *ssa.Instruction, calleeABI *backend.FunctionABI, stackSlotSize int64) {
 	var index int
 	r1, rs := si.Returns()
 	if r1.Valid() {
@@ -1952,6 +1964,43 @@ func (m *machine) lowerCall(si *ssa.Instruction) {
 	}
 }

+func (m *machine) lowerTailCall(si *ssa.Instruction) {
+	isDirectCall := si.Opcode() == ssa.OpcodeTailCallReturnCall
+	indirectCalleePtr, directCallee, isMemmove, calleeABI, stackSlotSize := m.prepareCall(si, isDirectCall)
+	if isMemmove {
+		panic("memmove not supported in tail calls")
+	}
+
+	isAllRegs := stackSlotSize == 0
+
+	switch {
+	case isDirectCall && isAllRegs:
+		call := m.allocateInstr().asTailCallReturnCall(directCallee, calleeABI)
+		m.insert(call)
+	case !isDirectCall && isAllRegs:
+		// In a tail call we insert the epilogue before the jump instruction,
+		// so an arbitrary register might be overwritten while restoring the stack.
+		// So, as compared to a regular indirect call, we ensure the pointer is stored
+		// in a caller-saved register (r11).
+		// For details, see internal/engine/RATIONALE.md
+		ptrOp := m.getOperand_Reg(m.c.ValueDefinition(indirectCalleePtr))
+		tmpJmp := r11VReg
+		m.InsertMove(tmpJmp, ptrOp.reg(), ssa.TypeI64)
+		callInd := m.allocateInstr().asTailCallReturnCallIndirect(newOperandReg(tmpJmp), calleeABI)
+		m.insert(callInd)
+	case isDirectCall && !isAllRegs:
+		call := m.allocateInstr().asCall(directCallee, calleeABI)
+		m.insert(call)
+	case !isDirectCall && !isAllRegs:
+		ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr))
+		callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI)
+		m.insert(callInd)
+	}
+
+	// If this is a proper tail call, returns will be cleared in the postRegAlloc phase.
+	m.insertReturns(si, calleeABI, stackSlotSize)
+}
+
 // callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
 // caller side of the function call.
 func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def backend.SSAValueDefinition, stackSlotSize int64) {
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
@@ -188,6 +188,23 @@ func (m *machine) postRegAlloc() {
 				linkInstr(inc, next)
 			}
 			continue
+		case tailCall, tailCallIndirect:
+			// At this point, reg alloc is done, therefore we can safely insert dec RPS instruction
+			// right before the tail call (jump) instruction. If this is done before reg alloc, the stack slot
+			// can point to the wrong location and therefore results in a wrong value.
+			tailCall := cur
+			_, _, _, _, size := backend.ABIInfoFromUint64(tailCall.u2)
+			if size > 0 {
+				dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true)
+				linkInstr(tailCall.prev, dec)
+				linkInstr(dec, tailCall)
+			}
+			// In a tail call, we insert the epilogue before the jump instruction.
+			m.setupEpilogueAfter(tailCall.prev)
+			// If this has been encoded as a proper tail call, we can remove the trailing instructions
+			// For details, see internal/engine/RATIONALE.md
+			m.removeUntilRet(cur.next)
+			continue
 		}

 		// Removes the redundant copy instruction.
@@ -278,6 +295,20 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 	linkInstr(cur, prevNext)
 }

+// removeUntilRet removes the instructions starting from `cur` until the first `ret` instruction.
+func (m *machine) removeUntilRet(cur *instruction) {
+	for ; cur != nil; cur = cur.next {
+		prev, next := cur.prev, cur.next
+		prev.next = next
+		if next != nil {
+			next.prev = prev
+		}
+		if cur.kind == ret {
+			return
+		}
+	}
+}
+
 func (m *machine) addRSP(offset int32, cur *instruction) *instruction {
 	if offset == 0 {
 		return cur
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
@@ -261,6 +261,23 @@ func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn reg

 func (m *machine) lowerCall(si *ssa.Instruction) {
 	isDirectCall := si.Opcode() == ssa.OpcodeCall
+	indirectCalleePtr, directCallee, calleeABI, stackSlotSize := m.prepareCall(si, isDirectCall)
+
+	if isDirectCall {
+		call := m.allocateInstr()
+		call.asCall(directCallee, calleeABI)
+		m.insert(call)
+	} else {
+		ptr := m.compiler.VRegOf(indirectCalleePtr)
+		callInd := m.allocateInstr()
+		callInd.asCallIndirect(ptr, calleeABI)
+		m.insert(callInd)
+	}
+
+	m.insertReturns(si, calleeABI, stackSlotSize)
+}
+
+func (m *machine) prepareCall(si *ssa.Instruction, isDirectCall bool) (ssa.Value, ssa.FuncRef, *backend.FunctionABI, int64) {
 	var indirectCalleePtr ssa.Value
 	var directCallee ssa.FuncRef
 	var sigID ssa.SignatureID
@@ -282,18 +299,10 @@ func (m *machine) lowerCall(si *ssa.Instruction) {
 		def := m.compiler.ValueDefinition(arg)
 		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
 	}
+	return indirectCalleePtr, directCallee, calleeABI, stackSlotSize
+}

-	if isDirectCall {
-		call := m.allocateInstr()
-		call.asCall(directCallee, calleeABI)
-		m.insert(call)
-	} else {
-		ptr := m.compiler.VRegOf(indirectCalleePtr)
-		callInd := m.allocateInstr()
-		callInd.asCallIndirect(ptr, calleeABI)
-		m.insert(callInd)
-	}
-
+func (m *machine) insertReturns(si *ssa.Instruction, calleeABI *backend.FunctionABI, stackSlotSize int64) {
 	var index int
 	r1, rs := si.Returns()
 	if r1.Valid() {
@@ -307,6 +316,40 @@ func (m *machine) lowerCall(si *ssa.Instruction) {
 	}
 }

+func (m *machine) lowerTailCall(si *ssa.Instruction) {
+	isDirectCall := si.Opcode() == ssa.OpcodeTailCallReturnCall
+	indirectCalleePtr, directCallee, calleeABI, stackSlotSize := m.prepareCall(si, isDirectCall)
+
+	// We currently support tail calls only when the args are passed via registers
+	// otherwise we fall back to a plain call.
+	// For details, see internal/engine/RATIONALE.md
+	isAllRegs := stackSlotSize == 0
+
+	switch {
+	case isDirectCall && isAllRegs:
+		tailJump := m.allocateInstr()
+		tailJump.asTailCall(directCallee, calleeABI)
+		m.insert(tailJump)
+	case !isDirectCall && isAllRegs:
+		ptr := m.compiler.VRegOf(indirectCalleePtr)
+		callInd := m.allocateInstr()
+		callInd.asTailCallIndirect(ptr, calleeABI)
+		m.insert(callInd)
+	case isDirectCall && !isAllRegs:
+		tailJump := m.allocateInstr()
+		tailJump.asCall(directCallee, calleeABI)
+		m.insert(tailJump)
+	case !isDirectCall && !isAllRegs:
+		ptr := m.compiler.VRegOf(indirectCalleePtr)
+		callInd := m.allocateInstr()
+		callInd.asCallIndirect(ptr, calleeABI)
+		m.insert(callInd)
+	}
+
+	// If this is a proper tail call, returns will be cleared in the postRegAlloc phase.
+	m.insertReturns(si, calleeABI, stackSlotSize)
+}
+
 func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) {
 	if imm12Operand, ok := asImm12Operand(uint64(diff)); ok {
 		alu := m.allocateInstr()
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -140,6 +140,8 @@ var defKinds = [numInstructionKinds]defKind{
 	atomicStore:          defKindNone,
 	dmb:                  defKindNone,
 	loadConstBlockArg:    defKindRD,
+	tailCall:             defKindCall,
+	tailCallInd:          defKindCall,
 }

 // Defs returns the list of regalloc.VReg that are defined by the instruction.
@@ -278,6 +280,8 @@ var useKinds = [numInstructionKinds]useKind{
 	atomicStore:          useKindRNRM,
 	loadConstBlockArg:    useKindNone,
 	dmb:                  useKindNone,
+	tailCall:             useKindCall,
+	tailCallInd:          useKindCallInd,
 }

 // Uses returns the list of regalloc.VReg that are used by the instruction.
@@ -1501,6 +1505,10 @@ func (i *instruction) String() (str string) {
 		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
 	case dmb:
 		str = "dmb"
+	case tailCall:
+		str = fmt.Sprintf("b %s", ssa.FuncRef(i.u1))
+	case tailCallInd:
+		str = fmt.Sprintf("b %s", formatVRegSized(i.rn.nr(), 64))
 	case udf:
 		str = "udf"
 	case emitSourceOffsetInfo:
@@ -1550,6 +1558,22 @@ func (i *instruction) asDMB() {
 	i.kind = dmb
 }

+func (i *instruction) asTailCall(ref ssa.FuncRef, abi *backend.FunctionABI) {
+	i.kind = tailCall
+	i.u1 = uint64(ref)
+	if abi != nil {
+		i.u2 = abi.ABIInfoAsUint64()
+	}
+}
+
+func (i *instruction) asTailCallIndirect(ptr regalloc.VReg, abi *backend.FunctionABI) {
+	i.kind = tailCallInd
+	i.rn = operandNR(ptr)
+	if abi != nil {
+		i.u2 = abi.ABIInfoAsUint64()
+	}
+}
+
 // TODO: delete unnecessary things.
 const (
 	// nop0 represents a no-op of zero size.
@@ -1727,6 +1751,10 @@ const (
 	atomicStore
 	// dmb represents the data memory barrier instruction in inner-shareable (ish) mode.
 	dmb
+	// tailCall represents a tail call instruction.
+	tailCall
+	// tailCallInd represents a tail call indirect instruction.
+	tailCallInd
 	// UDF is the undefined instruction. For debugging only.
 	udf
 	// loadConstBlockArg represents a load of a constant block argument.
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -39,7 +39,7 @@ func (i *instruction) encode(m *machine) {
 		c.Emit4Bytes(encodeUnconditionalBranch(false, imm))
 	case call:
 		// We still don't know the exact address of the function to call, so we emit a placeholder.
-		c.AddRelocationInfo(i.callFuncRef())
+		c.AddRelocationInfo(i.callFuncRef(), false)
 		c.Emit4Bytes(encodeUnconditionalBranch(true, 0)) // 0 = placeholder
 	case callInd:
 		c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true))
@@ -417,6 +417,12 @@ func (i *instruction) encode(m *machine) {
 		))
 	case dmb:
 		c.Emit4Bytes(encodeDMB())
+	case tailCall:
+		// We still don't know the exact address of the function to call, so we emit a placeholder.
+		c.AddRelocationInfo(i.callFuncRef(), true)        // true = IsTailCall
+		c.Emit4Bytes(encodeUnconditionalBranch(false, 0)) // 0 = placeholder
+	case tailCallInd:
+		c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], false))
 	default:
 		panic(i.String())
 	}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -788,6 +788,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		instr.asDMB()
 		m.insert(instr)

+	case ssa.OpcodeTailCallReturnCall, ssa.OpcodeTailCallReturnCallIndirect:
+		m.lowerTailCall(instr)
+
 	default:
 		panic("TODO: lowering " + op.String())
 	}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@@ -198,6 +198,11 @@ func (m *machine) postRegAlloc() {
 		switch cur.kind {
 		case ret:
 			m.setupEpilogueAfter(cur.prev)
+		case tailCall, tailCallInd:
+			m.setupEpilogueAfter(cur.prev)
+			// If this has been encoded as a proper tail call, we can remove the trailing instructions.
+			// For details, see internal/engine/RATIONALE.md
+			m.removeUntilRet(cur.next)
 		case loadConstBlockArg:
 			lc := cur
 			next := lc.next
@@ -325,6 +330,20 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 	linkInstr(cur, prevNext)
 }

+// removeUntilRet removes the instructions starting from `cur` until the first `ret` instruction.
+func (m *machine) removeUntilRet(cur *instruction) {
+	for ; cur != nil; cur = cur.next {
+		prev, next := cur.prev, cur.next
+		prev.next = next
+		if next != nil {
+			next.prev = prev
+		}
+		if cur.kind == ret {
+			return
+		}
+	}
+}
+
 // saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
 // stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
 // which always points to the execution context whenever the native code is entered from Go.
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
@@ -59,13 +59,19 @@ func (m *machine) ResolveRelocations(
 		if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
 			// Find the near trampoline island from callTrampolineIslandOffsets.
 			islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset))
-			islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef)
+			// Imported functions don't need trampolines, so we ignore them when we compute the offset
+			// (see also encodeCallTrampolineIsland)
+			funcOffset := int(r.FuncRef) - importedFns
+			islandTargetOffset := islandOffset + trampolineCallSize*funcOffset
 			diff = int64(islandTargetOffset) - (instrOffset)
 			if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
 				panic("BUG in trampoline placement")
 			}
 		}
-		binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff))
+		// The unconditional branch instruction is usually encoded as a branch-and-link (BL),
+		// because it is a function call. However, if the instruction is a tail call,
+		// we encode it as a plain unconditional branch (B), so we won't overwrite the link register.
+		binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(!r.IsTailCall, diff))
 	}
 }

--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go
@@ -6,8 +6,10 @@ import (
 	"errors"
 	"fmt"
 	"runtime"
+	"slices"
 	"sort"
 	"sync"
+	"sync/atomic"
 	"unsafe"

 	"github.com/tetratelabs/wazero/api"
@@ -23,11 +25,15 @@ import (
 )

 type (
+	compiledModuleWithCount struct {
+		*compiledModule
+		refCount int
+	}
 	// engine implements wasm.Engine.
 	engine struct {
 		wazeroVersion   string
 		fileCache       filecache.Cache
-		compiledModules map[wasm.ModuleID]*compiledModule
+		compiledModules map[wasm.ModuleID]*compiledModuleWithCount
 		// sortedCompiledModules is a list of compiled modules sorted by the initial address of the executable.
 		sortedCompiledModules []*compiledModule
 		mux                   sync.RWMutex
@@ -42,25 +48,32 @@ type (
 	}

 	sharedFunctions struct {
-		// memoryGrowExecutable is a compiled trampoline executable for memory.grow builtin function.
-		memoryGrowExecutable []byte
-		// checkModuleExitCode is a compiled trampoline executable for checking module instance exit code. This
-		// is used when ensureTermination is true.
-		checkModuleExitCode []byte
-		// stackGrowExecutable is a compiled executable for growing stack builtin function.
-		stackGrowExecutable []byte
-		// tableGrowExecutable is a compiled trampoline executable for table.grow builtin function.
-		tableGrowExecutable []byte
-		// refFuncExecutable is a compiled trampoline executable for ref.func builtin function.
-		refFuncExecutable []byte
-		// memoryWait32Executable is a compiled trampoline executable for memory.wait32 builtin function
-		memoryWait32Executable []byte
-		// memoryWait64Executable is a compiled trampoline executable for memory.wait64 builtin function
-		memoryWait64Executable []byte
-		// memoryNotifyExecutable is a compiled trampoline executable for memory.notify builtin function
-		memoryNotifyExecutable    []byte
-		listenerBeforeTrampolines map[*wasm.FunctionType][]byte
-		listenerAfterTrampolines  map[*wasm.FunctionType][]byte
+		// The compiled trampolines executable.
+		executable []byte
+		// memoryGrowAddress is the address of memory.grow builtin function.
+		memoryGrowAddress *byte
+		// checkModuleExitCodeAddress is the address of checking module instance exit code.
+		// This is used when ensureTermination is true.
+		checkModuleExitCodeAddress *byte
+		// stackGrowAddress is the address of growing stack builtin function.
+		stackGrowAddress *byte
+		// tableGrowAddress is the address of table.grow builtin function.
+		tableGrowAddress *byte
+		// refFuncAddress is the address of ref.func builtin function.
+		refFuncAddress *byte
+		// memoryWait32Address is the address of memory.wait32 builtin function
+		memoryWait32Address *byte
+		// memoryWait64Address is the address of memory.wait64 builtin function
+		memoryWait64Address *byte
+		// memoryNotifyAddress is the address of memory.notify builtin function
+		memoryNotifyAddress *byte
+		listenerTrampolines listenerTrampolines
+	}
+
+	listenerTrampolines = map[*wasm.FunctionType]struct {
+		executable []byte
+		before     *byte
+		after      *byte
 	}

 	// compiledModule is a compiled variant of a wasm.Module and ready to be used for instantiation.
@@ -83,8 +96,9 @@ type (
 	}

 	executables struct {
-		executable     []byte
-		entryPreambles [][]byte
+		executable         []byte
+		entryPreambles     []byte
+		entryPreamblesPtrs []*byte
 	}
 )

@@ -105,7 +119,7 @@ func NewEngine(ctx context.Context, _ api.CoreFeatures, fc filecache.Cache) wasm
 	machine := newMachine()
 	be := backend.NewCompiler(ctx, machine, ssa.NewBuilder())
 	e := &engine{
-		compiledModules: make(map[wasm.ModuleID]*compiledModule),
+		compiledModules: make(map[wasm.ModuleID]*compiledModuleWithCount),
 		setFinalizer:    runtime.SetFinalizer,
 		machine:         machine,
 		be:              be,
@@ -164,23 +178,46 @@ func (e *engine) CompileModule(ctx context.Context, module *wasm.Module, listene
 }

 func (exec *executables) compileEntryPreambles(m *wasm.Module, machine backend.Machine, be backend.Compiler) {
-	exec.entryPreambles = make([][]byte, len(m.TypeSection))
-	for i := range m.TypeSection {
+	if len(m.TypeSection) == 0 {
+		return
+	}
+
+	var preambles []byte
+	sizes := make([]int, len(m.TypeSection))
+
+	for i := range sizes {
 		typ := &m.TypeSection[i]
 		sig := frontend.SignatureForWasmFunctionType(typ)
 		be.Init()
 		buf := machine.CompileEntryPreamble(&sig)
-		executable := mmapExecutable(buf)
-		exec.entryPreambles[i] = executable
+		preambles = append(preambles, buf...)
+		align := 15 & -len(preambles) // Align 16-bytes boundary.
+		preambles = append(preambles, make([]byte, align)...)
+		sizes[i] = len(buf) + align
+	}
+
+	exec.entryPreambles = mmapExecutable(preambles)
+	exec.entryPreamblesPtrs = make([]*byte, len(sizes))
+
+	offset := 0
+	for i, size := range sizes {
+		ptr := &exec.entryPreambles[offset]
+		exec.entryPreamblesPtrs[i] = ptr
+		offset += size

 		if wazevoapi.PerfMapEnabled {
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&executable[0])),
-				uint64(len(executable)), fmt.Sprintf("entry_preamble::type=%s", typ.String()))
+			typ := &m.TypeSection[i]
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(ptr)),
+				uint64(size), fmt.Sprintf("entry_preamble::type=%s", typ.String()))
 		}
 	}
 }

 func (e *engine) compileModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (*compiledModule, error) {
+	if module.IsHostModule {
+		return e.compileHostModule(ctx, module, listeners)
+	}
+
 	withListener := len(listeners) > 0
 	cm := &compiledModule{
 		offsets: wazevoapi.NewModuleContextOffsetData(module, withListener), parent: e, module: module,
@@ -188,116 +225,137 @@ func (e *engine) compileModule(ctx context.Context, module *wasm.Module, listene
 		executables:       &executables{},
 	}

-	if module.IsHostModule {
-		return e.compileHostModule(ctx, module, listeners)
-	}
-
 	importedFns, localFns := int(module.ImportFunctionCount), len(module.FunctionSection)
 	if localFns == 0 {
 		return cm, nil
 	}

-	rels := make([]backend.RelocationInfo, 0)
-	refToBinaryOffset := make([]int, importedFns+localFns)
-
-	if wazevoapi.DeterministicCompilationVerifierEnabled {
-		// The compilation must be deterministic regardless of the order of functions being compiled.
-		wazevoapi.DeterministicCompilationVerifierRandomizeIndexes(ctx)
+	machine := newMachine()
+	relocator, err := newEngineRelocator(machine, importedFns, localFns)
+	if err != nil {
+		return nil, err
 	}

 	needSourceInfo := module.DWARFLines != nil

-	// Creates new compiler instances which are reused for each function.
 	ssaBuilder := ssa.NewBuilder()
-	fe := frontend.NewFrontendCompiler(module, ssaBuilder, &cm.offsets, ensureTermination, withListener, needSourceInfo)
-	machine := newMachine()
 	be := backend.NewCompiler(ctx, machine, ssaBuilder)
-
 	cm.executables.compileEntryPreambles(module, machine, be)
-
-	totalSize := 0 // Total binary size of the executable.
 	cm.functionOffsets = make([]int, localFns)
-	bodies := make([][]byte, localFns)

-	// Trampoline relocation related variables.
-	trampolineInterval, callTrampolineIslandSize, err := machine.CallTrampolineIslandInfo(localFns)
-	if err != nil {
-		return nil, err
+	var indexes []int
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		// The compilation must be deterministic regardless of the order of functions being compiled.
+		indexes = wazevoapi.DeterministicCompilationVerifierRandomizeIndexes(ctx)
 	}
-	needCallTrampoline := callTrampolineIslandSize > 0
-	var callTrampolineIslandOffsets []int // Holds the offsets of trampoline islands.

-	for i := range module.CodeSection {
-		if wazevoapi.DeterministicCompilationVerifierEnabled {
-			i = wazevoapi.DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex(ctx, i)
-		}
+	if workers := experimental.GetCompilationWorkers(ctx); workers <= 1 {
+		// Compile with a single goroutine.
+		fe := frontend.NewFrontendCompiler(module, ssaBuilder, &cm.offsets, ensureTermination, withListener, needSourceInfo)

-		fidx := wasm.Index(i + importedFns)
-
-		if wazevoapi.NeedFunctionNameInContext {
-			def := module.FunctionDefinition(fidx)
-			name := def.DebugName()
-			if len(def.ExportNames()) > 0 {
-				name = def.ExportNames()[0]
+		for i := range module.CodeSection {
+			if wazevoapi.DeterministicCompilationVerifierEnabled {
+				i = indexes[i]
 			}
-			ctx = wazevoapi.SetCurrentFunctionName(ctx, i, fmt.Sprintf("[%d/%d]%s", i, len(module.CodeSection)-1, name))
-		}

-		needListener := len(listeners) > 0 && listeners[i] != nil
-		body, relsPerFunc, err := e.compileLocalWasmFunction(ctx, module, wasm.Index(i), fe, ssaBuilder, be, needListener)
-		if err != nil {
-			return nil, fmt.Errorf("compile function %d/%d: %v", i, len(module.CodeSection)-1, err)
-		}
+			fidx := wasm.Index(i + importedFns)
+			fctx := functionContext(ctx, module, i, fidx)

-		// Align 16-bytes boundary.
-		totalSize = (totalSize + 15) &^ 15
-		cm.functionOffsets[i] = totalSize
-
-		if needSourceInfo {
-			// At the beginning of the function, we add the offset of the function body so that
-			// we can resolve the source location of the call site of before listener call.
-			cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(totalSize))
-			cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, module.CodeSection[i].BodyOffsetInCodeSection)
-
-			for _, info := range be.SourceOffsetInfo() {
-				cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(totalSize)+uintptr(info.ExecutableOffset))
-				cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, uint64(info.SourceOffset))
+			needListener := len(listeners) > i && listeners[i] != nil
+			body, relsPerFunc, err := e.compileLocalWasmFunction(fctx, module, wasm.Index(i), fe, ssaBuilder, be, needListener)
+			if err != nil {
+				return nil, fmt.Errorf("compile function %d/%d: %v", i, len(module.CodeSection)-1, err)
 			}
+
+			relocator.appendFunction(fctx, module, cm, i, fidx, body, relsPerFunc, be.SourceOffsetInfo())
+		}
+	} else {
+		// Compile with N worker goroutines.
+		// Collect compiled functions across workers in a slice,
+		// to be added to the relocator in-order and resolved serially at the end.
+		// This uses more memory and CPU (across cores), but can be significantly faster.
+		type compiledFunc struct {
+			fctx        context.Context
+			fnum        int
+			fidx        wasm.Index
+			body        []byte
+			relsPerFunc []backend.RelocationInfo
+			offsPerFunc []backend.SourceOffsetInfo
 		}

-		fref := frontend.FunctionIndexToFuncRef(fidx)
-		refToBinaryOffset[fref] = totalSize
+		compiledFuncs := make([]compiledFunc, len(module.CodeSection))
+		ctx, cancel := context.WithCancelCause(ctx)
+		defer cancel(nil)

-		// At this point, relocation offsets are relative to the start of the function body,
-		// so we adjust it to the start of the executable.
-		for _, r := range relsPerFunc {
-			r.Offset += int64(totalSize)
-			rels = append(rels, r)
+		var count atomic.Uint32
+		var wg sync.WaitGroup
+		wg.Add(workers)
+
+		for range workers {
+			go func() {
+				defer wg.Done()
+
+				// Creates new compiler instances which are reused for each function.
+				machine := newMachine()
+				ssaBuilder := ssa.NewBuilder()
+				be := backend.NewCompiler(ctx, machine, ssaBuilder)
+				fe := frontend.NewFrontendCompiler(module, ssaBuilder, &cm.offsets, ensureTermination, withListener, needSourceInfo)
+
+				for {
+					if err := ctx.Err(); err != nil {
+						// Compilation canceled!
+						return
+					}
+
+					i := int(count.Add(1)) - 1
+					if i >= len(module.CodeSection) {
+						return
+					}
+
+					if wazevoapi.DeterministicCompilationVerifierEnabled {
+						i = indexes[i]
+					}
+
+					fidx := wasm.Index(i + importedFns)
+					fctx := functionContext(ctx, module, i, fidx)
+
+					needListener := len(listeners) > i && listeners[i] != nil
+					body, relsPerFunc, err := e.compileLocalWasmFunction(fctx, module, wasm.Index(i), fe, ssaBuilder, be, needListener)
+					if err != nil {
+						cancel(fmt.Errorf("compile function %d/%d: %v", i, len(module.CodeSection)-1, err))
+						return
+					}
+
+					compiledFuncs[i] = compiledFunc{
+						fctx, i, fidx, body,
+						// These slices are internal to the backend compiler and since we are going to buffer them instead
+						// of process them immediately we need to copy the memory.
+						slices.Clone(relsPerFunc),
+						slices.Clone(be.SourceOffsetInfo()),
+					}
+				}
+			}()
 		}

-		bodies[i] = body
-		totalSize += len(body)
-		if wazevoapi.PrintMachineCodeHexPerFunction {
-			fmt.Printf("[[[machine code for %s]]]\n%s\n\n", wazevoapi.GetCurrentFunctionName(ctx), hex.EncodeToString(body))
+		wg.Wait()
+		if err := context.Cause(ctx); err != nil {
+			return nil, err
 		}

-		if needCallTrampoline {
-			// If the total size exceeds the trampoline interval, we need to add a trampoline island.
-			if totalSize/trampolineInterval > len(callTrampolineIslandOffsets) {
-				callTrampolineIslandOffsets = append(callTrampolineIslandOffsets, totalSize)
-				totalSize += callTrampolineIslandSize
-			}
+		for i := range compiledFuncs {
+			fn := &compiledFuncs[i]
+			relocator.appendFunction(fn.fctx, module, cm, fn.fnum, fn.fidx, fn.body, fn.relsPerFunc, fn.offsPerFunc)
 		}
 	}

 	// Allocate executable memory and then copy the generated machine code.
-	executable, err := platform.MmapCodeSegment(totalSize)
+	executable, err := platform.MmapCodeSegment(relocator.totalSize)
 	if err != nil {
 		panic(err)
 	}
 	cm.executable = executable

-	for i, b := range bodies {
+	for i, b := range relocator.bodies {
 		offset := cm.functionOffsets[i]
 		copy(executable[offset:], b)
 	}
@@ -312,22 +370,108 @@ func (e *engine) compileModule(ctx context.Context, module *wasm.Module, listene
 		}
 	}

-	// Resolve relocations for local function calls.
-	if len(rels) > 0 {
-		machine.ResolveRelocations(refToBinaryOffset, importedFns, executable, rels, callTrampolineIslandOffsets)
-	}
+	relocator.resolveRelocations(machine, executable, importedFns)

-	if runtime.GOARCH == "arm64" {
-		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
-		if err = platform.MprotectRX(executable); err != nil {
-			return nil, err
-		}
+	if err = platform.MprotectRX(executable); err != nil {
+		return nil, err
 	}
 	cm.sharedFunctions = e.sharedFunctions
 	e.setFinalizer(cm.executables, executablesFinalizer)
 	return cm, nil
 }

+func functionContext(ctx context.Context, module *wasm.Module, fnum int, fidx wasm.Index) context.Context {
+	if wazevoapi.NeedFunctionNameInContext {
+		def := module.FunctionDefinition(fidx)
+		name := def.DebugName()
+		if len(def.ExportNames()) > 0 {
+			name = def.ExportNames()[0]
+		}
+		ctx = wazevoapi.SetCurrentFunctionName(ctx, fnum, fmt.Sprintf("[%d/%d]%s", fnum, len(module.CodeSection)-1, name))
+	}
+	return ctx
+}
+
+type engineRelocator struct {
+	bodies                      [][]byte
+	refToBinaryOffset           []int
+	rels                        []backend.RelocationInfo
+	totalSize                   int // Total binary size of the executable.
+	trampolineInterval          int
+	callTrampolineIslandSize    int
+	callTrampolineIslandOffsets []int // Holds the offsets of trampoline islands.
+}
+
+func newEngineRelocator(
+	machine backend.Machine,
+	importedFns, localFns int,
+) (r engineRelocator, err error) {
+	// Trampoline relocation related variables.
+	r.trampolineInterval, r.callTrampolineIslandSize, err = machine.CallTrampolineIslandInfo(localFns)
+	r.refToBinaryOffset = make([]int, importedFns+localFns)
+	r.bodies = make([][]byte, 0, localFns)
+	return
+}
+
+func (r *engineRelocator) resolveRelocations(machine backend.Machine, executable []byte, importedFns int) {
+	// Resolve relocations for local function calls.
+	if len(r.rels) > 0 {
+		machine.ResolveRelocations(r.refToBinaryOffset, importedFns, executable, r.rels, r.callTrampolineIslandOffsets)
+	}
+}
+
+func (r *engineRelocator) appendFunction(
+	ctx context.Context,
+	module *wasm.Module,
+	cm *compiledModule,
+	fnum int, fidx wasm.Index,
+	body []byte,
+	relsPerFunc []backend.RelocationInfo,
+	offsPerFunc []backend.SourceOffsetInfo,
+) {
+	// Align 16-bytes boundary.
+	r.totalSize = (r.totalSize + 15) &^ 15
+	cm.functionOffsets[fnum] = r.totalSize
+
+	needSourceInfo := module.DWARFLines != nil
+	if needSourceInfo {
+		// At the beginning of the function, we add the offset of the function body so that
+		// we can resolve the source location of the call site of before listener call.
+		cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(r.totalSize))
+		cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, module.CodeSection[fnum].BodyOffsetInCodeSection)
+
+		for _, info := range offsPerFunc {
+			cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(r.totalSize)+uintptr(info.ExecutableOffset))
+			cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, uint64(info.SourceOffset))
+		}
+	}
+
+	fref := frontend.FunctionIndexToFuncRef(fidx)
+	r.refToBinaryOffset[fref] = r.totalSize
+
+	// At this point, relocation offsets are relative to the start of the function body,
+	// so we adjust it to the start of the executable.
+	r.rels = slices.Grow(r.rels, len(relsPerFunc))
+	for _, rel := range relsPerFunc {
+		rel.Offset += int64(r.totalSize)
+		r.rels = append(r.rels, rel)
+	}
+
+	r.totalSize += len(body)
+	r.bodies = append(r.bodies, body)
+	if wazevoapi.PrintMachineCodeHexPerFunction {
+		fmt.Printf("[[[machine code for %s]]]\n%s\n\n", wazevoapi.GetCurrentFunctionName(ctx), hex.EncodeToString(body))
+	}
+
+	if r.callTrampolineIslandSize > 0 {
+		// If the total size exceeds the trampoline interval, we need to add a trampoline island.
+		if r.totalSize/r.trampolineInterval > len(r.callTrampolineIslandOffsets) {
+			r.callTrampolineIslandOffsets = append(r.callTrampolineIslandOffsets, r.totalSize)
+			r.totalSize += r.callTrampolineIslandSize
+		}
+	}
+}
+
 func (e *engine) compileLocalWasmFunction(
 	ctx context.Context,
 	module *wasm.Module,
@@ -374,9 +518,7 @@ func (e *engine) compileLocalWasmFunction(
 	}

 	// TODO: optimize as zero copy.
-	copied := make([]byte, len(original))
-	copy(copied, original)
-	return copied, rels, nil
+	return slices.Clone(original), rels, nil
 }

 func (e *engine) compileHostModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener) (*compiledModule, error) {
@@ -448,9 +590,7 @@ func (e *engine) compileHostModule(ctx context.Context, module *wasm.Module, lis
 		}

 		// TODO: optimize as zero copy.
-		copied := make([]byte, len(body))
-		copy(copied, body)
-		bodies[i] = copied
+		bodies[i] = slices.Clone(body)
 		totalSize += len(body)
 	}

@@ -475,11 +615,8 @@ func (e *engine) compileHostModule(ctx context.Context, module *wasm.Module, lis
 		wazevoapi.PerfMap.Flush(uintptr(unsafe.Pointer(&executable[0])), cm.functionOffsets)
 	}

-	if runtime.GOARCH == "arm64" {
-		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
-		if err = platform.MprotectRX(executable); err != nil {
-			return nil, err
-		}
+	if err = platform.MprotectRX(executable); err != nil {
+		return nil, err
 	}
 	e.setFinalizer(cm.executables, executablesFinalizer)
 	return cm, nil
@@ -507,12 +644,17 @@ func (e *engine) DeleteCompiledModule(m *wasm.Module) {
 	e.mux.Lock()
 	defer e.mux.Unlock()
 	cm, ok := e.compiledModules[m.ID]
-	if ok {
-		if len(cm.executable) > 0 {
-			e.deleteCompiledModuleFromSortedList(cm)
-		}
-		delete(e.compiledModules, m.ID)
+	if !ok {
+		return
 	}
+	cm.refCount--
+	if cm.refCount > 0 {
+		return
+	}
+	if len(cm.executable) > 0 {
+		e.deleteCompiledModuleFromSortedList(cm.compiledModule)
+	}
+	delete(e.compiledModules, m.ID)
 }

 func (e *engine) addCompiledModuleToSortedList(cm *compiledModule) {
@@ -569,7 +711,7 @@ func (e *engine) NewModuleEngine(m *wasm.Module, mi *wasm.ModuleInstance) (wasm.
 	// Note: imported functions are resolved in moduleEngine.ResolveImportedFunction.
 	me.importedFunctions = make([]importedFunction, m.ImportFunctionCount)

-	compiled, ok := e.getCompiledModuleFromMemory(m)
+	compiled, ok := e.getCompiledModuleFromMemory(m, false)
 	if !ok {
 		return nil, errors.New("source module must be compiled before instantiation")
 	}
@@ -591,167 +733,123 @@ func (e *engine) NewModuleEngine(m *wasm.Module, mi *wasm.ModuleInstance) (wasm.
 }

 func (e *engine) compileSharedFunctions() {
-	e.sharedFunctions = &sharedFunctions{
-		listenerBeforeTrampolines: make(map[*wasm.FunctionType][]byte),
-		listenerAfterTrampolines:  make(map[*wasm.FunctionType][]byte),
+	var sizes [8]int
+	var trampolines []byte
+
+	addTrampoline := func(i int, buf []byte) {
+		trampolines = append(trampolines, buf...)
+		align := 15 & -len(trampolines) // Align 16-bytes boundary.
+		trampolines = append(trampolines, make([]byte, align)...)
+		sizes[i] = len(buf) + align
 	}

 	e.be.Init()
-	{
-		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeGrowMemory, &ssa.Signature{
+	addTrampoline(0,
+		e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeGrowMemory, &ssa.Signature{
 			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32},
 			Results: []ssa.Type{ssa.TypeI32},
-		}, false)
-		e.sharedFunctions.memoryGrowExecutable = mmapExecutable(src)
-		if wazevoapi.PerfMapEnabled {
-			exe := e.sharedFunctions.memoryGrowExecutable
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_grow_trampoline")
-		}
-	}
+		}, false))

 	e.be.Init()
-	{
-		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeTableGrow, &ssa.Signature{
+	addTrampoline(1,
+		e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeTableGrow, &ssa.Signature{
 			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* table index */, ssa.TypeI32 /* num */, ssa.TypeI64 /* ref */},
 			Results: []ssa.Type{ssa.TypeI32},
-		}, false)
-		e.sharedFunctions.tableGrowExecutable = mmapExecutable(src)
-		if wazevoapi.PerfMapEnabled {
-			exe := e.sharedFunctions.tableGrowExecutable
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "table_grow_trampoline")
-		}
-	}
+		}, false))

 	e.be.Init()
-	{
-		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCheckModuleExitCode, &ssa.Signature{
+	addTrampoline(2,
+		e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCheckModuleExitCode, &ssa.Signature{
 			Params:  []ssa.Type{ssa.TypeI32 /* exec context */},
 			Results: []ssa.Type{ssa.TypeI32},
-		}, false)
-		e.sharedFunctions.checkModuleExitCode = mmapExecutable(src)
-		if wazevoapi.PerfMapEnabled {
-			exe := e.sharedFunctions.checkModuleExitCode
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "check_module_exit_code_trampoline")
-		}
-	}
+		}, false))

 	e.be.Init()
-	{
-		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeRefFunc, &ssa.Signature{
+	addTrampoline(3,
+		e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeRefFunc, &ssa.Signature{
 			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* function index */},
 			Results: []ssa.Type{ssa.TypeI64}, // returns the function reference.
-		}, false)
-		e.sharedFunctions.refFuncExecutable = mmapExecutable(src)
-		if wazevoapi.PerfMapEnabled {
-			exe := e.sharedFunctions.refFuncExecutable
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "ref_func_trampoline")
-		}
-	}
+		}, false))

 	e.be.Init()
-	{
-		src := e.machine.CompileStackGrowCallSequence()
-		e.sharedFunctions.stackGrowExecutable = mmapExecutable(src)
-		if wazevoapi.PerfMapEnabled {
-			exe := e.sharedFunctions.stackGrowExecutable
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "stack_grow_trampoline")
-		}
-	}
+	addTrampoline(4, e.machine.CompileStackGrowCallSequence())

 	e.be.Init()
-	{
-		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait32, &ssa.Signature{
+	addTrampoline(5,
+		e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait32, &ssa.Signature{
 			// exec context, timeout, expected, addr
 			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
 			// Returns the status.
 			Results: []ssa.Type{ssa.TypeI32},
-		}, false)
-		e.sharedFunctions.memoryWait32Executable = mmapExecutable(src)
-		if wazevoapi.PerfMapEnabled {
-			exe := e.sharedFunctions.memoryWait32Executable
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_wait32_trampoline")
-		}
-	}
+		}, false))

 	e.be.Init()
-	{
-		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait64, &ssa.Signature{
+	addTrampoline(6,
+		e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait64, &ssa.Signature{
 			// exec context, timeout, expected, addr
 			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64, ssa.TypeI64},
 			// Returns the status.
 			Results: []ssa.Type{ssa.TypeI32},
-		}, false)
-		e.sharedFunctions.memoryWait64Executable = mmapExecutable(src)
-		if wazevoapi.PerfMapEnabled {
-			exe := e.sharedFunctions.memoryWait64Executable
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_wait64_trampoline")
-		}
-	}
+		}, false))

 	e.be.Init()
-	{
-		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryNotify, &ssa.Signature{
+	addTrampoline(7,
+		e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryNotify, &ssa.Signature{
 			// exec context, count, addr
 			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
 			// Returns the number notified.
 			Results: []ssa.Type{ssa.TypeI32},
-		}, false)
-		e.sharedFunctions.memoryNotifyExecutable = mmapExecutable(src)
-		if wazevoapi.PerfMapEnabled {
-			exe := e.sharedFunctions.memoryNotifyExecutable
-			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_notify_trampoline")
-		}
+		}, false))
+
+	fns := &sharedFunctions{
+		executable:          mmapExecutable(trampolines),
+		listenerTrampolines: make(listenerTrampolines),
+	}
+	e.setFinalizer(fns, sharedFunctionsFinalizer)
+
+	offset := 0
+	fns.memoryGrowAddress = &fns.executable[offset]
+	offset += sizes[0]
+	fns.tableGrowAddress = &fns.executable[offset]
+	offset += sizes[1]
+	fns.checkModuleExitCodeAddress = &fns.executable[offset]
+	offset += sizes[2]
+	fns.refFuncAddress = &fns.executable[offset]
+	offset += sizes[3]
+	fns.stackGrowAddress = &fns.executable[offset]
+	offset += sizes[4]
+	fns.memoryWait32Address = &fns.executable[offset]
+	offset += sizes[5]
+	fns.memoryWait64Address = &fns.executable[offset]
+	offset += sizes[6]
+	fns.memoryNotifyAddress = &fns.executable[offset]
+
+	if wazevoapi.PerfMapEnabled {
+		wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(fns.memoryGrowAddress)), uint64(sizes[0]), "memory_grow_trampoline")
+		wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(fns.tableGrowAddress)), uint64(sizes[1]), "table_grow_trampoline")
+		wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(fns.checkModuleExitCodeAddress)), uint64(sizes[2]), "check_module_exit_code_trampoline")
+		wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(fns.refFuncAddress)), uint64(sizes[3]), "ref_func_trampoline")
+		wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(fns.stackGrowAddress)), uint64(sizes[4]), "stack_grow_trampoline")
+		wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(fns.memoryWait32Address)), uint64(sizes[5]), "memory_wait32_trampoline")
+		wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(fns.memoryWait64Address)), uint64(sizes[6]), "memory_wait64_trampoline")
+		wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(fns.memoryNotifyAddress)), uint64(sizes[7]), "memory_notify_trampoline")
 	}

-	e.setFinalizer(e.sharedFunctions, sharedFunctionsFinalizer)
+	e.sharedFunctions = fns
 }

 func sharedFunctionsFinalizer(sf *sharedFunctions) {
-	if err := platform.MunmapCodeSegment(sf.memoryGrowExecutable); err != nil {
+	if err := platform.MunmapCodeSegment(sf.executable); err != nil {
 		panic(err)
 	}
-	if err := platform.MunmapCodeSegment(sf.checkModuleExitCode); err != nil {
-		panic(err)
-	}
-	if err := platform.MunmapCodeSegment(sf.stackGrowExecutable); err != nil {
-		panic(err)
-	}
-	if err := platform.MunmapCodeSegment(sf.tableGrowExecutable); err != nil {
-		panic(err)
-	}
-	if err := platform.MunmapCodeSegment(sf.refFuncExecutable); err != nil {
-		panic(err)
-	}
-	if err := platform.MunmapCodeSegment(sf.memoryWait32Executable); err != nil {
-		panic(err)
-	}
-	if err := platform.MunmapCodeSegment(sf.memoryWait64Executable); err != nil {
-		panic(err)
-	}
-	if err := platform.MunmapCodeSegment(sf.memoryNotifyExecutable); err != nil {
-		panic(err)
-	}
-	for _, f := range sf.listenerBeforeTrampolines {
-		if err := platform.MunmapCodeSegment(f); err != nil {
-			panic(err)
-		}
-	}
-	for _, f := range sf.listenerAfterTrampolines {
-		if err := platform.MunmapCodeSegment(f); err != nil {
+	for _, f := range sf.listenerTrampolines {
+		if err := platform.MunmapCodeSegment(f.executable); err != nil {
 			panic(err)
 		}
 	}

-	sf.memoryGrowExecutable = nil
-	sf.checkModuleExitCode = nil
-	sf.stackGrowExecutable = nil
-	sf.tableGrowExecutable = nil
-	sf.refFuncExecutable = nil
-	sf.memoryWait32Executable = nil
-	sf.memoryWait64Executable = nil
-	sf.memoryNotifyExecutable = nil
-	sf.listenerBeforeTrampolines = nil
-	sf.listenerAfterTrampolines = nil
+	sf.executable = nil
+	sf.listenerTrampolines = nil
 }

 func executablesFinalizer(exec *executables) {
@@ -762,12 +860,13 @@ func executablesFinalizer(exec *executables) {
 	}
 	exec.executable = nil

-	for _, f := range exec.entryPreambles {
-		if err := platform.MunmapCodeSegment(f); err != nil {
+	if len(exec.entryPreambles) > 0 {
+		if err := platform.MunmapCodeSegment(exec.entryPreambles); err != nil {
 			panic(err)
 		}
 	}
 	exec.entryPreambles = nil
+	exec.entryPreamblesPtrs = nil
 }

 func mmapExecutable(src []byte) []byte {
@@ -778,11 +877,8 @@ func mmapExecutable(src []byte) []byte {

 	copy(executable, src)

-	if runtime.GOARCH == "arm64" {
-		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
-		if err = platform.MprotectRX(executable); err != nil {
-			panic(err)
-		}
+	if err = platform.MprotectRX(executable); err != nil {
+		panic(err)
 	}
 	return executable
 }
@@ -804,25 +900,30 @@ func (e *engine) getListenerTrampolineForType(functionType *wasm.FunctionType) (
 	e.mux.Lock()
 	defer e.mux.Unlock()

-	beforeBuf, ok := e.sharedFunctions.listenerBeforeTrampolines[functionType]
-	afterBuf := e.sharedFunctions.listenerAfterTrampolines[functionType]
-	if ok {
-		return &beforeBuf[0], &afterBuf[0]
+	trampoline, ok := e.sharedFunctions.listenerTrampolines[functionType]
+	if !ok {
+		var executable []byte
+		beforeSig, afterSig := frontend.SignatureForListener(functionType)
+
+		e.be.Init()
+		buf := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerBefore, beforeSig, false)
+		executable = append(executable, buf...)
+
+		align := 15 & -len(executable) // Align 16-bytes boundary.
+		executable = append(executable, make([]byte, align)...)
+		offset := len(executable)
+
+		e.be.Init()
+		buf = e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerAfter, afterSig, false)
+		executable = append(executable, buf...)
+
+		trampoline.executable = mmapExecutable(executable)
+		trampoline.before = &trampoline.executable[0]
+		trampoline.after = &trampoline.executable[offset]
+
+		e.sharedFunctions.listenerTrampolines[functionType] = trampoline
 	}
-
-	beforeSig, afterSig := frontend.SignatureForListener(functionType)
-
-	e.be.Init()
-	buf := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerBefore, beforeSig, false)
-	beforeBuf = mmapExecutable(buf)
-
-	e.be.Init()
-	buf = e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerAfter, afterSig, false)
-	afterBuf = mmapExecutable(buf)
-
-	e.sharedFunctions.listenerBeforeTrampolines[functionType] = beforeBuf
-	e.sharedFunctions.listenerAfterTrampolines[functionType] = afterBuf
-	return &beforeBuf[0], &afterBuf[0]
+	return trampoline.before, trampoline.after
 }

 func (cm *compiledModule) getSourceOffset(pc uintptr) uint64 {
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"hash/crc32"
 	"io"
-	"runtime"
 	"unsafe"

 	"github.com/tetratelabs/wazero/experimental"
@@ -33,7 +32,7 @@ func fileCacheKey(m *wasm.Module) (ret filecache.Key) {
 	s.Write(magic)
 	// Write the CPU features so that we can cache the compiled module for the same CPU.
 	// This prevents the incompatible CPU features from being used.
-	cpu := platform.CpuFeatures.Raw()
+	cpu := platform.CpuFeatures().Raw()
 	// Reuse the `ret` buffer to write the first 8 bytes of the CPU features so that we can avoid the allocation.
 	binary.LittleEndian.PutUint64(ret[:8], cpu)
 	s.Write(ret[:8])
@@ -51,7 +50,7 @@ func (e *engine) addCompiledModule(module *wasm.Module, cm *compiledModule) (err
 }

 func (e *engine) getCompiledModule(module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (cm *compiledModule, ok bool, err error) {
-	cm, ok = e.getCompiledModuleFromMemory(module)
+	cm, ok = e.getCompiledModuleFromMemory(module, true)
 	if ok {
 		return
 	}
@@ -88,16 +87,23 @@ func (e *engine) getCompiledModule(module *wasm.Module, listeners []experimental
 func (e *engine) addCompiledModuleToMemory(m *wasm.Module, cm *compiledModule) {
 	e.mux.Lock()
 	defer e.mux.Unlock()
-	e.compiledModules[m.ID] = cm
+	e.compiledModules[m.ID] = &compiledModuleWithCount{compiledModule: cm, refCount: 1}
 	if len(cm.executable) > 0 {
 		e.addCompiledModuleToSortedList(cm)
 	}
 }

-func (e *engine) getCompiledModuleFromMemory(module *wasm.Module) (cm *compiledModule, ok bool) {
-	e.mux.RLock()
-	defer e.mux.RUnlock()
-	cm, ok = e.compiledModules[module.ID]
+func (e *engine) getCompiledModuleFromMemory(module *wasm.Module, increaseRefCount bool) (cm *compiledModule, ok bool) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+
+	cmWithCount, ok := e.compiledModules[module.ID]
+	if ok {
+		cm = cmWithCount.compiledModule
+		if increaseRefCount {
+			cmWithCount.refCount++
+		}
+	}
 	return
 }

@@ -246,11 +252,8 @@ func deserializeCompiledModule(wazeroVersion string, reader io.ReadCloser) (cm *
 			return nil, false, fmt.Errorf("compilationcache: checksum mismatch (expected %d, got %d)", expected, checksum)
 		}

-		if runtime.GOARCH == "arm64" {
-			// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
-			if err = platform.MprotectRX(executable); err != nil {
-				return nil, false, err
-			}
+		if err = platform.MprotectRX(executable); err != nil {
+			return nil, false, err
 		}
 		cm.executable = executable
 	}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
@@ -469,7 +469,7 @@ func (c *Compiler) allocateVarLengthValues(_cap int, vs ...ssa.Value) ssa.Values
 	builder := c.ssaBuilder
 	pool := builder.VarLengthPool()
 	args := pool.Allocate(_cap)
-	args = args.Append(builder.VarLengthPool(), vs...)
+	args = args.Append(pool, vs...)
 	return args
 }

--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
@@ -123,8 +123,7 @@ func (c *Compiler) nPeekDup(n int) ssa.Values {
 	l := c.state()
 	tail := len(l.values)

-	args := c.allocateVarLengthValues(n)
-	args = args.Append(c.ssaBuilder.VarLengthPool(), l.values[tail-n:tail]...)
+	args := c.allocateVarLengthValues(n, l.values[tail-n:tail]...)
 	return args
 }

@@ -665,19 +664,22 @@ func (c *Compiler) lowerCurrentOpcode() {
 			tableBaseAddr := c.loadTableBaseAddr(tableInstancePtr)
 			addr := builder.AllocateInstruction().AsIadd(tableBaseAddr, offsetInBytes).Insert(builder).Return()

-			// Prepare the loop and following block.
-			beforeLoop := builder.AllocateBasicBlock()
-			loopBlk := builder.AllocateBasicBlock()
-			loopVar := loopBlk.AddParam(builder, ssa.TypeI64)
-			followingBlk := builder.AllocateBasicBlock()
-
 			// Uses the copy trick for faster filling buffer like memory.fill, but in this case we copy 8 bytes at a time.
+			// Tables are rarely huge, so ignore the 8KB maximum.
+			// https://github.com/golang/go/blob/go1.24.0/src/slices/slices.go#L514-L517
+			//
 			// 	buf := memoryInst.Buffer[offset : offset+fillSize]
 			// 	buf[0:8] = value
 			// 	for i := 8; i < fillSize; i *= 2 { Begin with 8 bytes.
 			// 		copy(buf[i:], buf[:i])
 			// 	}

+			// Prepare the loop and following block.
+			beforeLoop := builder.AllocateBasicBlock()
+			loopBlk := builder.AllocateBasicBlock()
+			loopVar := loopBlk.AddParam(builder, ssa.TypeI64)
+			followingBlk := builder.AllocateBasicBlock()
+
 			// Insert the jump to the beforeLoop block; If the fillSize is zero, then jump to the following block to skip entire logics.
 			zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return()
 			ifFillSizeZero := builder.AllocateInstruction().AsIcmp(fillSizeExt, zero, ssa.IntegerCmpCondEqual).
@@ -688,32 +690,24 @@ func (c *Compiler) lowerCurrentOpcode() {
 			// buf[0:8] = value
 			builder.SetCurrentBlock(beforeLoop)
 			builder.AllocateInstruction().AsStore(ssa.OpcodeStore, value, addr, 0).Insert(builder)
-			initValue := builder.AllocateInstruction().AsIconst64(8).Insert(builder).Return()
-			c.insertJumpToBlock(c.allocateVarLengthValues(1, initValue), loopBlk)
+			eight := builder.AllocateInstruction().AsIconst64(8).Insert(builder).Return()
+			c.insertJumpToBlock(c.allocateVarLengthValues(1, eight), loopBlk)

 			builder.SetCurrentBlock(loopBlk)
 			dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return()

-			// If loopVar*2 > fillSizeInBytes, then count must be fillSizeInBytes-loopVar.
-			var count ssa.Value
-			{
-				loopVarDoubled := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return()
-				loopVarDoubledLargerThanFillSize := builder.
-					AllocateInstruction().AsIcmp(loopVarDoubled, fillSizeInBytes, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual).
-					Insert(builder).Return()
-				diff := builder.AllocateInstruction().AsIsub(fillSizeInBytes, loopVar).Insert(builder).Return()
-				count = builder.AllocateInstruction().AsSelect(loopVarDoubledLargerThanFillSize, diff, loopVar).Insert(builder).Return()
-			}
+			newLoopVar := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return()
+			newLoopVarLessThanFillSize := builder.AllocateInstruction().
+				AsIcmp(newLoopVar, fillSizeInBytes, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return()
+
+			// On the last iteration, count must be fillSizeInBytes-loopVar.
+			diff := builder.AllocateInstruction().AsIsub(fillSizeInBytes, loopVar).Insert(builder).Return()
+			count := builder.AllocateInstruction().AsSelect(newLoopVarLessThanFillSize, loopVar, diff).Insert(builder).Return()

 			c.callMemmove(dstAddr, addr, count)

-			shiftAmount := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return()
-			newLoopVar := builder.AllocateInstruction().AsIshl(loopVar, shiftAmount).Insert(builder).Return()
-			loopVarLessThanFillSize := builder.AllocateInstruction().
-				AsIcmp(newLoopVar, fillSizeInBytes, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return()
-
 			builder.AllocateInstruction().
-				AsBrnz(loopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk).
+				AsBrnz(newLoopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk).
 				Insert(builder)

 			c.insertJumpToBlock(ssa.ValuesNil, followingBlk)
@@ -741,11 +735,15 @@ func (c *Compiler) lowerCurrentOpcode() {
 			// Calculate the base address:
 			addr := builder.AllocateInstruction().AsIadd(c.getMemoryBaseValue(false), offset).Insert(builder).Return()

-			// Uses the copy trick for faster filling buffer: https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
+			// Uses the copy trick for faster filling buffer, with a maximum chunk size of 8KB.
+			// https://github.com/golang/go/blob/go1.24.0/src/bytes/bytes.go#L664-L673
+			//
 			// 	buf := memoryInst.Buffer[offset : offset+fillSize]
 			// 	buf[0] = value
-			// 	for i := 1; i < fillSize; i *= 2 {
-			// 		copy(buf[i:], buf[:i])
+			// 	for i := 1; i < fillSize; {
+			// 		chunk := ((i - 1) & 8191) + 1
+			// 		copy(buf[i:], buf[:chunk])
+			// 		i += chunk
 			// 	}

 			// Prepare the loop and following block.
@@ -764,32 +762,31 @@ func (c *Compiler) lowerCurrentOpcode() {
 			// buf[0] = value
 			builder.SetCurrentBlock(beforeLoop)
 			builder.AllocateInstruction().AsStore(ssa.OpcodeIstore8, value, addr, 0).Insert(builder)
-			initValue := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return()
-			c.insertJumpToBlock(c.allocateVarLengthValues(1, initValue), loopBlk)
+			one := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return()
+			c.insertJumpToBlock(c.allocateVarLengthValues(1, one), loopBlk)

 			builder.SetCurrentBlock(loopBlk)
 			dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return()

-			// If loopVar*2 > fillSizeExt, then count must be fillSizeExt-loopVar.
-			var count ssa.Value
-			{
-				loopVarDoubled := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return()
-				loopVarDoubledLargerThanFillSize := builder.
-					AllocateInstruction().AsIcmp(loopVarDoubled, fillSize, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual).
-					Insert(builder).Return()
-				diff := builder.AllocateInstruction().AsIsub(fillSize, loopVar).Insert(builder).Return()
-				count = builder.AllocateInstruction().AsSelect(loopVarDoubledLargerThanFillSize, diff, loopVar).Insert(builder).Return()
-			}
+			// chunk := ((i - 1) & 8191) + 1
+			mask := builder.AllocateInstruction().AsIconst64(8191).Insert(builder).Return()
+			tmp1 := builder.AllocateInstruction().AsIsub(loopVar, one).Insert(builder).Return()
+			tmp2 := builder.AllocateInstruction().AsBand(tmp1, mask).Insert(builder).Return()
+			chunk := builder.AllocateInstruction().AsIadd(tmp2, one).Insert(builder).Return()
+
+			// i += chunk
+			newLoopVar := builder.AllocateInstruction().AsIadd(loopVar, chunk).Insert(builder).Return()
+			newLoopVarLessThanFillSize := builder.AllocateInstruction().
+				AsIcmp(newLoopVar, fillSize, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return()
+
+			// count = min(chunk, fillSize-loopVar)
+			diff := builder.AllocateInstruction().AsIsub(fillSize, loopVar).Insert(builder).Return()
+			count := builder.AllocateInstruction().AsSelect(newLoopVarLessThanFillSize, chunk, diff).Insert(builder).Return()

 			c.callMemmove(dstAddr, addr, count)

-			shiftAmount := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return()
-			newLoopVar := builder.AllocateInstruction().AsIshl(loopVar, shiftAmount).Insert(builder).Return()
-			loopVarLessThanFillSize := builder.AllocateInstruction().
-				AsIcmp(newLoopVar, fillSize, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return()
-
 			builder.AllocateInstruction().
-				AsBrnz(loopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk).
+				AsBrnz(newLoopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk).
 				Insert(builder)

 			c.insertJumpToBlock(ssa.ValuesNil, followingBlk)
@@ -1173,7 +1170,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 				ssa.TypeI64,
 			).Insert(builder).Return()

-		args := c.allocateVarLengthValues(1, c.execCtxPtrValue, pages)
+		args := c.allocateVarLengthValues(2, c.execCtxPtrValue, pages)
 		callGrowRet := builder.
 			AllocateInstruction().
 			AsCallIndirect(memoryGrowPtr, &c.memoryGrowSig, args).
@@ -1343,8 +1340,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 			blockType:                    bt,
 		})

-		args := c.allocateVarLengthValues(originalLen)
-		args = args.Append(builder.VarLengthPool(), state.values[originalLen:]...)
+		args := c.allocateVarLengthValues(len(bt.Params), state.values[originalLen:]...)

 		// Insert the jump to the header of loop.
 		br := builder.AllocateInstruction()
@@ -1383,8 +1379,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 		// multiple definitions (one in Then and another in Else blocks).
 		c.addBlockParamsFromWasmTypes(bt.Results, followingBlk)

-		args := c.allocateVarLengthValues(len(bt.Params))
-		args = args.Append(builder.VarLengthPool(), state.values[len(state.values)-len(bt.Params):]...)
+		args := c.allocateVarLengthValues(len(bt.Params), state.values[len(state.values)-len(bt.Params):]...)

 		// Insert the conditional jump to the Else block.
 		brz := builder.AllocateInstruction()
@@ -1568,11 +1563,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 			c.callListenerAfter()
 		}

-		results := c.nPeekDup(c.results())
-		instr := builder.AllocateInstruction()
-
-		instr.AsReturn(results)
-		builder.InsertInstruction(instr)
+		c.lowerReturn(builder)
 		state.unreachable = true

 	case wasm.OpcodeUnreachable:
@@ -1597,66 +1588,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 		if state.unreachable {
 			break
 		}
-
-		var typIndex wasm.Index
-		if fnIndex < c.m.ImportFunctionCount {
-			// Before transfer the control to the callee, we have to store the current module's moduleContextPtr
-			// into execContext.callerModuleContextPtr in case when the callee is a Go function.
-			c.storeCallerModuleContext()
-			var fi int
-			for i := range c.m.ImportSection {
-				imp := &c.m.ImportSection[i]
-				if imp.Type == wasm.ExternTypeFunc {
-					if fi == int(fnIndex) {
-						typIndex = imp.DescFunc
-						break
-					}
-					fi++
-				}
-			}
-		} else {
-			typIndex = c.m.FunctionSection[fnIndex-c.m.ImportFunctionCount]
-		}
-		typ := &c.m.TypeSection[typIndex]
-
-		argN := len(typ.Params)
-		tail := len(state.values) - argN
-		vs := state.values[tail:]
-		state.values = state.values[:tail]
-		args := c.allocateVarLengthValues(2+len(vs), c.execCtxPtrValue)
-
-		sig := c.signatures[typ]
-		call := builder.AllocateInstruction()
-		if fnIndex >= c.m.ImportFunctionCount {
-			args = args.Append(builder.VarLengthPool(), c.moduleCtxPtrValue) // This case the callee module is itself.
-			args = args.Append(builder.VarLengthPool(), vs...)
-			call.AsCall(FunctionIndexToFuncRef(fnIndex), sig, args)
-			builder.InsertInstruction(call)
-		} else {
-			// This case we have to read the address of the imported function from the module context.
-			moduleCtx := c.moduleCtxPtrValue
-			loadFuncPtr, loadModuleCtxPtr := builder.AllocateInstruction(), builder.AllocateInstruction()
-			funcPtrOffset, moduleCtxPtrOffset, _ := c.offset.ImportedFunctionOffset(fnIndex)
-			loadFuncPtr.AsLoad(moduleCtx, funcPtrOffset.U32(), ssa.TypeI64)
-			loadModuleCtxPtr.AsLoad(moduleCtx, moduleCtxPtrOffset.U32(), ssa.TypeI64)
-			builder.InsertInstruction(loadFuncPtr)
-			builder.InsertInstruction(loadModuleCtxPtr)
-
-			args = args.Append(builder.VarLengthPool(), loadModuleCtxPtr.Return())
-			args = args.Append(builder.VarLengthPool(), vs...)
-			call.AsCallIndirect(loadFuncPtr.Return(), sig, args)
-			builder.InsertInstruction(call)
-		}
-
-		first, rest := call.Returns()
-		if first.Valid() {
-			state.push(first)
-		}
-		for _, v := range rest {
-			state.push(v)
-		}
-
-		c.reloadAfterCall()
+		c.lowerCall(fnIndex)

 	case wasm.OpcodeDrop:
 		if state.unreachable {
@@ -3190,7 +3122,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 					ssa.TypeI64,
 				).Insert(builder).Return()

-			args := c.allocateVarLengthValues(3, c.execCtxPtrValue, timeout, exp, addr)
+			args := c.allocateVarLengthValues(4, c.execCtxPtrValue, timeout, exp, addr)
 			memoryWaitRet := builder.AllocateInstruction().
 				AsCallIndirect(memoryWaitPtr, sig, args).
 				Insert(builder).Return()
@@ -3211,7 +3143,7 @@ func (c *Compiler) lowerCurrentOpcode() {
 					wazevoapi.ExecutionContextOffsetMemoryNotifyTrampolineAddress.U32(),
 					ssa.TypeI64,
 				).Insert(builder).Return()
-			args := c.allocateVarLengthValues(2, c.execCtxPtrValue, count, addr)
+			args := c.allocateVarLengthValues(3, c.execCtxPtrValue, count, addr)
 			memoryNotifyRet := builder.AllocateInstruction().
 				AsCallIndirect(memoryNotifyPtr, &c.memoryNotifySig, args).
 				Insert(builder).Return()
@@ -3460,6 +3392,25 @@ func (c *Compiler) lowerCurrentOpcode() {
 		elementAddr := c.lowerAccessTableWithBoundsCheck(tableIndex, targetOffsetInTable)
 		loaded := builder.AllocateInstruction().AsLoad(elementAddr, 0, ssa.TypeI64).Insert(builder).Return()
 		state.push(loaded)
+
+	case wasm.OpcodeTailCallReturnCallIndirect:
+		typeIndex := c.readI32u()
+		tableIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		_, _ = typeIndex, tableIndex
+		c.lowerTailCallReturnCallIndirect(typeIndex, tableIndex)
+		state.unreachable = true
+
+	case wasm.OpcodeTailCallReturnCall:
+		fnIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		c.lowerTailCallReturnCall(fnIndex)
+		state.unreachable = true
+
 	default:
 		panic("TODO: unsupported in wazevo yet: " + wasm.InstructionName(op))
 	}
@@ -3473,6 +3424,14 @@ func (c *Compiler) lowerCurrentOpcode() {
 	c.loweringState.pc++
 }

+func (c *Compiler) lowerReturn(builder ssa.Builder) {
+	results := c.nPeekDup(c.results())
+	instr := builder.AllocateInstruction()
+
+	instr.AsReturn(results)
+	builder.InsertInstruction(instr)
+}
+
 func (c *Compiler) lowerExtMul(v1, v2 ssa.Value, from, to ssa.VecLane, signed, low bool) ssa.Value {
 	// TODO: The sequence `Widen; Widen; VIMul` can be substituted for a single instruction on some ISAs.
 	builder := c.ssaBuilder
@@ -3533,7 +3492,83 @@ func (c *Compiler) lowerAccessTableWithBoundsCheck(tableIndex uint32, elementOff
 	return calcElementAddressInTable.Return()
 }

-func (c *Compiler) lowerCallIndirect(typeIndex, tableIndex uint32) {
+func (c *Compiler) prepareCall(fnIndex uint32) (isIndirect bool, sig *ssa.Signature, args ssa.Values, funcRefOrPtrValue uint64) {
+	builder := c.ssaBuilder
+	state := c.state()
+	var typIndex wasm.Index
+	if fnIndex < c.m.ImportFunctionCount {
+		// Before transfer the control to the callee, we have to store the current module's moduleContextPtr
+		// into execContext.callerModuleContextPtr in case when the callee is a Go function.
+		c.storeCallerModuleContext()
+		var fi int
+		for i := range c.m.ImportSection {
+			imp := &c.m.ImportSection[i]
+			if imp.Type == wasm.ExternTypeFunc {
+				if fi == int(fnIndex) {
+					typIndex = imp.DescFunc
+					break
+				}
+				fi++
+			}
+		}
+	} else {
+		typIndex = c.m.FunctionSection[fnIndex-c.m.ImportFunctionCount]
+	}
+	typ := &c.m.TypeSection[typIndex]
+
+	argN := len(typ.Params)
+	tail := len(state.values) - argN
+	vs := state.values[tail:]
+	state.values = state.values[:tail]
+	args = c.allocateVarLengthValues(2+len(vs), c.execCtxPtrValue)
+
+	sig = c.signatures[typ]
+	if fnIndex >= c.m.ImportFunctionCount {
+		args = args.Append(builder.VarLengthPool(), c.moduleCtxPtrValue) // This case the callee module is itself.
+		args = args.Append(builder.VarLengthPool(), vs...)
+		return false, sig, args, uint64(FunctionIndexToFuncRef(fnIndex))
+	} else {
+		// This case we have to read the address of the imported function from the module context.
+		moduleCtx := c.moduleCtxPtrValue
+		loadFuncPtr, loadModuleCtxPtr := builder.AllocateInstruction(), builder.AllocateInstruction()
+		funcPtrOffset, moduleCtxPtrOffset, _ := c.offset.ImportedFunctionOffset(fnIndex)
+		loadFuncPtr.AsLoad(moduleCtx, funcPtrOffset.U32(), ssa.TypeI64)
+		loadModuleCtxPtr.AsLoad(moduleCtx, moduleCtxPtrOffset.U32(), ssa.TypeI64)
+		builder.InsertInstruction(loadFuncPtr)
+		builder.InsertInstruction(loadModuleCtxPtr)
+
+		args = args.Append(builder.VarLengthPool(), loadModuleCtxPtr.Return())
+		args = args.Append(builder.VarLengthPool(), vs...)
+
+		return true, sig, args, uint64(loadFuncPtr.Return())
+	}
+}
+
+func (c *Compiler) lowerCall(fnIndex uint32) {
+	builder := c.ssaBuilder
+	state := c.state()
+	isIndirect, sig, args, funcRefOrPtrValue := c.prepareCall(fnIndex)
+
+	call := builder.AllocateInstruction()
+	if isIndirect {
+		call.AsCallIndirect(ssa.Value(funcRefOrPtrValue), sig, args)
+	} else {
+		call.AsCall(ssa.FuncRef(funcRefOrPtrValue), sig, args)
+	}
+	builder.InsertInstruction(call)
+
+	first, rest := call.Returns()
+	if first.Valid() {
+		state.push(first)
+	}
+	for _, v := range rest {
+		state.push(v)
+	}
+
+	c.reloadAfterCall()
+}
+
+func (c *Compiler) prepareCallIndirect(typeIndex, tableIndex uint32) (ssa.Value, *wasm.FunctionType, ssa.Values) {
 	builder := c.ssaBuilder
 	state := c.state()

@@ -3601,6 +3636,14 @@ func (c *Compiler) lowerCallIndirect(typeIndex, tableIndex uint32) {
 	// into execContext.callerModuleContextPtr in case when the callee is a Go function.
 	c.storeCallerModuleContext()

+	return executablePtr, typ, args
+}
+
+func (c *Compiler) lowerCallIndirect(typeIndex, tableIndex uint32) {
+	builder := c.ssaBuilder
+	state := c.state()
+	executablePtr, typ, args := c.prepareCallIndirect(typeIndex, tableIndex)
+
 	call := builder.AllocateInstruction()
 	call.AsCallIndirect(executablePtr, c.signatures[typ], args)
 	builder.InsertInstruction(call)
@@ -3616,6 +3659,62 @@ func (c *Compiler) lowerCallIndirect(typeIndex, tableIndex uint32) {
 	c.reloadAfterCall()
 }

+func (c *Compiler) lowerTailCallReturnCall(fnIndex uint32) {
+	isIndirect, sig, args, funcRefOrPtrValue := c.prepareCall(fnIndex)
+	builder := c.ssaBuilder
+	state := c.state()
+
+	call := builder.AllocateInstruction()
+	if isIndirect {
+		call.AsTailCallReturnCallIndirect(ssa.Value(funcRefOrPtrValue), sig, args)
+	} else {
+		call.AsTailCallReturnCall(ssa.FuncRef(funcRefOrPtrValue), sig, args)
+	}
+	builder.InsertInstruction(call)
+
+	// In a proper tail call, the following code is unreachable since execution
+	// transfers to the callee. However, sometimes the backend might need to fall back to
+	// a regular call, so we include return handling and let the backend delete it
+	// when redundant.
+	// For details, see internal/engine/RATIONALE.md
+	first, rest := call.Returns()
+	if first.Valid() {
+		state.push(first)
+	}
+	for _, v := range rest {
+		state.push(v)
+	}
+
+	c.reloadAfterCall()
+	c.lowerReturn(builder)
+}
+
+func (c *Compiler) lowerTailCallReturnCallIndirect(typeIndex, tableIndex uint32) {
+	builder := c.ssaBuilder
+	state := c.state()
+	executablePtr, typ, args := c.prepareCallIndirect(typeIndex, tableIndex)
+
+	call := builder.AllocateInstruction()
+	call.AsTailCallReturnCallIndirect(executablePtr, c.signatures[typ], args)
+	builder.InsertInstruction(call)
+
+	// In a proper tail call, the following code is unreachable since execution
+	// transfers to the callee. However, sometimes the backend might need to fall back to
+	// a regular call, so we include return handling and let the backend delete it
+	// when redundant.
+	// For details, see internal/engine/RATIONALE.md
+	first, rest := call.Returns()
+	if first.Valid() {
+		state.push(first)
+	}
+	for _, v := range rest {
+		state.push(v)
+	}
+
+	c.reloadAfterCall()
+	c.lowerReturn(builder)
+}
+
 // memOpSetup inserts the bounds check and calculates the address of the memory operation (loads/stores).
 func (c *Compiler) memOpSetup(baseAddr ssa.Value, constOffset, operationSizeInBytes uint64) (address ssa.Value) {
 	address = ssa.ValueInvalid
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
@@ -174,20 +174,21 @@ func (m *moduleEngine) NewFunction(index wasm.Index) api.Function {
 		indexInModule:          index,
 		executable:             &p.executable[offset],
 		parent:                 m,
-		preambleExecutable:     &m.parent.entryPreambles[typIndex][0],
+		preambleExecutable:     p.entryPreamblesPtrs[typIndex],
 		sizeOfParamResultSlice: sizeOfParamResultSlice,
 		requiredParams:         typ.ParamNumInUint64,
 		numberOfResults:        typ.ResultNumInUint64,
 	}

-	ce.execCtx.memoryGrowTrampolineAddress = &m.parent.sharedFunctions.memoryGrowExecutable[0]
-	ce.execCtx.stackGrowCallTrampolineAddress = &m.parent.sharedFunctions.stackGrowExecutable[0]
-	ce.execCtx.checkModuleExitCodeTrampolineAddress = &m.parent.sharedFunctions.checkModuleExitCode[0]
-	ce.execCtx.tableGrowTrampolineAddress = &m.parent.sharedFunctions.tableGrowExecutable[0]
-	ce.execCtx.refFuncTrampolineAddress = &m.parent.sharedFunctions.refFuncExecutable[0]
-	ce.execCtx.memoryWait32TrampolineAddress = &m.parent.sharedFunctions.memoryWait32Executable[0]
-	ce.execCtx.memoryWait64TrampolineAddress = &m.parent.sharedFunctions.memoryWait64Executable[0]
-	ce.execCtx.memoryNotifyTrampolineAddress = &m.parent.sharedFunctions.memoryNotifyExecutable[0]
+	sharedFunctions := p.sharedFunctions
+	ce.execCtx.memoryGrowTrampolineAddress = sharedFunctions.memoryGrowAddress
+	ce.execCtx.stackGrowCallTrampolineAddress = sharedFunctions.stackGrowAddress
+	ce.execCtx.checkModuleExitCodeTrampolineAddress = sharedFunctions.checkModuleExitCodeAddress
+	ce.execCtx.tableGrowTrampolineAddress = sharedFunctions.tableGrowAddress
+	ce.execCtx.refFuncTrampolineAddress = sharedFunctions.refFuncAddress
+	ce.execCtx.memoryWait32TrampolineAddress = sharedFunctions.memoryWait32Address
+	ce.execCtx.memoryWait64TrampolineAddress = sharedFunctions.memoryWait64Address
+	ce.execCtx.memoryNotifyTrampolineAddress = sharedFunctions.memoryNotifyAddress
 	ce.execCtx.memmoveAddress = memmovPtr
 	ce.init()
 	return ce
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
@@ -633,6 +633,14 @@ const (
 	// OpcodeFence is a memory fence operation.
 	OpcodeFence

+	// OpcodeTailCallReturnCall is the equivalent of OpcodeCall (a "near" call)
+	// for tail calls. Semantically, it combines Call + Return into a single operation.
+	OpcodeTailCallReturnCall
+
+	// OpcodeTailCallReturnCallIndirect is the equivalent of OpcodeCallIndirect (a call to a function address)
+	// for tail calls. Semantically, it combines CallIndirect + Return into a single operation.
+	OpcodeTailCallReturnCallIndirect
+
 	// opcodeEnd marks the end of the opcode list.
 	opcodeEnd
 )
@@ -679,12 +687,44 @@ func (op AtomicRmwOp) String() string {
 type returnTypesFn func(b *builder, instr *Instruction) (t1 Type, ts []Type)

 var (
-	returnTypesFnNoReturns returnTypesFn = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return typeInvalid, nil }
-	returnTypesFnSingle                  = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return instr.typ, nil }
-	returnTypesFnI32                     = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeI32, nil }
-	returnTypesFnF32                     = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeF32, nil }
-	returnTypesFnF64                     = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeF64, nil }
-	returnTypesFnV128                    = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeV128, nil }
+	returnTypesFnNoReturns    returnTypesFn = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return typeInvalid, nil }
+	returnTypesFnSingle                     = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return instr.typ, nil }
+	returnTypesFnI32                        = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeI32, nil }
+	returnTypesFnF32                        = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeF32, nil }
+	returnTypesFnF64                        = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeF64, nil }
+	returnTypesFnV128                       = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeV128, nil }
+	returnTypesFnCallIndirect               = func(b *builder, instr *Instruction) (t1 Type, ts []Type) {
+		sigID := SignatureID(instr.u1)
+		sig, ok := b.signatures[sigID]
+		if !ok {
+			panic("BUG")
+		}
+		switch len(sig.Results) {
+		case 0:
+			t1 = typeInvalid
+		case 1:
+			t1 = sig.Results[0]
+		default:
+			t1, ts = sig.Results[0], sig.Results[1:]
+		}
+		return
+	}
+	returnTypesFnCall = func(b *builder, instr *Instruction) (t1 Type, ts []Type) {
+		sigID := SignatureID(instr.u2)
+		sig, ok := b.signatures[sigID]
+		if !ok {
+			panic("BUG")
+		}
+		switch len(sig.Results) {
+		case 0:
+			t1 = typeInvalid
+		case 1:
+			t1 = sig.Results[0]
+		default:
+			t1, ts = sig.Results[0], sig.Results[1:]
+		}
+		return
+	}
 )

 // sideEffect provides the info to determine if an instruction has side effects which
@@ -846,6 +886,8 @@ var instructionSideEffects = [opcodeEnd]sideEffect{
 	OpcodeAtomicStore:                 sideEffectStrict,
 	OpcodeAtomicCas:                   sideEffectStrict,
 	OpcodeFence:                       sideEffectStrict,
+	OpcodeTailCallReturnCall:          sideEffectStrict,
+	OpcodeTailCallReturnCallIndirect:  sideEffectStrict,
 	OpcodeWideningPairwiseDotProductS: sideEffectNone,
 }

@@ -860,105 +902,75 @@ func (i *Instruction) sideEffect() sideEffect {

 // instructionReturnTypes provides the function to determine the return types of an instruction.
 var instructionReturnTypes = [opcodeEnd]returnTypesFn{
-	OpcodeExtIaddPairwise: returnTypesFnV128,
-	OpcodeVbor:            returnTypesFnV128,
-	OpcodeVbxor:           returnTypesFnV128,
-	OpcodeVband:           returnTypesFnV128,
-	OpcodeVbnot:           returnTypesFnV128,
-	OpcodeVbandnot:        returnTypesFnV128,
-	OpcodeVbitselect:      returnTypesFnV128,
-	OpcodeVanyTrue:        returnTypesFnI32,
-	OpcodeVallTrue:        returnTypesFnI32,
-	OpcodeVhighBits:       returnTypesFnI32,
-	OpcodeVIadd:           returnTypesFnV128,
-	OpcodeVSaddSat:        returnTypesFnV128,
-	OpcodeVUaddSat:        returnTypesFnV128,
-	OpcodeVIsub:           returnTypesFnV128,
-	OpcodeVSsubSat:        returnTypesFnV128,
-	OpcodeVUsubSat:        returnTypesFnV128,
-	OpcodeVIcmp:           returnTypesFnV128,
-	OpcodeVImin:           returnTypesFnV128,
-	OpcodeVUmin:           returnTypesFnV128,
-	OpcodeVImax:           returnTypesFnV128,
-	OpcodeVUmax:           returnTypesFnV128,
-	OpcodeVImul:           returnTypesFnV128,
-	OpcodeVAvgRound:       returnTypesFnV128,
-	OpcodeVIabs:           returnTypesFnV128,
-	OpcodeVIneg:           returnTypesFnV128,
-	OpcodeVIpopcnt:        returnTypesFnV128,
-	OpcodeVIshl:           returnTypesFnV128,
-	OpcodeVSshr:           returnTypesFnV128,
-	OpcodeVUshr:           returnTypesFnV128,
-	OpcodeExtractlane:     returnTypesFnSingle,
-	OpcodeInsertlane:      returnTypesFnV128,
-	OpcodeBand:            returnTypesFnSingle,
-	OpcodeFcopysign:       returnTypesFnSingle,
-	OpcodeBitcast:         returnTypesFnSingle,
-	OpcodeBor:             returnTypesFnSingle,
-	OpcodeBxor:            returnTypesFnSingle,
-	OpcodeRotl:            returnTypesFnSingle,
-	OpcodeRotr:            returnTypesFnSingle,
-	OpcodeIshl:            returnTypesFnSingle,
-	OpcodeSshr:            returnTypesFnSingle,
-	OpcodeSdiv:            returnTypesFnSingle,
-	OpcodeSrem:            returnTypesFnSingle,
-	OpcodeUdiv:            returnTypesFnSingle,
-	OpcodeUrem:            returnTypesFnSingle,
-	OpcodeUshr:            returnTypesFnSingle,
-	OpcodeJump:            returnTypesFnNoReturns,
-	OpcodeUndefined:       returnTypesFnNoReturns,
-	OpcodeIconst:          returnTypesFnSingle,
-	OpcodeSelect:          returnTypesFnSingle,
-	OpcodeSExtend:         returnTypesFnSingle,
-	OpcodeUExtend:         returnTypesFnSingle,
-	OpcodeSwidenLow:       returnTypesFnV128,
-	OpcodeUwidenLow:       returnTypesFnV128,
-	OpcodeSwidenHigh:      returnTypesFnV128,
-	OpcodeUwidenHigh:      returnTypesFnV128,
-	OpcodeSnarrow:         returnTypesFnV128,
-	OpcodeUnarrow:         returnTypesFnV128,
-	OpcodeSwizzle:         returnTypesFnSingle,
-	OpcodeShuffle:         returnTypesFnV128,
-	OpcodeSplat:           returnTypesFnV128,
-	OpcodeIreduce:         returnTypesFnSingle,
-	OpcodeFabs:            returnTypesFnSingle,
-	OpcodeSqrt:            returnTypesFnSingle,
-	OpcodeCeil:            returnTypesFnSingle,
-	OpcodeFloor:           returnTypesFnSingle,
-	OpcodeTrunc:           returnTypesFnSingle,
-	OpcodeNearest:         returnTypesFnSingle,
-	OpcodeCallIndirect: func(b *builder, instr *Instruction) (t1 Type, ts []Type) {
-		sigID := SignatureID(instr.u1)
-		sig, ok := b.signatures[sigID]
-		if !ok {
-			panic("BUG")
-		}
-		switch len(sig.Results) {
-		case 0:
-			t1 = typeInvalid
-		case 1:
-			t1 = sig.Results[0]
-		default:
-			t1, ts = sig.Results[0], sig.Results[1:]
-		}
-		return
-	},
-	OpcodeCall: func(b *builder, instr *Instruction) (t1 Type, ts []Type) {
-		sigID := SignatureID(instr.u2)
-		sig, ok := b.signatures[sigID]
-		if !ok {
-			panic("BUG")
-		}
-		switch len(sig.Results) {
-		case 0:
-			t1 = typeInvalid
-		case 1:
-			t1 = sig.Results[0]
-		default:
-			t1, ts = sig.Results[0], sig.Results[1:]
-		}
-		return
-	},
+	OpcodeExtIaddPairwise:             returnTypesFnV128,
+	OpcodeVbor:                        returnTypesFnV128,
+	OpcodeVbxor:                       returnTypesFnV128,
+	OpcodeVband:                       returnTypesFnV128,
+	OpcodeVbnot:                       returnTypesFnV128,
+	OpcodeVbandnot:                    returnTypesFnV128,
+	OpcodeVbitselect:                  returnTypesFnV128,
+	OpcodeVanyTrue:                    returnTypesFnI32,
+	OpcodeVallTrue:                    returnTypesFnI32,
+	OpcodeVhighBits:                   returnTypesFnI32,
+	OpcodeVIadd:                       returnTypesFnV128,
+	OpcodeVSaddSat:                    returnTypesFnV128,
+	OpcodeVUaddSat:                    returnTypesFnV128,
+	OpcodeVIsub:                       returnTypesFnV128,
+	OpcodeVSsubSat:                    returnTypesFnV128,
+	OpcodeVUsubSat:                    returnTypesFnV128,
+	OpcodeVIcmp:                       returnTypesFnV128,
+	OpcodeVImin:                       returnTypesFnV128,
+	OpcodeVUmin:                       returnTypesFnV128,
+	OpcodeVImax:                       returnTypesFnV128,
+	OpcodeVUmax:                       returnTypesFnV128,
+	OpcodeVImul:                       returnTypesFnV128,
+	OpcodeVAvgRound:                   returnTypesFnV128,
+	OpcodeVIabs:                       returnTypesFnV128,
+	OpcodeVIneg:                       returnTypesFnV128,
+	OpcodeVIpopcnt:                    returnTypesFnV128,
+	OpcodeVIshl:                       returnTypesFnV128,
+	OpcodeVSshr:                       returnTypesFnV128,
+	OpcodeVUshr:                       returnTypesFnV128,
+	OpcodeExtractlane:                 returnTypesFnSingle,
+	OpcodeInsertlane:                  returnTypesFnV128,
+	OpcodeBand:                        returnTypesFnSingle,
+	OpcodeFcopysign:                   returnTypesFnSingle,
+	OpcodeBitcast:                     returnTypesFnSingle,
+	OpcodeBor:                         returnTypesFnSingle,
+	OpcodeBxor:                        returnTypesFnSingle,
+	OpcodeRotl:                        returnTypesFnSingle,
+	OpcodeRotr:                        returnTypesFnSingle,
+	OpcodeIshl:                        returnTypesFnSingle,
+	OpcodeSshr:                        returnTypesFnSingle,
+	OpcodeSdiv:                        returnTypesFnSingle,
+	OpcodeSrem:                        returnTypesFnSingle,
+	OpcodeUdiv:                        returnTypesFnSingle,
+	OpcodeUrem:                        returnTypesFnSingle,
+	OpcodeUshr:                        returnTypesFnSingle,
+	OpcodeJump:                        returnTypesFnNoReturns,
+	OpcodeUndefined:                   returnTypesFnNoReturns,
+	OpcodeIconst:                      returnTypesFnSingle,
+	OpcodeSelect:                      returnTypesFnSingle,
+	OpcodeSExtend:                     returnTypesFnSingle,
+	OpcodeUExtend:                     returnTypesFnSingle,
+	OpcodeSwidenLow:                   returnTypesFnV128,
+	OpcodeUwidenLow:                   returnTypesFnV128,
+	OpcodeSwidenHigh:                  returnTypesFnV128,
+	OpcodeUwidenHigh:                  returnTypesFnV128,
+	OpcodeSnarrow:                     returnTypesFnV128,
+	OpcodeUnarrow:                     returnTypesFnV128,
+	OpcodeSwizzle:                     returnTypesFnSingle,
+	OpcodeShuffle:                     returnTypesFnV128,
+	OpcodeSplat:                       returnTypesFnV128,
+	OpcodeIreduce:                     returnTypesFnSingle,
+	OpcodeFabs:                        returnTypesFnSingle,
+	OpcodeSqrt:                        returnTypesFnSingle,
+	OpcodeCeil:                        returnTypesFnSingle,
+	OpcodeFloor:                       returnTypesFnSingle,
+	OpcodeTrunc:                       returnTypesFnSingle,
+	OpcodeNearest:                     returnTypesFnSingle,
+	OpcodeCallIndirect:                returnTypesFnCallIndirect,
+	OpcodeCall:                        returnTypesFnCall,
 	OpcodeLoad:                        returnTypesFnSingle,
 	OpcodeVZeroExtLoad:                returnTypesFnV128,
 	OpcodeLoadSplat:                   returnTypesFnV128,
@@ -1032,6 +1044,8 @@ var instructionReturnTypes = [opcodeEnd]returnTypesFn{
 	OpcodeAtomicStore:                 returnTypesFnNoReturns,
 	OpcodeAtomicCas:                   returnTypesFnSingle,
 	OpcodeFence:                       returnTypesFnNoReturns,
+	OpcodeTailCallReturnCallIndirect:  returnTypesFnCallIndirect,
+	OpcodeTailCallReturnCall:          returnTypesFnCall,
 	OpcodeWideningPairwiseDotProductS: returnTypesFnV128,
 }

@@ -2038,6 +2052,25 @@ func (i *Instruction) AtomicTargetSize() (size uint64) {
 	return i.u1
 }

+// AsTailCallReturnCall initializes this instruction as a call instruction with OpcodeTailCallReturnCall.
+func (i *Instruction) AsTailCallReturnCall(ref FuncRef, sig *Signature, args Values) {
+	i.opcode = OpcodeTailCallReturnCall
+	i.u1 = uint64(ref)
+	i.vs = args
+	i.u2 = uint64(sig.ID)
+	sig.used = true
+}
+
+// AsTailCallReturnCallIndirect initializes this instruction as a call-indirect instruction with OpcodeTailCallReturnCallIndirect.
+func (i *Instruction) AsTailCallReturnCallIndirect(funcPtr Value, sig *Signature, args Values) *Instruction {
+	i.opcode = OpcodeTailCallReturnCallIndirect
+	i.vs = args
+	i.v = funcPtr
+	i.u1 = uint64(sig.ID)
+	sig.used = true
+	return i
+}
+
 // ReturnVals returns the return values of OpcodeReturn.
 func (i *Instruction) ReturnVals() []Value {
 	return i.vs.View()
@@ -2166,7 +2199,7 @@ func (i *Instruction) AsCall(ref FuncRef, sig *Signature, args Values) {

 // CallData returns the call data for this instruction necessary for backends.
 func (i *Instruction) CallData() (ref FuncRef, sigID SignatureID, args []Value) {
-	if i.opcode != OpcodeCall {
+	if i.opcode != OpcodeCall && i.opcode != OpcodeTailCallReturnCall {
 		panic("BUG: CallData only available for OpcodeCall")
 	}
 	ref = FuncRef(i.u1)
@@ -2195,8 +2228,8 @@ func (i *Instruction) AsCallGoRuntimeMemmove(funcPtr Value, sig *Signature, args

 // CallIndirectData returns the call indirect data for this instruction necessary for backends.
 func (i *Instruction) CallIndirectData() (funcPtr Value, sigID SignatureID, args []Value, isGoMemmove bool) {
-	if i.opcode != OpcodeCallIndirect {
-		panic("BUG: CallIndirectData only available for OpcodeCallIndirect")
+	if i.opcode != OpcodeCallIndirect && i.opcode != OpcodeTailCallReturnCallIndirect {
+		panic("BUG: CallIndirectData only available for OpcodeCallIndirect and OpcodeTailCallReturnCallIndirect")
 	}
 	funcPtr = i.v
 	sigID = SignatureID(i.u1)
@@ -2620,6 +2653,17 @@ func (i *Instruction) Format(b Builder) string {
 		instSuffix = fmt.Sprintf("_%d, %s, %s, %s", 8*i.u1, i.v.Format(b), i.v2.Format(b), i.v3.Format(b))
 	case OpcodeFence:
 		instSuffix = fmt.Sprintf(" %d", i.u1)
+	case OpcodeTailCallReturnCall, OpcodeTailCallReturnCallIndirect:
+		view := i.vs.View()
+		vs := make([]string, len(view))
+		for idx := range vs {
+			vs[idx] = view[idx].Format(b)
+		}
+		if i.opcode == OpcodeCallIndirect {
+			instSuffix = fmt.Sprintf(" %s:%s, %s", i.v.Format(b), SignatureID(i.u1), strings.Join(vs, ", "))
+		} else {
+			instSuffix = fmt.Sprintf(" %s:%s, %s", FuncRef(i.u1), SignatureID(i.u2), strings.Join(vs, ", "))
+		}
 	case OpcodeWideningPairwiseDotProductS:
 		instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b))
 	default:
@@ -2879,6 +2923,10 @@ func (o Opcode) String() (ret string) {
 		return "AtomicStore"
 	case OpcodeFence:
 		return "Fence"
+	case OpcodeTailCallReturnCall:
+		return "ReturnCall"
+	case OpcodeTailCallReturnCallIndirect:
+		return "ReturnCallIndirect"
 	case OpcodeVbor:
 		return "Vbor"
 	case OpcodeVbxor:
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"math/rand"
 	"os"
+	"sync"
 	"time"
 )

@@ -91,7 +92,7 @@ type (
 		initialCompilationDone bool
 		maybeRandomizedIndexes []int
 		r                      *rand.Rand
-		values                 map[string]string
+		values                 sync.Map
 	}
 	verifierStateContextKey struct{}
 	currentFunctionNameKey  struct{}
@@ -106,31 +107,24 @@ func NewDeterministicCompilationVerifierContext(ctx context.Context, localFuncti
 	}
 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
 	return context.WithValue(ctx, verifierStateContextKey{}, &verifierState{
-		r: r, maybeRandomizedIndexes: maybeRandomizedIndexes, values: map[string]string{},
+		r: r, maybeRandomizedIndexes: maybeRandomizedIndexes, values: sync.Map{},
 	})
 }

 // DeterministicCompilationVerifierRandomizeIndexes randomizes the indexes for the deterministic compilation verifier.
-// To get the randomized index, use DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex.
-func DeterministicCompilationVerifierRandomizeIndexes(ctx context.Context) {
+// Returns a slice that maps an index to the randomized index.
+func DeterministicCompilationVerifierRandomizeIndexes(ctx context.Context) []int {
 	state := ctx.Value(verifierStateContextKey{}).(*verifierState)
 	if !state.initialCompilationDone {
 		// If this is the first attempt, we use the index as-is order.
 		state.initialCompilationDone = true
-		return
+		return state.maybeRandomizedIndexes
 	}
 	r := state.r
 	r.Shuffle(len(state.maybeRandomizedIndexes), func(i, j int) {
 		state.maybeRandomizedIndexes[i], state.maybeRandomizedIndexes[j] = state.maybeRandomizedIndexes[j], state.maybeRandomizedIndexes[i]
 	})
-}
-
-// DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex returns the randomized index for the given `index`
-// which is assigned by DeterministicCompilationVerifierRandomizeIndexes.
-func DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex(ctx context.Context, index int) int {
-	state := ctx.Value(verifierStateContextKey{}).(*verifierState)
-	ret := state.maybeRandomizedIndexes[index]
-	return ret
+	return state.maybeRandomizedIndexes
 }

 // VerifyOrSetDeterministicCompilationContextValue verifies that the `newValue` is the same as the previous value for the given `scope`
@@ -141,9 +135,8 @@ func VerifyOrSetDeterministicCompilationContextValue(ctx context.Context, scope
 	fn := ctx.Value(currentFunctionNameKey{}).(string)
 	key := fn + ": " + scope
 	verifierCtx := ctx.Value(verifierStateContextKey{}).(*verifierState)
-	oldValue, ok := verifierCtx.values[key]
-	if !ok {
-		verifierCtx.values[key] = newValue
+	oldValue, loaded := verifierCtx.values.LoadOrStore(key, newValue)
+	if !loaded {
 		return
 	}
 	if oldValue != newValue {
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
@@ -69,7 +69,7 @@ type IDedPool[T any] struct {

 // NewIDedPool returns a new IDedPool.
 func NewIDedPool[T any](resetFn func(*T)) IDedPool[T] {
-	return IDedPool[T]{pool: NewPool[T](resetFn), maxIDEncountered: -1}
+	return IDedPool[T]{pool: NewPool(resetFn), maxIDEncountered: -1}
 }

 // GetOrAllocate returns the T with the given id.
@@ -134,10 +134,10 @@ type VarLength[T any] struct {
 // NewVarLengthPool returns a new VarLengthPool.
 func NewVarLengthPool[T any]() VarLengthPool[T] {
 	return VarLengthPool[T]{
-		arrayPool: NewPool[varLengthPoolArray[T]](func(v *varLengthPoolArray[T]) {
+		arrayPool: NewPool(func(v *varLengthPoolArray[T]) {
 			v.next = 0
 		}),
-		slicePool: NewPool[[]T](func(i *[]T) {
+		slicePool: NewPool(func(i *[]T) {
 			*i = (*i)[:0]
 		}),
 	}
@@ -155,6 +155,9 @@ func (p *VarLengthPool[T]) Allocate(knownMin int) VarLength[T] {
 		return VarLength[T]{arr: arr}
 	}
 	slc := p.slicePool.Allocate()
+	if cap(*slc) < knownMin {
+		*slc = make([]T, 0, knownMin)
+	}
 	return VarLength[T]{slc: slc}
 }

@@ -166,39 +169,36 @@ func (p *VarLengthPool[T]) Reset() {

 // Append appends items to the backing slice just like the `append` builtin function in Go.
 func (i VarLength[T]) Append(p *VarLengthPool[T], items ...T) VarLength[T] {
-	if i.slc != nil {
-		*i.slc = append(*i.slc, items...)
+	slc := i.slc
+	if slc != nil {
+		*slc = append(*slc, items...)
 		return i
 	}

-	if i.arr == nil {
-		i.arr = p.arrayPool.Allocate()
+	arr := i.arr
+	if arr == nil {
+		arr = p.arrayPool.Allocate()
+		i.arr = arr
 	}

-	arr := i.arr
 	if arr.next+len(items) <= arraySize {
-		for _, item := range items {
-			arr.arr[arr.next] = item
-			arr.next++
-		}
+		arr.next += copy(arr.arr[arr.next:], items)
 	} else {
-		slc := p.slicePool.Allocate()
+		slc = p.slicePool.Allocate()
 		// Copy the array to the slice.
-		for ptr := 0; ptr < arr.next; ptr++ {
-			*slc = append(*slc, arr.arr[ptr])
-		}
+		*slc = append(*slc, arr.arr[:arr.next]...)
+		*slc = append(*slc, items...)
 		i.slc = slc
-		*i.slc = append(*i.slc, items...)
 	}
 	return i
 }

 // View returns the backing slice.
 func (i VarLength[T]) View() []T {
-	if i.slc != nil {
+	if slc := i.slc; slc != nil {
 		return *i.slc
-	} else if i.arr != nil {
-		arr := i.arr
+	}
+	if arr := i.arr; arr != nil {
 		return arr.arr[:arr.next]
 	}
 	return nil
@@ -207,9 +207,9 @@ func (i VarLength[T]) View() []T {
 // Cut cuts the backing slice to the given length.
 // Precondition: n <= len(i.backing).
 func (i VarLength[T]) Cut(n int) {
-	if i.slc != nil {
-		*i.slc = (*i.slc)[:n]
-	} else if i.arr != nil {
-		i.arr.next = n
+	if slc := i.slc; slc != nil {
+		*slc = (*slc)[:n]
+	} else if arr := i.arr; arr != nil {
+		arr.next = n
 	}
 }
--- a/vendor/github.com/tetratelabs/wazero/internal/expctxkeys/compilationworkers.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/expctxkeys/compilationworkers.go
@@ -0,0 +1,6 @@
+package expctxkeys
+
+// CompilationWorkers is a context.Context Value key.
+// Its associated value should be an int representing the number of workers
+// we want to spawn to compile a given wasm input.
+type CompilationWorkers struct{}
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_amd64.go
@@ -2,8 +2,10 @@

 package platform

+import "sync"
+
 // CpuFeatures exposes the capabilities for this CPU, queried via the Has, HasExtra methods.
-var CpuFeatures = loadCpuFeatureFlags()
+var CpuFeatures = sync.OnceValue(loadCpuFeatureFlags)

 // cpuFeatureFlags implements CpuFeatureFlags interface.
 type cpuFeatureFlags struct {
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_arm64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_arm64.go
@@ -2,10 +2,13 @@

 package platform

-import "runtime"
+import (
+	"runtime"
+	"sync"
+)

 // CpuFeatures exposes the capabilities for this CPU, queried via the Has, HasExtra methods.
-var CpuFeatures = loadCpuFeatureFlags()
+var CpuFeatures = sync.OnceValue(loadCpuFeatureFlags)

 // cpuFeatureFlags implements CpuFeatureFlags interface.
 type cpuFeatureFlags struct {
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/cpuid_unsupported.go
@@ -2,7 +2,7 @@

 package platform

-var CpuFeatures CpuFeatureFlags = &cpuFeatureFlags{}
+var CpuFeatures = func() CpuFeatureFlags { return &cpuFeatureFlags{} }

 // cpuFeatureFlags implements CpuFeatureFlags for unsupported platforms.
 type cpuFeatureFlags struct{}
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_linux.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_linux.go
@@ -59,12 +59,16 @@ func init() {
 	})
 }

-func mmapCodeSegment(size, prot int) ([]byte, error) {
-	flags := syscall.MAP_ANON | syscall.MAP_PRIVATE
+func mmapCodeSegment(size int) ([]byte, error) {
+	flag := syscall.MAP_ANON | syscall.MAP_PRIVATE
+	prot := syscall.PROT_READ | syscall.PROT_WRITE
+	if noopMprotectRX {
+		prot = syscall.PROT_READ | syscall.PROT_WRITE | syscall.PROT_EXEC
+	}

 	for _, hugePagesConfig := range hugePagesConfigs {
 		if hugePagesConfig.match(size) {
-			b, err := syscall.Mmap(-1, 0, size, prot, flags|hugePagesConfig.flag)
+			b, err := syscall.Mmap(-1, 0, size, prot, flag|hugePagesConfig.flag)
 			if err != nil {
 				continue
 			}
@@ -72,5 +76,5 @@ func mmapCodeSegment(size, prot int) ([]byte, error) {
 		}
 	}

-	return syscall.Mmap(-1, 0, size, prot, flags)
+	return syscall.Mmap(-1, 0, size, prot, flag)
 }
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_other.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_other.go
@@ -5,7 +5,11 @@ package platform

 import "syscall"

-func mmapCodeSegment(size, prot int) ([]byte, error) {
+func mmapCodeSegment(size int) ([]byte, error) {
+	prot := syscall.PROT_READ | syscall.PROT_WRITE
+	if noopMprotectRX {
+		prot = syscall.PROT_READ | syscall.PROT_WRITE | syscall.PROT_EXEC
+	}
 	return syscall.Mmap(
 		-1,
 		0,
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unix.go
@@ -2,31 +2,8 @@

 package platform

-import (
-	"syscall"
-)
-
-const (
-	mmapProtAMD64 = syscall.PROT_READ | syscall.PROT_WRITE | syscall.PROT_EXEC
-	mmapProtARM64 = syscall.PROT_READ | syscall.PROT_WRITE
-)
+import "syscall"

 func munmapCodeSegment(code []byte) error {
 	return syscall.Munmap(code)
 }
-
-// mmapCodeSegmentAMD64 gives all read-write-exec permission to the mmap region
-// to enter the function. Otherwise, segmentation fault exception is raised.
-func mmapCodeSegmentAMD64(size int) ([]byte, error) {
-	// The region must be RWX: RW for writing native codes, X for executing the region.
-	return mmapCodeSegment(size, mmapProtAMD64)
-}
-
-// mmapCodeSegmentARM64 cannot give all read-write-exec permission to the mmap region.
-// Otherwise, the mmap systemcall would raise an error. Here we give read-write
-// to the region so that we can write contents at call-sites. Callers are responsible to
-// execute MprotectRX on the returned buffer.
-func mmapCodeSegmentARM64(size int) ([]byte, error) {
-	// The region must be RW: RW for writing native codes.
-	return mmapCodeSegment(size, mmapProtARM64)
-}
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_unsupported.go
@@ -13,11 +13,7 @@ func munmapCodeSegment(code []byte) error {
 	panic(errUnsupported)
 }

-func mmapCodeSegmentAMD64(size int) ([]byte, error) {
-	panic(errUnsupported)
-}
-
-func mmapCodeSegmentARM64(size int) ([]byte, error) {
+func mmapCodeSegment(size int) ([]byte, error) {
 	panic(errUnsupported)
 }

--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mmap_windows.go
@@ -56,16 +56,7 @@ func virtualProtect(address, size, newprotect uintptr, oldprotect *uint32) error
 	return nil
 }

-func mmapCodeSegmentAMD64(size int) ([]byte, error) {
-	p, err := allocateMemory(uintptr(size), windows_PAGE_EXECUTE_READWRITE)
-	if err != nil {
-		return nil, err
-	}
-
-	return unsafe.Slice((*byte)(unsafe.Pointer(p)), size), nil
-}
-
-func mmapCodeSegmentARM64(size int) ([]byte, error) {
+func mmapCodeSegment(size int) ([]byte, error) {
 	p, err := allocateMemory(uintptr(size), windows_PAGE_READWRITE)
 	if err != nil {
 		return nil, err
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mprotect_bsd.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mprotect_bsd.go
@@ -7,6 +7,8 @@ import (
 	"unsafe"
 )

+const noopMprotectRX = false
+
 // MprotectRX is like syscall.Mprotect with RX permission, defined locally so that BSD compiles.
 func MprotectRX(b []byte) (err error) {
 	var _p0 unsafe.Pointer
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mprotect_syscall.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mprotect_syscall.go
@@ -4,6 +4,8 @@ package platform

 import "syscall"

+const noopMprotectRX = false
+
 // MprotectRX is like syscall.Mprotect with RX permission.
 func MprotectRX(b []byte) (err error) {
 	return syscall.Mprotect(b, syscall.PROT_READ|syscall.PROT_EXEC)
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/mprotect_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/mprotect_unsupported.go
@@ -2,8 +2,9 @@

 package platform

-import "syscall"
+const noopMprotectRX = true

 func MprotectRX(b []byte) error {
-	return syscall.ENOTSUP
+	// Assume we already called mmap with at least RX.
+	return nil
 }
--- a/vendor/github.com/tetratelabs/wazero/internal/platform/platform.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/platform/platform.go
@@ -21,13 +21,13 @@ func CompilerSupports(features api.CoreFeatures) bool {
 	case "linux", "darwin", "freebsd", "netbsd", "dragonfly", "windows":
 		if runtime.GOARCH == "arm64" {
 			if features.IsEnabled(experimental.CoreFeaturesThreads) {
-				return CpuFeatures.Has(CpuFeatureArm64Atomic)
+				return CpuFeatures().Has(CpuFeatureArm64Atomic)
 			}
 			return true
 		}
 		fallthrough
 	case "solaris", "illumos":
-		return runtime.GOARCH == "amd64" && CpuFeatures.Has(CpuFeatureAmd64SSE4_1)
+		return runtime.GOARCH == "amd64" && CpuFeatures().Has(CpuFeatureAmd64SSE4_1)
 	default:
 		return false
 	}
@@ -40,11 +40,7 @@ func MmapCodeSegment(size int) ([]byte, error) {
 	if size == 0 {
 		panic("BUG: MmapCodeSegment with zero length")
 	}
-	if runtime.GOARCH == "amd64" {
-		return mmapCodeSegmentAMD64(size)
-	} else {
-		return mmapCodeSegmentARM64(size)
-	}
+	return mmapCodeSegment(size)
 }

 // MunmapCodeSegment unmaps the given memory region.
--- a/vendor/github.com/tetratelabs/wazero/internal/sysfs/open_file_windows.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/sysfs/open_file_windows.go
@@ -36,7 +36,9 @@ func openFile(path string, oflag sys.Oflag, perm fs.FileMode) (*os.File, sys.Err
 	// To match expectations of WASI, e.g. TinyGo TestStatBadDir, return
 	// ENOENT, not ENOTDIR.
 	case sys.ENOTDIR:
-		errno = sys.ENOENT
+		if !strings.HasSuffix(path, "/") {
+			errno = sys.ENOENT
+		}
 	case sys.ENOENT:
 		if isSymlink(path) {
 			// Either symlink or hard link not found. We change the returned
--- a/vendor/github.com/tetratelabs/wazero/internal/sysfs/stat_linux.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/sysfs/stat_linux.go
@@ -1,4 +1,4 @@
-//go:build (amd64 || arm64 || riscv64) && linux
+//go:build (amd64 || arm64 || ppc64le || riscv64 || s390x) && linux

 // Note: This expression is not the same as compiler support, even if it looks
 // similar. Platform functions here are used in interpreter mode as well.
--- a/vendor/github.com/tetratelabs/wazero/internal/sysfs/stat_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/sysfs/stat_unsupported.go
@@ -1,4 +1,4 @@
-//go:build (!((amd64 || arm64 || riscv64) && linux) && !((amd64 || arm64) && (darwin || freebsd)) && !((amd64 || arm64) && windows)) || js
+//go:build (!((amd64 || arm64 || ppc64le || riscv64 || s390x) && linux) && !((amd64 || arm64) && (darwin || freebsd)) && !((amd64 || arm64) && windows)) || js

 package sysfs

--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/func_validation.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"errors"
 	"fmt"
+	"slices"
 	"strconv"
 	"strings"

@@ -480,11 +481,9 @@ func (m *Module) validateFunctionWithMaxStackValues(
 			// function type might result in invalid value types if the block is the outermost label
 			// which equals the function's type.
 			if lnLabel.op != OpcodeLoop { // Loop operation doesn't require results since the continuation is the beginning of the loop.
-				defaultLabelType = make([]ValueType, len(lnLabel.blockType.Results))
-				copy(defaultLabelType, lnLabel.blockType.Results)
+				defaultLabelType = slices.Clone(lnLabel.blockType.Results)
 			} else {
-				defaultLabelType = make([]ValueType, len(lnLabel.blockType.Params))
-				copy(defaultLabelType, lnLabel.blockType.Params)
+				defaultLabelType = slices.Clone(lnLabel.blockType.Params)
 			}

 			if enabledFeatures.IsEnabled(api.CoreFeatureReferenceTypes) {
@@ -534,7 +533,7 @@ func (m *Module) validateFunctionWithMaxStackValues(

 			// br_table instruction is stack-polymorphic.
 			valueTypeStack.unreachable()
-		} else if op == OpcodeCall {
+		} else if op == OpcodeCall || op == OpcodeTailCallReturnCall {
 			pc++
 			index, num, err := leb128.LoadUint32(body[pc:])
 			if err != nil {
@@ -544,16 +543,35 @@ func (m *Module) validateFunctionWithMaxStackValues(
 			if int(index) >= len(functions) {
 				return fmt.Errorf("invalid function index")
 			}
+
+			var opcodeName string
+			if op == OpcodeCall {
+				opcodeName = OpcodeCallName
+			} else {
+				opcodeName = OpcodeTailCallReturnCallName
+			}
+
 			funcType := &m.TypeSection[functions[index]]
 			for i := 0; i < len(funcType.Params); i++ {
 				if err := valueTypeStack.popAndVerifyType(funcType.Params[len(funcType.Params)-1-i]); err != nil {
-					return fmt.Errorf("type mismatch on %s operation param type: %v", OpcodeCallName, err)
+					return fmt.Errorf("type mismatch on %s operation param type: %v", opcodeName, err)
 				}
 			}
 			for _, exp := range funcType.Results {
 				valueTypeStack.push(exp)
 			}
-		} else if op == OpcodeCallIndirect {
+			if op == OpcodeTailCallReturnCall {
+				if err := enabledFeatures.RequireEnabled(experimental.CoreFeaturesTailCall); err != nil {
+					return fmt.Errorf("%s invalid as %v", OpcodeTailCallReturnCallName, err)
+				}
+				// Same formatting as OpcodeEnd on the outer-most block
+				if err := valueTypeStack.requireStackValues(false, "", functionType.Results, false); err != nil {
+					return err
+				}
+				// behaves as a jump.
+				valueTypeStack.unreachable()
+			}
+		} else if op == OpcodeCallIndirect || op == OpcodeTailCallReturnCallIndirect {
 			pc++
 			typeIndex, num, err := leb128.LoadUint32(body[pc:])
 			if err != nil {
@@ -561,8 +579,15 @@ func (m *Module) validateFunctionWithMaxStackValues(
 			}
 			pc += num

+			var opcodeName string
+			if op == OpcodeCallIndirect {
+				opcodeName = OpcodeCallIndirectName
+			} else {
+				opcodeName = OpcodeTailCallReturnCallIndirectName
+			}
+
 			if int(typeIndex) >= len(m.TypeSection) {
-				return fmt.Errorf("invalid type index at %s: %d", OpcodeCallIndirectName, typeIndex)
+				return fmt.Errorf("invalid type index at %s: %d", opcodeName, typeIndex)
 			}

 			tableIndex, num, err := leb128.LoadUint32(body[pc:])
@@ -582,21 +607,33 @@ func (m *Module) validateFunctionWithMaxStackValues(

 			table := tables[tableIndex]
 			if table.Type != RefTypeFuncref {
-				return fmt.Errorf("table is not funcref type but was %s for %s", RefTypeName(table.Type), OpcodeCallIndirectName)
+				return fmt.Errorf("table is not funcref type but was %s for %s", RefTypeName(table.Type), opcodeName)
 			}

 			if err = valueTypeStack.popAndVerifyType(ValueTypeI32); err != nil {
-				return fmt.Errorf("cannot pop the offset in table for %s", OpcodeCallIndirectName)
+				return fmt.Errorf("cannot pop the offset in table for %s", opcodeName)
 			}
 			funcType := &m.TypeSection[typeIndex]
 			for i := 0; i < len(funcType.Params); i++ {
 				if err = valueTypeStack.popAndVerifyType(funcType.Params[len(funcType.Params)-1-i]); err != nil {
-					return fmt.Errorf("type mismatch on %s operation input type", OpcodeCallIndirectName)
+					return fmt.Errorf("type mismatch on %s operation input type", opcodeName)
 				}
 			}
 			for _, exp := range funcType.Results {
 				valueTypeStack.push(exp)
 			}
+
+			if op == OpcodeTailCallReturnCallIndirect {
+				if err := enabledFeatures.RequireEnabled(experimental.CoreFeaturesTailCall); err != nil {
+					return fmt.Errorf("%s invalid as %v", OpcodeTailCallReturnCallIndirectName, err)
+				}
+				// Same formatting as OpcodeEnd on the outer-most block
+				if err := valueTypeStack.requireStackValues(false, "", functionType.Results, false); err != nil {
+					return err
+				}
+				// behaves as a jump.
+				valueTypeStack.unreachable()
+			}
 		} else if OpcodeI32Eqz <= op && op <= OpcodeI64Extend32S {
 			switch op {
 			case OpcodeI32Eqz:
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/instruction.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/instruction.go
@@ -777,6 +777,16 @@ const (
 	OpcodeAtomicI64Rmw32CmpxchgU OpcodeAtomic = 0x4e
 )

+// OpcodeTailCall represents an opcode of a tail call instructions.
+//
+// These opcodes are toggled with CoreFeaturesTailCall.
+type OpcodeTailCall = byte
+
+const (
+	OpcodeTailCallReturnCall         OpcodeTailCall = 0x12
+	OpcodeTailCallReturnCallIndirect OpcodeTailCall = 0x13
+)
+
 const (
 	OpcodeUnreachableName       = "unreachable"
 	OpcodeNopName               = "nop"
@@ -1864,3 +1874,18 @@ var atomicInstructionName = map[OpcodeAtomic]string{
 func AtomicInstructionName(oc OpcodeAtomic) (ret string) {
 	return atomicInstructionName[oc]
 }
+
+const (
+	OpcodeTailCallReturnCallName         = "return_call"
+	OpcodeTailCallReturnCallIndirectName = "return_call_indirect"
+)
+
+var tailCallInstructionName = map[OpcodeTailCall]string{
+	OpcodeTailCallReturnCall:         OpcodeTailCallReturnCallName,
+	OpcodeTailCallReturnCallIndirect: OpcodeTailCallReturnCallIndirectName,
+}
+
+// TailCallInstructionName returns the instruction name corresponding to the tail call Opcode.
+func TailCallInstructionName(oc OpcodeTailCall) (ret string) {
+	return tailCallInstructionName[oc]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/wasm/table.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/wasm/table.go
@@ -326,10 +326,14 @@ func (t *TableInstance) Grow(delta uint32, initialRef Reference) (currentLen uin
 	newLen >= math.MaxUint32 || (t.Max != nil && newLen > int64(*t.Max)) {
 		return 0xffffffff // = -1 in signed 32-bit integer.
 	}
+
 	t.References = append(t.References, make([]uintptr, delta)...)
+	if initialRef == 0 {
+		return
+	}

 	// Uses the copy trick for faster filling the new region with the initial value.
-	// https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
+	// https://github.com/golang/go/blob/go1.24.0/src/slices/slices.go#L514-L517
 	newRegion := t.References[currentLen:]
 	newRegion[0] = initialRef
 	for i := 1; i < len(newRegion); i *= 2 {
--- a/vendor/github.com/tetratelabs/wazero/sys/stat_linux.go
+++ b/vendor/github.com/tetratelabs/wazero/sys/stat_linux.go
@@ -1,4 +1,4 @@
-//go:build (amd64 || arm64 || riscv64) && linux
+//go:build (amd64 || arm64 || ppc64le || riscv64 || s390x) && linux

 // Note: This expression is not the same as compiler support, even if it looks
 // similar. Platform functions here are used in interpreter mode as well.
--- a/vendor/github.com/tetratelabs/wazero/sys/stat_unsupported.go
+++ b/vendor/github.com/tetratelabs/wazero/sys/stat_unsupported.go
@@ -1,4 +1,4 @@
-//go:build (!((amd64 || arm64 || riscv64) && linux) && !((amd64 || arm64) && (darwin || freebsd)) && !((amd64 || arm64) && windows)) || js
+//go:build (!((amd64 || arm64 || ppc64le || riscv64 || s390x) && linux) && !((amd64 || arm64) && (darwin || freebsd)) && !((amd64 || arm64) && windows)) || js

 package sys

--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -1326,8 +1326,8 @@ github.com/stretchr/testify/assert
 github.com/stretchr/testify/assert/yaml
 github.com/stretchr/testify/require
 github.com/stretchr/testify/suite
-# github.com/tetratelabs/wazero v1.9.0
-## explicit; go 1.22.0
+# github.com/tetratelabs/wazero v1.10.1
+## explicit; go 1.23.0
 github.com/tetratelabs/wazero
 github.com/tetratelabs/wazero/api
 github.com/tetratelabs/wazero/experimental