VERSION: bump for 24.2.0

meson: Additionally probe -mtls-dialect=desc for TLSDESC support
Previously only `-mtls-dialect=gnu2` was probed, which was appropriate for arm, x86 and x86_64, but not for newer architectures such as aarch64, loongarch64 and riscv64 which all use `-mtls-dialect=desc` instead. Because the driver option is not consistent across architectures (and probably will not), try both variants and choose the first one working. While at it, rename "gnu2_*" variables to "tlsdesc_*" respectively, for clarity. Cc: mesa-stable Reviewed-by: Icenowy Zheng <uwu@icenowy.me> Reviewed-by: Yukari Chiba <i@0x7f.cc> Reviewed-by: David Heidelberg <david@ixit.cz> Signed-off-by: WANG Xuerui <git@xen0n.name> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30599> (cherry picked from commit cc2dbb8ea5)
2024-08-14 18:37:13 +02:00 · 2024-08-14 17:45:45 +02:00 · 2024-08-14 17:45:45 +02:00 · 2024-08-14 17:45:41 +02:00 · 2024-08-14 17:45:37 +02:00 · 2024-08-14 16:07:01 +02:00
55 changed files with 4195 additions and 198 deletions
--- a/.gitlab-ci/common/generate-env.sh
+++ b/.gitlab-ci/common/generate-env.sh
@@ -84,6 +84,7 @@ VARS=(
    MESA_IMAGE_PATH
    MESA_IMAGE_TAG
    MESA_LOADER_DRIVER_OVERRIDE
+    MESA_SPIRV_LOG_LEVEL
    MESA_TEMPLATES_COMMIT
    MESA_VK_ABORT_ON_DEVICE_LOSS
    MESA_VK_IGNORE_CONFORMANCE_WARNING
--- a/.gitlab-ci/container/alpine/x86_64_build.sh
+++ b/.gitlab-ci/container/alpine/x86_64_build.sh
@@ -18,7 +18,7 @@ DEPS=(
    bash
    bison
    ccache
-    clang16-dev
+    clang${LLVM_VERSION}-dev
    cmake
    clang-dev
    coreutils
@@ -31,8 +31,8 @@ DEPS=(
    glslang
    graphviz
    linux-headers
-    llvm16-static
-    llvm16-dev
+    llvm${LLVM_VERSION}-static
+    llvm${LLVM_VERSION}-dev
    meson
    mold
    musl-dev
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@@ -1 +1 @@
-24.2.0-rc4
+24.2.0
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -157,9 +157,9 @@ endif
 endef

 ifneq ($(strip $(BOARD_MESA3D_GALLIUM_DRIVERS)),)
-# Module 'libgallium_dri', produces '/vendor/lib{64}/dri/libgallium_dri.so'
+# Module 'libgallium_dri', produces '/vendor/lib{64}/libgallium_dri.so'
 # This module also trigger DRI symlinks creation process
-$(eval $(call mesa3d-lib,libgallium_dri,dri,MESA3D_GALLIUM_DRI_BIN))
+$(eval $(call mesa3d-lib,libgallium_dri,,MESA3D_GALLIUM_BIN))
 # Module 'libglapi', produces '/vendor/lib{64}/libglapi.so'
 $(eval $(call mesa3d-lib,libglapi,,MESA3D_LIBGLAPI_BIN))

--- a/android/mesa3d_cross.mk
+++ b/android/mesa3d_cross.mk
@@ -63,8 +63,8 @@ MESON_OUT_DIR                            := $($(M_TARGET_PREFIX)TARGET_OUT_INTER
 MESON_GEN_DIR                            := $(MESON_OUT_DIR)_GEN
 MESON_GEN_FILES_TARGET                   := $(MESON_GEN_DIR)/.timestamp

-MESA3D_GALLIUM_DRI_DIR                   := $(MESON_OUT_DIR)/install/usr/local/lib/dri
-$(M_TARGET_PREFIX)MESA3D_GALLIUM_DRI_BIN := $(MESON_OUT_DIR)/install/usr/local/lib/libgallium_dri.so
+MESA3D_GALLIUM_DIR                       := $(MESON_OUT_DIR)/install/usr/local/lib
+$(M_TARGET_PREFIX)MESA3D_GALLIUM_BIN     := $(MESON_OUT_DIR)/install/usr/local/lib/libgallium_dri.so
 $(M_TARGET_PREFIX)MESA3D_LIBEGL_BIN      := $(MESON_OUT_DIR)/install/usr/local/lib/libEGL.so
 $(M_TARGET_PREFIX)MESA3D_LIBGLESV1_BIN   := $(MESON_OUT_DIR)/install/usr/local/lib/libGLESv1_CM.so
 $(M_TARGET_PREFIX)MESA3D_LIBGLESV2_BIN   := $(MESON_OUT_DIR)/install/usr/local/lib/libGLESv2.so
@@ -73,6 +73,7 @@ $(M_TARGET_PREFIX)MESA3D_LIBGBM_BIN      := $(MESON_OUT_DIR)/install/usr/local/l


 MESA3D_GLES_BINS := \
+    $($(M_TARGET_PREFIX)MESA3D_GALLIUM_BIN) \
    $($(M_TARGET_PREFIX)MESA3D_LIBEGL_BIN)    \
    $($(M_TARGET_PREFIX)MESA3D_LIBGLESV1_BIN) \
    $($(M_TARGET_PREFIX)MESA3D_LIBGLESV2_BIN) \
@@ -284,16 +285,11 @@ endif
 	$(MESON_BUILD)
 	touch $@

-MESON_COPY_LIBGALLIUM := \
-	cp `ls -1 $(MESA3D_GALLIUM_DRI_DIR)/* | head -1` $($(M_TARGET_PREFIX)MESA3D_GALLIUM_DRI_BIN)
-
-$(MESON_OUT_DIR)/install/.install.timestamp: MESON_COPY_LIBGALLIUM:=$(MESON_COPY_LIBGALLIUM)
 $(MESON_OUT_DIR)/install/.install.timestamp: MESON_BUILD:=$(MESON_BUILD)
 $(MESON_OUT_DIR)/install/.install.timestamp: $(MESON_OUT_DIR)/.build.timestamp
 	rm -rf $(dir $@)
 	mkdir -p $(dir $@)
 	DESTDIR=$(call relative-to-absolute,$(dir $@)) $(MESON_BUILD) install
-	$(if $(BOARD_MESA3D_GALLIUM_DRIVERS),$(MESON_COPY_LIBGALLIUM))
 	touch $@

 $($(M_TARGET_PREFIX)MESA3D_LIBGBM_BIN) $(MESA3D_GLES_BINS): $(MESON_OUT_DIR)/install/.install.timestamp
@@ -308,14 +304,3 @@ $(MESON_OUT_DIR)/install/usr/local/lib/libvulkan_$(MESA_VK_LIB_SUFFIX_$1).so: $(
 endef

 $(foreach driver,$(BOARD_MESA3D_VULKAN_DRIVERS), $(eval $(call vulkan_target,$(driver))))
-
-$($(M_TARGET_PREFIX)TARGET_OUT_VENDOR_SHARED_LIBRARIES)/dri/.symlinks.timestamp: MESA3D_GALLIUM_DRI_DIR:=$(MESA3D_GALLIUM_DRI_DIR)
-$($(M_TARGET_PREFIX)TARGET_OUT_VENDOR_SHARED_LIBRARIES)/dri/.symlinks.timestamp: $(MESON_OUT_DIR)/install/.install.timestamp
-	# Create Symlinks
-	mkdir -p $(dir $@)
-	ls -1 $(MESA3D_GALLIUM_DRI_DIR)/ | PATH=/usr/bin:$$PATH xargs -I{} ln -s -f libgallium_dri.so $(dir $@)/{}
-	touch $@
-
-$($(M_TARGET_PREFIX)MESA3D_GALLIUM_DRI_BIN): $(TARGET_OUT_VENDOR)/$(MESA3D_LIB_DIR)/dri/.symlinks.timestamp
-	echo "Build $@"
-	touch $@
--- a/meson.build
+++ b/meson.build
@@ -501,22 +501,28 @@ if not have_mtls_dialect
  if meson.is_cross_build() and not meson.can_run_host_binaries()
    warning('cannot auto-detect -mtls-dialect when cross-compiling, using compiler default')
  else
-    # -fpic to force dynamic tls, otherwise TLS relaxation defeats check
-    gnu2_test = cc.run('int __thread x; int main() { return x; }',
-                       args: ['-mtls-dialect=gnu2', '-fpic'],
-                       name: '-mtls-dialect=gnu2')
-    if gnu2_test.returncode() == 0 and (
-          # check for lld 13 bug: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5665
-          host_machine.cpu_family() != 'x86_64' or
-          # get_linker_id misses LDFLAGS=-fuse-ld=lld: https://github.com/mesonbuild/meson/issues/6377
-          #cc.get_linker_id() != 'ld.lld' or
-          cc.links('''int __thread x; int y; int main() { __asm__(
-                "leaq x@TLSDESC(%rip), %rax\n"
-                "movq y@GOTPCREL(%rip), %rdx\n"
-                "call *x@TLSCALL(%rax)\n"); }''', name: 'split TLSDESC')
-          )
-      c_cpp_args += '-mtls-dialect=gnu2'
-    endif
+    # The way to specify the TLSDESC dialect is architecture-specific.
+    # We probe both because there is not a fallback guaranteed to work for all
+    # future architectures.
+    foreach tlsdesc_arg : ['-mtls-dialect=gnu2', '-mtls-dialect=desc']
+      # -fpic to force dynamic tls, otherwise TLS relaxation defeats check
+      tlsdesc_test = cc.run('int __thread x; int main() { return x; }',
+                            args: [tlsdesc_arg, '-fpic'],
+                            name: tlsdesc_arg)
+      if tlsdesc_test.returncode() == 0 and (
+            # check for lld 13 bug: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5665
+            host_machine.cpu_family() != 'x86_64' or
+            # get_linker_id misses LDFLAGS=-fuse-ld=lld: https://github.com/mesonbuild/meson/issues/6377
+            #cc.get_linker_id() != 'ld.lld' or
+            cc.links('''int __thread x; int y; int main() { __asm__(
+                  "leaq x@TLSDESC(%rip), %rax\n"
+                  "movq y@GOTPCREL(%rip), %rdx\n"
+                  "call *x@TLSCALL(%rax)\n"); }''', name: 'split TLSDESC')
+            )
+        c_cpp_args += tlsdesc_arg
+        break
+      endif
+    endforeach
  endif
 endif

@@ -1013,6 +1019,7 @@ endforeach
 _attributes = [
  'const', 'flatten', 'malloc', 'pure', 'unused', 'warn_unused_result',
  'weak', 'format', 'packed', 'returns_nonnull', 'alias', 'noreturn',
+  'optimize',
 ]
 foreach a : cc.get_supported_function_attributes(_attributes)
  pre_args += '-DHAVE_FUNC_ATTRIBUTE_@0@'.format(a.to_upper())
@@ -1753,7 +1760,6 @@ if with_clc
  llvm_optional_modules += ['all-targets', 'windowsdriver', 'frontendhlsl', 'frontenddriver']
 endif
 draw_with_llvm = get_option('draw-use-llvm')
-llvm_with_orcjit = get_option('llvm-orcjit')
 if draw_with_llvm
  llvm_modules += 'native'
  # lto is needded with LLVM>=15, but we don't know what LLVM verrsion we are using yet
@@ -1761,6 +1767,12 @@ if draw_with_llvm
 endif
 amd_with_llvm = get_option('amd-use-llvm')

+# MCJIT is deprecated in LLVM and will not accept new architecture ports,
+# so any architecture not in the exhaustive list will have to rely on LLVM
+# ORCJIT for llvmpipe functionality.
+llvm_has_mcjit = host_machine.cpu_family() in ['aarch64', 'arm', 'ppc', 'ppc64', 's390x', 'x86', 'x86_64']
+llvm_with_orcjit = get_option('llvm-orcjit') or not llvm_has_mcjit
+
 if with_amd_vk or with_gallium_radeonsi or with_clc or llvm_with_orcjit
  _llvm_version = '>= 15.0.0'
 elif with_gallium_clover
@@ -1797,8 +1809,8 @@ if with_llvm
  pre_args += '-DMESA_LLVM_VERSION_STRING="@0@"'.format(dep_llvm.version())
  pre_args += '-DLLVM_IS_SHARED=@0@'.format(_shared_llvm.to_int())

-  if with_swrast_vk and not draw_with_llvm
-    error('Lavapipe requires LLVM draw support.')
+  if (with_swrast_vk or with_gallium_llvmpipe) and not draw_with_llvm
+    error('Lavapipe and llvmpipe require LLVM draw support.')
  endif

  if with_gallium_r600 and not amd_with_llvm
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -65,6 +65,14 @@ option(
  description : 'Location to install dri drivers. Default: $libdir/dri.'
 )

+option(
+  'unversion-libgallium',
+  type : 'boolean',
+  value : false,
+  description : 'Do not include mesa version in libgallium DSO filename. ' +
+                'Do not enable unless you know what you are doing. Default: false'
+)
+
 option(
  'dri-search-path',
  type : 'string',
@@ -436,7 +444,10 @@ option (
  'llvm-orcjit',
  type : 'boolean',
  value : false,
-  description: 'Build llvmpipe with LLVM ORCJIT support.'
+  description: 'Build llvmpipe with LLVM ORCJIT support. Has no effect when ' +
+               'building for architectures without LLVM MCJIT support -- ' +
+               'ORCJIT is the only choice on such architectures and will ' +
+               'always be enabled.'
 )

 option(
--- a/src/amd/common/ac_surface.c
+++ b/src/amd/common/ac_surface.c
@@ -3015,7 +3015,7 @@ static bool gfx12_compute_hiz_his_info(struct ac_addrlib *addrlib, const struct
 {
   assert(surf_in->flags.depth != surf_in->flags.stencil);

-   if (surf->flags & RADEON_SURF_NO_HTILE)
+   if (surf->flags & RADEON_SURF_NO_HTILE || (info->gfx_level == GFX12 && info->chip_rev == 0))
      return true;

   ADDR3_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -199,6 +199,21 @@ process_live_temps_per_block(live_ctx& ctx, Block* block)
         }
      }

+      if (ctx.program->gfx_level >= GFX10 && insn->isVALU() &&
+          insn->definitions.back().regClass() == s2) {
+         /* RDNA2 ISA doc, 6.2.4. Wave64 Destination Restrictions:
+          * The first pass of a wave64 VALU instruction may not overwrite a scalar value used by
+          * the second half.
+          */
+         bool carry_in = insn->opcode == aco_opcode::v_addc_co_u32 ||
+                         insn->opcode == aco_opcode::v_subb_co_u32 ||
+                         insn->opcode == aco_opcode::v_subbrev_co_u32;
+         for (unsigned op_idx = 0; op_idx < (carry_in ? 2 : insn->operands.size()); op_idx++) {
+            if (insn->operands[op_idx].isOfType(RegType::sgpr))
+               insn->operands[op_idx].setLateKill(true);
+         }
+      }
+
      /* we need to do this in a separate loop because the next one can
       * setKill() for several operands at once and we don't want to
       * overwrite that in a later iteration */
--- a/src/compiler/glsl/ast_function.cpp
+++ b/src/compiler/glsl/ast_function.cpp
@@ -2398,6 +2398,24 @@ ast_function_expression::hir(exec_list *instructions,

         ir_rvalue *result = convert_component(ir, desired_type);

+         /* If the bindless packing constructors are used directly as function
+          * params to bultin functions the compiler doesn't know what to do
+          * with them. To avoid this make sure we always copy the results from
+          * the pack to a temp first.
+          */
+         if (result->as_expression() &&
+             result->as_expression()->operation == ir_unop_pack_sampler_2x32) {
+            ir_variable *var =
+               new(ctx) ir_variable(desired_type, "sampler_ctor",
+                                    ir_var_temporary);
+            instructions->push_tail(var);
+
+            ir_dereference *lhs = new(ctx) ir_dereference_variable(var);
+            ir_instruction *assignment = new(ctx) ir_assignment(lhs, result);
+            instructions->push_tail(assignment);
+            result = lhs;
+         }
+
         /* Attempt to convert the parameter to a constant valued expression.
          * After doing so, track whether or not all the parameters to the
          * constructor are trivially constant valued expressions.
--- a/src/compiler/nir/nir_lower_int64.c
+++ b/src/compiler/nir/nir_lower_int64.c
@@ -683,24 +683,13 @@ lower_ufind_msb64(nir_builder *b, nir_def *x)
   nir_def *lo_count = nir_ufind_msb(b, x_lo);
   nir_def *hi_count = nir_ufind_msb(b, x_hi);

-   if (b->shader->options->lower_uadd_sat) {
-      nir_def *valid_hi_bits = nir_ine_imm(b, x_hi, 0);
-      nir_def *hi_res = nir_iadd_imm(b, hi_count, 32);
-      return nir_bcsel(b, valid_hi_bits, hi_res, lo_count);
-   } else {
-      /* If hi_count was -1, it will still be -1 after this uadd_sat. As a
-       * result, hi_count is either -1 or the correct return value for 64-bit
-       * ufind_msb.
-       */
-      nir_def *hi_res = nir_uadd_sat(b, nir_imm_intN_t(b, 32, 32), hi_count);
-
-      /* hi_res is either -1 or a value in the range [63, 32]. lo_count is
-       * either -1 or a value in the range [31, 0]. The imax will pick
-       * lo_count only when hi_res is -1. In those cases, lo_count is
-       * guaranteed to be the correct answer.
-       */
-      return nir_imax(b, hi_res, lo_count);
-   }
+   /* hi_count is either -1 or a value in the range [31, 0]. lo_count is
+    * the same. The imax will pick lo_count only when hi_count is -1. In those
+    * cases, lo_count is guaranteed to be the correct answer.
+    * The ior 32 is always safe here as with -1 the value won't change,
+    * otherwise it adds 32, which is what we want anyway.
+    */
+   return nir_imax(b, lo_count, nir_ior_imm(b, hi_count, 32));
 }

 static nir_def *
@@ -713,11 +702,9 @@ lower_find_lsb64(nir_builder *b, nir_def *x)

   /* Use umin so that -1 (no bits found) becomes larger (0xFFFFFFFF)
    * than any actual bit position, so we return a found bit instead.
-    * This is similar to the ufind_msb lowering. If you need this lowering
-    * without uadd_sat, add code like in lower_ufind_msb64.
+    * This is similar to the ufind_msb lowering.
    */
-   assert(!b->shader->options->lower_uadd_sat);
-   return nir_umin(b, lo_lsb, nir_uadd_sat(b, hi_lsb, nir_imm_int(b, 32)));
+   return nir_umin(b, lo_lsb, nir_ior_imm(b, hi_lsb, 32));
 }

 static nir_def *
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -2976,8 +2976,10 @@ dri2_initialize_wayland_swrast(_EGLDisplay *disp)
                          dri2_dpy->formats.num_formats))
      goto cleanup;

-   if (disp->Options.Zink)
-      dri2_initialize_wayland_drm_extensions(dri2_dpy);
+   if (disp->Options.Zink) {
+      if (!dri2_initialize_wayland_drm_extensions(dri2_dpy) && !disp->Options.ForceSoftware)
+         goto cleanup;
+   }

   dri2_dpy->driver_name = strdup(disp->Options.Zink ? "zink" : "swrast");
   if (!dri2_load_driver_swrast(disp))
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -1778,7 +1778,7 @@ dri2_initialize_x11_swrast(_EGLDisplay *disp)
   if (disp->Options.Zink &&
       !debug_get_bool_option("LIBGL_DRI3_DISABLE", false) &&
       !debug_get_bool_option("LIBGL_KOPPER_DRI2", false))
-      dri3_x11_connect(dri2_dpy, disp->Options.ForceSoftware);
+      dri3_x11_connect(dri2_dpy, disp->Options.Zink, disp->Options.ForceSoftware);
 #endif
   if (!dri2_load_driver_swrast(disp))
      goto cleanup;
@@ -1863,7 +1863,7 @@ dri2_initialize_x11_dri3(_EGLDisplay *disp)
   if (!dri2_get_xcb_connection(disp, dri2_dpy))
      goto cleanup;

-   status = dri3_x11_connect(dri2_dpy, disp->Options.ForceSoftware);
+   status = dri3_x11_connect(dri2_dpy, disp->Options.Zink, disp->Options.ForceSoftware);
   if (status != DRI2_EGL_DRIVER_LOADED)
      goto cleanup;

--- a/src/egl/drivers/dri2/platform_x11_dri3.c
+++ b/src/egl/drivers/dri2/platform_x11_dri3.c
@@ -527,7 +527,7 @@ struct dri2_egl_display_vtbl dri3_x11_display_vtbl = {
 };

 enum dri2_egl_driver_fail
-dri3_x11_connect(struct dri2_egl_display *dri2_dpy, bool swrast)
+dri3_x11_connect(struct dri2_egl_display *dri2_dpy, bool zink, bool swrast)
 {
   dri2_dpy->fd_render_gpu =
      loader_dri3_open(dri2_dpy->conn, dri2_dpy->screen->root, 0);
@@ -549,15 +549,16 @@ dri3_x11_connect(struct dri2_egl_display *dri2_dpy, bool swrast)
   if (!dri2_dpy->driver_name)
      dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd_render_gpu);

-   if (!strcmp(dri2_dpy->driver_name, "zink") &&
-       !debug_get_bool_option("LIBGL_KOPPER_DISABLE", false)) {
+   if (!zink && !strcmp(dri2_dpy->driver_name, "zink")) {
      close(dri2_dpy->fd_render_gpu);
+      dri2_dpy->fd_render_gpu = -1;
      return DRI2_EGL_DRIVER_PREFER_ZINK;
   }

   if (!dri2_dpy->driver_name) {
      _eglLog(_EGL_WARNING, "DRI3: No driver found");
      close(dri2_dpy->fd_render_gpu);
+      dri2_dpy->fd_render_gpu = -1;
      return DRI2_EGL_DRIVER_FAILED;
   }

--- a/src/egl/drivers/dri2/platform_x11_dri3.h
+++ b/src/egl/drivers/dri2/platform_x11_dri3.h
@@ -36,6 +36,6 @@ extern const __DRIimageLoaderExtension dri3_image_loader_extension;
 extern struct dri2_egl_display_vtbl dri3_x11_display_vtbl;

 enum dri2_egl_driver_fail
-dri3_x11_connect(struct dri2_egl_display *dri2_dpy, bool swrast);
+dri3_x11_connect(struct dri2_egl_display *dri2_dpy, bool zink, bool swrast);

 #endif
--- a/src/freedreno/vulkan/tu_wsi.cc
+++ b/src/freedreno/vulkan/tu_wsi.cc
@@ -24,9 +24,12 @@ tu_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
 static bool
 tu_wsi_can_present_on_device(VkPhysicalDevice physicalDevice, int fd)
 {
+#ifdef HAVE_LIBDRM
   VK_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
-
   return wsi_common_drm_devices_equal(fd, pdevice->local_fd);
+#else
+   return true;
+#endif
 }

 VkResult
--- a/src/gallium/auxiliary/gallivm/lp_bld_init_orc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init_orc.cpp
@@ -10,6 +10,7 @@
 #include <string>
 #include <vector>
 #include <mutex>
+#include <cstdlib>
 #include "lp_bld.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_init.h"
@@ -57,7 +58,7 @@
 /* conflict with ObjectLinkingLayer.h */
 #include "util/u_memory.h"

-#if DETECT_ARCH_RISCV64 == 1 || DETECT_ARCH_RISCV32 == 1 || (defined(_WIN32) && LLVM_VERSION_MAJOR >= 15)
+#if DETECT_ARCH_RISCV64 == 1 || DETECT_ARCH_RISCV32 == 1 || DETECT_ARCH_LOONGARCH64 == 1 || (defined(_WIN32) && LLVM_VERSION_MAJOR >= 15)
 /* use ObjectLinkingLayer (JITLINK backend) */
 #define USE_JITLINK
 #endif
@@ -102,6 +103,8 @@ public:

 class LPJit;

+void lpjit_exit();
+
 class LLVMEnsureMultithreaded {
 public:
   LLVMEnsureMultithreaded()
@@ -270,15 +273,19 @@ private:
   LPJit(const LPJit&) = delete;
   LPJit& operator=(const LPJit&) = delete;

+   friend void lpjit_exit();
+
   static void init_native_targets();
   llvm::orc::JITTargetMachineBuilder create_jtdb();

   static void init_lpjit() {
      jit = new LPJit;
+      std::atexit(lpjit_exit);
   }
   static LPJit* jit;

   std::unique_ptr<llvm::orc::LLJIT> lljit;
+   std::unique_ptr<llvm::TargetMachine> tm_unique;
   /* avoid name conflict */
   unsigned jit_dylib_count;

@@ -292,6 +299,11 @@ private:

 LPJit* LPJit::jit = NULL;

+void lpjit_exit()
+{
+   delete LPJit::jit;
+}
+
 LLVMErrorRef module_transform(void *Ctx, LLVMModuleRef mod) {
   struct lp_passmgr *mgr;

@@ -318,7 +330,8 @@ LPJit::LPJit() :jit_dylib_count(0) {

   init_native_targets();
   JITTargetMachineBuilder JTMB = create_jtdb();
-   tm = wrap(ExitOnErr(JTMB.createTargetMachine()).release());
+   tm_unique = ExitOnErr(JTMB.createTargetMachine());
+   tm = wrap(tm_unique.get());

   /* Create an LLJIT instance with an ObjectLinkingLayer (JITLINK)
    * or RuntimeDyld as the base layer.
@@ -410,6 +423,14 @@ llvm::orc::JITTargetMachineBuilder LPJit::create_jtdb() {
 #else
 #error "GALLIVM: unknown target riscv float abi"
 #endif
+#endif
+
+#if DETECT_ARCH_LOONGARCH64 == 1
+#if defined(__loongarch_lp64) && defined(__loongarch_double_float)
+   options.MCOptions.ABIName = "lp64d";
+#else
+#error "GALLIVM: unknown target loongarch float abi"
+#endif
 #endif

   JTMB.setOptions(options);
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -414,6 +414,24 @@ lp_build_fill_mattrs(std::vector<std::string> &MAttrs)
    */
   MAttrs = {"+m","+c","+a","+d","+f"};
 #endif
+
+#if DETECT_ARCH_LOONGARCH64 == 1
+   /*
+    * TODO: Implement util_get_cpu_caps()
+    *
+    * No FPU-less LoongArch64 systems are ever shipped yet, and LP64D is
+    * the default ABI, so FPU is enabled here.
+    *
+    * The Software development convention defaults to have "128-bit
+    * vector", so LSX is enabled here, see
+    * https://github.com/loongson/la-softdev-convention/releases/download/v0.1/la-softdev-convention.pdf
+    */
+   MAttrs = {"+f","+d"};
+#if LLVM_VERSION_MAJOR == 17
+   /* LLVM 17's LSX support is incomplete, so explicitly mask it */
+   MAttrs.push_back("-lsx");
+#endif
+#endif
 }

 void
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -3848,7 +3848,7 @@ atomic_emit(
      LLVMValueRef atom_res = lp_build_alloca(gallivm,
                                              uint_bld->vec_type, "");

-      LLVMValueRef ssbo_limit;
+      LLVMValueRef ssbo_limit = NULL;
      if (!is_shared) {
         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
--- a/src/gallium/drivers/d3d12/d3d12_video_enc_h264.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_video_enc_h264.cpp
@@ -1090,6 +1090,7 @@ d3d12_video_encoder_convert_profile_to_d3d12_enc_profile_h264(enum pipe_video_pr
 {
   switch (profile) {
      case PIPE_VIDEO_PROFILE_MPEG4_AVC_CONSTRAINED_BASELINE:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
      case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
      {
         return D3D12_VIDEO_ENCODER_PROFILE_H264_MAIN;
--- a/src/gallium/drivers/d3d12/d3d12_video_screen.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_video_screen.cpp
@@ -873,6 +873,7 @@ d3d12_has_video_encode_support(struct pipe_screen *pscreen,
   switch (profile) {
 #if VIDEO_CODEC_H264ENC
      case PIPE_VIDEO_PROFILE_MPEG4_AVC_CONSTRAINED_BASELINE:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
      case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
      case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
      case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH10:
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -47,6 +47,7 @@
 #include "pan_cmdstream.h"
 #include "pan_context.h"
 #include "pan_csf.h"
+#include "pan_format.h"
 #include "pan_indirect_dispatch.h"
 #include "pan_jm.h"
 #include "pan_job.h"
@@ -195,7 +196,12 @@ panfrost_create_sampler_state(struct pipe_context *pctx,
    * swizzle derived from the format, to allow more formats than the
    * hardware otherwise supports. When packing border colours, we need to
    * undo this bijection, by swizzling with its inverse.
+    * On v10+, watch out for depth+stencil formats, because those have a
+    * swizzle that doesn't really apply to the border color
    */
+#if PAN_ARCH >= 10
+   if (!util_format_is_depth_and_stencil(cso->border_color_format)) {
+#endif
   unsigned mali_format =
      GENX(panfrost_format_from_pipe_format)(cso->border_color_format)->hw;
   enum mali_rgb_component_order order = mali_format & BITFIELD_MASK(12);
@@ -207,6 +213,10 @@ panfrost_create_sampler_state(struct pipe_context *pctx,
   util_format_apply_color_swizzle(&so->base.border_color, &cso->border_color,
                                   inverted_swizzle,
                                   false /* is_integer (irrelevant) */);
+#if PAN_ARCH >= 10
+   }
+#endif
+
 #endif

   bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
@@ -378,6 +388,17 @@ panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
               panfrost_dithered_format_from_pipe_format)(format, dithered);
            cfg.fixed_function.rt = i;

+#if PAN_ARCH >= 7
+            if (cfg.mode == MALI_BLEND_MODE_FIXED_FUNCTION &&
+                (cfg.fixed_function.conversion.memory_format & 0xff) ==
+                   MALI_RGB_COMPONENT_ORDER_RGB1) {
+               /* fixed function does not like RGB1 as the component order */
+               /* force this field to be the default 0 (RGBA) */
+               cfg.fixed_function.conversion.memory_format &= ~0xff;
+               cfg.fixed_function.conversion.memory_format |=
+                  MALI_RGB_COMPONENT_ORDER_RGBA;
+            }
+#endif
 #if PAN_ARCH <= 7
            if (!info.opaque) {
               cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop;
--- a/src/gallium/drivers/panfrost/pan_screen.c
+++ b/src/gallium/drivers/panfrost/pan_screen.c
@@ -208,7 +208,7 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
    * handles this but we need to fix up the border colour.
    */
   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
-      if (dev->arch == 7)
+      if (dev->arch == 7 || dev->arch >= 10)
         return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_FREEDRENO;
      else
         return 0;
--- a/src/gallium/drivers/r300/compiler/radeon_optimize.c
+++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c
@@ -627,7 +627,10 @@ static int peephole_mad_presub_bias(
 		if (rc_inline_to_float(src1_reg.Index) != 2.0f)
 			 return 0;
 	} else {
-	        struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
+		if (src1_reg.File != RC_FILE_CONSTANT)
+			return 0;
+
+		struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
 		if (constant->Type != RC_CONSTANT_IMMEDIATE)
 			return 0;
 	        for (i = 0; i < 4; i++) {
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@@ -2924,8 +2924,14 @@ begin_rendering(struct zink_context *ctx, bool check_msaa_expand)
   if (has_swapchain) {
      ASSERTED struct zink_resource *res = zink_resource(ctx->fb_state.cbufs[0]->texture);
      zink_render_fixup_swapchain(ctx);
-      if (res->use_damage)
+      if (res->use_damage) {
         ctx->dynamic_fb.info.renderArea = res->damage;
+      } else {
+         ctx->dynamic_fb.info.renderArea.offset.x = 0;
+         ctx->dynamic_fb.info.renderArea.offset.y = 0;
+         ctx->dynamic_fb.info.renderArea.extent.width = ctx->fb_state.width;
+         ctx->dynamic_fb.info.renderArea.extent.height = ctx->fb_state.height;
+      }
      /* clamp for late swapchain resize */
      if (res->base.b.width0 < ctx->dynamic_fb.info.renderArea.extent.width)
         ctx->dynamic_fb.info.renderArea.extent.width = res->base.b.width0;
--- a/src/gallium/drivers/zink/zink_descriptors.c
+++ b/src/gallium/drivers/zink/zink_descriptors.c
@@ -416,7 +416,7 @@ init_program_db(struct zink_screen *screen, struct zink_program *pg, enum zink_d
 {
   VkDeviceSize val;
   VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, dsl, &val);
-   pg->dd.db_size[type] = val;
+   pg->dd.db_size[type] = align64(val, screen->info.db_props.descriptorBufferOffsetAlignment);
   pg->dd.db_offset[type] = rzalloc_array(pg, uint32_t, num_bindings);
   for (unsigned i = 0; i < num_bindings; i++) {
      VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, dsl, bindings[i].binding, &val);
@@ -740,7 +740,7 @@ zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shad
      shader->precompile.num_bindings = num_bindings;
      VkDeviceSize val;
      VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, shader->precompile.dsl, &val);
-      shader->precompile.db_size = val;
+      shader->precompile.db_size = align64(val, screen->info.db_props.descriptorBufferOffsetAlignment);
      shader->precompile.db_offset = rzalloc_array(shader, uint32_t, num_bindings);
      for (unsigned i = 0; i < num_bindings; i++) {
         VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, shader->precompile.dsl, bindings[i].binding, &val);
@@ -1146,6 +1146,7 @@ update_separable(struct zink_context *ctx, struct zink_program *pg)
      }
      bs->dd.cur_db_offset[use_buffer] = bs->dd.db_offset;
      bs->dd.db_offset += zs->precompile.db_size;
+
      /* TODO: maybe compile multiple variants for different set counts for compact mode? */
      int set_idx = screen->info.have_EXT_shader_object ? j : j == MESA_SHADER_FRAGMENT;
      VKCTX(CmdSetDescriptorBufferOffsetsEXT)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pg->layout, set_idx, 1, &use_buffer, &offset);
@@ -1633,7 +1634,7 @@ zink_descriptors_init(struct zink_context *ctx)
      VkDeviceSize val;
      for (unsigned i = 0; i < 2; i++) {
         VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, ctx->dd.push_dsl[i]->layout, &val);
-         ctx->dd.db_size[i] = val;
+         ctx->dd.db_size[i] = align64(val, screen->info.db_props.descriptorBufferOffsetAlignment);
      }
      for (unsigned i = 0; i < ZINK_GFX_SHADER_COUNT; i++) {
         VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, ctx->dd.push_dsl[0]->layout, i, &val);
@@ -1709,7 +1710,7 @@ zink_descriptor_util_init_fbfetch(struct zink_context *ctx)
   if (zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB) {
      VkDeviceSize val;
      VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, ctx->dd.push_dsl[0]->layout, &val);
-      ctx->dd.db_size[0] = val;
+      ctx->dd.db_size[0] = align64(val, screen->info.db_props.descriptorBufferOffsetAlignment);
      for (unsigned i = 0; i < ARRAY_SIZE(ctx->dd.db_offset); i++) {
         VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, ctx->dd.push_dsl[0]->layout, i, &val);
         ctx->dd.db_offset[i] = val;
--- a/src/gallium/drivers/zink/zink_kopper.c
+++ b/src/gallium/drivers/zink/zink_kopper.c
@@ -887,6 +887,8 @@ zink_kopper_present_queue(struct zink_screen *screen, struct zink_resource *res,
      kopper_present(cpi, screen, -1);
   }
   res->obj->indefinite_acquire = false;
+   res->use_damage = false;
+   memset(&res->damage, 0, sizeof(res->damage));
   cdt->swapchain->images[res->obj->dt_idx].acquired = NULL;
   res->obj->dt_idx = UINT32_MAX;
 }
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -1542,10 +1542,25 @@ zink_set_damage_region(struct pipe_screen *pscreen, struct pipe_resource *pres,

   for (unsigned i = 0; i < nrects; i++) {
      int y = pres->height0 - rects[i].y - rects[i].height;
-      res->damage.extent.width = MAX2(res->damage.extent.width, rects[i].x + rects[i].width);
-      res->damage.extent.height = MAX2(res->damage.extent.height, y + rects[i].height);
-      res->damage.offset.x = MIN2(res->damage.offset.x, rects[i].x);
-      res->damage.offset.y = MIN2(res->damage.offset.y, y);
+      /* convert back to coord-based rects to use coordinate calcs */
+      struct u_rect currect = {
+         .x0 = res->damage.offset.x,
+         .y0 = res->damage.offset.y,
+         .x1 = res->damage.offset.x + res->damage.extent.width,
+         .y1 = res->damage.offset.y + res->damage.extent.height,
+      };
+      struct u_rect newrect = {
+         .x0 = rects[i].x,
+         .y0 = y,
+         .x1 = rects[i].x + rects[i].width,
+         .y1 = y + rects[i].height,
+      };
+      struct u_rect u;
+      u_rect_union(&u, &currect, &newrect);
+      res->damage.extent.width = u.y1 - u.y0;
+      res->damage.extent.height = u.x1 - u.x0;
+      res->damage.offset.x = u.x0;
+      res->damage.offset.y = u.y0;
   }

   res->use_damage = nrects > 0;
--- a/src/gallium/frontends/dri/dri2.c
+++ b/src/gallium/frontends/dri/dri2.c
@@ -2346,7 +2346,7 @@ dri_swrast_kms_init_screen(struct dri_screen *screen, bool driver_name_is_inferr
 #endif

   if (!pscreen)
-       goto fail;
+       return NULL;

   dri_init_options(screen);
   dri2_init_screen_extensions(screen, pscreen, true);
@@ -2364,7 +2364,7 @@ dri_swrast_kms_init_screen(struct dri_screen *screen, bool driver_name_is_inferr
   return configs;

 fail:
-   dri_release_screen(screen);
+   pipe_loader_release(&screen->dev, 1);

 #endif // HAVE_SWRAST
   return NULL;
--- a/src/gallium/frontends/rusticl/api/icd.rs
+++ b/src/gallium/frontends/rusticl/api/icd.rs
@@ -35,7 +35,7 @@ pub static DISPATCH: cl_icd_dispatch = cl_icd_dispatch {
    clRetainCommandQueue: Some(clRetainCommandQueue),
    clReleaseCommandQueue: Some(clReleaseCommandQueue),
    clGetCommandQueueInfo: Some(clGetCommandQueueInfo),
-    clSetCommandQueueProperty: None,
+    clSetCommandQueueProperty: Some(clSetCommandQueueProperty),
    clCreateBuffer: Some(clCreateBuffer),
    clCreateImage2D: Some(clCreateImage2D),
    clCreateImage3D: Some(clCreateImage3D),
--- a/src/gallium/frontends/rusticl/api/kernel.rs
+++ b/src/gallium/frontends/rusticl/api/kernel.rs
@@ -367,7 +367,14 @@ fn set_kernel_arg(
                    return Err(CL_INVALID_ARG_SIZE);
                }
            }
-            _ => {
+
+            KernelArgType::Sampler => {
+                if arg_size != std::mem::size_of::<cl_sampler>() {
+                    return Err(CL_INVALID_ARG_SIZE);
+                }
+            }
+
+            KernelArgType::Constant => {
                if arg.size != arg_size {
                    return Err(CL_INVALID_ARG_SIZE);
                }
--- a/src/gallium/frontends/rusticl/api/memory.rs
+++ b/src/gallium/frontends/rusticl/api/memory.rs
@@ -2192,13 +2192,20 @@ fn enqueue_unmap_mem_object(

    // SAFETY: it's required that applications do not cause data races
    let mapped_ptr = unsafe { MutMemoryPtr::from_ptr(mapped_ptr) };
+    let needs_sync = m.unmap(mapped_ptr)?;
    create_and_queue(
        q,
        CL_COMMAND_UNMAP_MEM_OBJECT,
        evs,
        event,
        false,
-        Box::new(move |q, ctx| m.unmap(q, ctx, mapped_ptr)),
+        Box::new(move |q, ctx| {
+            if needs_sync {
+                m.sync_unmap(q, ctx, mapped_ptr)
+            } else {
+                Ok(())
+            }
+        }),
    )
 }

--- a/src/gallium/frontends/rusticl/api/queue.rs
+++ b/src/gallium/frontends/rusticl/api/queue.rs
@@ -41,6 +41,22 @@ impl CLInfo<cl_command_queue_info> for cl_command_queue {
    }
 }

+#[cl_entrypoint(clSetCommandQueueProperty)]
+fn set_command_queue_property(
+    _command_queue: cl_command_queue,
+    _properties: cl_command_queue_properties,
+    _enable: cl_bool,
+    _old_properties: *mut cl_command_queue_properties,
+) -> CLResult<()> {
+    // clSetCommandQueueProperty may unconditionally return an error if no devices in the context
+    // associated with command_queue support modifying the properties of a command-queue. Support
+    // for modifying the properties of a command-queue is required only for OpenCL 1.0 devices.
+    //
+    // CL_INVALID_OPERATION if no devices in the context associated with command_queue support
+    // modifying the properties of a command-queue.
+    Err(CL_INVALID_OPERATION)
+}
+
 fn valid_command_queue_properties(properties: cl_command_queue_properties) -> bool {
    let valid_flags = cl_bitfield::from(
        CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
--- a/src/gallium/frontends/rusticl/core/memory.rs
+++ b/src/gallium/frontends/rusticl/core/memory.rs
@@ -39,6 +39,8 @@ struct Mapping<T> {
    layout: Layout,
    writes: bool,
    ptr: Option<MutMemoryPtr>,
+    /// reference count from the API perspective. Once it reaches 0, we need to write back the
+    /// mappings content to the GPU resource.
    count: u32,
    inner: T,
 }
@@ -152,10 +154,17 @@ impl Mem {
        }
    }

-    pub fn unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
+    pub fn sync_unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
        match self {
-            Self::Buffer(b) => b.unmap(q, ctx, ptr),
-            Self::Image(i) => i.unmap(q, ctx, ptr),
+            Self::Buffer(b) => b.sync_unmap(q, ctx, ptr),
+            Self::Image(i) => i.sync_unmap(q, ctx, ptr),
+        }
+    }
+
+    pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
+        match self {
+            Self::Buffer(b) => b.unmap(ptr),
+            Self::Image(i) => i.unmap(ptr),
        }
    }
 }
@@ -712,7 +721,9 @@ impl MemBase {

    fn is_pure_user_memory(&self, d: &Device) -> CLResult<bool> {
        let r = self.get_res_of_dev(d)?;
-        Ok(r.is_user())
+        // 1Dbuffer objects are weird. The parent memory object can be a host_ptr thing, but we are
+        // not allowed to actually return a pointer based on the host_ptr when mapping.
+        Ok(r.is_user() && !self.host_ptr().is_null())
    }

    fn map<T>(
@@ -912,7 +923,9 @@ impl Buffer {
    }

    fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
-        self.maps.lock().unwrap().contains_key(ptr as usize)
+        let mut maps = self.maps.lock().unwrap();
+        let entry = maps.entry(ptr as usize);
+        matches!(entry, Entry::Occupied(entry) if entry.get().count > 0)
    }

    pub fn map(&self, size: usize, offset: usize, writes: bool) -> CLResult<MutMemoryPtr> {
@@ -993,6 +1006,31 @@ impl Buffer {
        self.read(q, ctx, mapping.offset, ptr, mapping.size())
    }

+    pub fn sync_unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
+        // no need to update
+        if self.is_pure_user_memory(q.device)? {
+            return Ok(());
+        }
+
+        match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
+            Entry::Vacant(_) => Err(CL_INVALID_VALUE),
+            Entry::Occupied(entry) => {
+                let mapping = entry.get();
+
+                if mapping.writes {
+                    self.write(q, ctx, mapping.offset, ptr.into(), mapping.size())?;
+                }
+
+                // only remove if the mapping wasn't reused in the meantime
+                if mapping.count == 0 {
+                    entry.remove();
+                }
+
+                Ok(())
+            }
+        }
+    }
+
    fn tx<'a>(
        &self,
        q: &Queue,
@@ -1014,22 +1052,16 @@ impl Buffer {
    }

    // TODO: only sync on unmap when the memory is not mapped for writing
-    pub fn unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
-        let mapping = match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
-            Entry::Vacant(_) => return Err(CL_INVALID_VALUE),
+    pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
+        match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
+            Entry::Vacant(_) => Err(CL_INVALID_VALUE),
            Entry::Occupied(mut entry) => {
-                entry.get_mut().count -= 1;
-                (entry.get().count == 0).then(|| entry.remove())
+                let entry = entry.get_mut();
+                debug_assert!(entry.count > 0);
+                entry.count -= 1;
+                Ok(entry.count == 0)
            }
-        };
-
-        if let Some(mapping) = mapping {
-            if mapping.writes && !self.is_pure_user_memory(q.device)? {
-                self.write(q, ctx, mapping.offset, ptr.into(), mapping.size())?;
-            }
-        };
-
-        Ok(())
+        }
    }

    pub fn write(
@@ -1289,7 +1321,9 @@ impl Image {
    }

    fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
-        self.maps.lock().unwrap().contains_key(ptr as usize)
+        let mut maps = self.maps.lock().unwrap();
+        let entry = maps.entry(ptr as usize);
+        matches!(entry, Entry::Occupied(entry) if entry.get().count > 0)
    }

    pub fn is_parent_buffer(&self) -> bool {
@@ -1309,8 +1343,33 @@ impl Image {
        *row_pitch = self.image_desc.row_pitch()? as usize;
        *slice_pitch = self.image_desc.slice_pitch();

-        let (offset, size) =
-            CLVec::calc_offset_size(origin, region, [pixel_size, *row_pitch, *slice_pitch]);
+        let offset = CLVec::calc_offset(origin, [pixel_size, *row_pitch, *slice_pitch]);
+
+        // From the CL Spec:
+        //
+        //   The pointer returned maps a 1D, 2D or 3D region starting at origin and is at least
+        //   region[0] pixels in size for a 1D image, 1D image buffer or 1D image array,
+        //   (image_row_pitch × region[1]) pixels in size for a 2D image or 2D image array, and
+        //   (image_slice_pitch × region[2]) pixels in size for a 3D image. The result of a memory
+        //   access outside this region is undefined.
+        //
+        // It's not guaranteed that the row_pitch is taken into account for 1D images, but the CL
+        // CTS relies on this behavior.
+        //
+        // Also note, that the spec wording is wrong in regards to arrays, which need to take the
+        // image_slice_pitch into account.
+        let size = if self.image_desc.is_array() || self.image_desc.dims() == 3 {
+            debug_assert_ne!(*slice_pitch, 0);
+            // the slice count is in region[1] for 1D array images
+            if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY {
+                region[1] * *slice_pitch
+            } else {
+                region[2] * *slice_pitch
+            }
+        } else {
+            debug_assert_ne!(*row_pitch, 0);
+            region[1] * *row_pitch
+        };

        let layout;
        unsafe {
@@ -1418,6 +1477,41 @@ impl Image {
        )
    }

+    pub fn sync_unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
+        // no need to update
+        if self.is_pure_user_memory(q.device)? {
+            return Ok(());
+        }
+
+        match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
+            Entry::Vacant(_) => Err(CL_INVALID_VALUE),
+            Entry::Occupied(entry) => {
+                let mapping = entry.get();
+                let row_pitch = self.image_desc.row_pitch()? as usize;
+                let slice_pitch = self.image_desc.slice_pitch();
+
+                if mapping.writes {
+                    self.write(
+                        ptr.into(),
+                        q,
+                        ctx,
+                        &mapping.region,
+                        row_pitch,
+                        slice_pitch,
+                        &mapping.origin,
+                    )?;
+                }
+
+                // only remove if the mapping wasn't reused in the meantime
+                if mapping.count == 0 {
+                    entry.remove();
+                }
+
+                Ok(())
+            }
+        }
+    }
+
    fn tx_image<'a>(
        &self,
        q: &Queue,
@@ -1430,33 +1524,16 @@ impl Image {
    }

    // TODO: only sync on unmap when the memory is not mapped for writing
-    pub fn unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
-        let mapping = match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
-            Entry::Vacant(_) => return Err(CL_INVALID_VALUE),
+    pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
+        match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
+            Entry::Vacant(_) => Err(CL_INVALID_VALUE),
            Entry::Occupied(mut entry) => {
-                entry.get_mut().count -= 1;
-                (entry.get().count == 0).then(|| entry.remove())
-            }
-        };
-
-        let row_pitch = self.image_desc.row_pitch()? as usize;
-        let slice_pitch = self.image_desc.slice_pitch();
-
-        if let Some(mapping) = mapping {
-            if mapping.writes && !self.is_pure_user_memory(q.device)? {
-                self.write(
-                    ptr.into(),
-                    q,
-                    ctx,
-                    &mapping.region,
-                    row_pitch,
-                    slice_pitch,
-                    &mapping.origin,
-                )?;
+                let entry = entry.get_mut();
+                debug_assert!(entry.count > 0);
+                entry.count -= 1;
+                Ok(entry.count == 0)
            }
        }
-
-        Ok(())
    }

    pub fn write(
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -22,8 +22,14 @@ if with_ld_dynamic_list
  gallium_dri_link_depends += files('../dri.dyn')
 endif

+if get_option('unversion-libgallium') or with_platform_android
+  libgallium_name = 'gallium_dri'
+else
+  libgallium_name = 'gallium-@0@'.format(meson.project_version())
+endif
+
 libgallium_dri = shared_library(
-  'gallium-@0@'.format(meson.project_version()),
+  libgallium_name,
  files('dri_target.c'),
  include_directories : [
    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_util, inc_gallium_drivers,
--- a/src/intel/blorp/blorp_genX_exec_brw.h
+++ b/src/intel/blorp/blorp_genX_exec_brw.h
@@ -593,7 +593,10 @@ blorp_emit_cc_viewport(struct blorp_batch *batch)
 {
   uint32_t cc_vp_offset;

-   if (batch->blorp->config.use_cached_dynamic_states) {
+   /* Somehow reusing CC_VIEWPORT on Gfx9 is causing issues :
+    *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
+    */
+   if (GFX_VER != 9 && batch->blorp->config.use_cached_dynamic_states) {
      cc_vp_offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_CC_VIEWPORT);
   } else {
      blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
--- a/src/intel/compiler/brw_fs_saturate_propagation.cpp
+++ b/src/intel/compiler/brw_fs_saturate_propagation.cpp
@@ -75,6 +75,9 @@ opt_saturate_propagation_local(fs_visitor &s, bblock_t *block)
                 !scan_inst->can_change_types()))
               break;

+            if (scan_inst->flags_written(s.devinfo) != 0)
+               break;
+
            if (scan_inst->saturate) {
               inst->saturate = false;
               progress = true;
--- a/src/intel/compiler/brw_nir_lower_rt_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_rt_intrinsics.c
@@ -24,6 +24,24 @@
 #include "brw_nir_rt.h"
 #include "brw_nir_rt_builder.h"

+static nir_def *
+nir_build_vec3_mat_mult_col_major(nir_builder *b, nir_def *vec,
+                                  nir_def *matrix[], bool translation)
+{
+   nir_def *result_components[3] = {
+      nir_channel(b, matrix[3], 0),
+      nir_channel(b, matrix[3], 1),
+      nir_channel(b, matrix[3], 2),
+   };
+   for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 3; ++j) {
+         nir_def *v = nir_fmul(b, nir_channels(b, vec, 1 << j), nir_channels(b, matrix[j], 1 << i));
+         result_components[i] = (translation || j) ? nir_fadd(b, result_components[i], v) : v;
+      }
+   }
+   return nir_vec(b, result_components, 3);
+}
+
 static nir_def *
 build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
 {
@@ -163,11 +181,27 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
            break;

         case nir_intrinsic_load_ray_object_origin:
-            sysval = object_ray_in.orig;
+            if (stage == MESA_SHADER_CLOSEST_HIT) {
+               struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+               brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+
+               sysval = nir_build_vec3_mat_mult_col_major(
+                  b, world_ray_in.orig, leaf.world_to_object, true);
+            } else {
+               sysval = object_ray_in.orig;
+            }
            break;

         case nir_intrinsic_load_ray_object_direction:
-            sysval = object_ray_in.dir;
+            if (stage == MESA_SHADER_CLOSEST_HIT) {
+               struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+               brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+
+               sysval = nir_build_vec3_mat_mult_col_major(
+                  b, world_ray_in.dir, leaf.world_to_object, false);
+            } else {
+               sysval = object_ray_in.dir;
+            }
            break;

         case nir_intrinsic_load_ray_t_min:
--- a/src/intel/compiler/elk/elk_fs_saturate_propagation.cpp
+++ b/src/intel/compiler/elk/elk_fs_saturate_propagation.cpp
@@ -45,7 +45,8 @@ using namespace elk;
 */

 static bool
-opt_saturate_propagation_local(const fs_live_variables &live, elk_bblock_t *block)
+opt_saturate_propagation_local(const intel_device_info *devinfo,
+                               const fs_live_variables &live, elk_bblock_t *block)
 {
   bool progress = false;
   int ip = block->end_ip + 1;
@@ -74,6 +75,16 @@ opt_saturate_propagation_local(const fs_live_variables &live, elk_bblock_t *bloc
                 !scan_inst->can_change_types()))
               break;

+            /* min and max pseudo ops modify the flags on Gfx4 and Gfx5, but
+             * it's not based on the result of the operation. This is the one
+             * case where it is always safe to propagate a saturate to an
+             * instruction that writes the flags.
+             */
+            if (scan_inst->flags_written(devinfo) != 0 &&
+                scan_inst->opcode != ELK_OPCODE_SEL) {
+               break;
+            }
+
            if (scan_inst->saturate) {
               inst->saturate = false;
               progress = true;
@@ -156,7 +167,7 @@ elk_fs_visitor::opt_saturate_propagation()
   bool progress = false;

   foreach_block (block, cfg) {
-      progress = opt_saturate_propagation_local(live, block) || progress;
+      progress = opt_saturate_propagation_local(devinfo, live, block) || progress;
   }

   /* Live intervals are still valid. */
--- a/src/intel/dev/intel_device_info.c
+++ b/src/intel/dev/intel_device_info.c
@@ -2023,15 +2023,15 @@ intel_device_info_wa_stepping(struct intel_device_info *devinfo)
 uint32_t
 intel_device_info_get_max_slm_size(const struct intel_device_info *devinfo)
 {
-   uint32_t k_bytes = 0;
+   uint32_t bytes = 0;

   if (devinfo->verx10 >= 200) {
-      k_bytes = intel_device_info_get_max_preferred_slm_size(devinfo);
+      bytes = intel_device_info_get_max_preferred_slm_size(devinfo);
   } else {
-      k_bytes = 64;
+      bytes = 64 * 1024;
   }

-   return k_bytes * 1024;
+   return bytes;
 }

 uint32_t
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@@ -1825,7 +1825,13 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
      }
   }

-   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC)) {
+   /* Force CC_VIEWPORT reallocation on Gfx9 when reprogramming
+    * 3DSTATE_VIEWPORT_STATE_POINTERS_CC :
+    *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
+    */
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
+       (GFX_VER == 9 &&
+        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR))) {
      hw_state->vp_cc.state =
         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
                                            hw_state->vp_cc.count * 8, 32);
--- a/src/intel/vulkan/genX_simple_shader.c
+++ b/src/intel/vulkan/genX_simple_shader.c
@@ -110,7 +110,8 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state)

   genX(emit_l3_config)(batch, device, state->l3_config);

-   state->cmd_buffer->state.current_l3_config = state->l3_config;
+   if (state->cmd_buffer)
+      state->cmd_buffer->state.current_l3_config = state->l3_config;

   enum intel_urb_deref_block_size deref_block_size;
   genX(emit_urb_setup)(device, batch, state->l3_config,
--- a/src/nouveau/vulkan/nvk_buffer_view.c
+++ b/src/nouveau/vulkan/nvk_buffer_view.c
@@ -13,6 +13,8 @@

 #include "vk_format.h"

+#include "clb097.h"
+
 VkFormatFeatureFlags2
 nvk_get_buffer_format_features(struct nvk_physical_device *pdev,
                               VkFormat vk_format)
@@ -29,6 +31,8 @@ nvk_get_buffer_format_features(struct nvk_physical_device *pdev,
      if (nil_format_supports_storage(&pdev->info, p_format)) {
         features |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT |
                     VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+         if (pdev->info.cls_eng3d >= MAXWELL_A)
+            features |= VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
      }

      if (p_format == PIPE_FORMAT_R32_UINT || p_format == PIPE_FORMAT_R32_SINT)
--- a/src/nouveau/vulkan/nvk_image.c
+++ b/src/nouveau/vulkan/nvk_image.c
@@ -267,6 +267,9 @@ vk_image_usage_to_format_features(VkImageUsageFlagBits usage_flag)
      return VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
   case VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT:
      return VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT;
+   case VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT:
+      return VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+             VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT;
   default:
      return 0;
   }
--- a/src/panfrost/lib/pan_format.c
+++ b/src/panfrost/lib/pan_format.c
@@ -30,18 +30,26 @@

 /* Convenience */

-#define MALI_BLEND_AU_R8G8B8A8    (MALI_RGBA8_TB << 12)
-#define MALI_BLEND_PU_R8G8B8A8    (MALI_RGBA8_TB << 12)
-#define MALI_BLEND_AU_R10G10B10A2 (MALI_RGB10_A2_TB << 12)
-#define MALI_BLEND_PU_R10G10B10A2 (MALI_RGB10_A2_TB << 12)
-#define MALI_BLEND_AU_R8G8B8A2    (MALI_RGB8_A2_AU << 12)
-#define MALI_BLEND_PU_R8G8B8A2    (MALI_RGB8_A2_PU << 12)
-#define MALI_BLEND_AU_R4G4B4A4    (MALI_RGBA4_AU << 12)
-#define MALI_BLEND_PU_R4G4B4A4    (MALI_RGBA4_PU << 12)
-#define MALI_BLEND_AU_R5G6B5A0    (MALI_R5G6B5_AU << 12)
-#define MALI_BLEND_PU_R5G6B5A0    (MALI_R5G6B5_PU << 12)
-#define MALI_BLEND_AU_R5G5B5A1    (MALI_RGB5_A1_AU << 12)
-#define MALI_BLEND_PU_R5G5B5A1    (MALI_RGB5_A1_PU << 12)
+#if PAN_ARCH == 6
+#define MALI_RGBA_SWIZZLE         PAN_V6_SWIZZLE(R, G, B, A)
+#define MALI_RGB1_SWIZZLE         PAN_V6_SWIZZLE(R, G, B, A)
+#else
+#define MALI_RGBA_SWIZZLE         MALI_RGB_COMPONENT_ORDER_RGBA
+#define MALI_RGB1_SWIZZLE         MALI_RGB_COMPONENT_ORDER_RGB1
+#endif
+
+#define MALI_BLEND_AU_R8G8B8A8    (MALI_RGBA8_TB << 12)    | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_PU_R8G8B8A8    (MALI_RGBA8_TB << 12)    | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_AU_R10G10B10A2 (MALI_RGB10_A2_TB << 12) | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_PU_R10G10B10A2 (MALI_RGB10_A2_TB << 12) | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_AU_R8G8B8A2    (MALI_RGB8_A2_AU << 12)  | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_PU_R8G8B8A2    (MALI_RGB8_A2_PU << 12)  | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_AU_R4G4B4A4    (MALI_RGBA4_AU << 12)    | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_PU_R4G4B4A4    (MALI_RGBA4_PU << 12)    | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_AU_R5G6B5A0    (MALI_R5G6B5_AU << 12)   | MALI_RGB1_SWIZZLE
+#define MALI_BLEND_PU_R5G6B5A0    (MALI_R5G6B5_PU << 12)   | MALI_RGB1_SWIZZLE
+#define MALI_BLEND_AU_R5G5B5A1    (MALI_RGB5_A1_AU << 12)  | MALI_RGBA_SWIZZLE
+#define MALI_BLEND_PU_R5G5B5A1    (MALI_RGB5_A1_PU << 12)  | MALI_RGBA_SWIZZLE

 #if PAN_ARCH <= 5
 #define BFMT2(pipe, internal, writeback, srgb)                                 \
@@ -50,18 +58,6 @@
      MALI_COLOR_FORMAT_##writeback,                                           \
      { 0, 0 },                                                                \
   }
-#elif PAN_ARCH == 6
-#define BFMT2(pipe, internal, writeback, srgb)                                 \
-   [PIPE_FORMAT_##pipe] = {                                                    \
-      MALI_COLOR_BUFFER_INTERNAL_FORMAT_##internal,                            \
-      MALI_COLOR_FORMAT_##writeback,                                           \
-      {                                                                        \
-         MALI_BLEND_PU_##internal | (srgb ? (1 << 20) : 0) |                   \
-            PAN_V6_SWIZZLE(R, G, B, A),                                        \
-         MALI_BLEND_AU_##internal | (srgb ? (1 << 20) : 0) |                   \
-            PAN_V6_SWIZZLE(R, G, B, A),                                        \
-      },                                                                       \
-   }
 #else
 #define BFMT2(pipe, internal, writeback, srgb)                                 \
   [PIPE_FORMAT_##pipe] = {                                                    \
--- a/src/util/detect_arch.h
+++ b/src/util/detect_arch.h
@@ -112,6 +112,14 @@
 #endif
 #endif

+#if defined(__loongarch__)
+#ifdef __loongarch_lp64
+#define DETECT_ARCH_LOONGARCH64 1
+#else
+#error "detect_arch: unknown target loongarch base ABI type"
+#endif
+#endif
+
 #ifndef DETECT_ARCH_X86
 #define DETECT_ARCH_X86 0
 #endif
@@ -168,4 +176,8 @@
 #define DETECT_ARCH_RISCV64 0
 #endif

+#ifndef DETECT_ARCH_LOONGARCH64
+#define DETECT_ARCH_LOONGARCH64 0
+#endif
+
 #endif /* UTIL_DETECT_ARCH_H_ */
--- a/src/util/macros.h
+++ b/src/util/macros.h
@@ -240,6 +240,12 @@ do {                       \
 #  endif
 #endif

+#ifdef HAVE_FUNC_ATTRIBUTE_OPTIMIZE
+#define ATTRIBUTE_OPTIMIZE(flags) __attribute__((__optimize__((flags))))
+#else
+#define ATTRIBUTE_OPTIMIZE(flags)
+#endif
+
 #ifdef __cplusplus
 /**
 * Macro function that evaluates to true if T is a trivially
--- a/src/util/tests/u_debug_stack_test.cpp
+++ b/src/util/tests/u_debug_stack_test.cpp
@@ -49,7 +49,8 @@ func_b(void)
   debug_backtrace_dump(backtrace, 16);
 }

-static void ATTRIBUTE_NOINLINE
+/* This function must emit a stack frame for the unit test to work */
+static void ATTRIBUTE_NOINLINE ATTRIBUTE_OPTIMIZE("no-omit-frame-pointer")
 func_c(struct debug_stack_frame *frames)
 {
   debug_backtrace_capture(frames, 0, 16);
--- a/src/util/u_printf.c
+++ b/src/util/u_printf.c
@@ -166,10 +166,9 @@ u_printf_impl(FILE *out, const char *buffer, size_t buffer_size,
         int arg_size = fmt->arg_sizes[i];
         size_t spec_pos = util_printf_next_spec_pos(format, 0);

-         if (spec_pos == -1) {
-            u_printf_plain(out, format);
-            continue;
-         }
+         /* If we hit an unused argument we skip all remaining ones */
+         if (spec_pos == -1)
+            break;

         const char *token = util_printf_prev_tok(&format[spec_pos]);
         const char *next_format = &format[spec_pos + 1];
--- a/src/vulkan/wsi/wsi_common.c
+++ b/src/vulkan/wsi/wsi_common.c
@@ -58,6 +58,10 @@ static const struct debug_control debug_control[] = {
   { NULL, },
 };

+static bool present_false(VkPhysicalDevice pdevice, int fd) {
+   return false;
+}
+
 VkResult
 wsi_device_init(struct wsi_device *wsi,
                VkPhysicalDevice pdevice,
@@ -270,6 +274,21 @@ wsi_device_init(struct wsi_device *wsi,
      }
   }

+   /* can_present_on_device is a function pointer used to determine if images
+    * can be presented directly on a given device file descriptor (fd).
+    * If HAVE_LIBDRM is defined, it will be initialized to a platform-specific
+    * function (wsi_device_matches_drm_fd). Otherwise, it is initialized to
+    * present_false to ensure that it always returns false, preventing potential
+    * segmentation faults from unchecked calls.
+    * Drivers for non-PCI based GPUs are expected to override this after calling
+    * wsi_device_init().
+    */
+#ifdef HAVE_LIBDRM
+   wsi->can_present_on_device = wsi_device_matches_drm_fd;
+#else
+   wsi->can_present_on_device = present_false;
+#endif
+
   return VK_SUCCESS;
 fail:
   wsi_device_finish(wsi, alloc);
--- a/src/vulkan/wsi/wsi_common_display.c
+++ b/src/vulkan/wsi/wsi_common_display.c
@@ -1100,7 +1100,7 @@ wsi_display_surface_get_present_rectangles(VkIcdSurfaceBase *surface_base,
   wsi_display_mode *mode = wsi_display_mode_from_handle(surface->displayMode);
   VK_OUTARRAY_MAKE_TYPED(VkRect2D, out, pRects, pRectCount);

-   if (wsi_device_matches_drm_fd(wsi_device, mode->connector->wsi->fd)) {
+   if (wsi_device->can_present_on_device(wsi_device->pdevice, mode->connector->wsi->fd)) {
      vk_outarray_append_typed(VkRect2D, &out, rect) {
         *rect = (VkRect2D) {
            .offset = { 0, 0 },
@@ -3114,7 +3114,7 @@ wsi_AcquireDrmDisplayEXT(VkPhysicalDevice physicalDevice,
   VK_FROM_HANDLE(vk_physical_device, pdevice, physicalDevice);
   struct wsi_device *wsi_device = pdevice->wsi_device;

-   if (!wsi_device_matches_drm_fd(wsi_device, drmFd))
+   if (!wsi_device->can_present_on_device(wsi_device->pdevice, drmFd))
      return VK_ERROR_UNKNOWN;

   struct wsi_display *wsi =
@@ -3148,7 +3148,7 @@ wsi_GetDrmDisplayEXT(VkPhysicalDevice physicalDevice,
   VK_FROM_HANDLE(vk_physical_device, pdevice, physicalDevice);
   struct wsi_device *wsi_device = pdevice->wsi_device;

-   if (!wsi_device_matches_drm_fd(wsi_device, drmFd)) {
+   if (!wsi_device->can_present_on_device(wsi_device->pdevice, drmFd)) {
      *pDisplay = VK_NULL_HANDLE;
      return VK_ERROR_UNKNOWN;
   }
--- a/src/vulkan/wsi/wsi_common_drm.c
+++ b/src/vulkan/wsi/wsi_common_drm.c
@@ -440,10 +440,10 @@ wsi_common_drm_devices_equal(int fd_a, int fd_b)
 }

 bool
-wsi_device_matches_drm_fd(const struct wsi_device *wsi, int drm_fd)
+wsi_device_matches_drm_fd(VkPhysicalDevice physicalDevice, int drm_fd)
 {
-   if (wsi->can_present_on_device)
-      return wsi->can_present_on_device(wsi->pdevice, drm_fd);
+   VK_FROM_HANDLE(vk_physical_device, pdevice, physicalDevice);
+   const struct wsi_device *wsi = pdevice->wsi_device;

   drmDevicePtr fd_device;
   int ret = drmGetDevice2(drm_fd, 0, &fd_device);
--- a/src/vulkan/wsi/wsi_common_private.h
+++ b/src/vulkan/wsi/wsi_common_private.h
@@ -225,7 +225,7 @@ struct wsi_swapchain {
 };

 bool
-wsi_device_matches_drm_fd(const struct wsi_device *wsi, int drm_fd);
+wsi_device_matches_drm_fd(VkPhysicalDevice pdevice, int drm_fd);

 void
 wsi_wl_surface_destroy(VkIcdSurfaceBase *icd_surface, VkInstance _instance,
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@@ -160,7 +160,7 @@ wsi_x11_check_dri3_compatible(const struct wsi_device *wsi_dev,
   if (dri3_fd == -1)
      return true;

-   bool match = wsi_device_matches_drm_fd(wsi_dev, dri3_fd);
+   bool match = wsi_dev->can_present_on_device(wsi_dev->pdevice, dri3_fd);

   close(dri3_fd);

@@ -1071,9 +1071,11 @@ struct x11_image {
    * We need to keep track of them when considering present ID. */

   /* This is arbitrarily chosen. With IMMEDIATE on a 3 deep swapchain,
-    * we allow up to 48 outstanding presentations per vblank, which is more than enough
-    * for any reasonable application. */
-#define X11_SWAPCHAIN_MAX_PENDING_COMPLETIONS 16
+    * we allow over 300 outstanding presentations per vblank, which is more than enough
+    * for any reasonable application.
+    * This used to be 16, but it regressed benchmarks that did 15k+ FPS.
+    * This should allow over 25k FPS on a 60 Hz monitor. Any more than this is comical. */
+#define X11_SWAPCHAIN_MAX_PENDING_COMPLETIONS 128
   uint32_t                                  present_queued_count;
   struct x11_image_pending_completion       pending_completions[X11_SWAPCHAIN_MAX_PENDING_COMPLETIONS];
 #ifdef HAVE_DRI3_EXPLICIT_SYNC