VERSION: bump for 24.3.0-rc2 release

Signed-off-by: Dylan Baker <dylan.c.baker@intel.com>
ci: raise priority of release manager pipelines
2024-11-13 10:45:35 -08:00 · 2024-11-13 08:31:00 -08:00 · 2024-11-13 08:30:59 -08:00 · 2024-11-13 08:30:59 -08:00 · 2024-11-13 08:30:58 -08:00 · 2024-11-13 08:30:57 -08:00
32 changed files with 2962 additions and 194 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -54,7 +54,7 @@ workflow:
    # pipeline for direct pushes that bypassed the CI
    - if: &is-direct-push $CI_PROJECT_NAMESPACE == "mesa" && $CI_PIPELINE_SOURCE == "push" && $GITLAB_USER_LOGIN != "marge-bot"
      variables:
-        JOB_PRIORITY: 40
+        JOB_PRIORITY: 70


 variables:
--- a/.gitlab-ci/setup-test-env.sh
+++ b/.gitlab-ci/setup-test-env.sh
@@ -34,24 +34,6 @@ function get_current_minsec {
    printf "%02d:%02d" $((CURR_TIME/60)) $((CURR_TIME%60))
 }

-function error {
-    x_off 2>/dev/null
-    RED="\e[0;31m"
-    ENDCOLOR="\e[0m"
-    # we force the following to be not in a section
-    if [ -n "${CURRENT_SECTION:-}" ]; then
-      _section_end $CURRENT_SECTION
-    fi
-
-    CURR_MINSEC=$(get_current_minsec)
-    echo -e "\n${RED}[${CURR_MINSEC}] ERROR: $*${ENDCOLOR}\n"
-    x_restore
-}
-
-function trap_err {
-    error ${CURRENT_SECTION:-'unknown-section'}: ret code: $*
-}
-
 function _build_section_start {
    local section_params=$1
    shift
@@ -68,13 +50,13 @@ function _build_section_start {
 alias build_section_start="x_off; _build_section_start"

 function _section_start {
-    _build_section_start "[collapsed=true]" $*
+    build_section_start "[collapsed=true]" $*
    x_restore
 }
 alias section_start="x_off; _section_start"

 function _uncollapsed_section_start {
-    _build_section_start "" $*
+    build_section_start "" $*
    x_restore
 }
 alias uncollapsed_section_start="x_off; _uncollapsed_section_start"
@@ -87,7 +69,7 @@ function _build_section_end {
 alias build_section_end="x_off; _build_section_end"

 function _section_end {
-    _build_section_end $*
+    build_section_end $*
    x_restore
 }
 alias section_end="x_off; _section_end"
@@ -95,9 +77,10 @@ alias section_end="x_off; _section_end"
 function _section_switch {
    if [ -n "$CURRENT_SECTION" ]
    then
-	_build_section_end $CURRENT_SECTION
+        build_section_end $CURRENT_SECTION
+        x_off
    fi
-    _build_section_start "[collapsed=true]" $*
+    build_section_start "[collapsed=true]" $*
    x_restore
 }
 alias section_switch="x_off; _section_switch"
@@ -105,9 +88,10 @@ alias section_switch="x_off; _section_switch"
 function _uncollapsed_section_switch {
    if [ -n "$CURRENT_SECTION" ]
    then
-	_build_section_end $CURRENT_SECTION
+        build_section_end $CURRENT_SECTION
+        x_off
    fi
-    _build_section_start "" $*
+    build_section_start "" $*
    x_restore
 }
 alias uncollapsed_section_switch="x_off; _uncollapsed_section_switch"
@@ -116,8 +100,6 @@ export -f _x_store_state
 export -f _x_off
 export -f _x_restore
 export -f get_current_minsec
-export -f error
-export -f trap_err
 export -f _build_section_start
 export -f _section_start
 export -f _build_section_end
@@ -136,5 +118,27 @@ if [ -z "${RESULTS_DIR:-}" ]; then
 	mkdir -p "${RESULTS_DIR}"
 fi

+function error {
+    x_off 2>/dev/null
+    RED="\e[0;31m"
+    ENDCOLOR="\e[0m"
+    # we force the following to be not in a section
+    if [ -n "${CURRENT_SECTION:-}" ]; then
+      section_end $CURRENT_SECTION
+      x_off
+    fi
+
+    CURR_MINSEC=$(get_current_minsec)
+    echo -e "\n${RED}[${CURR_MINSEC}] ERROR: $*${ENDCOLOR}\n"
+    x_restore
+}
+
+function trap_err {
+    error ${CURRENT_SECTION:-'unknown-section'}: ret code: $*
+}
+
+export -f error
+export -f trap_err
+
 set -E
 trap 'trap_err $?' ERR
--- a/.gitlab-ci/test/gitlab-ci.yml
+++ b/.gitlab-ci/test/gitlab-ci.yml
@@ -370,7 +370,7 @@ yaml-toml-shell-test:
  after_script:
    # Keep the results path the same as baremetal and LAVA
    - mkdir -p "${JOB_FOLDER}"/results
-    - mv "${JOB_FOLDER}"/results results/
+    - mv "${JOB_FOLDER}"/results ./
    - !reference [default, after_script]

  artifacts:
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@@ -1 +1 @@
-24.3.0-devel
+24.3.0-rc2
--- a/meson.build
+++ b/meson.build
@@ -1848,7 +1848,7 @@ endif
 dep_spirv_tools = dependency(
  'SPIRV-Tools',
  required : with_clover_spirv or with_clc,
-  version : '>= 2018.0'
+  version : '>= 2022.1'
 )
 if dep_spirv_tools.found()
  pre_args += '-DHAVE_SPIRV_TOOLS'
--- a/src/asahi/lib/agx_nir_prolog_epilog.c
+++ b/src/asahi/lib/agx_nir_prolog_epilog.c
@@ -448,16 +448,17 @@ agx_nir_fs_epilog(nir_builder *b, const void *key_)
    * to the epilog, when sample shading is not used but blending is.
    */
   if (key->link.sample_shading) {
-      NIR_PASS(_, b->shader, agx_nir_lower_to_per_sample);
-      NIR_PASS(_, b->shader, agx_nir_lower_fs_active_samples_to_register);
-
      /* Lower the resulting discards. Done in agx_nir_lower_monolithic_msaa for
-       * the pixel shaded path.
+       * the pixel shaded path. Must be done before agx_nir_lower_to_per_sample
+       * to avoid duplicating tests.
       */
      if (key->blend.alpha_to_coverage) {
         NIR_PASS(_, b->shader, agx_nir_lower_sample_mask);
      }

+      NIR_PASS(_, b->shader, agx_nir_lower_to_per_sample);
+      NIR_PASS(_, b->shader, agx_nir_lower_fs_active_samples_to_register);
+
      /* Ensure the sample ID is preserved in register. We do this late since it
       * has to go in the last block, and the above passes might add control
       * flow when lowering.
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1973,11 +1973,12 @@ emit_frag_end(struct v3d_compile *c)
         */
        if (c->output_position_index == -1 &&
            !(c->s->info.num_images || c->s->info.num_ssbos) &&
-            !c->s->info.fs.uses_discard &&
            !c->fs_key->sample_alpha_to_coverage &&
            c->output_sample_mask_index == -1 &&
            has_any_tlb_color_write) {
-                c->s->info.fs.early_fragment_tests = true;
+                c->s->info.fs.early_fragment_tests =
+                        !c->s->info.fs.uses_discard ||
+                        c->fs_key->can_earlyz_with_discard;
        }

        /* By default, Z buffer writes are implicit using the Z values produced
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -426,6 +426,7 @@ struct v3d_fs_key {
        bool msaa;
        bool sample_alpha_to_coverage;
        bool sample_alpha_to_one;
+        bool can_earlyz_with_discard;
        /* Mask of which color render targets are present. */
        uint8_t cbufs;
        uint8_t swap_color_rb;
--- a/src/compiler/nir/nir_lower_mediump.c
+++ b/src/compiler/nir/nir_lower_mediump.c
@@ -584,8 +584,10 @@ nir_lower_mediump_vars(nir_shader *shader, nir_variable_mode modes)
               nir_variable *var = nir_deref_instr_get_variable(deref);

               /* If we have atomic derefs that we can't track, then don't lower any mediump.  */
-               if (!var)
+               if (!var) {
+                  ralloc_free(no_lower_set);
                  return false;
+               }

               _mesa_set_add(no_lower_set, var);
               break;
--- a/src/freedreno/ir3/ir3_shared_ra.c
+++ b/src/freedreno/ir3/ir3_shared_ra.c
@@ -825,6 +825,24 @@ assign_src(struct ra_ctx *ctx, struct ir3_register *src)
   interval->src = false;
 }

+static bool
+is_nontrivial_collect(struct ir3_instruction *collect)
+{
+   if (collect->opc != OPC_META_COLLECT) {
+      return false;
+   }
+
+   struct ir3_register *dst = collect->dsts[0];
+
+   foreach_src_n (src, src_n, collect) {
+      if (src->num != dst->num + src_n) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
 static void
 handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
           struct ir3_register *dst)
@@ -861,10 +879,26 @@ handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
      free_space(ctx, physreg, size);
   }

+   dst->num = ra_physreg_to_num(physreg, dst->flags);
+
+   /* Non-trivial collects (i.e., ones that will introduce moves because the
+    * sources don't line-up with the destination) may cause source intervals to
+    * get implicitly moved when they are inserted as children of the destination
+    * interval. Since we don't support moving intervals in shared RA, this may
+    * cause illegal register allocations. Prevent this by creating a new
+    * top-level interval for the destination so that the source intervals will
+    * be left alone.
+    */
+   if (is_nontrivial_collect(instr)) {
+      dst->merge_set = NULL;
+      dst->interval_start = ctx->live->interval_offset;
+      dst->interval_end = dst->interval_start + reg_size(dst);
+      ctx->live->interval_offset = dst->interval_end;
+   }
+
   ra_update_affinity(reg_file_size(dst), dst, physreg);
   interval->physreg_start = physreg;
   interval->physreg_end = physreg + reg_size(dst);
-   dst->num = ra_physreg_to_num(physreg, dst->flags);
   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
   d("insert dst %u physreg %u", dst->name, physreg);

--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -2514,6 +2514,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
         vk_descriptor_set_layout_unref(&cmd_buffer->device->vk,
                                        &cmd_buffer->descriptors[i].push_set.layout->vk);
      }
+      vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->descriptors[i].push_set.mapped_ptr);
      memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
      cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
      cmd_buffer->descriptors[i].max_sets_bound = 0;
@@ -4630,8 +4631,8 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
                     const struct tu_image_view *, pass->attachment_count);
   vk_multialloc_add(&ma, &cmd->state.clear_values, VkClearValue,
                     pRenderPassBegin->clearValueCount);
-   if (!vk_multialloc_alloc(&ma, &cmd->vk.pool->alloc,
-                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
+   if (ma.size && !vk_multialloc_alloc(&ma, &cmd->vk.pool->alloc,
+                                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
      vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
      return;
   }
--- a/src/gallium/drivers/etnaviv/etnaviv_ml.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml.c
@@ -61,6 +61,13 @@ etna_ml_create_tensor(struct etna_ml_subgraph *subgraph, unsigned idx, unsigned
   ML_DBG("created resource %p for tensor %d with size %d\n", res, idx, size);
 }

+struct etna_core_npu_info *
+etna_ml_get_core_info(struct etna_context *context) {
+   struct etna_screen *screen = context->screen;
+   struct etna_core_info *info = etna_gpu_get_core_info(screen->npu);
+   return &info->npu;
+}
+
 static bool
 needs_reshuffle(const struct pipe_ml_operation *poperation)
 {
@@ -237,7 +244,7 @@ etna_ml_subgraph_create(struct pipe_context *pcontext,
                        unsigned count)
 {
   struct etna_context *ctx = etna_context(pcontext);
-   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
   struct etna_ml_subgraph *subgraph;
   struct list_head operations;
   unsigned tensor_count;
@@ -358,7 +365,7 @@ void
 etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psubgraph, struct pipe_tensor *input)
 {
   struct etna_context *ctx = etna_context(pctx);
-   unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
+   unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
   struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
   struct etna_cmd_stream *stream = ctx->stream;
   static bool is_initialized = false;
--- a/src/gallium/drivers/etnaviv/etnaviv_ml.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml.h
@@ -7,7 +7,8 @@
 #define H_ETNA_ML

 #include "pipe/p_state.h"
-#include "util/u_inlines.h"
+#include "util/u_dynarray.h"
+#include "etnaviv_context.h"

 #define MAX_CONFIG_BOS 4

@@ -94,6 +95,8 @@ unsigned etna_ml_allocate_tensor(struct etna_ml_subgraph *subgraph);
 struct pipe_resource *etna_ml_get_tensor(struct etna_ml_subgraph *subgraph, unsigned idx);
 unsigned etna_ml_get_offset(struct etna_ml_subgraph *subgraph, unsigned idx);

+struct etna_core_npu_info *etna_ml_get_core_info(struct etna_context *context);
+
 struct pipe_ml_subgraph *
 etna_ml_subgraph_create(struct pipe_context *context,
                        const struct pipe_ml_operation *operations,
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
@@ -515,8 +515,8 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
 static unsigned
 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
 {
-   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
-   unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
+   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
   unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
@@ -590,8 +590,8 @@ calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *inp
 static unsigned
 calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
 {
-   unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
-   unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
+   unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
+   unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
   unsigned input_width = operation->input_width;
   unsigned input_height = operation->input_height;
   unsigned input_channels = operation->input_channels;
@@ -639,9 +639,9 @@ create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation
 {
   struct pipe_context *context = subgraph->base.context;
   struct etna_context *ctx = etna_context(context);
-   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
   unsigned nn_core_version = ctx->screen->specs.nn_core_version;
-   unsigned oc_sram_size = ctx->screen->info->npu.on_chip_sram_size;
+   unsigned oc_sram_size = etna_ml_get_core_info(ctx)->on_chip_sram_size;
   struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
                                    sizeof(struct etna_nn_params),
                                    DRM_ETNA_GEM_CACHE_WC);
@@ -967,7 +967,7 @@ static unsigned
 write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
 {
   struct pipe_context *pctx = subgraph->base.context;
-   unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
+   unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
   unsigned cores_used = MIN2(output_channels, nn_core_count);
@@ -1047,7 +1047,7 @@ static unsigned
 write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
 {
   struct pipe_context *pctx = subgraph->base.context;
-   unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
+   unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
   unsigned cores_used = MIN2(output_channels, nn_core_count);
@@ -1134,7 +1134,7 @@ static unsigned
 write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
 {
   struct pipe_context *pctx = subgraph->base.context;
-   unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
+   unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
   unsigned cores_used = MIN2(output_channels, nn_core_count);
   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
@@ -1221,7 +1221,7 @@ calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_op
 {
   struct pipe_context *context = subgraph->base.context;
   struct etna_context *ctx = etna_context(context);
-   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
   unsigned header_size = ALIGN(nn_core_count * 4, 64);
   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
@@ -1245,8 +1245,8 @@ calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operatio
 {
   struct pipe_context *context = subgraph->base.context;
   struct etna_context *ctx = etna_context(context);
-   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
-   unsigned max_zrl_bits = ctx->screen->info->npu.nn_zrl_bits;
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
+   unsigned max_zrl_bits = etna_ml_get_core_info(ctx)->nn_zrl_bits;
   unsigned header_size = ALIGN(nn_core_count * 4, 64);
   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
@@ -1298,7 +1298,7 @@ create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_oper
 {
   struct pipe_context *context = subgraph->base.context;
   struct etna_context *ctx = etna_context(context);
-   unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
+   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
   unsigned header_size = ALIGN(nn_core_count * 4, 64);
   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c
@@ -394,7 +394,7 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
                        unsigned tp_core, unsigned tp_cores_used)
 {
   struct etna_context *ctx = etna_context(subgraph->base.context);
-   unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
+   unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
   struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
                                    sizeof(struct etna_tp_params),
                                    DRM_ETNA_GEM_CACHE_WC);
@@ -730,7 +730,7 @@ etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph,
      instruction->configs[0] = create_detranspose_config(subgraph, operation);
      break;
   case ETNA_ML_TP_RESHUFFLE: {
-      unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
+      unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
      unsigned tp_cores_used;

      tp_cores_used = (operation->input_width > 8 || operation->input_channels > 1) ? tp_core_count : 1;
@@ -756,7 +756,7 @@ etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph,
                          unsigned idx)
 {
   struct etna_context *ctx = etna_context(subgraph->base.context);
-   unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
+   unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
   struct etna_cmd_stream *stream = ctx->stream;
   bool more_than_one_tp_job = operation->configs[1] != NULL;
   bool parallel = DBG_ENABLED(ETNA_DBG_NPU_PARALLEL);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
@@ -141,11 +141,9 @@ nvc0_resource_from_user_memory(struct pipe_screen *pipe,
                               const struct pipe_resource *templ,
                               void *user_memory)
 {
-   ASSERTED struct nouveau_screen *screen = nouveau_screen(pipe);
-
-   assert(screen->has_svm);
-   assert(templ->target == PIPE_BUFFER);
-
+   struct nouveau_screen *screen = nouveau_screen(pipe);
+   if (!screen->has_svm || templ->target != PIPE_BUFFER)
+      return NULL;
   return nouveau_buffer_create_from_user(pipe, templ, user_memory);
 }

--- a/src/gallium/drivers/v3d/v3d_program.c
+++ b/src/gallium/drivers/v3d/v3d_program.c
@@ -649,6 +649,7 @@ v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode)
                            V3D_DIRTY_BLEND |
                            V3D_DIRTY_FRAMEBUFFER |
                            V3D_DIRTY_ZSA |
+                            V3D_DIRTY_OQ |
                            V3D_DIRTY_RASTERIZER |
                            V3D_DIRTY_SAMPLE_STATE |
                            V3D_DIRTY_FRAGTEX |
@@ -677,6 +678,10 @@ v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode)
        }

        key->swap_color_rb = v3d->swap_color_rb;
+        key->can_earlyz_with_discard = s->info.fs.uses_discard &&
+                (!v3d->zsa || !job->zsbuf || !v3d->zsa->base.depth_enabled ||
+                 !v3d->zsa->base.depth_writemask) &&
+                !(v3d->active_queries && v3d->current_oq);

        for (int i = 0; i < v3d->framebuffer.nr_cbufs; i++) {
                struct pipe_surface *cbuf = v3d->framebuffer.cbufs[i];
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -548,9 +548,27 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,

   assert(dest.nr < XE2_MAX_GRF);

-   if (devinfo->ver >= 10)
-      assert(!(src0.file == IMM &&
-               src2.file == IMM));
+   if (devinfo->ver <= 9) {
+      assert(src0.file != IMM && src2.file != IMM);
+   } else if (devinfo->ver <= 11) {
+      /* On Ice Lake, BFE and CSEL cannot have any immediate sources. */
+      assert((opcode != BRW_OPCODE_BFE && opcode != BRW_OPCODE_CSEL) ||
+             (src0.file != IMM && src2.file != IMM));
+
+      /* On Ice Lake, DP4A and MAD can only have one immediate source. */
+      assert((opcode != BRW_OPCODE_DP4A && opcode != BRW_OPCODE_MAD) ||
+             !(src0.file == IMM && src2.file == IMM));
+   } else {
+      /* Having two immediate sources is allowed, but this should have been
+       * converted to a regular ADD by brw_fs_opt_algebraic.
+       */
+      assert(opcode == BRW_OPCODE_ADD3 ||
+             !(src0.file == IMM && src2.file == IMM));
+   }
+
+   /* BFI2 cannot have any immediate sources on any platform. */
+   assert(opcode != BRW_OPCODE_BFI2 ||
+          (src0.file != IMM && src2.file != IMM));

   assert(src0.file == IMM || src0.nr < XE2_MAX_GRF);
   assert(src1.file != IMM && src1.nr < XE2_MAX_GRF);
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -825,9 +825,8 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
    * destination of the copy, and simply replacing the sources would give a
    * program with different semantics.
    */
-   if ((brw_type_size_bits(entry->dst.type) < brw_type_size_bits(inst->src[arg].type) ||
-        entry->is_partial_write) &&
-       inst->opcode != BRW_OPCODE_MOV) {
+   if (brw_type_size_bits(entry->dst.type) < brw_type_size_bits(inst->src[arg].type) ||
+       (entry->is_partial_write && inst->opcode != BRW_OPCODE_MOV)) {
      return false;
   }

@@ -1506,8 +1505,7 @@ try_copy_propagate_def(const brw_compiler *compiler,
    * destination of the copy, and simply replacing the sources would give a
    * program with different semantics.
    */
-   if (inst->opcode != BRW_OPCODE_MOV &&
-       brw_type_size_bits(def->dst.type) <
+   if (brw_type_size_bits(def->dst.type) <
       brw_type_size_bits(inst->src[arg].type))
      return false;

--- a/src/intel/compiler/brw_fs_cse.cpp
+++ b/src/intel/compiler/brw_fs_cse.cpp
@@ -475,6 +475,19 @@ brw_fs_opt_cse_defs(fs_visitor &s)
               assert(ops_must_match);
            }

+            /* Some later instruction could depend on the flags written by
+             * this instruction. It can only be removed if the previous
+             * instruction that write the flags is identical.
+             */
+            if (inst->flags_written(devinfo)) {
+               bool ignored;
+
+               if (last_flag_write == NULL ||
+                   !instructions_match(last_flag_write, inst, &ignored)) {
+                  continue;
+               }
+            }
+
            progress = true;
            need_remaps = true;
            remap_table[inst->dst.nr] =
--- a/src/intel/compiler/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw_nir_lower_ray_queries.c
@@ -159,7 +159,7 @@ get_ray_query_shadow_addr(nir_builder *b,
            nir_imul(
               b,
               brw_load_btd_dss_id(b),
-               brw_nir_rt_load_num_simd_lanes_per_dss(b, state->devinfo)),
+               state->globals.num_dss_rt_stacks),
            brw_nir_rt_sync_stack_id(b)),
         BRW_RT_SIZEOF_SHADOW_RAY_QUERY);

@@ -232,7 +232,8 @@ lower_ray_query_intrinsic(nir_builder *b,
   nir_def *shadow_stack_addr =
      get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
   nir_def *hw_stack_addr =
-      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, state->devinfo);
+      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
+                                 state->globals.num_dss_rt_stacks);
   nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;

   switch (intrin->intrinsic) {
--- a/src/intel/compiler/brw_nir_rt_builder.h
+++ b/src/intel/compiler/brw_nir_rt_builder.h
@@ -74,15 +74,6 @@ brw_load_btd_dss_id(nir_builder *b)
   return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_DSS);
 }

-static inline nir_def *
-brw_nir_rt_load_num_simd_lanes_per_dss(nir_builder *b,
-                                       const struct intel_device_info *devinfo)
-{
-   return nir_imm_int(b, devinfo->num_thread_per_eu *
-                         devinfo->max_eus_per_subslice *
-                         16 /* The RT computation is based off SIMD16 */);
-}
-
 static inline nir_def *
 brw_load_eu_thread_simd(nir_builder *b)
 {
@@ -187,23 +178,27 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
 static inline nir_def *
 brw_nir_rt_sync_stack_addr(nir_builder *b,
                           nir_def *base_mem_addr,
-                           const struct intel_device_info *devinfo)
+                           nir_def *num_dss_rt_stacks)
 {
-   /* For Ray queries (Synchronous Ray Tracing), the formula is similar but
-    * goes down from rtMemBasePtr :
+   /* Bspec 47547 (Xe) and 56936 (Xe2+) say:
+    *    For Ray queries (Synchronous Ray Tracing), the formula is similar but
+    *    goes down from rtMemBasePtr :
    *
-    *    syncBase  = RTDispatchGlobals.rtMemBasePtr
-    *              - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
-    *              * syncStackSize
+    *       syncBase  = RTDispatchGlobals.rtMemBasePtr
+    *                 - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
+    *                 * syncStackSize
    *
-    * We assume that we can calculate a 32-bit offset first and then add it
-    * to the 64-bit base address at the end.
+    *    We assume that we can calculate a 32-bit offset first and then add it
+    *    to the 64-bit base address at the end.
+    *
+    * However, on HSD 14020275151 it's clarified that the HW uses
+    * NUM_SYNC_STACKID_PER_DSS instead.
    */
   nir_def *offset32 =
      nir_imul(b,
               nir_iadd(b,
                        nir_imul(b, brw_load_btd_dss_id(b),
-                                    brw_nir_rt_load_num_simd_lanes_per_dss(b, devinfo)),
+                                    num_dss_rt_stacks),
                        nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
               nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
   return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
--- a/src/intel/vulkan/anv_image_host_copy.c
+++ b/src/intel/vulkan/anv_image_host_copy.c
@@ -452,10 +452,8 @@ anv_CopyImageToImageEXT(
                     .y = dst_offset_el.y + y_el,
                  };
                  VkExtent3D extent = {
-                     .width  = MIN2(extent_el.width - src_offset.x,
-                                    tile_width_el),
-                     .height = MIN2(extent_el.height - src_offset.y,
-                                    tile_height_el),
+                     .width  = MIN2(extent_el.width - x_el, tile_width_el),
+                     .height = MIN2(extent_el.height - y_el, tile_height_el),
                     .depth  = 1,
                  };

--- a/src/intel/vulkan/anv_physical_device.c
+++ b/src/intel/vulkan/anv_physical_device.c
@@ -57,6 +57,72 @@ compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
   va_end(args);
 }

+struct anv_descriptor_limits {
+   uint32_t max_ubos;
+   uint32_t max_ssbos;
+   uint32_t max_samplers;
+   uint32_t max_images;
+   uint32_t max_resources;
+};
+
+static void
+get_device_descriptor_limits(const struct anv_physical_device *device,
+                             struct anv_descriptor_limits *limits)
+{
+   memset(limits, 0, sizeof(*limits));
+
+   /* It's a bit hard to exactly map our implementation to the limits
+    * described by Vulkan. The bindless surface handle in the extended message
+    * descriptors is 20 bits on <= Gfx12.0, 26 bits on >= Gfx12.5 and it's an
+    * index into the table of RENDER_SURFACE_STATE structs that starts at
+    * bindless surface base address. On <= Gfx12.0, this means that we can
+    * have at must 1M surface states allocated at any given time. Since most
+    * image views take two descriptors, this means we have a limit of about
+    * 500K image views. On >= Gfx12.5, we do not need 2 surfaces per
+    * descriptors and we have 33M+ descriptors (we have a 2GB limit, due to
+    * overlapping heaps for workarounds, but HW can do 4GB).
+    *
+    * However, on <= Gfx12.0, since we allocate surface states at
+    * vkCreateImageView time, this means our limit is actually something on
+    * the order of 500K image views allocated at any time. The actual limit
+    * describe by Vulkan, on the other hand, is a limit of how many you can
+    * have in a descriptor set. Assuming anyone using 1M descriptors will be
+    * using the same image view twice a bunch of times (or a bunch of null
+    * descriptors), we can safely advertise a larger limit here.
+    *
+    * Here we use the size of the heap in which the descriptors are stored and
+    * divide by the size of the descriptor to get a limit value.
+    */
+   const uint64_t descriptor_heap_size =
+      device->indirect_descriptors ?
+      device->va.indirect_descriptor_pool.size :
+      device->va.bindless_surface_state_pool.size;;
+
+   const uint32_t buffer_descriptor_size =
+      device->indirect_descriptors ?
+      sizeof(struct anv_address_range_descriptor) :
+      ANV_SURFACE_STATE_SIZE;
+   const uint32_t image_descriptor_size =
+      device->indirect_descriptors ?
+      sizeof(struct anv_address_range_descriptor) :
+      ANV_SURFACE_STATE_SIZE;
+   const uint32_t sampler_descriptor_size =
+      device->indirect_descriptors ?
+      sizeof(struct anv_sampled_image_descriptor) :
+      ANV_SAMPLER_STATE_SIZE;
+
+   limits->max_ubos = descriptor_heap_size / buffer_descriptor_size;
+   limits->max_ssbos = descriptor_heap_size / buffer_descriptor_size;
+   limits->max_images = descriptor_heap_size / image_descriptor_size;
+   limits->max_samplers = descriptor_heap_size / sampler_descriptor_size;
+
+   limits->max_resources = UINT32_MAX;
+   limits->max_resources = MIN2(limits->max_resources, limits->max_ubos);
+   limits->max_resources = MIN2(limits->max_resources, limits->max_ssbos);
+   limits->max_resources = MIN2(limits->max_resources, limits->max_images);
+   limits->max_resources = MIN2(limits->max_resources, limits->max_samplers);
+}
+
 static void
 get_device_extensions(const struct anv_physical_device *device,
                      struct vk_device_extension_table *ext)
@@ -972,25 +1038,10 @@ get_properties_1_2(const struct anv_physical_device *pdevice,
   p->shaderRoundingModeRTZFloat64           = true;
   p->shaderSignedZeroInfNanPreserveFloat64  = true;

-   /* It's a bit hard to exactly map our implementation to the limits
-    * described by Vulkan.  The bindless surface handle in the extended
-    * message descriptors is 20 bits and it's an index into the table of
-    * RENDER_SURFACE_STATE structs that starts at bindless surface base
-    * address.  This means that we can have at must 1M surface states
-    * allocated at any given time.  Since most image views take two
-    * descriptors, this means we have a limit of about 500K image views.
-    *
-    * However, since we allocate surface states at vkCreateImageView time,
-    * this means our limit is actually something on the order of 500K image
-    * views allocated at any time.  The actual limit describe by Vulkan, on
-    * the other hand, is a limit of how many you can have in a descriptor set.
-    * Assuming anyone using 1M descriptors will be using the same image view
-    * twice a bunch of times (or a bunch of null descriptors), we can safely
-    * advertise a larger limit here.
-    */
-   const unsigned max_bindless_views =
-      anv_physical_device_bindless_heap_size(pdevice, false) / ANV_SURFACE_STATE_SIZE;
-   p->maxUpdateAfterBindDescriptorsInAllPools            = max_bindless_views;
+   struct anv_descriptor_limits desc_limits;
+   get_device_descriptor_limits(pdevice, &desc_limits);
+
+   p->maxUpdateAfterBindDescriptorsInAllPools            = desc_limits.max_resources;
   p->shaderUniformBufferArrayNonUniformIndexingNative   = false;
   p->shaderSampledImageArrayNonUniformIndexingNative    = false;
   p->shaderStorageBufferArrayNonUniformIndexingNative   = true;
@@ -998,20 +1049,20 @@ get_properties_1_2(const struct anv_physical_device *pdevice,
   p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
   p->robustBufferAccessUpdateAfterBind                  = true;
   p->quadDivergentImplicitLod                           = false;
-   p->maxPerStageDescriptorUpdateAfterBindSamplers       = max_bindless_views;
-   p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
-   p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
-   p->maxPerStageDescriptorUpdateAfterBindSampledImages  = max_bindless_views;
-   p->maxPerStageDescriptorUpdateAfterBindStorageImages  = max_bindless_views;
+   p->maxPerStageDescriptorUpdateAfterBindSamplers       = desc_limits.max_samplers;
+   p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = desc_limits.max_ubos;
+   p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = desc_limits.max_ssbos;
+   p->maxPerStageDescriptorUpdateAfterBindSampledImages  = desc_limits.max_images;
+   p->maxPerStageDescriptorUpdateAfterBindStorageImages  = desc_limits.max_images;
   p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
-   p->maxPerStageUpdateAfterBindResources                = UINT32_MAX;
-   p->maxDescriptorSetUpdateAfterBindSamplers            = max_bindless_views;
-   p->maxDescriptorSetUpdateAfterBindUniformBuffers      = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
+   p->maxPerStageUpdateAfterBindResources                = desc_limits.max_resources;
+   p->maxDescriptorSetUpdateAfterBindSamplers            = desc_limits.max_samplers;
+   p->maxDescriptorSetUpdateAfterBindUniformBuffers      = desc_limits.max_ubos;
   p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
-   p->maxDescriptorSetUpdateAfterBindStorageBuffers      = UINT32_MAX;
+   p->maxDescriptorSetUpdateAfterBindStorageBuffers      = desc_limits.max_ssbos;
   p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
-   p->maxDescriptorSetUpdateAfterBindSampledImages       = max_bindless_views;
-   p->maxDescriptorSetUpdateAfterBindStorageImages       = max_bindless_views;
+   p->maxDescriptorSetUpdateAfterBindSampledImages       = desc_limits.max_images;
+   p->maxDescriptorSetUpdateAfterBindStorageImages       = desc_limits.max_images;
   p->maxDescriptorSetUpdateAfterBindInputAttachments    = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;

   /* We support all of the depth resolve modes */
@@ -1125,15 +1176,8 @@ get_properties(const struct anv_physical_device *pdevice,

      const struct intel_device_info *devinfo = &pdevice->info;

-   const uint32_t max_ssbos = UINT16_MAX;
-   const uint32_t max_textures = UINT16_MAX;
-   const uint32_t max_samplers = UINT16_MAX;
-   const uint32_t max_images = UINT16_MAX;
   const VkDeviceSize max_heap_size = anx_get_physical_device_max_heap_size(pdevice);

-   /* Claim a high per-stage limit since we have bindless. */
-   const uint32_t max_per_stage = UINT32_MAX;
-
   const uint32_t max_workgroup_size =
      MIN2(1024, 32 * devinfo->max_cs_workgroup_threads);

@@ -1158,6 +1202,9 @@ get_properties(const struct anv_physical_device *pdevice,
   }
 #endif /* DETECT_OS_ANDROID */

+   struct anv_descriptor_limits desc_limits;
+   get_device_descriptor_limits(pdevice, &desc_limits);
+
   *props = (struct vk_properties) {
      .apiVersion = ANV_API_VERSION,
      .driverVersion = vk_get_driver_version(),
@@ -1183,20 +1230,20 @@ get_properties(const struct anv_physical_device *pdevice,
      .bufferImageGranularity                   = 1,
      .sparseAddressSpaceSize                   = sparse_addr_space_size,
      .maxBoundDescriptorSets                   = MAX_SETS,
-      .maxPerStageDescriptorSamplers            = max_samplers,
-      .maxPerStageDescriptorUniformBuffers      = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
-      .maxPerStageDescriptorStorageBuffers      = max_ssbos,
-      .maxPerStageDescriptorSampledImages       = max_textures,
-      .maxPerStageDescriptorStorageImages       = max_images,
+      .maxPerStageDescriptorSamplers            = desc_limits.max_samplers,
+      .maxPerStageDescriptorUniformBuffers      = desc_limits.max_ubos,
+      .maxPerStageDescriptorStorageBuffers      = desc_limits.max_ssbos,
+      .maxPerStageDescriptorSampledImages       = desc_limits.max_images,
+      .maxPerStageDescriptorStorageImages       = desc_limits.max_images,
      .maxPerStageDescriptorInputAttachments    = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
-      .maxPerStageResources                     = max_per_stage,
-      .maxDescriptorSetSamplers                 = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
-      .maxDescriptorSetUniformBuffers           = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,           /* number of stages * maxPerStageDescriptorUniformBuffers */
+      .maxPerStageResources                     = desc_limits.max_resources,
+      .maxDescriptorSetSamplers                 = desc_limits.max_samplers,
+      .maxDescriptorSetUniformBuffers           = desc_limits.max_ubos,
      .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
-      .maxDescriptorSetStorageBuffers           = 6 * max_ssbos,    /* number of stages * maxPerStageDescriptorStorageBuffers */
+      .maxDescriptorSetStorageBuffers           = desc_limits.max_ssbos,
      .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
-      .maxDescriptorSetSampledImages            = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
-      .maxDescriptorSetStorageImages            = 6 * max_images,   /* number of stages * maxPerStageDescriptorStorageImages */
+      .maxDescriptorSetSampledImages            = desc_limits.max_images,
+      .maxDescriptorSetStorageImages            = desc_limits.max_images,
      .maxDescriptorSetInputAttachments         = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
      .maxVertexInputAttributes                 = MAX_VES,
      .maxVertexInputBindings                   = MAX_VBS,
@@ -1227,7 +1274,8 @@ get_properties(const struct anv_physical_device *pdevice,
      .maxFragmentInputComponents               = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
      .maxFragmentOutputAttachments             = 8,
      .maxFragmentDualSrcAttachments            = 1,
-      .maxFragmentCombinedOutputResources       = MAX_RTS + max_ssbos + max_images,
+      .maxFragmentCombinedOutputResources       = MAX_RTS + desc_limits.max_ssbos +
+                                                  desc_limits.max_images,
      .maxComputeSharedMemorySize               = intel_device_info_get_max_slm_size(&pdevice->info),
      .maxComputeWorkGroupCount                 = { 65535, 65535, 65535 },
      .maxComputeWorkGroupInvocations           = max_workgroup_size,
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -3035,7 +3035,7 @@ struct anv_descriptor_pool {
    */
   bool host_only;

-   char host_mem[0];
+   alignas(8) char host_mem[0];
 };

 bool
--- a/src/nouveau/codegen/nv50_ir_from_nir.cpp
+++ b/src/nouveau/codegen/nv50_ir_from_nir.cpp
@@ -3151,7 +3151,7 @@ Converter::visit(nir_tex_instr *insn)
      if (lodIdx != -1 && !target.isMS())
         srcs.push_back(getSrc(&insn->src[lodIdx].src, 0));
      else if (op == OP_TXQ)
-         srcs.push_back(zero); // TXQ always needs an LOD
+         srcs.push_back(loadImm(NULL, 0)); // TXQ always needs an LOD
      else if (op == OP_TXF)
         lz = true;
      if (msIdx != -1)
--- a/src/nouveau/vulkan/nvk_cmd_buffer.c
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.c
@@ -643,12 +643,8 @@ nvk_cmd_dirty_cbufs_for_descriptors(struct nvk_cmd_buffer *cmd,

         case NVK_CBUF_TYPE_DESC_SET:
         case NVK_CBUF_TYPE_UBO_DESC:
-            if (cbuf->desc_set >= sets_start && cbuf->desc_set < sets_end)
-               group->dirty |= BITFIELD_BIT(i);
-            break;
-
         case NVK_CBUF_TYPE_DYNAMIC_UBO:
-            if (cbuf->dynamic_idx >= dyn_start && cbuf->dynamic_idx < dyn_end)
+            if (cbuf->desc_set >= sets_start && cbuf->desc_set < sets_end)
               group->dirty |= BITFIELD_BIT(i);
            break;

@@ -749,7 +745,7 @@ nvk_bind_descriptor_sets(struct nvk_cmd_buffer *cmd,
   assert(next_dyn_offset <= info->dynamicOffsetCount);

   nvk_descriptor_state_set_root_array(cmd, desc, dynamic_buffers,
-                                       dyn_buffer_start, dyn_buffer_end,
+                                       dyn_buffer_start, dyn_buffer_end - dyn_buffer_start,
                                       &dynamic_buffers[dyn_buffer_start]);

   /* We need to set everything above first_set because later calls to
--- a/src/nouveau/vulkan/nvk_cmd_buffer.h
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.h
@@ -103,8 +103,9 @@ struct nvk_descriptor_state {
   const struct nvk_root_descriptor_table *root = \
      (const struct nvk_root_descriptor_table *)(desc)->root; \
   unsigned _start = start; \
-   assert(_start + count <= ARRAY_SIZE(root->member)); \
-   for (unsigned i = 0; i < count; i++) \
+   unsigned _count = count; \
+   assert(_start + _count <= ARRAY_SIZE(root->member)); \
+   for (unsigned i = 0; i < _count; i++) \
      (dst)[i] = root->member[i + _start]; \
 } while (0)

@@ -125,13 +126,14 @@ struct nvk_descriptor_state {
   struct nvk_root_descriptor_table *root = \
      (struct nvk_root_descriptor_table *)_desc->root; \
   unsigned _start = start; \
-   assert(_start + count <= ARRAY_SIZE(root->member)); \
-   for (unsigned i = 0; i < count; i++) \
+   unsigned _count = count; \
+   assert(_start + _count <= ARRAY_SIZE(root->member)); \
+   for (unsigned i = 0; i < _count; i++) \
      root->member[i + _start] = (src)[i]; \
   if (_desc->flush_root != NULL) { \
      size_t offset = (char *)&root->member[_start] - (char *)root; \
      _desc->flush_root((cmd), _desc, offset, \
-                        count * sizeof(root->member[0])); \
+                        _count * sizeof(root->member[0])); \
   } \
 } while (0)

--- a/src/panfrost/vulkan/panvk_vX_cmd_meta.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_meta.c
@@ -71,6 +71,10 @@ panvk_per_arch(cmd_meta_compute_end)(

   cmdbuf->state.compute.shader = save_ctx->cs.shader;
   cmdbuf->state.compute.cs.desc = save_ctx->cs.desc;
+
+#if PAN_ARCH >= 9
+   cmdbuf->state.compute.cs.desc.res_table = 0;
+#endif
 }

 void
@@ -136,6 +140,9 @@ panvk_per_arch(cmd_meta_gfx_end)(
   cmdbuf->state.gfx.vs.attribs = 0;
   cmdbuf->state.gfx.vs.attrib_bufs = 0;
   cmdbuf->state.gfx.fs.rsd = 0;
+#else
+   cmdbuf->state.gfx.fs.desc.res_table = 0;
+   cmdbuf->state.gfx.vs.desc.res_table = 0;
 #endif

   cmdbuf->vk.dynamic_graphics_state = save_ctx->dyn_state.all;
--- a/src/util/perf/u_perfetto.h
+++ b/src/util/perf/u_perfetto.h
@@ -25,11 +25,12 @@
 #define _UTIL_PERFETTO_H

 #include "util/u_atomic.h"
+#include "util/detect_os.h"

 // On Unix, pass a clockid_t to designate which clock was used to gather the timestamp
 // On Windows, this paramter is ignored, and it's expected that `timestamp` comes from QueryPerformanceCounter
-#ifndef _WIN32
-#include <sys/types.h>
+#if DETECT_OS_POSIX
+#include <time.h>
 typedef clockid_t perfetto_clock_id;
 #else
 typedef int32_t perfetto_clock_id;
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -1149,41 +1149,47 @@ wsi_wl_surface_get_support(VkIcdSurfaceBase *surface,
   return VK_SUCCESS;
 }

+/* For true mailbox mode, we need at least 4 images:
+ *  1) One to scan out from
+ *  2) One to have queued for scan-out
+ *  3) One to be currently held by the Wayland compositor
+ *  4) One to render to
+ */
+#define WSI_WL_BUMPED_NUM_IMAGES 4
+
+/* Catch-all. 3 images is a sound default for everything except MAILBOX. */
+#define WSI_WL_DEFAULT_NUM_IMAGES 3
+
 static uint32_t
 wsi_wl_surface_get_min_image_count(struct wsi_wl_display *display,
                                   const VkSurfacePresentModeEXT *present_mode)
 {
-   /* With legacy frame callback mechanism, report 4 images by default, unless
-    * EXT_surface_maintenance1 query is used to ask explicitly for FIFO. */
-   if (present_mode && (present_mode->presentMode == VK_PRESENT_MODE_FIFO_KHR ||
-                        present_mode->presentMode == VK_PRESENT_MODE_FIFO_RELAXED_KHR)) {
-      if (display->fifo_manager) {
-         /* When FIFO protocol is supported, applications will no longer block
-          * in QueuePresentKHR due to frame callback, so returning 4 images
-          * for a FIFO swapchain is problematic due to excessive latency. This
-          * latency can only be limited through means of presentWait which few
-          * applications use.
-          * 2 images are enough for forward progress, but 3 is used here
-          * because 2 could result in waiting for the compositor to remove an
-          * old image from scanout when we'd like to be rendering.
-          */
-         return 3;
-      }
-
-      /* If we receive a FIFO present mode, only 2 images is required for forward progress.
-       * Performance with 2 images will be questionable, but we only allow it for applications
-       * using the new API, so we don't risk breaking any existing apps this way.
-       * Other ICDs expose 2 images here already. */
-       return 2;
-   } else {
-      /* For true mailbox mode, we need at least 4 images:
-       *  1) One to scan out from
-       *  2) One to have queued for scan-out
-       *  3) One to be currently held by the Wayland compositor
-       *  4) One to render to
-       */
-      return 4;
+   if (present_mode) {
+      return present_mode->presentMode == VK_PRESENT_MODE_MAILBOX_KHR ?
+             WSI_WL_BUMPED_NUM_IMAGES : WSI_WL_DEFAULT_NUM_IMAGES;
   }
+
+   /* If explicit present_mode is not being queried, we need to provide a safe "catch-all"
+    * which can work for any presentation mode. Implementations are allowed to bump the minImageCount
+    * on swapchain creation, so this limit should be the lowest value which can guarantee forward progress. */
+
+   /* When FIFO protocol is not supported, we always returned 4 here,
+    * despite it going against the spirit of minImageCount in the specification.
+    * To avoid any unforeseen breakage, just keep using the same values we always have.
+    * In this path, we also never consider bumping the image count in minImageCount in swapchain creation time. */
+
+   /* When FIFO protocol is supported, applications will no longer block
+    * in QueuePresentKHR due to frame callback, so returning 4 images
+    * for a FIFO swapchain is deeply problematic due to excessive latency.
+    * This latency can only be limited through means of presentWait which few applications use, and we cannot
+    * mandate that shipping applications are rewritten to avoid a regression.
+    * 2 images are enough for forward progress in FIFO, but 3 is used here as a pragmatic decision
+    * because 2 could result in waiting for the compositor to remove an
+    * old image from scanout when we'd like to be rendering,
+    * and we don't want naively written applications to head into poor performance territory by default.
+    * X11 backend has very similar logic and rationale here.
+    */
+   return display->fifo_manager ? WSI_WL_DEFAULT_NUM_IMAGES : WSI_WL_BUMPED_NUM_IMAGES;
 }

 static VkResult
@@ -2760,9 +2766,10 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
      old_chain->retired = true;
   }

-   int num_images = pCreateInfo->minImageCount;
-
-   size_t size = sizeof(*chain) + num_images * sizeof(chain->images[0]);
+   /* We need to allocate the chain handle early, since display initialization code relies on it.
+    * We do not know the actual image count until we have initialized the display handle,
+    * so allocate conservatively in case we need to bump the image count. */
+   size_t size = sizeof(*chain) + MAX2(WSI_WL_BUMPED_NUM_IMAGES, pCreateInfo->minImageCount) * sizeof(chain->images[0]);
   chain = vk_zalloc(pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (chain == NULL)
      return VK_ERROR_OUT_OF_HOST_MEMORY;
@@ -2802,6 +2809,31 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
   if (result != VK_SUCCESS)
      goto fail;

+   uint32_t num_images = pCreateInfo->minImageCount;
+
+   /* If app provides a present mode list from EXT_swapchain_maintenance1,
+    * we don't know which present mode will be used.
+    * Application is assumed to be well-behaved and be spec-compliant.
+    * It needs to query all per-present mode minImageCounts individually and use the max() of those modes,
+    * so there should never be any need to bump image counts. */
+   bool uses_present_mode_group = vk_find_struct_const(
+         pCreateInfo->pNext, SWAPCHAIN_PRESENT_MODES_CREATE_INFO_EXT) != NULL;
+
+   /* If FIFO manager is not used, minImageCount is already the bumped value for reasons outlined in
+    * wsi_wl_surface_get_min_image_count(), so skip any attempt to bump the counts. */
+   if (wsi_wl_surface->display->fifo_manager && !uses_present_mode_group) {
+      /* With proper FIFO, we return a lower minImageCount to make FIFO viable without requiring the use of KHR_present_wait.
+       * The image count for MAILBOX should be bumped for performance reasons in this case.
+       * This matches strategy for X11. */
+      const VkSurfacePresentModeEXT mode =
+            { VK_STRUCTURE_TYPE_SURFACE_PRESENT_MODE_EXT, NULL, pCreateInfo->presentMode };
+
+      uint32_t min_images = wsi_wl_surface_get_min_image_count(wsi_wl_surface->display, &mode);
+      bool requires_image_count_bump = min_images == WSI_WL_BUMPED_NUM_IMAGES;
+      if (requires_image_count_bump)
+         num_images = MAX2(min_images, num_images);
+   }
+
   VkPresentModeKHR present_mode = wsi_swapchain_get_present_mode(wsi_device, pCreateInfo);
   if (present_mode == VK_PRESENT_MODE_IMMEDIATE_KHR) {
      chain->tearing_control =