VERSION: bump for 25.0.0-rc2

FreeBSD: Disable support for "-mtls-dialect" for FreeBSD
Clang 19 supports "-mtls-dialect=" but FreeBSD does not support "-mtls-dialect=gnu2". Skip auto-detection for FreeBSD. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31931> (cherry picked from commit f9fc7392fa)
2025-02-05 18:42:06 +01:00 · 2025-02-05 16:09:27 +01:00 · 2025-02-05 16:08:35 +01:00 · 2025-02-05 16:08:35 +01:00 · 2025-02-05 16:08:34 +01:00 · 2025-02-05 16:08:34 +01:00
63 changed files with 3370 additions and 2262 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -258,7 +258,7 @@ include:

 .ci-deqp-artifacts:
  artifacts:
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    when: always
    untracked: false
    paths:
--- a/.gitlab-ci/build/gitlab-ci.yml
+++ b/.gitlab-ci/build/gitlab-ci.yml
@@ -16,7 +16,7 @@
  # We don't want to download any previous job's artifacts
  dependencies: []
  artifacts:
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    when: always
    paths:
      - _build/meson-logs/*.txt
@@ -238,7 +238,6 @@ debian-build-testing:
  extends: .meson-build
  stage: build-for-tests
  variables:
-    BUILDTYPE: debug
    UNWIND: "enabled"
    DRI_LOADERS: >
      -D glx=dri
@@ -255,7 +254,7 @@ debian-build-testing:
      -D gallium-rusticl=false
    GALLIUM_DRIVERS: "i915,iris,nouveau,r300,r600,freedreno,llvmpipe,softpipe,svga,v3d,vc4,virgl,etnaviv,panfrost,lima,zink,d3d12,asahi,crocus"
    VULKAN_DRIVERS: "intel_hasvk,imagination-experimental,microsoft-experimental,nouveau,swrast"
-    BUILD_TYPE: "debugoptimized"
+    BUILDTYPE: "debugoptimized"
    EXTRA_OPTION: >
      -D spirv-to-dxil=true
      -D osmesa=true
--- a/.gitlab-ci/test/gitlab-ci.yml
+++ b/.gitlab-ci/test/gitlab-ci.yml
@@ -143,7 +143,7 @@ yaml-toml-shell-py-test:
 .piglit-traces-test:
  artifacts:
    when: on_failure
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    reports:
      junit: results/junit.xml
    paths:
@@ -177,7 +177,7 @@ yaml-toml-shell-py-test:
    - ./install/fossilize-runner.sh
  artifacts:
    when: on_failure
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - results/

@@ -205,7 +205,7 @@ yaml-toml-shell-py-test:
    BM_ROOTFS: /rootfs-${DEBIAN_ARCH}
  artifacts:
    when: always
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - results/
      - serial*.txt
@@ -399,7 +399,7 @@ yaml-toml-shell-py-test:

  artifacts:
    when: always
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - results
    reports:
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@@ -1 +1 @@
-25.0.0-devel
+25.0.0-rc2
--- a/meson.build
+++ b/meson.build
@@ -525,6 +525,8 @@ if not have_mtls_dialect
  # cross-compiling, but because this is just an optimization we can skip it
  if meson.is_cross_build() and not meson.can_run_host_binaries()
    warning('cannot auto-detect -mtls-dialect when cross-compiling, using compiler default')
+  elif host_machine.system() == 'freebsd'
+    warning('cannot use -mtls-dialect for FreeBSD, using compiler default')
  else
    # The way to specify the TLSDESC dialect is architecture-specific.
    # We probe both because there is not a fallback guaranteed to work for all
--- a/src/amd/ci/gitlab-ci-inc.yml
+++ b/src/amd/ci/gitlab-ci-inc.yml
@@ -191,6 +191,9 @@
    HWCI_KERNEL_MODULES: amdgpu
    KERNEL_IMAGE_TYPE: ""
    RUNNER_TAG: mesa-ci-x86-64-lava-asus-CM1400CXA-dalboz
+    # Force fixed 6.6 kernel, amdgpu doesn't revcover from GPU resets on 6.13
+    # https://gitlab.freedesktop.org/drm/amd/-/issues/3861
+    EXTERNAL_KERNEL_TAG: "v6.6.21-mesa-f8ea"

 # Status: https://lava.collabora.dev/scheduler/device_type/lenovo-TPad-C13-Yoga-zork
 .lava-lenovo-TPad-C13-Yoga-zork:x86_64:
@@ -204,6 +207,9 @@
    HWCI_KERNEL_MODULES: amdgpu
    KERNEL_IMAGE_TYPE: ""
    RUNNER_TAG: mesa-ci-x86-64-lava-lenovo-TPad-C13-Yoga-zork
+    # Force fixed 6.6 kernel, amdgpu doesn't revcover from GPU resets on 6.13
+    # https://gitlab.freedesktop.org/drm/amd/-/issues/3861
+    EXTERNAL_KERNEL_TAG: "v6.6.21-mesa-f8ea"

 # Status: https://lava.collabora.dev/scheduler/device_type/hp-x360-14a-cb0001xx-zork
 .lava-hp-x360-14a-cb0001xx-zork:x86_64:
@@ -217,6 +223,9 @@
    HWCI_KERNEL_MODULES: amdgpu
    KERNEL_IMAGE_TYPE: ""
    RUNNER_TAG: mesa-ci-x86-64-lava-hp-x360-14a-cb0001xx-zork
+    # Force fixed 6.6 kernel, amdgpu doesn't revcover from GPU resets on 6.13
+    # https://gitlab.freedesktop.org/drm/amd/-/issues/3861
+    EXTERNAL_KERNEL_TAG: "v6.6.21-mesa-f8ea"


 ############### LAVA
--- a/src/asahi/vulkan/hk_device.c
+++ b/src/asahi/vulkan/hk_device.c
@@ -288,7 +288,7 @@ hk_check_status(struct vk_device *device)
 static VkResult
 hk_get_timestamp(struct vk_device *device, uint64_t *timestamp)
 {
-   struct hk_device *dev = container_of(device, struct hk_device, vk);
+   // struct hk_device *dev = container_of(device, struct hk_device, vk);
   unreachable("todo");
   // *timestamp = agx_get_gpu_timestamp(dev);
   return VK_SUCCESS;
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -2844,6 +2844,7 @@ cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer *cmd_buffer,

      /* Texel Buffer read */
      if (buffer_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+                           VK_ACCESS_2_SHADER_READ_BIT |
                           VK_ACCESS_2_MEMORY_READ_BIT)) {
         if (vs_bin_maps->texture_map.num_desc > 0)
            return true;
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -1130,7 +1130,7 @@ builtin_variable_generator::generate_special_vars()
      add_system_value(SYSTEM_VALUE_SUBGROUP_LE_MASK, uvec4_t, "gl_SubgroupLeMask");
      add_system_value(SYSTEM_VALUE_SUBGROUP_LT_MASK, uvec4_t, "gl_SubgroupLtMask");
   }
-   if (state->is_version(300, 300) && state->OVR_multiview_enable){
+   if (state->is_version(130, 300) && state->OVR_multiview_enable) {
      add_system_value(SYSTEM_VALUE_VIEW_INDEX, int_t, GLSL_PRECISION_MEDIUM,
                      "gl_ViewID_OVR");
   }
--- a/src/gallium/auxiliary/driver_ddebug/dd_screen.c
+++ b/src/gallium/auxiliary/driver_ddebug/dd_screen.c
@@ -679,6 +679,9 @@ ddebug_screen_create(struct pipe_screen *screen)
   SCR_INIT(vertex_state_destroy);
   dscreen->base.get_driver_pipe_screen = dd_get_driver_pipe_screen;

+   /* copy all caps */
+   *(struct pipe_caps *)&dscreen->base.caps = screen->caps;
+
 #undef SCR_INIT

   dscreen->screen = screen;
--- a/src/gallium/auxiliary/driver_noop/noop_pipe.c
+++ b/src/gallium/auxiliary/driver_noop/noop_pipe.c
@@ -831,6 +831,9 @@ struct pipe_screen *noop_screen_create(struct pipe_screen *oscreen)
   screen->query_compression_modifiers = noop_query_compression_modifiers;
   screen->get_driver_pipe_screen = noop_get_driver_pipe_screen;

+   /* copy all caps */
+   *(struct pipe_caps *)&screen->caps = oscreen->caps;
+
   slab_create_parent(&noop_screen->pool_transfers,
                      sizeof(struct pipe_transfer), 64);

--- a/src/gallium/drivers/d3d12/ci/gitlab-ci.yml
+++ b/src/gallium/drivers/d3d12/ci/gitlab-ci.yml
@@ -23,7 +23,7 @@
    - . _install/piglit_run.ps1
  artifacts:
    when: on_failure
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - logs/
  variables:
--- a/src/gallium/drivers/i915/ci/gitlab-ci.yml
+++ b/src/gallium/drivers/i915/ci/gitlab-ci.yml
@@ -4,7 +4,7 @@ include:
 # Manual test rules for using g33 in ondracka r300 farm.
 .ondracka-g33-test:
  extends:
-    - .b2c-x86_64-test-gl
+    - .b2c-x86_64-test-gl-manual # use debian-build-testing rather than debian-testing
    - .i915g-manual-rules
  tags:
    - intelgpu:codename:G33
--- a/src/gallium/drivers/panfrost/pan_csf.c
+++ b/src/gallium/drivers/panfrost/pan_csf.c
@@ -1196,7 +1196,8 @@ csf_emit_draw_state(struct panfrost_batch *batch,

         /* Also use per-sample shading if required by the shader
          */
-         cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
+         cfg.evaluate_per_sample |=
+            (fs->info.fs.sample_shading && rast->multisample);

         /* Unlike Bifrost, alpha-to-coverage must be included in
          * this identically-named flag. Confusing, isn't it?
--- a/src/gallium/drivers/panfrost/pan_fb_preload.c
+++ b/src/gallium/drivers/panfrost/pan_fb_preload.c
@@ -258,8 +258,9 @@ pan_preload_emit_rsd(const struct pan_preload_shader_data *preload_shader,
         cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
         cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
      } else {
-         cfg.properties.zs_update_operation = MALI_PIXEL_KILL_WEAK_EARLY;
-         cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_WEAK_EARLY;
+         /* Skipping ATEST requires forcing Z/S */
+         cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
+         cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
      }

      /* However, while shaders writing Z/S can normally be killed, on v6
@@ -586,6 +587,7 @@ pan_preload_get_shader(struct pan_fb_preload_cache *cache,

   struct panfrost_compile_inputs inputs = {
      .gpu_id = cache->gpu_id,
+      .is_blit = true,
      .no_idvs = true,
   };
   struct util_dynarray binary;
@@ -1187,8 +1189,9 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
         cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
         cfg.blend_count = 0;
      } else {
-         cfg.zs_update_operation = MALI_PIXEL_KILL_WEAK_EARLY;
-         cfg.pixel_kill_operation = MALI_PIXEL_KILL_WEAK_EARLY;
+         /* Skipping ATEST requires forcing Z/S */
+         cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
+         cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;

         cfg.blend = blend.gpu;
         cfg.blend_count = bd_count;
--- a/src/gallium/drivers/panfrost/pan_jm.c
+++ b/src/gallium/drivers/panfrost/pan_jm.c
@@ -593,7 +593,8 @@ jm_emit_tiler_draw(struct mali_draw_packed *out, struct panfrost_batch *batch,

         /* Also use per-sample shading if required by the shader
          */
-         cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
+         cfg.evaluate_per_sample |=
+            (fs->info.fs.sample_shading && rast->multisample);

         /* Unlike Bifrost, alpha-to-coverage must be included in
          * this identically-named flag. Confusing, isn't it?
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -1224,7 +1224,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
      }
   }

-   si_need_gfx_cs_space(sctx, 0);
+   si_need_gfx_cs_space(sctx, 0, 0);

   /* If we're using a secure context, determine if cs must be secure or not */
   if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -107,7 +107,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
                              struct pipe_resource *src, unsigned byte_count,
                              uint64_t remaining_size, bool *is_first, unsigned *packet_flags)
 {
-   si_need_gfx_cs_space(sctx, 0);
+   si_need_gfx_cs_space(sctx, 0, 0);

   /* This must be done after need_cs_space. */
   radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(dst),
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -277,7 +277,7 @@ static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)

   if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
      return;
-   si_need_gfx_cs_space(sctx, 0);
+   si_need_gfx_cs_space(sctx, 0, 0);

   if (query->shaders)
      si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -2005,7 +2005,8 @@ static inline bool util_rast_prim_is_triangles(unsigned prim)
   return ((1 << prim) & UTIL_ALL_PRIM_TRIANGLE_MODES) != 0;
 }

-static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws)
+static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws,
+                                        unsigned extra_dw_per_draw)
 {
   struct radeon_cmdbuf *cs = &ctx->gfx_cs;
   /* Don't count the needed CS space exactly and just use an upper bound.
@@ -2013,7 +2014,8 @@ static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_dra
    * Also reserve space for stopping queries at the end of IB, because
    * the number of active queries is unlimited in theory.
    */
-   unsigned reserve_dw = 2048 + ctx->num_cs_dw_queries_suspend + num_draws * 10;
+   unsigned reserve_dw = 2048 + ctx->num_cs_dw_queries_suspend +
+      num_draws * (10 + extra_dw_per_draw);

   if (!ctx->ws->cs_check_space(cs, reserve_dw))
      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -919,7 +919,7 @@ static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *
   si_update_prims_generated_query_state(sctx, query->b.type, 1);
   si_update_hw_pipeline_stats(sctx, query->b.type, 1);

-   si_need_gfx_cs_space(sctx, 0);
+   si_need_gfx_cs_space(sctx, 0, 0);

   va = query->buffer.buf->gpu_address + query->buffer.results_end;
   si_query_hw_do_emit_start(sctx, query, query->buffer.buf, va);
@@ -1015,7 +1015,7 @@ static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *q

   /* The queries which need begin already called this in begin_query. */
   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
-      si_need_gfx_cs_space(sctx, 0);
+      si_need_gfx_cs_space(sctx, 0, 0);
      if (!si_query_buffer_alloc(sctx, &query->buffer, si_query_hw_prepare_buffer,
                                 query->result_size))
         return;
@@ -1728,7 +1728,7 @@ void si_resume_queries(struct si_context *sctx)
   struct si_query *query;

   /* Check CS space here. Resuming must not be interrupted by flushes. */
-   si_need_gfx_cs_space(sctx, 0);
+   si_need_gfx_cs_space(sctx, 0, 0);

   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
      query->ops->resume(sctx, query);
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -2071,7 +2071,7 @@ static void si_draw(struct pipe_context *ctx,
   else if (GFX_VERSION < GFX12)
      gfx11_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));

-   si_need_gfx_cs_space(sctx, num_draws);
+   si_need_gfx_cs_space(sctx, num_draws, ALT_HIZ_LOGIC ? 8 : 0);

   if (u_trace_perfetto_active(&sctx->ds.trace_context))
      trace_si_begin_draw(&sctx->trace);
--- a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
+++ b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
@@ -3165,6 +3165,7 @@ emit_is_helper_invocation(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 {
   spirv_builder_emit_extension(&ctx->builder,
                                "SPV_EXT_demote_to_helper_invocation");
+   spirv_builder_emit_cap(&ctx->builder, SpvCapabilityDemoteToHelperInvocation);
   SpvId result = spirv_is_helper_invocation(&ctx->builder);
   store_def(ctx, intr->def.index, result, nir_type_bool);
 }
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -1820,7 +1820,7 @@ zink_flush_frontbuffer(struct pipe_screen *pscreen,
   if (!zink_kopper_acquired(res->obj->dt, res->obj->dt_idx)) {
      /* swapbuffers to an undefined surface: acquire and present garbage */
      zink_kopper_acquire(ctx, res, UINT64_MAX);
-      ctx->needs_present = res;
+      zink_resource_reference(&ctx->needs_present, res);
      /* set batch usage to submit acquire semaphore */
      zink_batch_resource_usage_set(ctx->bs, res, true, false);
      /* ensure the resource is set up to present garbage */
@@ -3403,6 +3403,8 @@ zink_internal_create_screen(const struct pipe_screen_config *config, int64_t dev
         }
      }
   }
+
+   bool maybe_has_rebar = true;
   /* iterate again to check for missing heaps */
   for (enum zink_heap i = 0; i < ZINK_HEAP_MAX; i++) {
      /* not found: use compatible heap */
@@ -3416,10 +3418,12 @@ zink_internal_create_screen(const struct pipe_screen_config *config, int64_t dev
         } else {
            memcpy(screen->heap_map[i], screen->heap_map[ZINK_HEAP_DEVICE_LOCAL], screen->heap_count[ZINK_HEAP_DEVICE_LOCAL]);
            screen->heap_count[i] = screen->heap_count[ZINK_HEAP_DEVICE_LOCAL];
+            if (i == ZINK_HEAP_DEVICE_LOCAL_VISIBLE)
+               maybe_has_rebar = false;
         }
      }
   }
-   {
+   if (maybe_has_rebar) {
      uint64_t biggest_vis_vram = 0;
      for (unsigned i = 0; i < screen->heap_count[ZINK_HEAP_DEVICE_LOCAL_VISIBLE]; i++)
         biggest_vis_vram = MAX2(biggest_vis_vram, screen->info.mem_props.memoryHeaps[screen->info.mem_props.memoryTypes[screen->heap_map[ZINK_HEAP_DEVICE_LOCAL_VISIBLE][i]].heapIndex].size);
--- a/src/gallium/frontends/lavapipe/lvp_device.c
+++ b/src/gallium/frontends/lavapipe/lvp_device.c
@@ -304,7 +304,7 @@ min_vertex_pipeline_param(struct pipe_screen *pscreen, enum pipe_shader_cap para
                                     PIPE_SHADER_CAP_MAX_INSTRUCTIONS))
         continue;

-      val = MAX2(val, pscreen->get_shader_param(pscreen, i, param));
+      val = MIN2(val, pscreen->get_shader_param(pscreen, i, param));
   }
   return val;
 }
--- a/src/gallium/frontends/rusticl/core/context.rs
+++ b/src/gallium/frontends/rusticl/core/context.rs
@@ -200,25 +200,37 @@ impl Context {
        modifier: u64,
        image_type: cl_mem_object_type,
        gl_target: cl_GLenum,
-        format: pipe_format,
+        format: cl_image_format,
        gl_props: GLMemProps,
    ) -> CLResult<HashMap<&'static Device, Arc<PipeResource>>> {
        let mut res = HashMap::new();
        let target = cl_mem_type_to_texture_target_gl(image_type, gl_target);
+        let pipe_format = if image_type == CL_MEM_OBJECT_BUFFER {
+            pipe_format::PIPE_FORMAT_NONE
+        } else {
+            format.to_pipe_format().unwrap()
+        };

        for dev in &self.devs {
+            let enable_bind_as_image = if target != pipe_texture_target::PIPE_BUFFER {
+                dev.formats[&format][&image_type] as u32 & CL_MEM_WRITE_ONLY != 0
+            } else {
+                false
+            };
+
            let resource = dev
                .screen()
                .resource_import_dmabuf(
                    handle,
                    modifier,
                    target,
-                    format,
+                    pipe_format,
                    gl_props.stride,
                    gl_props.width,
                    gl_props.height,
                    gl_props.depth,
                    gl_props.array_size,
+                    enable_bind_as_image,
                )
                .ok_or(CL_OUT_OF_RESOURCES)?;

--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@@ -639,6 +639,12 @@ fn compile_nir_to_args(
    nir_pass!(nir, nir_scale_fdiv);
    nir.set_workgroup_size_variable_if_zero();
    nir.structurize();
+    nir_pass!(
+        nir,
+        nir_lower_variable_initializers,
+        nir_variable_mode::nir_var_function_temp
+    );
+
    while {
        let mut progress = false;
        nir_pass!(nir, nir_split_var_copies);
--- a/src/gallium/frontends/rusticl/core/memory.rs
+++ b/src/gallium/frontends/rusticl/core/memory.rs
@@ -950,7 +950,7 @@ impl MemBase {
            export_out.modifier,
            mem_type,
            export_in.target,
-            pipe_format,
+            image_format,
            gl_mem_props.clone(),
        )?;

--- a/src/gallium/frontends/rusticl/core/queue.rs
+++ b/src/gallium/frontends/rusticl/core/queue.rs
@@ -6,6 +6,7 @@ use crate::core::platform::*;
 use crate::impl_cl_type_trait;

 use mesa_rust::pipe::context::PipeContext;
+use mesa_rust_gen::*;
 use mesa_rust_util::properties::*;
 use rusticl_opencl_gen::*;

@@ -93,11 +94,20 @@ pub struct Queue {

 impl_cl_type_trait!(cl_command_queue, Queue, CL_INVALID_COMMAND_QUEUE);

-fn flush_events(evs: &mut Vec<Arc<Event>>, pipe: &PipeContext) {
+fn flush_events(evs: &mut Vec<Arc<Event>>, pipe: &PipeContext) -> cl_int {
    if !evs.is_empty() {
        pipe.flush().wait();
-        evs.drain(..).for_each(|e| e.signal());
+        if pipe.device_reset_status() != pipe_reset_status::PIPE_NO_RESET {
+            // if the context reset while executing, simply put all events into error state.
+            evs.drain(..)
+                .for_each(|e| e.set_user_status(CL_OUT_OF_RESOURCES));
+            return CL_OUT_OF_RESOURCES;
+        } else {
+            evs.drain(..).for_each(|e| e.signal());
+        }
    }
+
+    CL_SUCCESS as cl_int
 }

 impl Queue {
@@ -152,7 +162,8 @@ impl Queue {
                            // If we hit any deps from another queue, flush so we don't risk a dead
                            // lock.
                            if e.deps.iter().any(|ev| ev.queue != e.queue) {
-                                flush_events(&mut flushed, &ctx);
+                                let dep_err = flush_events(&mut flushed, &ctx);
+                                last_err = cmp::min(last_err, dep_err);
                            }

                            // check if any dependency has an error
@@ -184,20 +195,23 @@ impl Queue {
                            if e.is_user() {
                                // On each user event we flush our events as application might
                                // wait on them before signaling user events.
-                                flush_events(&mut flushed, &ctx);
+                                last_err = flush_events(&mut flushed, &ctx);

-                                // Wait on user events as they are synchronization points in the
-                                // application's control.
-                                e.wait();
+                                if last_err >= 0 {
+                                    // Wait on user events as they are synchronization points in the
+                                    // application's control.
+                                    e.wait();
+                                }
                            } else if Platform::dbg().sync_every_event {
                                flushed.push(e);
-                                flush_events(&mut flushed, &ctx);
+                                last_err = flush_events(&mut flushed, &ctx);
                            } else {
                                flushed.push(e);
                            }
                        }

-                        flush_events(&mut flushed, &ctx);
+                        let flush_err = flush_events(&mut flushed, &ctx);
+                        last_err = cmp::min(last_err, flush_err);
                    }
                })
                .unwrap(),
@@ -245,7 +259,10 @@ impl Queue {
            // Waiting on the last event is good enough here as the queue will process it in order
            // It's not a problem if the weak ref is invalid as that means the work is already done
            // and waiting isn't necessary anymore.
-            last.upgrade().map(|e| e.wait());
+            let err = last.upgrade().map(|e| e.wait()).unwrap_or_default();
+            if err < 0 {
+                return Err(err);
+            }
        }
        Ok(())
    }
--- a/src/gallium/frontends/rusticl/mesa/compiler/nir.rs
+++ b/src/gallium/frontends/rusticl/mesa/compiler/nir.rs
@@ -259,11 +259,6 @@ impl NirShader {
    }

    pub fn inline(&mut self, libclc: &NirShader) {
-        nir_pass!(
-            self,
-            nir_lower_variable_initializers,
-            nir_variable_mode::nir_var_function_temp,
-        );
        nir_pass!(self, nir_lower_returns);
        nir_pass!(self, nir_link_shader_functions, libclc.nir.as_ptr());
        nir_pass!(self, nir_inline_functions);
--- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
@@ -591,6 +591,16 @@ impl PipeContext {
            }
        }
    }
+
+    pub fn device_reset_status(&self) -> pipe_reset_status {
+        unsafe {
+            if let Some(get_device_reset_status) = self.pipe.as_ref().get_device_reset_status {
+                get_device_reset_status(self.pipe.as_ptr())
+            } else {
+                pipe_reset_status::PIPE_NO_RESET
+            }
+        }
+    }
 }

 impl Drop for PipeContext {
--- a/src/gallium/frontends/rusticl/mesa/pipe/screen.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/screen.rs
@@ -249,6 +249,7 @@ impl PipeScreen {
        height: u16,
        depth: u16,
        array_size: u16,
+        support_image: bool,
    ) -> Option<PipeResource> {
        let mut tmpl = pipe_resource::default();
        let mut handle = winsys_handle {
@@ -267,6 +268,15 @@ impl PipeScreen {
        tmpl.depth0 = depth;
        tmpl.array_size = array_size;

+        if target == pipe_texture_target::PIPE_BUFFER {
+            tmpl.bind = PIPE_BIND_GLOBAL
+        } else {
+            tmpl.bind = PIPE_BIND_SAMPLER_VIEW;
+            if support_image {
+                tmpl.bind |= PIPE_BIND_SHADER_IMAGE;
+            }
+        }
+
        unsafe {
            PipeResource::new(
                self.screen().resource_from_handle.unwrap()(
--- a/src/gallium/targets/dril/dril_target.c
+++ b/src/gallium/targets/dril/dril_target.c
@@ -453,7 +453,9 @@ drilCreateNewScreen(int scrn, int fd,
                    const __DRIconfig ***driver_configs, void *data)
 {
   const __DRIconfig **configs = init_dri2_configs(fd);
-   if (!configs && fd == -1) {
+   if (!configs) {
+      if (fd != -1)
+         return NULL;
      // otherwise set configs to point to our config list
      configs = calloc(ARRAY_SIZE(drilConfigs) * 2 + 1, sizeof(void *));
      int c = 0;
--- a/src/intel/compiler/brw_generator.cpp
+++ b/src/intel/compiler/brw_generator.cpp
@@ -1250,8 +1250,15 @@ brw_generator::generate_code(const cfg_t *cfg, int dispatch_width,
         assert(inst->force_writemask_all && inst->group == 0);
         assert(inst->dst.file == BAD_FILE);
         brw_set_default_exec_size(p, BRW_EXECUTE_1);
+         brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
         brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), BRW_TYPE_UD),
                 retype(brw_mask_reg(0), BRW_TYPE_UD));
+         /* Reading certain ARF registers (like 'ce', the mask register) on
+          * Gfx12+ requires requires a dependency on all pipes on the read
+          * instruction and the next instructions
+          */
+         if (devinfo->ver >= 12)
+            brw_SYNC(p, TGL_SYNC_NOP);
         break;
      }
      case SHADER_OPCODE_BROADCAST:
--- a/src/intel/vulkan/bvh/meson.build
+++ b/src/intel/vulkan/bvh/meson.build
@@ -29,7 +29,6 @@ anv_bvh_includes = files(
  'anv_bvh.h',
 )

-bvh_spv = []
 foreach s : bvh_shaders
  command = [
    prog_glslang, '-V', '-I' + vk_bvh_include_dir, '-I' + anv_bvh_include_dir, '--target-env', 'spirv1.5', '-x', '-o', '@OUTPUT@', '@INPUT@'
--- a/src/intel/vulkan/genX_cmd_video.c
+++ b/src/intel/vulkan/genX_cmd_video.c
@@ -2098,7 +2098,7 @@ anv_av1_decode_video_tile(struct anv_cmd_buffer *cmd_buffer,
      pic.AllowScreenContentToolsFlag = std_pic_info->flags.allow_screen_content_tools;
      pic.ForceIntegerMVFlag = std_pic_info->flags.force_integer_mv;
      pic.AllowWarpedMotionFlag = std_pic_info->flags.allow_warped_motion;
-      pic.UseCDEFFilterFlag = seq_hdr->flags.enable_cdef;
+      pic.UseCDEFFilterFlag = !frame_lossless && seq_hdr->flags.enable_cdef;
      pic.UseSuperResFlag = std_pic_info->flags.use_superres;
      pic.FrameLevelLoopRestorationFilterEnable = frame_restoration_type[0] || frame_restoration_type[1] || frame_restoration_type[2];
      pic.FrameType = std_pic_info->frame_type;
@@ -2113,7 +2113,7 @@ anv_av1_decode_video_tile(struct anv_cmd_buffer *cmd_buffer,
      pic.LastActiveSegmentSegmentID = last_active_segid;
      pic.DeltaQPresentFlag = std_pic_info->flags.delta_q_present;
      pic.DeltaQRes = std_pic_info->delta_q_res;
-      pic.FrameCodedLosslessMode = frame_lossless; /* TODO */
+      pic.FrameCodedLosslessMode = frame_lossless;
      pic.SegmentMapisZeroFlag = 0; /* TODO */
      pic.SegmentIDBufferStreamInEnableFlag = 0; /* TODO */
      pic.SegmentIDBufferStreamOutEnableFlag = 0; /* TODO */
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -37,6 +37,7 @@ idep_anv_headers = declare_dependency(
  include_directories : inc_anv,
 )

+bvh_spv = []
 if with_intel_vk_rt
  if with_intel_bvh_grl
    subdir('grl')
@@ -124,7 +125,7 @@ endif
 foreach _gfx_ver : ['90', '110', '120', '125', '200', '300']
  libanv_per_hw_ver_libs += static_library(
    'anv_per_hw_ver@0@'.format(_gfx_ver),
-    [anv_per_hw_ver_files, anv_entrypoints[0]],
+    [anv_per_hw_ver_files, anv_entrypoints[0]] + bvh_spv,
    include_directories : [
      inc_include, inc_src, inc_intel,
    ],
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1000,6 +1000,11 @@ test_attachment_completeness(const struct gl_context *ctx, GLenum format,
            att->Complete = GL_FALSE;
            return;
         }
+         if (att->Zoffset + att->NumViews > texImage->Depth) {
+            att_incomplete("bad 2D-array view range");
+            att->Complete = GL_FALSE;
+            return;
+         }
         break;
      case GL_TEXTURE_CUBE_MAP_ARRAY:
         if (att->Zoffset >= texImage->Depth) {
@@ -4218,7 +4223,8 @@ static ALWAYS_INLINE void
 frame_buffer_texture(GLuint framebuffer, GLenum target,
                     GLenum attachment, GLuint texture,
                     GLint level, GLsizei samples, GLint layer, const char *func,
-                     bool dsa, bool no_error, bool check_layered, GLsizei numviews)
+                     bool dsa, bool no_error, bool check_layered, bool multiview,
+                     GLsizei numviews)
 {
   GET_CURRENT_CONTEXT(ctx);
   GLboolean layered = GL_FALSE;
@@ -4279,7 +4285,7 @@ frame_buffer_texture(GLuint framebuffer, GLenum target,
            return;
      }

-      if (numviews > 1) {
+      if (multiview) {
         /* We do this regardless of no_error because this sets multiviews */
         if (!check_multiview_texture_target(ctx, texture, texObj->Target, level, layer, numviews, func))
         {
@@ -4339,7 +4345,7 @@ _mesa_FramebufferTextureLayer_no_error(GLenum target, GLenum attachment,
                                       GLint layer)
 {
   frame_buffer_texture(0, target, attachment, texture, level, 0, layer,
-                        "glFramebufferTextureLayer", false, true, false, 0);
+                        "glFramebufferTextureLayer", false, true, false, false, 0);
 }


@@ -4348,7 +4354,7 @@ _mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
                              GLuint texture, GLint level, GLint layer)
 {
   frame_buffer_texture(0, target, attachment, texture, level, 0, layer,
-                        "glFramebufferTextureLayer", false, false, false, 0);
+                        "glFramebufferTextureLayer", false, false, false, false, 0);
 }


@@ -4359,7 +4365,7 @@ _mesa_NamedFramebufferTextureLayer_no_error(GLuint framebuffer,
                                            GLint layer)
 {
   frame_buffer_texture(framebuffer, 0, attachment, texture, level, 0, layer,
-                        "glNamedFramebufferTextureLayer", true, true, false, 0);
+                        "glNamedFramebufferTextureLayer", true, true, false, false, 0);
 }


@@ -4368,7 +4374,7 @@ _mesa_NamedFramebufferTextureLayer(GLuint framebuffer, GLenum attachment,
                                   GLuint texture, GLint level, GLint layer)
 {
   frame_buffer_texture(framebuffer, 0, attachment, texture, level, 0, layer,
-                        "glNamedFramebufferTextureLayer", true, false, false, 0);
+                        "glNamedFramebufferTextureLayer", true, false, false, false, 0);
 }


@@ -4378,7 +4384,7 @@ _mesa_FramebufferTextureMultiviewOVR_no_error(GLenum target, GLenum attachment,
                                              GLint baseViewIndex, GLsizei numViews)
 {
   frame_buffer_texture(0, target, attachment, texture, level, 0, baseViewIndex,
-                        "glFramebufferTexture", false, true, false, numViews);
+                        "glFramebufferTexture", false, true, false, true, numViews);
 }


@@ -4388,7 +4394,7 @@ _mesa_FramebufferTextureMultiviewOVR(GLenum target, GLenum attachment,
                                     GLint baseViewIndex, GLsizei numViews)
 {
   frame_buffer_texture(0, target, attachment, texture, level, 0, baseViewIndex,
-                        "glFramebufferTexture", false, false, false, numViews);
+                        "glFramebufferTexture", false, false, false, true, numViews);
 }


@@ -4398,7 +4404,7 @@ _mesa_FramebufferTextureMultisampleMultiviewOVR_no_error(GLenum target, GLenum a
                                                         GLint baseViewIndex, GLsizei numViews)
 {
   frame_buffer_texture(0, target, attachment, texture, level, samples, baseViewIndex,
-                        "FramebufferTextureMultisampleMultiviewOVR", false, true, false, numViews);
+                        "FramebufferTextureMultisampleMultiviewOVR", false, true, false, true, numViews);
 }


@@ -4408,7 +4414,7 @@ _mesa_FramebufferTextureMultisampleMultiviewOVR(GLenum target, GLenum attachment
                                                GLint baseViewIndex, GLsizei numViews)
 {
   frame_buffer_texture(0, target, attachment, texture, level, samples, baseViewIndex,
-                        "FramebufferTextureMultisampleMultiviewOVR", false, false, false, numViews);
+                        "FramebufferTextureMultisampleMultiviewOVR", false, false, false, true, numViews);
 }


@@ -4418,7 +4424,7 @@ _mesa_NamedFramebufferTextureMultiviewOVR_no_error(GLuint framebuffer, GLenum at
                                                   GLint baseViewIndex, GLsizei numViews)
 {
   frame_buffer_texture(framebuffer, 0, attachment, texture, level, 0, baseViewIndex,
-                        "glFramebufferTexture", true, true, false, numViews);
+                        "glFramebufferTexture", true, true, false, true, numViews);
 }


@@ -4428,7 +4434,7 @@ _mesa_NamedFramebufferTextureMultiviewOVR(GLuint framebuffer, GLenum attachment,
                                          GLint baseViewIndex, GLsizei numViews)
 {
   frame_buffer_texture(framebuffer, 0, attachment, texture, level, 0, baseViewIndex,
-                        "glFramebufferTexture", true, false, false, numViews);
+                        "glFramebufferTexture", true, false, false, true, numViews);
 }


@@ -4437,7 +4443,7 @@ _mesa_FramebufferTexture_no_error(GLenum target, GLenum attachment,
                                  GLuint texture, GLint level)
 {
   frame_buffer_texture(0, target, attachment, texture, level, 0, 0,
-                        "glFramebufferTexture", false, true, true, 0);
+                        "glFramebufferTexture", false, true, true, false, 0);
 }


@@ -4446,7 +4452,7 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
                         GLuint texture, GLint level)
 {
   frame_buffer_texture(0, target, attachment, texture, level, 0, 0,
-                        "glFramebufferTexture", false, false, true, 0);
+                        "glFramebufferTexture", false, false, true, false, 0);
 }

 void GLAPIENTRY
@@ -4454,7 +4460,7 @@ _mesa_NamedFramebufferTexture_no_error(GLuint framebuffer, GLenum attachment,
                                       GLuint texture, GLint level)
 {
   frame_buffer_texture(framebuffer, 0, attachment, texture, level, 0, 0,
-                        "glNamedFramebufferTexture", true, true, true, 0);
+                        "glNamedFramebufferTexture", true, true, true, false, 0);
 }


@@ -4463,7 +4469,7 @@ _mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
                              GLuint texture, GLint level)
 {
   frame_buffer_texture(framebuffer, 0, attachment, texture, level, 0, 0,
-                        "glNamedFramebufferTexture", true, false, true, 0);
+                        "glNamedFramebufferTexture", true, false, true, false, 0);
 }


--- a/src/microsoft/ci/gitlab-ci.yml
+++ b/src/microsoft/ci/gitlab-ci.yml
@@ -18,7 +18,7 @@ test-spirv2dxil-windows:
    - . _install/spirv2dxil_run.ps1
  artifacts:
    when: on_failure
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - spirv2dxil_results.txt

@@ -42,7 +42,7 @@ test-dozen-deqp:
    - . _install/deqp_runner_run.ps1
  artifacts:
    when: on_failure
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - results/
    reports:
--- a/src/panfrost/ci/panfrost-g52-fails.txt
+++ b/src/panfrost/ci/panfrost-g52-fails.txt
--- a/src/panfrost/ci/panfrost-g52-flakes.txt
+++ b/src/panfrost/ci/panfrost-g52-flakes.txt
@@ -32,3 +32,6 @@ dEQP-VK.memory.pipeline_barrier.host_write_uniform_texel_buffer.1048576

 # Sometime timeout
 dEQP-VK.memory.pipeline_barrier.host_write_storage_buffer.1048576
+
+# No clue what's happening to these on CI lately
+dEQP-VK.wsi.*.swapchain.simulate_oom.*
--- a/src/panfrost/ci/panfrost-g610-fails.txt
+++ b/src/panfrost/ci/panfrost-g610-fails.txt
@@ -7,31 +7,3 @@ dEQP-VK.pipeline.fast_linked_library.misc.interpolate_at_sample_no_sample_shadin
 dEQP-VK.renderpass.multiple_subpasses_multiple_command_buffers.test,Fail

 dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex,Crash
-
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.composite_alpha,Crash
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.exclusive_nonzero_queues,Crash
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.image_array_layers,Crash
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.image_extent,Crash
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.image_format,Crash
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.image_usage,Crash
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.min_image_count,Crash
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.pre_transform,Crash
-dEQP-VK.wsi.wayland.swapchain.simulate_oom.present_mode,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.clipped,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.composite_alpha,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.image_array_layers,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.image_extent,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.image_format,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.image_usage,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.min_image_count,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.pre_transform,Crash
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.present_mode,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.clipped,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.composite_alpha,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.exclusive_nonzero_queues,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.image_array_layers,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.image_extent,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.image_format,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.min_image_count,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.pre_transform,Crash
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.present_mode,Crash
--- a/src/panfrost/ci/panfrost-g610-flakes.txt
+++ b/src/panfrost/ci/panfrost-g610-flakes.txt
@@ -257,9 +257,8 @@ dEQP-VK.texture.filtering.3d.combinations.nearest.linear.clamp_to_edge.repeat.cl
 dEQP-VK.texture.shadow.1d_array.linear_mipmap_linear.equal_x8_d24_unorm_pack32
 dEQP-VK.texture.shadow.cube.linear_mipmap_nearest.equal_d24_unorm_s8_uint

-dEQP-VK.wsi.wayland.swapchain.simulate_oom.clipped
-dEQP-VK.wsi.xcb.swapchain.simulate_oom.exclusive_nonzero_queues
-dEQP-VK.wsi.xlib.swapchain.simulate_oom.image_usage
+# No clue what's happening to these on CI lately
+dEQP-VK.wsi.*.swapchain.simulate_oom.*

 # With linux kernel 6.13-rc4 uprev
 dEQP-GLES31.functional.copy_image.compressed.viewclass_astc_8x6_rgba.rgba_astc_8x6_khr_srgb8_alpha8_astc_8x6_khr.texture3d_to_cubemap
--- a/src/panfrost/compiler/bi_lower_swizzle.c
+++ b/src/panfrost/compiler/bi_lower_swizzle.c
@@ -54,12 +54,17 @@ lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
   case BI_OPCODE_CSEL_V2I16:
   case BI_OPCODE_CSEL_V2S16:
   case BI_OPCODE_CSEL_V2U16:
+      break;

   /* Despite ostensibly being 32-bit instructions, CLPER does not
    * inherently interpret the data, so it can be used for v2f16
    * derivatives, which might require swizzle lowering */
   case BI_OPCODE_CLPER_I32:
   case BI_OPCODE_CLPER_OLD_I32:
+      if (src == 0)
+         break;
+      else
+         return;

   /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
    * boolean is implemented as a 16-bit integer, the swizzle is needed
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -877,12 +877,16 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2,
 }

 /* Blend shaders do not need to run ATEST since they are dependent on a
- * fragment shader that runs it. */
+ * fragment shader that runs it. Blit shaders may not need to run ATEST, since
+ * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
+ * there are no writes to the coverage mask. The latter two are satisfied for
+ * all blit shaders, so we just care about early-z, which blit shaders force
+ * iff they do not write depth or stencil */

 static bool
-bi_skip_atest(bi_context *ctx)
+bi_skip_atest(bi_context *ctx, bool emit_zs)
 {
-   return ctx->inputs->is_blend;
+   return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
 }

 static void
@@ -946,7 +950,7 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
    * alpha value is only used for alpha-to-coverage, a stage which is
    * skipped for pure integer framebuffers, so the issue is moot. */

-   if (!b->shader->emitted_atest && !bi_skip_atest(b->shader)) {
+   if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
      nir_alu_type T = nir_intrinsic_src_type(instr);

      bi_index rgba = bi_src_index(&instr->src[0]);
@@ -1178,7 +1182,13 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)

      if (index_offset != 0)
         index = bi_iadd_imm_i32(b, index, index_offset);
-      bi_index address = bi_lea_buf_imm(b, index);
+
+      /* On Valhall, with IDVS varying are stored in a hardware-controlled
+       * buffer through table 61 at index 0 */
+      bi_index address = bi_temp(b->shader);
+      bi_instr *I = bi_lea_buf_imm_to(b, address, index);
+      I->table = va_res_fold_table_idx(61);
+      I->index = 0;
      bi_emit_split_i32(b, a, address, 2);

      bi_store(b, nr * src_bit_sz, data, a[0], a[1],
@@ -1735,7 +1745,7 @@ bi_emit_derivative(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr,
    */
   if (nir_def_all_uses_ignore_sign_bit(&instr->def) && !coarse) {
      left = s0;
-      right = bi_clper(b, s0, bi_imm_u32(axis), BI_LANE_OP_XOR);
+      right = bi_clper(b, s0, bi_imm_u8(axis), BI_LANE_OP_XOR);
   } else {
      bi_index lane1, lane2;
      if (coarse) {
@@ -1748,8 +1758,8 @@ bi_emit_derivative(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr,
         lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false);
      }

-      left = bi_clper(b, s0, lane1, BI_LANE_OP_NONE);
-      right = bi_clper(b, s0, lane2, BI_LANE_OP_NONE);
+      left = bi_clper(b, s0, bi_byte(lane1, 0), BI_LANE_OP_NONE);
+      right = bi_clper(b, s0, bi_byte(lane2, 0), BI_LANE_OP_NONE);
   }

   bi_fadd_to(b, sz, dst, right, bi_neg(left));
@@ -2042,7 +2052,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
         bi_subgroup_from_cluster_size(pan_subgroup_size(b->shader->arch));
      bi_clper_i32_to(b, dst,
                      bi_src_index(&instr->src[0]),
-                      bi_src_index(&instr->src[1]),
+                      bi_byte(bi_src_index(&instr->src[1]), 0),
                      inactive_result, lane_op, subgroup);
      break;
   }
@@ -4118,10 +4128,16 @@ bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
                       !narrow_indices, mask, sr_count);
      break;
   case nir_texop_txf:
-   case nir_texop_txf_ms:
+   case nir_texop_txf_ms: {
+      /* On Valhall, TEX_FETCH doesn't have CUBE support. This is not a problem
+       * as a cube is just a 2D array in any cases. */
+      if (dim == BI_DIMENSION_CUBE)
+         dim = BI_DIMENSION_2D;
+
      bi_tex_fetch_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
                      explicit_offset, !narrow_indices, mask, sr_count);
      break;
+   }
   case nir_texop_tg4:
      bi_tex_gather_to(b, dest, idx, src0, src1, instr->is_array, dim,
                       instr->component, false, regfmt, instr->is_shadow,
@@ -5538,7 +5554,7 @@ bi_compile_variant_nir(nir_shader *nir,
   /* If the shader doesn't write any colour or depth outputs, it may
    * still need an ATEST at the very end! */
   bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) &&
-                           !ctx->emitted_atest && !bi_skip_atest(ctx);
+                           !ctx->emitted_atest && !bi_skip_atest(ctx, false);

   if (need_dummy_atest) {
      bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
--- a/src/panfrost/compiler/valhall/ISA.xml
+++ b/src/panfrost/compiler/valhall/ISA.xml
@@ -200,7 +200,7 @@

  <enum name="Lanes (8-bit)">
    <desc>Used to select the 2 bytes for shifts of 16-bit vectors</desc>
-    <value>b02</value>
+    <reserved/>
    <reserved/>
    <reserved/>
    <reserved/>
@@ -210,8 +210,8 @@
    <value>b33</value>
    <reserved/>
    <reserved/>
-    <value>b01</value>
-    <value>b23</value>
+    <reserved/>
+    <reserved/>
    <reserved/>
    <reserved/>
    <reserved/>
@@ -1977,7 +1977,7 @@
      derivatives in fragment shaders.
    </desc>
    <src>A</src>
-    <src widen="true">B</src>
+    <src lanes="true" size="8">B</src>
    <subgroup/>
    <lane_op/>
    <inactive_result/>
--- a/src/panfrost/compiler/valhall/test/assembler-cases.txt
+++ b/src/panfrost/compiler/valhall/test/assembler-cases.txt
@@ -32,7 +32,7 @@ e6 00 00 00 00 c1 91 06    MOV.i32 r1, core_id.w0
 00 00 00 00 00 c0 00 78    NOP.end 
 40 c4 c0 9c 01 c1 f0 00    ICMP_OR.u32.gt.m1 r1, ^r0, 0x1000000.b3, 0x0
 42 00 00 18 02 40 61 50    STORE.i32.slot0.reconverge @r0, ^r2, offset:0
-00 c9 8f 12 30 c0 a0 00    CLPER.i32.f1 r0, r0, 0x7060504.b0
+00 c9 8f 12 30 c0 a0 00    CLPER.i32.f1 r0, r0, 0x7060504.b00
 00 00 00 30 00 c7 90 00    S8_TO_S32 r7, r0.b3
 00 00 00 20 00 c6 90 00    S8_TO_S32 r6, r0.b2
 00 00 00 00 00 c4 90 00    S8_TO_S32 r4, r0.b0
--- a/src/panfrost/compiler/valhall/test/test-packing.cpp
+++ b/src/panfrost/compiler/valhall/test/test-packing.cpp
@@ -306,7 +306,7 @@ TEST_F(ValhallPacking, LdVarBufImmF16)
 TEST_F(ValhallPacking, LeaBufImm)
 {
   CASE(bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))),
-        0x005e840400000d7b);
+        0x005e84040000007b);
 }

 TEST_F(ValhallPacking, StoreSegment)
--- a/src/panfrost/compiler/valhall/va_lower_constants.c
+++ b/src/panfrost/compiler/valhall/va_lower_constants.c
@@ -238,7 +238,7 @@ va_lower_constants(bi_context *ctx, bi_instr *I)
          * applying the lane select puts the desired constant (at least) in the
          * bottom byte, so we can always select the bottom byte.
          */
-         if (info.lane && I->src[s].swizzle == BI_SWIZZLE_H01) {
+         if ((info.lane || info.lanes) && I->src[s].swizzle == BI_SWIZZLE_H01) {
            assert(info.size == VA_SIZE_8);
            I->src[s] = bi_byte(I->src[s], 0);
         }
--- a/src/panfrost/compiler/valhall/va_pack.c
+++ b/src/panfrost/compiler/valhall/va_pack.c
@@ -349,8 +349,6 @@ static enum va_lanes_8_bit
 va_pack_shift_lanes(const bi_instr *I, enum bi_swizzle swz)
 {
   switch (swz) {
-   case BI_SWIZZLE_H01:
-      return VA_LANES_8_BIT_B02;
   case BI_SWIZZLE_B0000:
      return VA_LANES_8_BIT_B00;
   case BI_SWIZZLE_B1111:
@@ -472,8 +470,8 @@ va_pack_alu(const bi_instr *I)
      break;

   case BI_OPCODE_LEA_BUF_IMM:
-      /* Buffer table index */
-      hex |= 0xD << 8;
+      hex |= ((uint64_t)I->table) << 8;
+      hex |= ((uint64_t)I->index) << 12;
      break;

   case BI_OPCODE_LEA_ATTR_IMM:
--- a/src/panfrost/drm-shim/panfrost_noop.c
+++ b/src/panfrost/drm-shim/panfrost_noop.c
@@ -216,7 +216,7 @@ panthor_ioctl_dev_query(int fd, unsigned long request, void *arg)

      /* Default values */
      priorities_info->allowed_mask =
-         PANTHOR_GROUP_PRIORITY_LOW | PANTHOR_GROUP_PRIORITY_MEDIUM;
+         BITFIELD_BIT(PANTHOR_GROUP_PRIORITY_LOW) | BITFIELD_BIT(PANTHOR_GROUP_PRIORITY_MEDIUM);

      return 0;
   }
--- a/src/panfrost/lib/genxml/decode_csf.c
+++ b/src/panfrost/lib/genxml/decode_csf.c
@@ -595,7 +595,7 @@ pandecode_run_tiling(struct pandecode_context *ctx, FILE *fp,
                 cs_get_u64(qctx, 48));

   uint64_t blend = cs_get_u64(qctx, 50);
-   GENX(pandecode_blend_descs)(ctx, blend & ~7, blend & 7, 0, qctx->gpu_id);
+   GENX(pandecode_blend_descs)(ctx, blend & ~15, blend & 15, 0, qctx->gpu_id);

   DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");

@@ -610,6 +610,7 @@ pandecode_run_tiling(struct pandecode_context *ctx, FILE *fp,

   ctx->indent--;
 }
+
 static void
 pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
                   struct queue_ctx *qctx, struct MALI_CS_RUN_IDVS *I)
@@ -726,7 +727,7 @@ pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
      pandecode_log(ctx, "Varying allocation: %u\n", cs_get_u32(qctx, 48));

   uint64_t blend = cs_get_u64(qctx, 50);
-   GENX(pandecode_blend_descs)(ctx, blend & ~7, blend & 7, 0, qctx->gpu_id);
+   GENX(pandecode_blend_descs)(ctx, blend & ~15, blend & 15, 0, qctx->gpu_id);

   DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");

@@ -1145,7 +1146,7 @@ record_indirect_branch_target(struct cs_code_cfg *cfg,
 {
   union {
      uint32_t u32[256];
-      uint32_t u64[256];
+      uint64_t u64[128];
   } reg_file = {0};

   list_add(&cur_blk->node, blk_stack);
@@ -1158,7 +1159,11 @@ record_indirect_branch_target(struct cs_code_cfg *cfg,
         switch (base.opcode) {
         case MALI_CS_OPCODE_MOVE: {
            cs_unpack(instr, CS_MOVE, I);
-            reg_file.u64[I.destination] = I.immediate;
+
+            assert(I.destination % 2 == 0 &&
+                   "Destination register should be aligned to 2");
+
+            reg_file.u64[I.destination / 2] = I.immediate;
            break;
         }

@@ -1176,7 +1181,14 @@ record_indirect_branch_target(struct cs_code_cfg *cfg,

         case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
            cs_unpack(instr, CS_ADD_IMMEDIATE64, I);
-            reg_file.u64[I.destination] = reg_file.u64[I.source] + I.immediate;
+
+            assert(I.destination % 2 == 0 &&
+                   "Destination register should be aligned to 2");
+            assert(I.source % 2 == 0 &&
+                   "Source register should be aligned to 2");
+
+            reg_file.u64[I.destination / 2] =
+               reg_file.u64[I.source / 2] + I.immediate;
            break;
         }

@@ -1198,8 +1210,10 @@ record_indirect_branch_target(struct cs_code_cfg *cfg,
   uint64_t *instr = &cfg->instrs[ibranch->instr_idx];
   cs_unpack(instr, CS_JUMP, I);

+   assert(I.address % 2 == 0 && "Address register should be aligned to 2");
+
   struct cs_indirect_branch_target target = {
-      .address = reg_file.u64[I.address],
+      .address = reg_file.u64[I.address / 2],
      .length = reg_file.u32[I.length],
   };

--- a/src/panfrost/lib/genxml/v10.xml
+++ b/src/panfrost/lib/genxml/v10.xml
@@ -771,7 +771,7 @@
  </struct>

  <struct name="CS PROGRESS_LOAD" size="2">
-    <field name="Destination" size="8" start="40" type="uint"/>
+    <field name="Destination" size="8" start="48" type="uint"/>
    <field name="Opcode" size="8" start="56" type="CS Opcode" default="PROGRESS_LOAD"/>
  </struct>

--- a/src/panfrost/lib/pan_format.c
+++ b/src/panfrost/lib/pan_format.c
@@ -137,6 +137,12 @@ const struct pan_blendable_format
 #define YUV_NO_SWAP (0)
 #define YUV_SWAP    (1)

+#if PAN_ARCH <= 9
+#define MALI_YUV_CR_SITING_CENTER_422 (MALI_YUV_CR_SITING_CENTER_Y)
+#else
+#define MALI_YUV_CR_SITING_CENTER_422 (MALI_YUV_CR_SITING_CENTER_X)
+#endif
+
 #define FMT_YUV(pipe, mali, swizzle, swap, siting, flags)                      \
   [PIPE_FORMAT_##pipe] = {                                                    \
      .hw = (MALI_YUV_SWIZZLE_##swizzle) | ((YUV_##swap) << 3) |               \
@@ -170,20 +176,20 @@ const struct panfrost_format GENX(panfrost_pipe_format)[PIPE_FORMAT_COUNT] = {

 #if PAN_ARCH >= 7
   /* Multiplane formats */
-   FMT_YUV(R8G8_R8B8_UNORM, YUYV8, UVYA, NO_SWAP, CENTER_Y, _T__),
-   FMT_YUV(G8R8_B8R8_UNORM, VYUY8, UYVA, SWAP,    CENTER_Y, _T__),
-   FMT_YUV(R8B8_R8G8_UNORM, YUYV8, VYUA, NO_SWAP, CENTER_Y, _T__),
-   FMT_YUV(B8R8_G8R8_UNORM, VYUY8, VUYA, SWAP,    CENTER_Y, _T__),
+   FMT_YUV(R8G8_R8B8_UNORM, YUYV8, UVYA, NO_SWAP, CENTER_422, _T__),
+   FMT_YUV(G8R8_B8R8_UNORM, VYUY8, UYVA, SWAP,    CENTER_422, _T__),
+   FMT_YUV(R8B8_R8G8_UNORM, YUYV8, VYUA, NO_SWAP, CENTER_422, _T__),
+   FMT_YUV(B8R8_G8R8_UNORM, VYUY8, VUYA, SWAP,    CENTER_422, _T__),
   FMT_YUV(R8_G8B8_420_UNORM, Y8_UV8_420, YUVA, NO_SWAP, CENTER, _T__),
   FMT_YUV(R8_B8G8_420_UNORM, Y8_UV8_420, YVUA, NO_SWAP, CENTER, _T__),
   FMT_YUV(R8_G8_B8_420_UNORM, Y8_U8_V8_420, YUVA, NO_SWAP, CENTER, _T__),
   FMT_YUV(R8_B8_G8_420_UNORM, Y8_U8_V8_420, YVUA, NO_SWAP, CENTER, _T__),

-   FMT_YUV(R8_G8B8_422_UNORM, Y8_UV8_422, YUVA, NO_SWAP, CENTER, _T__),
-   FMT_YUV(R8_B8G8_422_UNORM, Y8_UV8_422, YVUA, NO_SWAP, CENTER, _T__),
+   FMT_YUV(R8_G8B8_422_UNORM, Y8_UV8_422, YUVA, NO_SWAP, CENTER_422, _T__),
+   FMT_YUV(R8_B8G8_422_UNORM, Y8_UV8_422, YVUA, NO_SWAP, CENTER_422, _T__),

   FMT_YUV(R10_G10B10_420_UNORM, Y10_UV10_420, YUVA, NO_SWAP, CENTER, _T__),
-   FMT_YUV(R10_G10B10_422_UNORM, Y10_UV10_422, YUVA, NO_SWAP, CENTER, _T__),
+   FMT_YUV(R10_G10B10_422_UNORM, Y10_UV10_422, YUVA, NO_SWAP, CENTER_422, _T__),
 #endif

   FMTC(ETC1_RGB8,               ETC2_RGB8,       RGBA8_UNORM, RGB1, L),
--- a/src/panfrost/lib/pan_texture.c
+++ b/src/panfrost/lib/pan_texture.c
@@ -423,14 +423,15 @@ panfrost_emit_plane(const struct pan_image_view *iview,
   bool afbc = drm_is_afbc(layout->modifier);
   bool afrc = drm_is_afrc(layout->modifier);
   // TODO: this isn't technically guaranteed to be YUV, but it is in practice.
-   bool is_3_planar_yuv = desc->layout == UTIL_FORMAT_LAYOUT_PLANAR3;
+   bool is_chroma_2p =
+      desc->layout == UTIL_FORMAT_LAYOUT_PLANAR3 && plane_index > 0;

   pan_cast_and_pack(*payload, PLANE, cfg) {
      cfg.pointer = pointer;
      cfg.row_stride = row_stride;
      cfg.size = layout->data_size - layout->slices[level].offset;

-      if (is_3_planar_yuv) {
+      if (is_chroma_2p) {
         cfg.two_plane_yuv_chroma.secondary_pointer =
            sections[plane_index + 1].pointer;
      } else if (!panfrost_format_is_yuv(layout->format)) {
@@ -489,8 +490,8 @@ panfrost_emit_plane(const struct pan_image_view *iview,
            GENX(pan_afrc_format)(finfo, layout->modifier, plane_index);
 #endif
      } else {
-         cfg.plane_type = is_3_planar_yuv ? MALI_PLANE_TYPE_CHROMA_2P
-                                          : MALI_PLANE_TYPE_GENERIC;
+         cfg.plane_type =
+            is_chroma_2p ? MALI_PLANE_TYPE_CHROMA_2P : MALI_PLANE_TYPE_GENERIC;
         cfg.clump_format = panfrost_clump_format(iview->format);
      }

--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@@ -98,7 +98,7 @@ struct panfrost_compile_inputs {
   struct util_debug_callback *debug;

   unsigned gpu_id;
-   bool is_blend;
+   bool is_blend, is_blit;
   struct {
      unsigned nr_samples;
      uint64_t bifrost_blend_desc;
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c
@@ -171,7 +171,7 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
    * simple with this all-or-nothing approach. */
   if ((instance->debug_flags & PANVK_DEBUG_CS) &&
       cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
-       !(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) {
+       !cmdbuf->state.gfx.render.suspended) {
      cs_update_cmdbuf_regs(b) {
         /* Poison all cmdbuf registers to make sure we don't inherit state from
          * a previously executed cmdbuf. */
@@ -934,8 +934,8 @@ panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer,
      /* We need to propagate the suspending state of the secondary command
       * buffer if we want to avoid poisoning the reg file when the secondary
       * command buffer suspended the render pass. */
-      if (secondary->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)
-         primary->state.gfx.render.flags = secondary->state.gfx.render.flags;
+      primary->state.gfx.render.suspended =
+         secondary->state.gfx.render.suspended;

      /* If the render context we passed to the secondary command buffer got
       * invalidated, reset the FB/tiler descs and treat things as if we
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@@ -1526,7 +1526,12 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf)

            cfg.pixel_kill_operation = earlyzs.kill;
            cfg.zs_update_operation = earlyzs.update;
-            cfg.evaluate_per_sample = fs->info.fs.sample_shading;
+            cfg.evaluate_per_sample = fs->info.fs.sample_shading &&
+                                      (dyns->ms.rasterization_samples > 1);
+
+            cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
+                                           fs->info.fs.can_discard ||
+                                           alpha_to_coverage;
         } else {
            cfg.allow_forward_pixel_to_kill = true;
            cfg.allow_forward_pixel_to_be_killed = true;
@@ -2076,6 +2081,7 @@ panvk_per_arch(cmd_inherit_render_state)(
      to_panvk_physical_device(dev->vk.physical);
   struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;

+   cmdbuf->state.gfx.render.suspended = false;
   cmdbuf->state.gfx.render.flags = inheritance_info->flags;

   gfx_state_set_dirty(cmdbuf, RENDER_STATE);
@@ -2609,6 +2615,7 @@ panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer)
    * so any barrier encountered after EndRendering() doesn't try to flush
    * draws. */
   cmdbuf->state.gfx.render.flags = 0;
+   cmdbuf->state.gfx.render.suspended = suspending;

   /* If we're not suspending, we need to resolve attachments. */
   if (!suspending)
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
@@ -251,6 +251,7 @@ panvk_draw_prepare_fs_rsd(struct panvk_cmd_buffer *cmdbuf,

   pan_pack(rsd, RENDERER_STATE, cfg) {
      bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable;
+      bool msaa = dyns->ms.rasterization_samples > 1;

      if (fs) {
         pan_shader_prepare_rsd(fs_info, fs_code, &cfg);
@@ -282,7 +283,8 @@ panvk_draw_prepare_fs_rsd(struct panvk_cmd_buffer *cmdbuf,

         cfg.properties.pixel_kill_operation = earlyzs.kill;
         cfg.properties.zs_update_operation = earlyzs.update;
-         cfg.multisample_misc.evaluate_per_sample = fs->info.fs.sample_shading;
+         cfg.multisample_misc.evaluate_per_sample =
+            (fs->info.fs.sample_shading && msaa);
      } else {
         cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
         cfg.properties.allow_forward_pixel_to_kill = true;
@@ -290,7 +292,6 @@ panvk_draw_prepare_fs_rsd(struct panvk_cmd_buffer *cmdbuf,
         cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
      }

-      bool msaa = dyns->ms.rasterization_samples > 1;
      cfg.multisample_misc.multisample_enable = msaa;
      cfg.multisample_misc.sample_mask =
         msaa ? dyns->ms.sample_mask : UINT16_MAX;
@@ -312,8 +313,6 @@ panvk_draw_prepare_fs_rsd(struct panvk_cmd_buffer *cmdbuf,
      cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS;
      cfg.stencil_mask_misc.front_facing_depth_bias = rs->depth_bias.enable;
      cfg.stencil_mask_misc.back_facing_depth_bias = rs->depth_bias.enable;
-      cfg.stencil_mask_misc.single_sampled_lines =
-         dyns->ms.rasterization_samples <= 1;

      cfg.depth_units = rs->depth_bias.constant_factor;
      cfg.depth_factor = rs->depth_bias.slope_factor;
--- a/src/panfrost/vulkan/panvk_cmd_draw.h
+++ b/src/panfrost/vulkan/panvk_cmd_draw.h
@@ -77,6 +77,9 @@ struct panvk_rendering_state {
    * inherited context, and the primary command buffer needs to know. */
   bool invalidate_inherited_ctx;

+   /* True if the last render pass was suspended. */
+   bool suspended;
+
   struct {
      /* != 0 if the render pass contains one or more occlusion queries to
       * signal. */
--- a/src/panfrost/vulkan/panvk_physical_device.c
+++ b/src/panfrost/vulkan/panvk_physical_device.c
@@ -905,6 +905,14 @@ panvk_physical_device_init(struct panvk_physical_device *device,

   unsigned arch = pan_arch(device->kmod.props.gpu_prod_id);

+   if (!device->model) {
+      result = panvk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                            "Unknown gpu_id (%#x) or variant (%#x)",
+                            device->kmod.props.gpu_prod_id,
+                            device->kmod.props.gpu_variant);
+      goto fail;
+   }
+
   switch (arch) {
   case 6:
   case 7:
--- a/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c
@@ -172,6 +172,7 @@ get_preload_shader(struct panvk_device *dev,
   struct panfrost_compile_inputs inputs = {
      .gpu_id = phys_dev->kmod.props.gpu_prod_id,
      .no_ubo_to_push = true,
+      .is_blit = true,
   };

   pan_shader_preprocess(nir, inputs.gpu_id);
@@ -374,8 +375,9 @@ cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo,
      cfg.stencil_back = cfg.stencil_front;

      if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
-         cfg.properties.zs_update_operation = MALI_PIXEL_KILL_WEAK_EARLY;
-         cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_WEAK_EARLY;
+         /* Skipping ATEST requires forcing Z/S */
+         cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
+         cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
      } else {
         /* Writing Z/S requires late updates */
         cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
@@ -617,8 +619,9 @@ cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo,

   pan_pack(&dcds[dcd_idx], DRAW, cfg) {
      if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
-         cfg.zs_update_operation = MALI_PIXEL_KILL_WEAK_EARLY;
-         cfg.pixel_kill_operation = MALI_PIXEL_KILL_WEAK_EARLY;
+         /* Skipping ATEST requires forcing Z/S */
+         cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
+         cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;

         cfg.blend = bds.gpu;
         cfg.blend_count = bd_count;