docs: add sha sum for 25.0.7

VERSION: bump for 25.0.7
docs: add release notes for 25.0.7
2025-05-28 17:35:48 +02:00 · 2025-05-28 17:20:23 +02:00 · 2025-05-28 17:20:23 +02:00 · 2025-05-28 15:43:52 +02:00 · 2025-05-28 15:23:15 +02:00 · 2025-05-28 15:23:05 +02:00
663 changed files with 64061 additions and 6039 deletions
--- a/.ci-farms-disabled/google-freedreno
+++ b/.ci-farms-disabled/google-freedreno
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -30,11 +30,15 @@ workflow:
    # do not duplicate pipelines on merge pipelines
    - if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS && $CI_PIPELINE_SOURCE == "push"
      when: never
+    # tag pipelines are disabled as it's too late to run all the tests by
+    # then, the release has been made based on the staging pipelines results
+    - if: $CI_COMMIT_TAG
+      when: never
    # merge pipeline
    - if: &is-merge-attempt $GITLAB_USER_LOGIN == "marge-bot" && $CI_PIPELINE_SOURCE == "merge_request_event"
      variables:
        MESA_CI_PERFORMANCE_ENABLED: 1
-        VALVE_INFRA_VANGOGH_JOB_PRIORITY: ""  # Empty tags are ignored by gitlab
+        CI_TRON_JOB_PRIORITY_TAG: ""  # Empty tags are ignored by gitlab
        JOB_PRIORITY: 75
        # fast-fail in merge pipelines: stop early if we get this many unexpected fails/crashes
        DEQP_RUNNER_MAX_FAILS: 40
@@ -53,7 +57,11 @@ workflow:
        # Note: 0 = infinity = gitlab's job `timeout:` applies, which is 1h
        BUILD_JOB_TIMEOUT_OVERRIDE: 0
    # pipeline for direct pushes that bypassed the CI
-    - if: &is-direct-push $CI_PROJECT_NAMESPACE == "mesa" && $CI_PIPELINE_SOURCE == "push" && $GITLAB_USER_LOGIN != "marge-bot"
+    - if: &is-direct-push $CI_PROJECT_NAMESPACE == "mesa" && $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH
+      variables:
+        JOB_PRIORITY: 70
+    # pipeline for direct pushes from release maintainer
+    - if: &is-staging-push $CI_PROJECT_NAMESPACE == "mesa" && $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_REF_NAME =~ /^staging\//
      variables:
        JOB_PRIORITY: 70

@@ -102,7 +110,7 @@ variables:
  # Avoid the wall of "Unsupported SPIR-V capability" warnings in CI job log, hiding away useful output
  MESA_SPIRV_LOG_LEVEL: error
  # Default priority for non-merge pipelines
-  VALVE_INFRA_VANGOGH_JOB_PRIORITY: priority:low
+  CI_TRON_JOB_PRIORITY_TAG: ci-tron:priority:low
  JOB_PRIORITY: 50
  DATA_STORAGE_PATH: data_storage

@@ -248,6 +256,9 @@ include:
    # Build everything after someone bypassed the CI
    - if: *is-direct-push
      when: on_success
+    # Build everything when pushing to staging branches
+    - if: *is-staging-push
+      when: on_success
    # Build everything in scheduled pipelines
    - if: *is-scheduled-pipeline
      when: on_success
@@ -258,7 +269,7 @@ include:

 .ci-deqp-artifacts:
  artifacts:
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    when: always
    untracked: false
    paths:
@@ -284,11 +295,11 @@ make git archive:
    # Compactify the .git directory
    - git gc --aggressive
    # Download & cache the perfetto subproject as well.
-    - rm -rf subprojects/perfetto ; mkdir -p subprojects/perfetto && curl https://android.googlesource.com/platform/external/perfetto/+archive/$(grep 'revision =' subprojects/perfetto.wrap | cut -d ' ' -f3).tar.gz | tar zxf - -C subprojects/perfetto
+    - rm -rf subprojects/perfetto ; mkdir -p subprojects/perfetto && curl --fail https://android.googlesource.com/platform/external/perfetto/+archive/$(grep 'revision =' subprojects/perfetto.wrap | cut -d ' ' -f3).tar.gz | tar zxf - -C subprojects/perfetto
    # compress the current folder
    - tar -cvzf ../$CI_PROJECT_NAME.tar.gz .

-    - ci-fairy s3cp --token-file "${S3_JWT_FILE}" ../$CI_PROJECT_NAME.tar.gz https://$S3_HOST/git-cache/$CI_PROJECT_NAMESPACE/$CI_PROJECT_NAME/$CI_PROJECT_NAME.tar.gz
+    - s3_upload ../$CI_PROJECT_NAME.tar.gz "https://$S3_HOST/git-cache/$CI_PROJECT_NAMESPACE/$CI_PROJECT_NAME/"

 # Sanity checks of MR settings and commit logs
 sanity:
--- a/.gitlab-ci/build/gitlab-ci.yml
+++ b/.gitlab-ci/build/gitlab-ci.yml
@@ -16,7 +16,7 @@
  # We don't want to download any previous job's artifacts
  dependencies: []
  artifacts:
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    when: always
    paths:
      - _build/meson-logs/*.txt
@@ -72,6 +72,8 @@
    optional: true
  - job: debian-testing-asan
    optional: true
+  - job: debian-testing-ubsan
+    optional: true
  - job: debian-build-testing
    optional: true
  - job: debian-arm32
@@ -238,7 +240,6 @@ debian-build-testing:
  extends: .meson-build
  stage: build-for-tests
  variables:
-    BUILDTYPE: debug
    UNWIND: "enabled"
    DRI_LOADERS: >
      -D glx=dri
@@ -255,7 +256,7 @@ debian-build-testing:
      -D gallium-rusticl=false
    GALLIUM_DRIVERS: "i915,iris,nouveau,r300,r600,freedreno,llvmpipe,softpipe,svga,v3d,vc4,virgl,etnaviv,panfrost,lima,zink,d3d12,asahi,crocus"
    VULKAN_DRIVERS: "intel_hasvk,imagination-experimental,microsoft-experimental,nouveau,swrast"
-    BUILD_TYPE: "debugoptimized"
+    BUILDTYPE: "debugoptimized"
    EXTRA_OPTION: >
      -D spirv-to-dxil=true
      -D osmesa=true
@@ -297,6 +298,8 @@ shader-db:
    paths:
      - shader-db
  timeout: 15m
+  tags:
+    - kvm  # FIXME: this is a hack, should not be needed

 # Test a release build with -Werror so new warnings don't sneak in.
 debian-release:
--- a/.gitlab-ci/common/init-stage2.sh
+++ b/.gitlab-ci/common/init-stage2.sh
@@ -230,7 +230,7 @@ cleanup
 # upload artifacts
 if [ -n "$S3_RESULTS_UPLOAD" ]; then
  tar --zstd -cf results.tar.zst results/;
-  ci-fairy s3cp --token-file "${S3_JWT_FILE}" results.tar.zst https://"$S3_RESULTS_UPLOAD"/results.tar.zst;
+  s3_upload results.tar.zst https://"$S3_RESULTS_UPLOAD"/
 fi

 # We still need to echo the hwci: mesa message, as some scripts rely on it, such
--- a/.gitlab-ci/container/baremetal_build.sh
+++ b/.gitlab-ci/container/baremetal_build.sh
@@ -7,7 +7,7 @@ set -o xtrace
 # network transfer, disk usage, and runtime on test jobs)

 # shellcheck disable=SC2154 # arch is assigned in previous scripts
-if curl -X HEAD -s "${ARTIFACTS_PREFIX}/${FDO_UPSTREAM_REPO}/${ARTIFACTS_SUFFIX}/${arch}/done"; then
+if curl --fail -X HEAD -s "${ARTIFACTS_PREFIX}/${FDO_UPSTREAM_REPO}/${ARTIFACTS_SUFFIX}/${arch}/done"; then
  ARTIFACTS_URL="${ARTIFACTS_PREFIX}/${FDO_UPSTREAM_REPO}/${ARTIFACTS_SUFFIX}/${arch}"
 else
  ARTIFACTS_URL="${ARTIFACTS_PREFIX}/${CI_PROJECT_PATH}/${ARTIFACTS_SUFFIX}/${arch}"
--- a/.gitlab-ci/container/build-android-x86_64-llvm.sh
+++ b/.gitlab-ci/container/build-android-x86_64-llvm.sh
@@ -110,7 +110,7 @@ tar --zstd -cf "${ANDROID_LLVM_ARTIFACT_NAME}.tar.zst" "$LLVM_INSTALL_PREFIX"
 # version does not change, and delete it.
 # The file is not deleted for non-CI because it can be useful in local runs.
 if [ -n "$CI" ]; then
-  ci-fairy s3cp --token-file "${S3_JWT_FILE}" "${ANDROID_LLVM_ARTIFACT_NAME}.tar.zst" "https://${S3_HOST}/${S3_ANDROID_BUCKET}/${CI_PROJECT_PATH}/${ANDROID_LLVM_ARTIFACT_NAME}.tar.zst"
+  s3_upload "${ANDROID_LLVM_ARTIFACT_NAME}.tar.zst" "https://${S3_HOST}/${S3_ANDROID_BUCKET}/${CI_PROJECT_PATH}/"
  rm "${ANDROID_LLVM_ARTIFACT_NAME}.tar.zst"
 fi

--- a/.gitlab-ci/container/build-fluster.sh
+++ b/.gitlab-ci/container/build-fluster.sh
@@ -25,11 +25,10 @@ if [ "${SKIP_UPDATE_FLUSTER_VECTORS}" != 1 ]; then

    # Build fluster vectors archive and upload it
    tar --zstd -cf "vectors.tar.zst" fluster/resources/
-    ci-fairy s3cp --token-file "${S3_JWT_FILE}" "vectors.tar.zst" \
-          "https://${S3_PATH_FLUSTER}/vectors.tar.zst"
+    s3_upload vectors.tar.zst "https://${S3_PATH_FLUSTER}/"

    touch /lava-files/done
-    ci-fairy s3cp --token-file "${S3_JWT_FILE}" /lava-files/done "https://${S3_PATH_FLUSTER}/done"
+    s3_upload /lava-files/done "https://${S3_PATH_FLUSTER}/"

    # Don't include the vectors in the rootfs
    rm -fr fluster/resources/*
--- a/.gitlab-ci/container/build-piglit.sh
+++ b/.gitlab-ci/container/build-piglit.sh
@@ -10,7 +10,7 @@ uncollapsed_section_start piglit "Building piglit"
 # DEBIAN_TEST_VK_TAG
 # KERNEL_ROOTFS_TAG

-REV="631b72944f56e688f56a08d26c8a9f3988801a08"
+REV="68658566da1c9cd6a378b5ca36999617e26440e7"

 git clone https://gitlab.freedesktop.org/mesa/piglit.git --single-branch --no-checkout /piglit
 pushd /piglit
--- a/.gitlab-ci/container/build-va-tools.sh
+++ b/.gitlab-ci/container/build-va-tools.sh
@@ -19,7 +19,7 @@ git clone \

 pushd /va-utils
 # Too old libva in Debian 11. TODO: when this PR gets in, refer to the patch.
-curl -L https://github.com/intel/libva-utils/pull/329.patch | git am
+curl --fail -L https://github.com/intel/libva-utils/pull/329.patch | git am

 meson setup build -D tests=true -Dprefix=/va ${EXTRA_MESON_ARGS:-}
 meson install -C build
--- a/.gitlab-ci/container/debian/maybe-add-llvm-repo.sh
+++ b/.gitlab-ci/container/debian/maybe-add-llvm-repo.sh
@@ -12,7 +12,7 @@ case "${FDO_DISTRIBUTION_VERSION%-*},${LLVM_VERSION}" in
 esac

 if [ "$NEED_LLVM_REPO" = "true" ]; then
-  curl -s https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
+  curl --fail -s https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
  export LLVM_APT_REPO="deb [trusted=yes] https://apt.llvm.org/${FDO_DISTRIBUTION_VERSION%-*}/ llvm-toolchain-${FDO_DISTRIBUTION_VERSION%-*}-${LLVM_VERSION} main"
  echo "$LLVM_APT_REPO" | tee /etc/apt/sources.list.d/llvm.list
 fi
--- a/.gitlab-ci/container/lava_build.sh
+++ b/.gitlab-ci/container/lava_build.sh
@@ -444,8 +444,7 @@ popd

 . .gitlab-ci/container/container_post_build.sh

-ci-fairy s3cp --token-file "${S3_JWT_FILE}" /lava-files/"${ROOTFSTAR}" \
-      https://${S3_PATH}/"${ROOTFSTAR}"
+s3_upload /lava-files/"${ROOTFSTAR}" "https://${S3_PATH}/"

 touch /lava-files/done
-ci-fairy s3cp --token-file "${S3_JWT_FILE}" /lava-files/done https://${S3_PATH}/done
+s3_upload /lava-files/done "https://${S3_PATH}/"
--- a/.gitlab-ci/image-tags.yml
+++ b/.gitlab-ci/image-tags.yml
@@ -28,9 +28,9 @@ variables:
   DEBIAN_X86_64_TEST_ANDROID_IMAGE_PATH: "debian/x86_64_test-android"

   DEBIAN_TEST_ANDROID_TAG: "20250130-vvless"
-   DEBIAN_TEST_GL_TAG: "20250130-vvless"
-   DEBIAN_TEST_VK_TAG: "20250130-vvless"
-   KERNEL_ROOTFS_TAG: "20250130-vvless"
+   DEBIAN_TEST_GL_TAG: "20250327-piglit-250"
+   DEBIAN_TEST_VK_TAG: "20250327-piglit-250"
+   KERNEL_ROOTFS_TAG: "20250327-trace-250"

   DEBIAN_PYUTILS_IMAGE: "debian/x86_64_pyutils"
   DEBIAN_PYUTILS_TAG: "20250129-lavacli"
--- a/.gitlab-ci/lava/lava-submit.sh
+++ b/.gitlab-ci/lava/lava-submit.sh
@@ -52,7 +52,7 @@ cp artifacts/ci-common/init-*.sh results/job-rootfs-overlay/
 cp "$SCRIPTS_DIR"/setup-test-env.sh results/job-rootfs-overlay/

 tar zcf job-rootfs-overlay.tar.gz -C results/job-rootfs-overlay/ .
-ci-fairy s3cp --token-file "${S3_JWT_FILE}" job-rootfs-overlay.tar.gz "https://${JOB_ROOTFS_OVERLAY_PATH}"
+s3_upload job-rootfs-overlay.tar.gz "https://${JOB_ARTIFACTS_BASE}"

 # Prepare env vars for upload.
 section_switch variables "Environment variables passed through to device:"
--- a/.gitlab-ci/lava/utils/lava_job_definition.py
+++ b/.gitlab-ci/lava/utils/lava_job_definition.py
@@ -162,6 +162,16 @@ class LAVAJobDefinition:
                        "minutes": 5
                        * NUMBER_OF_ATTEMPTS_LAVA_BOOT,
                    },
+                    "uboot-action": {
+                        # For rockchip DUTs, U-Boot auto-login action downloads the kernel and
+                        # setup early network. This takes 72 seconds on average.
+                        # The LAVA action that wraps it is `uboot-commands`, but we can't set a
+                        # timeout for it directly, it is overridden by one third of `uboot-action`
+                        # timeout.
+                        # So actually, this timeout is here to enforce that `uboot-commands`
+                        # timeout to be 100 seconds (300 sec / 3), which is more than enough.
+                        "minutes": 5
+                    },
                },
            },
        }
--- a/.gitlab-ci/lava/utils/ssh_job_definition.py
+++ b/.gitlab-ci/lava/utils/ssh_job_definition.py
@@ -68,7 +68,7 @@ EOF
 ping -c 5 -w 60 $(lava-target-ip)

 lava_ssh_test_case() {
-    set -x
+    set -ex
    local test_case="${1}"
    shift
    lava-test-case \"${test_case}\" --shell \\
@@ -170,7 +170,7 @@ def generate_docker_test(
        # maintainers with monitoring
        f"lava_ssh_test_case '{args.project_name}_{args.mesa_job_name}' "
        # Changing directory to /, as the HWCI_SCRIPT expects that
-        "'\"cd / && /init-stage2.sh\"'",
+        "'cd / && /init-stage2.sh'",
    ]

    return init_stages_test
--- a/.gitlab-ci/piglit/piglit-traces.sh
+++ b/.gitlab-ci/piglit/piglit-traces.sh
@@ -13,7 +13,6 @@ set -ex
 export PAGER=cat  # FIXME: export everywhere

 INSTALL=$(realpath -s "$PWD"/install)
-S3_ARGS="--token-file ${S3_JWT_FILE}"

 export PIGLIT_REPLAY_DESCRIPTION_FILE="$INSTALL/$PIGLIT_TRACES_FILE"

@@ -120,7 +119,7 @@ replay_s3_upload_images() {
            fi
            __S3_PATH="$PIGLIT_REPLAY_REFERENCE_IMAGES_BASE"
            __DESTINATION_FILE_PATH="${line##*-}"
-            if curl -L -s -I "https://${__S3_PATH}/${__DESTINATION_FILE_PATH}" | grep -q "content-type: application/octet-stream" 2>/dev/null; then
+            if curl --fail -L -s -I "https://${__S3_PATH}/${__DESTINATION_FILE_PATH}" | grep -q "content-type: application/octet-stream" 2>/dev/null; then
                continue
            fi
        else
@@ -128,8 +127,7 @@ replay_s3_upload_images() {
            __DESTINATION_FILE_PATH="$__S3_TRACES_PREFIX/${line##*-}"
        fi

-        ci-fairy s3cp $S3_ARGS "$RESULTS_DIR/$__PREFIX/$line" \
-            "https://${__S3_PATH}/${__DESTINATION_FILE_PATH}"
+        s3_upload "$RESULTS_DIR/$__PREFIX/$line" "https://${__S3_PATH}/${__DESTINATION_FILE_PATH%/*}"
    done
 }

@@ -169,7 +167,9 @@ rm -rf replayer-db
 if [ -n "$PIGLIT_REPLAY_ANGLE_TAG" ]; then
  ARCH="amd64"
  FILE="angle-bin-${ARCH}-${PIGLIT_REPLAY_ANGLE_TAG}.tar.zst"
-  ci-fairy s3cp $S3_ARGS "https://s3.freedesktop.org/mesa-tracie-private/${FILE}" "${FILE}"
+  curl --location --fail --retry-all-errors --retry 4 --retry-delay 60 \
+    --header "Authorization: Bearer $(cat "${S3_JWT_FILE}")" \
+    "https://s3.freedesktop.org/mesa-tracie-private/${FILE}" --output "${FILE}"
  mkdir -p replayer-db/angle
  tar --zstd -xf ${FILE} -C replayer-db/angle/
 fi
--- a/.gitlab-ci/prepare-artifacts-python.sh
+++ b/.gitlab-ci/prepare-artifacts-python.sh
@@ -53,7 +53,7 @@ if [ -n "$S3_ARTIFACT_NAME" ]; then
    # Pass needed files to the test stage
    S3_ARTIFACT_TAR="$S3_ARTIFACT_NAME.tar.zst"
    tar cv artifacts/ | zstd -o "${S3_ARTIFACT_TAR}"
-    ci-fairy s3cp --token-file "${S3_JWT_FILE}" "${S3_ARTIFACT_TAR}" "https://${PIPELINE_ARTIFACTS_BASE}/${S3_ARTIFACT_TAR}"
+    s3_upload "${S3_ARTIFACT_TAR}" "https://${PIPELINE_ARTIFACTS_BASE}/"
    rm "${S3_ARTIFACT_TAR}"
 fi

--- a/.gitlab-ci/prepare-artifacts.sh
+++ b/.gitlab-ci/prepare-artifacts.sh
@@ -84,7 +84,7 @@ if [ -n "$S3_ARTIFACT_NAME" ]; then
    # Pass needed files to the test stage
    S3_ARTIFACT_NAME="$S3_ARTIFACT_NAME.tar.zst"
    zstd --quiet --threads ${FDO_CI_CONCURRENT:-0} artifacts/install.tar -o ${S3_ARTIFACT_NAME}
-    ci-fairy s3cp --token-file "${S3_JWT_FILE}" ${S3_ARTIFACT_NAME} https://${PIPELINE_ARTIFACTS_BASE}/${S3_ARTIFACT_NAME}
+    s3_upload "${S3_ARTIFACT_NAME}" "https://${PIPELINE_ARTIFACTS_BASE}/"
 fi

 section_end prepare-artifacts
--- a/.gitlab-ci/setup-test-env.sh
+++ b/.gitlab-ci/setup-test-env.sh
@@ -140,5 +140,21 @@ function trap_err {
 export -f error
 export -f trap_err

+s3_upload() {
+    x_off
+    local file=$1 s3_folder_url=$2
+    if [ ! -f "$file" ] || [[ "$s3_folder_url" != https://* ]]
+    then
+      echo "s3_upload used incorrectly: first argument is the file, second argument is the s3 folder url"
+      exit 1
+    fi
+    curl --fail --retry-all-errors --retry 4 --retry-delay 60 \
+      --header "Authorization: Bearer $(cat "${S3_JWT_FILE}")" \
+      -X PUT --form file=@"$file" \
+      "$s3_folder_url"
+    x_restore
+}
+export -f s3_upload
+
 set -E
 trap 'trap_err $?' ERR
--- a/.gitlab-ci/test/gitlab-ci.yml
+++ b/.gitlab-ci/test/gitlab-ci.yml
@@ -64,7 +64,8 @@ yaml-toml-shell-py-test:
    - !reference [.disable-farm-mr-rules, rules]
    - !reference [.never-post-merge-rules, rules]
    - !reference [.no_scheduled_pipelines-rules, rules]
-    - if: $GITLAB_USER_LOGIN == "marge-bot"
+    # merge pipeline
+    - if: $GITLAB_USER_LOGIN == "marge-bot" && $CI_PIPELINE_SOURCE == "merge_request_event"
      changes: &lint_files
        - .gitlab-ci/test/gitlab-ci.yml
        - .gitlab-ci/**/*.sh
@@ -74,6 +75,14 @@ yaml-toml-shell-py-test:
        - .gitlab-ci/tests/**/*
        - bin/ci/**/*
      when: on_success
+    # direct pushes that bypassed the CI
+    - if: $CI_PROJECT_NAMESPACE == "mesa" && $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH
+      changes: *lint_files
+      when: on_success
+    # direct pushes from release manager
+    - if: $CI_PROJECT_NAMESPACE == "mesa" && $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_REF_NAME =~ /^staging\//
+      changes: *lint_files
+      when: on_success
    - changes: *lint_files
      when: manual
  tags:
@@ -135,6 +144,8 @@ yaml-toml-shell-py-test:
  artifacts:
    paths:
      - results/
+  tags:
+    - kvm

 .b2c-vkd3d-proton-test:
  variables:
@@ -143,7 +154,7 @@ yaml-toml-shell-py-test:
 .piglit-traces-test:
  artifacts:
    when: on_failure
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    reports:
      junit: results/junit.xml
    paths:
@@ -177,7 +188,7 @@ yaml-toml-shell-py-test:
    - ./install/fossilize-runner.sh
  artifacts:
    when: on_failure
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - results/

@@ -205,7 +216,7 @@ yaml-toml-shell-py-test:
    BM_ROOTFS: /rootfs-${DEBIAN_ARCH}
  artifacts:
    when: always
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - results/
      - serial*.txt
@@ -399,7 +410,7 @@ yaml-toml-shell-py-test:

  artifacts:
    when: always
-    name: "{CI_PROJECT_NAME}_${CI_JOB_NAME}"
+    name: "${CI_PROJECT_NAME}_${CI_JOB_NAME}"
    paths:
      - results
    reports:
@@ -425,6 +436,8 @@ yaml-toml-shell-py-test:
  extends:
    - .use-debian/x86_64_test-vk
    - .b2c-x86_64-test
+  variables:
+    S3_ARTIFACT_NAME: "debian-build-testing"
  needs:
    - debian/x86_64_test-vk
    - debian-build-testing
@@ -443,6 +456,8 @@ yaml-toml-shell-py-test:
  extends:
    - .use-debian/x86_64_test-gl
    - .b2c-x86_64-test
+  variables:
+    S3_ARTIFACT_NAME: "debian-build-testing"
  needs:
    - debian/x86_64_test-gl
    - debian-build-testing
--- a/.gitlab-ci/tests/data/FASTBOOT_force_uart=False_job_definition.yaml
+++ b/.gitlab-ci/tests/data/FASTBOOT_force_uart=False_job_definition.yaml
@@ -16,6 +16,8 @@ timeouts:
      minutes: 1
    depthcharge-action:
      minutes: 15
+    uboot-action:
+      minutes: 5
 actions:
 - deploy:
    timeout:
@@ -43,7 +45,8 @@ actions:
        steps:
        - cat Image.gz my_dtb_filename.dtb > Image.gz+dtb
        - mkbootimg --kernel Image.gz+dtb --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard,v3
-          ip=dhcp init=/init rootwait usbcore.quirks=0bda:8153:k" --pagesize 4096 --base 0x80000000 -o boot.img
+          ip=dhcp init=/init rootwait usbcore.quirks=0bda:8153:k" --pagesize 4096
+          --base 0x80000000 -o boot.img
    namespace: dut
 - deploy:
    timeout:
@@ -118,7 +121,7 @@ actions:
            ping -c 5 -w 60 $(lava-target-ip)

            lava_ssh_test_case() {
-                set -x
+                set -ex
                local test_case="${1}"
                shift
                lava-test-case "${test_case}" --shell \
@@ -137,6 +140,6 @@ actions:
            sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh
            EOF
          - export SSH_PTY_ARGS=-tt
-          - lava_ssh_test_case 'test-project_dut' '"cd / && /init-stage2.sh"'
+          - lava_ssh_test_case 'test-project_dut' 'cd / && /init-stage2.sh'
    docker:
      image:
--- a/.gitlab-ci/tests/data/FASTBOOT_force_uart=True_job_definition.yaml
+++ b/.gitlab-ci/tests/data/FASTBOOT_force_uart=True_job_definition.yaml
@@ -16,6 +16,8 @@ timeouts:
      minutes: 1
    depthcharge-action:
      minutes: 15
+    uboot-action:
+      minutes: 5
 actions:
 - deploy:
    timeout:
@@ -42,7 +44,8 @@ actions:
        steps:
        - cat Image.gz my_dtb_filename.dtb > Image.gz+dtb
        - mkbootimg --kernel Image.gz+dtb --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard,v3
-          ip=dhcp init=/init rootwait usbcore.quirks=0bda:8153:k" --pagesize 4096 --base 0x80000000 -o boot.img
+          ip=dhcp init=/init rootwait usbcore.quirks=0bda:8153:k" --pagesize 4096
+          --base 0x80000000 -o boot.img
 - deploy:
    timeout:
      minutes: 2
--- a/.gitlab-ci/tests/data/UBOOT_force_uart=False_job_definition.yaml
+++ b/.gitlab-ci/tests/data/UBOOT_force_uart=False_job_definition.yaml
@@ -16,6 +16,8 @@ timeouts:
      minutes: 1
    depthcharge-action:
      minutes: 15
+    uboot-action:
+      minutes: 5
 actions:
 - deploy:
    timeout:
@@ -90,7 +92,7 @@ actions:
            ping -c 5 -w 60 $(lava-target-ip)

            lava_ssh_test_case() {
-                set -x
+                set -ex
                local test_case="${1}"
                shift
                lava-test-case "${test_case}" --shell \
@@ -109,6 +111,6 @@ actions:
            sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh
            EOF
          - export SSH_PTY_ARGS=-tt
-          - lava_ssh_test_case 'test-project_dut' '"cd / && /init-stage2.sh"'
+          - lava_ssh_test_case 'test-project_dut' 'cd / && /init-stage2.sh'
    docker:
      image:
--- a/.gitlab-ci/tests/data/UBOOT_force_uart=True_job_definition.yaml
+++ b/.gitlab-ci/tests/data/UBOOT_force_uart=True_job_definition.yaml
@@ -16,6 +16,8 @@ timeouts:
      minutes: 1
    depthcharge-action:
      minutes: 15
+    uboot-action:
+      minutes: 5
 actions:
 - deploy:
    timeout:
--- a/.gitlab-ci/tests/utils/test_lava_job_definition.py
+++ b/.gitlab-ci/tests/utils/test_lava_job_definition.py
@@ -211,7 +211,7 @@ def test_lava_job_definition(
    job_dict = yaml.load(job_definition)

    # Uncomment the following to update the expected YAML files
-    # yaml.dump(job_dict, Path(f"../../data/{mode}_force_uart={force_uart}_job_definition.yaml"))
+    # yaml.dump(job_dict, load_data_file(f"{mode}_force_uart={force_uart}_job_definition.yaml"))

    # Check that the generated job definition matches the expected one
    assert job_dict == expected_job_dict
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@@ -1 +1 @@
-25.0.0-devel
+25.0.7
--- a/bin/pick/core.py
+++ b/bin/pick/core.py
@@ -29,6 +29,7 @@ import subprocess
 import typing

 import attr
+from packaging.version import Version

 if typing.TYPE_CHECKING:
    from .ui import UI
@@ -292,11 +293,13 @@ async def resolve_nomination(commit: 'Commit', version: str) -> 'Commit':
                commit.nominated = True
                return commit

-    if backport_to := IS_BACKPORT.search(out):
-        if version in backport_to.groups():
-            commit.nominated = True
-            commit.nomination_type = NominationType.BACKPORT
-            return commit
+    if backport_to := IS_BACKPORT.findall(out):
+        for match in backport_to:
+            if any(Version(version) >= Version(backport_version)
+                   for backport_version in match if backport_version != ''):
+                commit.nominated = True
+                commit.nomination_type = NominationType.BACKPORT
+                return commit

    if cc_to := IS_CC.search(out):
        if cc_to.groups() == (None, None) or version in cc_to.groups():
--- a/bin/pick/core_test.py
+++ b/bin/pick/core_test.py
@@ -252,9 +252,8 @@ class TestRE:
                Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
            """)

-            backport_to = core.IS_BACKPORT.search(message)
-            assert backport_to is not None
-            assert backport_to.groups() == ('19.2', None)
+            backport_to = core.IS_BACKPORT.findall(message)
+            assert backport_to == [('19.2', '')]

        def test_multiple_release_space(self):
            """Tests commit with more than one branch specified"""
@@ -268,9 +267,8 @@ class TestRE:
                Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
            """)

-            backport_to = core.IS_BACKPORT.search(message)
-            assert backport_to is not None
-            assert backport_to.groups() == ('19.1', '19.2')
+            backport_to = core.IS_BACKPORT.findall(message)
+            assert backport_to == [('19.1', '19.2')]

        def test_multiple_release_comma(self):
            """Tests commit with more than one branch specified"""
@@ -284,9 +282,20 @@ class TestRE:
                Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
            """)

-            backport_to = core.IS_BACKPORT.search(message)
-            assert backport_to is not None
-            assert backport_to.groups() == ('19.1', '19.2')
+            backport_to = core.IS_BACKPORT.findall(message)
+            assert backport_to == [('19.1', '19.2')]
+
+        def test_multiple_release_lines(self):
+            """Tests commit with more than one branch specified in mulitple tags"""
+            message = textwrap.dedent("""\
+                commit title
+
+                Backport-to: 19.0
+                Backport-to: 19.1, 19.2
+            """)
+
+            backport_to = core.IS_BACKPORT.findall(message)
+            assert backport_to == [('19.0', ''), ('19.1', '19.2')]


 class TestResolveNomination:
@@ -386,6 +395,17 @@ class TestResolveNomination:
        assert c.nominated
        assert c.nomination_type is core.NominationType.BACKPORT

+    @pytest.mark.asyncio
+    async def test_backport_is_nominated_after(self):
+        s = self.FakeSubprocess(b'Backport-to: 16.2')
+        c = core.Commit('abcdef1234567890', 'a commit')
+
+        with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock):
+            await core.resolve_nomination(c, '16.3')
+
+        assert c.nominated
+        assert c.nomination_type is core.NominationType.BACKPORT
+
    @pytest.mark.asyncio
    async def test_backport_is_not_nominated(self):
        s = self.FakeSubprocess(b'Backport-to: 16.2')
--- a/bin/pick/requirements.txt
+++ b/bin/pick/requirements.txt
@@ -1,2 +1,3 @@
 attrs==23.1.0
+packaging==25.0
 urwid==2.1.2
--- a/docs/android.rst
+++ b/docs/android.rst
@@ -220,13 +220,11 @@ driver libraries into the source tree of Android and patch the binary names.
   mkdir prebuilts/mesa/x86_64
   mkdir prebuilts/mesa/x86
   cp ${INSTALL_PREFIX_64}/lib/libEGL.so prebuilts/mesa/x86_64/
-   cp ${INSTALL_PREFIX_64}/lib/libglapi.so prebuilts/mesa/x86_64/
   cp ${INSTALL_PREFIX_64}/lib/libgallium_dri.so prebuilts/mesa/x86_64/
   cp ${INSTALL_PREFIX_64}/lib/libGLESv1_CM.so  prebuilts/mesa/x86_64/
   cp ${INSTALL_PREFIX_64}/lib/libGLESv2.so  prebuilts/mesa/x86_64/
   cp ${INSTALL_PREFIX_64}/lib/libvulkan_lvp.so prebuilts/mesa/x86_64/
   cp ${INSTALL_PREFIX_32}/lib/libEGL.so prebuilts/mesa/x86
-   cp ${INSTALL_PREFIX_32}/lib/libglapi.so prebuilts/mesa/x86
   cp ${INSTALL_PREFIX_32}/lib/libgallium_dri.so prebuilts/mesa/x86/
   cp ${INSTALL_PREFIX_32}/lib/libGLESv1_CM.so  prebuilts/mesa/x86
   cp ${INSTALL_PREFIX_32}/lib/libGLESv2.so  prebuilts/mesa/x86
@@ -246,24 +244,6 @@ the libraries in the build.

 .. code-block::

-   cc_prebuilt_library_shared {
-       name: "libglapi",
-       arch: {
-           x86_64: {
-               srcs: ["x86_64/libglapi.so"],
-           },
-           x86: {
-               srcs: ["x86/libglapi.so"],
-           },
-       },
-       strip: {
-           none: true,
-       },
-       relative_install_path: "egl",
-       shared_libs: ["libc", "libdl", "liblog", "libm"],
-       vendor: true
-   }
-
   cc_prebuilt_library_shared {
       name: "libgallium_dri",
       arch: {
--- a/docs/drivers/panfrost.rst
+++ b/docs/drivers/panfrost.rst
@@ -9,7 +9,8 @@ and `Mali-G610 <https://www.khronos.org/conformance/adopters/conformant-products
 but **non-conformant** on other GPUs.

 PanVK, the Vulkan implementation in the Panfrost driver stack, is currently
-**non-conformant** on all GPUs.
+**conformant** on `Mali-G610 <https://www.khronos.org/conformance/adopters/conformant-products#submission_906>`__,
+but *non-conformant* on other GPUs.

 The following hardware is currently supported:

--- a/docs/features.txt
+++ b/docs/features.txt
@@ -490,7 +490,6 @@ Vulkan 1.3 -- all DONE: anv, lvp, nvk, radv, tu, vn, v3dv
  VK_KHR_maintenance4                                   DONE (anv, hasvk, lvp, nvk, radv, tu, v3dv, vn)
  VK_KHR_shader_integer_dot_product                     DONE (anv, dzn, hasvk, lvp, nvk, radv, tu, v3dv, vn)
  VK_KHR_shader_non_semantic_info                       DONE (anv, hasvk, nvk, panvk, radv, tu, v3dv, vn)
-  VK_KHR_shader_relaxed_extended_instruction            DONE (anv, hasvk, nvk, panvk, radv, tu, v3dv)
  VK_KHR_shader_terminate_invocation                    DONE (anv, hasvk, lvp, nvk, radv, tu, v3dv, vn)
  VK_KHR_synchronization2                               DONE (anv, dzn, hasvk, lvp, nvk, panvk, radv, tu, v3dv, vn)
  VK_KHR_zero_initialize_workgroup_memory               DONE (anv, hasvk, lvp, nvk, panvk, radv, tu, v3dv, vn)
@@ -522,7 +521,7 @@ Vulkan 1.4 -- all DONE: anv, lvp, nvk, radv/gfx8+, tu/a7xx+
  VK_KHR_push_descriptor                                DONE (anv, hasvk, lvp, nvk, panvk, radv, tu, vn)
  VK_KHR_shader_expect_assume                           DONE (anv, dzn, hasvk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
  VK_KHR_shader_float_controls2                         DONE (anv, lvp, nvk, radv, tu)
-  VK_KHR_shader_subgroup_rotate                         DONE (anv, lvp, nvk, radv, tu)
+  VK_KHR_shader_subgroup_rotate                         DONE (anv, lvp, nvk, panvk, radv, tu)
  VK_KHR_vertex_attribute_divisor                       DONE (anv, lvp, nvk, panvk, radv, tu, v3dv)
  VK_EXT_host_image_copy                                DONE (anv, lvp, nvk/Turing+, tu)
  VK_EXT_pipeline_protected_access                      DONE (anv/gfx12+)
@@ -561,6 +560,7 @@ Khronos extensions that are not part of any Vulkan version:
  VK_KHR_ray_tracing_position_fetch                     DONE (anv, radv/gfx10.3+)
  VK_KHR_shader_clock                                   DONE (anv, hasvk, lvp, nvk, radv, vn)
  VK_KHR_shader_maximal_reconvergence                   DONE (anv, lvp, nvk, radv)
+  VK_KHR_shader_relaxed_extended_instruction            DONE (anv, hasvk, nvk, panvk, radv, tu, v3dv)
  VK_KHR_shader_subgroup_uniform_control_flow           DONE (anv, hasvk, nvk, radv, tu)
  VK_KHR_shader_quad_control                            DONE (anv, nvk, radv)
  VK_KHR_shared_presentable_image                       not started
@@ -597,7 +597,7 @@ Khronos extensions that are not part of any Vulkan version:
  VK_EXT_device_fault                                   DONE (radv)
  VK_EXT_device_generated_commands                      DONE (nvk/Turing+, radv/gfx8+)
  VK_EXT_device_memory_report                           DONE (vn)
-  VK_EXT_direct_mode_display                            DONE (anv, lvp, nvk, radv, tu, v3dv)
+  VK_EXT_direct_mode_display                            DONE (anv, lvp, nvk, panvk, radv, tu, v3dv)
  VK_EXT_discard_rectangles                             DONE (radv)
  VK_EXT_display_control                                DONE (anv, hasvk, nvk, radv, tu)
  VK_EXT_display_surface_counter                        DONE (anv, lvp, nvk, radv, tu)
--- a/docs/relnotes.rst
+++ b/docs/relnotes.rst
@@ -3,6 +3,14 @@ Release Notes

 The release notes summarize what's new or changed in each Mesa release.

+-  :doc:`25.0.7 release notes <relnotes/25.0.7>`
+-  :doc:`25.0.6 release notes <relnotes/25.0.6>`
+-  :doc:`25.0.5 release notes <relnotes/25.0.5>`
+-  :doc:`25.0.4 release notes <relnotes/25.0.4>`
+-  :doc:`25.0.3 release notes <relnotes/25.0.3>`
+-  :doc:`25.0.2 release notes <relnotes/25.0.2>`
+-  :doc:`25.0.1 release notes <relnotes/25.0.1>`
+-  :doc:`25.0.0 release notes <relnotes/25.0.0>`
 -  :doc:`24.3.4 release notes <relnotes/24.3.4>`
 -  :doc:`24.3.3 release notes <relnotes/24.3.3>`
 -  :doc:`24.3.2 release notes <relnotes/24.3.2>`
@@ -442,6 +450,14 @@ The release notes summarize what's new or changed in each Mesa release.
   :maxdepth: 1
   :hidden:

+   25.0.7 <relnotes/25.0.7>
+   25.0.6 <relnotes/25.0.6>
+   25.0.5 <relnotes/25.0.5>
+   25.0.4 <relnotes/25.0.4>
+   25.0.3 <relnotes/25.0.3>
+   25.0.2 <relnotes/25.0.2>
+   25.0.1 <relnotes/25.0.1>
+   25.0.0 <relnotes/25.0.0>
   24.3.4 <relnotes/24.3.4>
   24.3.3 <relnotes/24.3.3>
   24.3.2 <relnotes/24.3.2>
--- a/docs/relnotes/25.0.0.rst
+++ b/docs/relnotes/25.0.0.rst
--- a/docs/relnotes/25.0.1.rst
+++ b/docs/relnotes/25.0.1.rst
@@ -0,0 +1,251 @@
+Mesa 25.0.1 Release Notes / 2025-03-05
+======================================
+
+Mesa 25.0.1 is a bug fix release which fixes bugs found since the 25.0.0 release.
+
+Mesa 25.0.1 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 25.0.1 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    SHA256: 49eb55ba5acccae91deb566573a6a73144a0f39014be1982d78c21c5b6b0bb3f  mesa-25.0.1.tar.xz
+    SHA512: 1ecb1b90c5f78de4c61f177888543778285731faccc6f78d266d4b437f7b422a78b705a6e9fc6c9eab62c08f2573db5dd725eaa9cc9e5bedcaa7d8cfe6b47a1f  mesa-25.0.1.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- Zink: Kopper's present thread causes Wayland protocol races
+- GLmatrix needs aligned malloc
+- Lavapipe crashes if no Position is output in mesh shader
+- [RADV/aco][regression][bisected] - Avowed (2457220) - GPU hangs near Watermill outside of Dawnshore
+- radv/sqtt: assertion "layout transition marker should be only emitted inside a barrier marker"
+- [radv] Glitchy ground geometry regression in Total War Warhammer III on RX 7600
+
+
+Changes
+-------
+
+Benjamin Lee (4):
+
+- panfrost: remove NIR_PASS_V usage for noperspective lowering
+- panfrost: fix large int32->float16 conversions
+- panfrost: fix condition in bi_nir_is_replicated
+- panfrost/va: remove swizzle mod from LDEXP
+
+Caio Oliveira (1):
+
+- brw: Fix size in assembler when compacting
+
+Daniel Schürmann (5):
+
+- aco/scheduler: always respect min_waves on GFX10+
+- aco/insert_exec_mask: Don't immediately set exec to zero in break/continue blocks
+- aco/insert_exec_mask: don't restore exec in continue_or_break blocks
+- aco/ssa_elimination: insert parallelcopies for p_phi immediately before branch
+- aco/assembler: Fix short jumps over chained branches
+
+Dave Airlie (1):
+
+- vulkan/wsi/x11: don't use update_region for damage if not created
+
+David Rosca (2):
+
+- frontends/va: Set AV1 max_width/height to surface size
+- radeonsi/vcn: Set all pic params for H264 encode references
+
+Dylan Baker (2):
+
+- iris: Correctly set NOS for geometry shader state changes
+- iris: fix handling of GL_*_VERTEX_CONVENTION
+
+Emmanuel Gil Peyrot (1):
+
+- panvk: Initialize out array with the correct length
+
+Eric Engestrom (8):
+
+- docs: add sha sum for 25.0.0
+- .pick_status.json: Update to b331713f20148852370a4fae5c2830d46801eb3b
+- .pick_status.json: Update to 55c476efed01121b3a64a58c304aae8ef9a79475
+- .pick_status.json: Mark b85c94fc891fe9d73b3a032aea8a6a71b8e6173b as denominated
+- .pick_status.json: Update to 4348253db5232b7be4db0a0ff47b31d51bc8f534
+- .pick_status.json: Update to fbc55afbdfc93a82c69f1cd6a1f4abbed96cfd19
+- .pick_status.json: Mark 5461ed5808421a8ffb79bdaa1449265f3e8f40a5 as denominated
+- .pick_status.json: Update to 45e771f4fbe4245b252c6360e55776080f0bf458
+
+Erik Faye-Lund (1):
+
+- mesa/main: wire up glapi bits for EXT_multi_draw_indirect
+
+Faith Ekstrand (12):
+
+- nak: Only use suld.constant on Ampere+
+- zink: Use the correct array size for signal_values[]
+- zink: Use persistent semaphores for PIPE_FD_TYPE_SYNCOBJ
+- nvk: Don't bind a fragment shading rate image pre-Turing
+- nvk: Do not set INVALIDATE_SKED_CACHES pre-MaxwellB
+- nak/qmd: Add a nak_get_qmd_cbuf_desc_layout() helper
+- nvk: Handle pre-Turing dispatch indirect commands
+- nvk: Only support deviceGeneratedCommandsMultiDrawIndirectCount on Turing+
+- nvk: Only support compute shader derivatives on Turing+
+- zink: Don't present to Wayland surfaces asynchronously
+- egl/dri2: Rework get_wl_surface_proxy()
+- egl/wayland: Pass the original wl_surface to kopper
+
+Georg Lehmann (1):
+
+- aco/insert_exec: fix continue_or_break on gfx6-7
+
+Gert Wollny (1):
+
+- r600/sfn: gather info and set lowering 64 bit after nir_lower_io
+
+Guilherme Gallo (3):
+
+- ci/lava: Drop the repeating quotes on lava-test-case
+- ci/lava: Propagate errors in SSH tests
+- ci/lava: Add U-Boot action timeout for rockchip DUTs
+
+Hans-Kristian Arntzen (1):
+
+- radv: Always set 0 dispatch offset for indirect CS.
+
+Hyunjun Ko (1):
+
+- anv: Do not support the tiling of DRM modifier if DECODE_DST
+
+Iago Toral Quiroga (1):
+
+- pan/va: fix FAU validation
+
+James Hogan (5):
+
+- mesa: Consider NumViews to reuse FBO attachments
+- mesa: Handle GL_FRAMEBUFFER_INCOMPLETE_VIEW_TARGETS_OVR
+- mesa: Check views don't exceed GL_MAX_ARRAY_TEXTURE_LAYERS
+- mesa: OVR_multiview framebuffer attachment parameters
+- mesa: Handle getting GL_MAX_VIEWS_OVR
+
+Job Noorman (1):
+
+- ir3/ra: prevent reusing parent interval of reloaded sources
+
+Juan A. Suarez Romero (2):
+
+- v3dv: duplicate key for texel_buffer cache
+- broadcom/simulator: use string copy instead of memcpy
+
+Karol Herbst (3):
+
+- rusticl/mem: set num_samples and num_mip_levels to 0 when importing from GL
+- rusticl/platform: advertise all extensions supported by all devices
+- intel/brw, lp: enable lower_pack_64_4x16
+
+Kevin Chuang (2):
+
+- anv/bvh: Fix encoder handling sparse buffer
+- anv/bvh: Fix copy shader handling sparse buffer
+
+Konstantin Seurer (1):
+
+- llvmpipe: Skip draw_mesh if the ms did not write gl_Position
+
+Lars-Ivar Hesselberg Simonsen (2):
+
+- panfrost: Use RUN_COMPUTE over RUN_COMPUTE_INDIRECT
+- panvk: Use RUN_COMPUTE over RUN_COMPUTE_INDIRECT
+
+Lionel Landwerlin (2):
+
+- anv: fix missing 3DSTATE_PS:Kernel0MaximumPolysperThread programming
+- vulkan/runtime: ensure robustness state is fully initialized
+
+Lorenzo Rossi (1):
+
+- nvk: Fix MSAA sparse residency lowering crash
+
+Marek Olšák (1):
+
+- mesa: allocate GLmatrix aligned to 16 bytes
+
+Mary Guillemard (1):
+
+- pan/bi: Disallow FAU special page 3 and WARP_ID on message instructions
+
+Mike Blumenkrantz (6):
+
+- zink: wait on tc fence before checking for fd semaphore
+- zink: always fully unwrap contexts
+- zink: clamp UBO sizes instead of asserting
+- llvmpipe: pass layer count to rast clear
+- gallium: fix pipe_framebuffer_state::view_mask
+- mesa: avoid creating incomplete surfaces when multiview goes out of range
+
+Natalie Vock (1):
+
+- radv/rt: Don't allocate the traversal shader in a capture/replay range
+
+Patrick Lerda (3):
+
+- r600: fix evergreen_emit_vertex_buffers() related cl regression
+- r600: fix the indirect draw 8-bits path
+- r600: fix emit_image_size() range base compatibility
+
+Paulo Zanoni (1):
+
+- brw: extend the NOP+WHILE workaround
+
+Peyton Lee (1):
+
+- radeonsi/vpe: check reduction ratio
+
+Pierre-Eric Pelloux-Prayer (2):
+
+- tc: add missing TC_SENTINEL for TC_END_BATCH
+- mesa/st: call _mesa_glthread_finish before _mesa_make_current
+
+Rhys Perry (1):
+
+- ac/nir: fix tess factor optimization when workgroup barriers are reduced
+
+Roland Scheidegger (1):
+
+- llvmpipe: Fix alpha-to-coverage without dithering
+
+Samuel Pitoiset (3):
+
+- radv/video: fix adding the query pool BO to the cmdbuf list
+- radv: fix missing SQTT barriers for fbfetch color/depth decompressions
+- radv: fix re-emitting fragment output state when resetting gfx pipeline state
+
+Tapani Pälli (2):
+
+- iris: wait for imported fences to be available in iris_fence_await
+- iris: remove dead code that cannot get hit anymore
+
+Yiwei Zhang (2):
+
+- venus: fix image format cache miss with AHB usage query
+- venus: relax the requirement for sync2
+
+Yogesh Mohan Marimuthu (1):
+
+- winsys/amdgpu: same_queue variable should be set if there is only one queue
--- a/docs/relnotes/25.0.2.rst
+++ b/docs/relnotes/25.0.2.rst
@@ -0,0 +1,234 @@
+Mesa 25.0.2 Release Notes / 2025-03-20
+======================================
+
+Mesa 25.0.2 is a bug fix release which fixes bugs found since the 25.0.1 release.
+
+Mesa 25.0.2 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 25.0.2 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    SHA256: adf904d083b308df95898600ffed435f4b5c600d95fb6ec6d4c45638627fdc97  mesa-25.0.2.tar.xz
+    SHA512: 2de8e8b514619d9ad5f407f5e1ff04fff8039d66b5f32257c2e8ca3d9f3b190269066aeba0779d6e0b2a2c0739237382fc6a98ea8563ed97801a809c96163386  mesa-25.0.2.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- RADV: logic used to avoid running on CDNA is faulty
+- [LNL/BMG] Assassin's Creed Valhalla trace replay hang
+- X11 + Zink on NVK flickers older frames in Firefox based browsers
+- Vulkan conformanceVersion is reported as 0.0.0.0 in Mesa 25.0.0
+- VRAM Abnormal use on mesa 25.0
+- [radv][regression] Multiple games detect the wrong amount of vram
+- Resident Evil 2 Remake flickers
+- OpConstantNull not supported for OpTypeCooperativeMatrixKHR
+- v3dv: vkcube-wayland crashes on raspberry pi 5 kernel 6.12 and latest mesa
+- GMSH Visualization Fails with radeonsi:can't compile a main shader part,  Fedora 41 AMD 7900xt
+- AMD VDPAU deinterlacing SIGSEGV
+- radv: vkd3d-proton test_primitive_restart_list_topology_stream_output randomly fails on NAVI2X
+- Mesa 24.1 introduced a Vulkan problem with DOOM 2016 on AMD 780M GPU
+- nouveau & zink+nvk: Flashing in Firefox and Thunderbird on Hyprland
+
+
+Changes
+-------
+
+Aaron Ruby (1):
+
+- gfxstream: Downgrade log severity when enabling params in LinuxVirtGpu
+
+Alyssa Rosenzweig (2):
+
+- pan/mdg: call nir_lower_is_helper_invocation
+- nir/lower_helper_writes: fix stores after discard
+
+Ashley Smith (1):
+
+- panfrost: Reset syncobj after use to avoid kernel warnings
+
+Bas Nieuwenhuizen (1):
+
+- radv: Move support check out of winsys.
+
+Dave Airlie (1):
+
+- radv/video: don't try and send events on UVD devices.
+
+David Rosca (4):
+
+- gallium/vl: Fix video buffer supported format check
+- Revert "frontends/vdpau: Alloc interlaced surface for interlaced pics"
+- frontends/vdpau: Fix creating deinterlace filter for interleaved buffers
+- gallium/vl: Return YUV plane order for single plane formats
+
+Eric Engestrom (6):
+
+- docs: add sha sum for 25.0.1
+- .pick_status.json: Mark 61b0955308d720a6fa065e7a414d16999f7ffd03 as denominated
+- .pick_status.json: Mark 534436f8635e63a30e4d7af4837dad35cfa361ad as denominated
+- .pick_status.json: Update to 61feea6954a7526836ccbd30c657e6afc11fb4f5
+- .pick_status.json: Mark 551770ccf8bdb1e5fa45ddac854535edf2b31a22 as denominated
+- meson: announce that clover is deprecated (slated for removal)
+
+Erik Faye-Lund (2):
+
+- docs/features: add missing panvk feature
+- panvk: correct VkPhysicalDeviceProperties::deviceName
+
+Faith Ekstrand (9):
+
+- util/box: Add a intersect_2d helper
+- zink: Use pipe_box helpers for damage calculations
+- zink: Set needs_barrier after transitioning to QUEUE_FAMILY_FOREIGN
+- zink: Check queue families when binding image resources
+- nvk: Allow rendering to linear images with unaligned strides
+- nil: Relax alignment requirements for linear images
+- vtn: Support cooperative matrices in OpConstantNull
+- egl/x11: Re-order an if statement
+- egl/kopper: Update the EGLSurface size after kopperSwapBuffers()
+
+Ganesh Belgur Ramachandra (1):
+
+- amd: use 128B compression for scanout images when drm.minor <63
+
+Georg Lehmann (3):
+
+- radv: enable invariant geom for DOOM(2016)
+- aco/gfx11.5: remove vinterp ddx/ddy path
+- aco/ra: disallow vcc definitions for pseudo scalar trans instrs
+
+Ivan A. Melnikov (1):
+
+- gallium/radeon: Make sure radeonsi PCI IDs are also included
+
+Job Noorman (2):
+
+- ir3: fix false dependencies of rpt instructions
+- ir3: keep inputs at start block when creating empty preamble
+
+John Anthony (1):
+
+- panvk: Avoid division by zero for vkCmdCopyQueryPoolResults
+
+José Roberto de Souza (1):
+
+- intel/common: Retry GEM_CONTEXT_CREATE when PXP have not finished initialization
+
+Karol Herbst (6):
+
+- rusticl/program: implement CL_INVALID_PROGRAM_EXECUTABLE check in clGetProgramInfo
+- rusticl/program: pass options by reference
+- rusticl/program: loop over all devices inside Program::build
+- rusticl/program: rework build_nirs so it only touches devices we care about
+- rusticl/program: fix building kernels
+- nir/serialize: fix decoding of is_return and is_uniform
+
+Lionel Landwerlin (3):
+
+- anv: fix non page aligned descriptor bindings on <Gfx12.0
+- brw: fix spilling for Xe2+
+- brw: ensure VUE header writes in HS/DS/GS stages
+
+Lucas Stach (2):
+
+- etnaviv: rs: fix slow/fast clear transitions
+- etnaviv: fix ETNA_MESA_DEBUG=no_early_z
+
+Marek Olšák (1):
+
+- Revert "ac/nir: clamp vertex color outputs in the right place"
+
+Mary Guillemard (2):
+
+- pan/bi: Fix out of range access in bi_instr_replicates
+- pan/bi: Ensure we select b0 with halfswizzle in va_lower_constants
+
+Matt Turner (1):
+
+- glsl: Add missing break
+
+Maíra Canal (1):
+
+- v3dv: don't overwrite the primary fd if it's already set
+
+Mel Henning (1):
+
+- nvk: Don't zero imported memory
+
+Mike Blumenkrantz (1):
+
+- zink: fix refcounting of zink_surface objects
+
+Natalie Vock (2):
+
+- radv/rt: Guard leaf encoding by leaf node count
+- radv/rt: Flush L2 after writing internal node offset on GFX12
+
+Patrick Lerda (2):
+
+- r600: fix cayman main non-deterministic behavior problem
+- r600: update the software fp64 support
+
+Pierre-Eric Pelloux-Prayer (1):
+
+- st/mesa: fix nir_load_per_vertex_input parameter
+
+Rebecca Mckeever (1):
+
+- panvk: Add STORAGE_IMAGE_BIT feature for formats supporting sampled images
+
+Rhys Perry (1):
+
+- aco: insert dependency waits in certain situations
+
+Rob Clark (2):
+
+- tc: Add missing tc_set_driver_thread()
+- freedreno: Wait for imported syncobj fences to be available
+
+Samuel Pitoiset (6):
+
+- ac,radv: add a workaround for a hw bug with primitive restart on GFX10-GFX10.3
+- radv: fix a GPU hang with inherited rendering and HiZ/HiS on GFX1201
+- radv/amdgpu: fix device deduplication
+- radv: update conformance version
+- aco: do not apply OMOD/CLAMP for pseudo scalar trans instrs
+- radv: emit a dummy PS state for noop FS on GFX12
+
+Seán de Búrca (1):
+
+- rusticl/mem: don't create svm_pointers slice from null raw pointer
+
+Sviatoslav Peleshko (2):
+
+- anv: Add full subgroups workaround for the shaders that use shared memory
+- drirc: Apply assume_full_subgroups_with_shared_memory to Resident Evil 2
+
+Timothy Arceri (1):
+
+- util/u_idalloc: fix util_idalloc_sparse_alloc_range()
+
+Yiwei Zhang (4):
+
+- venus: fix a memory corruption in query records recycle
+- lavapipe: set availability bit for accel struct host queries
+- lavapipe: fix accel struct device query copy
+- venus: fix to ignore dstSet for push descriptor
--- a/docs/relnotes/25.0.3.rst
+++ b/docs/relnotes/25.0.3.rst
@@ -0,0 +1,231 @@
+Mesa 25.0.3 Release Notes / 2025-04-02
+======================================
+
+Mesa 25.0.3 is a bug fix release which fixes bugs found since the 25.0.2 release.
+
+Mesa 25.0.3 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 25.0.3 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    SHA256: 5ff426ed6ce0588fd96d18975bdff451ae2ab2fe98b5d1528842ee71ec66711b  mesa-25.0.3.tar.xz
+    SHA512: a8ddfa3ac31869e82a49d14aaab0659d0496ae77db3f32aa0d5d28de8e1e4cace9fa652451a050fbc79281e8461cd70e86ad464aa387533387187fbcb604aaab  mesa-25.0.3.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- [RADV][RDNA3][Phoenix3][APU] NARAKA: BLADEPOINT (1203220) gpu hang reproducible (ice/water regression mesa 24.1 bisected SAMPLE_MASK_TRACKER_WATERMARK=15) random (maybe other apps/games)
+- GPU hangs running Octopath Traveler II with 780M
+- GPU crash on Radeon 780M with Tales of Arise
+- brw: Hit unreachable nir_op_fsign case that brw_nir_lower_fsign missed
+- The Last of Us - shadows flickering on gfx1201 without nohiz flag
+- anv: Dark pattern overlayed on objects in Eve Online DX11 mode on BMG
+- Mesa 25 removes VA-API encoding for R9 390
+- Video stuttering / anv: extend implicit fencing support
+- anv, bmg: Visual issues in AC Origins, Odyssey and Fenyx Rising when dxvk doesn't export PointSize
+- [ANV][LNL] - A Game About Digging A Hole (3244220) - Title throws an assertion failure on launch.
+- anv/video: Timestamps are exposed in video encode queue, but it crashes
+- Getting a crash with manually built llvmpipe (OpenGL)
+- [RadeonSI] Blender assetshelf icons are borken in mesa >= 25.0.0
+- radeonsi regression after 24.3.4
+- misc OpenGL CTS failures
+- glBindVertexBuffer regression due to ID reuse
+
+
+Changes
+-------
+
+Caio Oliveira (1):
+
+- brw: Fix decoding of 3-src destination stride in EU validation
+
+Connor Abbott (3):
+
+- tu: Fix GMEM offset for multisample layered separate stencil
+- tu: Fix size of frag_size_ir3 and frag_offset_ir3 driver params
+- tu: Fix reported FDM fragment size with multiview
+
+Daniel Schürmann (1):
+
+- aco: don't assume that demote doesn't cause an empty exec mask
+
+Daniel Stone (1):
+
+- ci: Re-enable trace jobs with updated Piglit
+
+Dave Airlie (2):
+
+- gallivm: check for avx512vbmi and tell LLVM the correct answer.
+- nak: add reads after setting writes
+
+David Rosca (5):
+
+- radeonsi/vce: Support old VCE firmware
+- gallium/vl: Fix rotation with scaling for compute shaders
+- gallium/vl: Fix mirror with rotation for compute shaders
+- frontends/va: Don't ignore rotation and mirror for conversions to RGB
+- radv: Add radv_format_description to remap 10/12bit formats to 16bit
+
+Eric Engestrom (11):
+
+- docs: add sha sum for 25.0.2
+- .pick_status.json: Update to 85983e060ccca163ff5c4aad51c7082b7ae8c4a0
+- ci/piglit: drop usage of s3cp for a simple download
+- ci: always abort if the curl download fails
+- ci: replace broken s3cp command with a simple curl call
+- ci: run shader-db & zink-lvp on kvm runners
+- pick-ui: fix parsing of multiple \`backport-to:` lines
+- .pick_status.json: Update to e3433489f81a75c278ff70cc5700cd028447bf76
+- [25.0 only] update ci expectations
+- .pick_status.json: Update to b60d816d6ee35cc1bfa2d2f6aed59104a09ec11d
+- .pick_status.json: Update to 0d2ebca39fd2a68bfb64dc2196e442e25dc90334
+
+Eric R. Smith (1):
+
+- panfrost: consider xfb shader when calculating thread local storage size
+
+Erik Faye-Lund (3):
+
+- panfrost: avoid accidental aliasing
+- panvk: check for texture-compression support
+- mesa/main: fix regression in extension-checking
+
+Faith Ekstrand (10):
+
+- nak: Insert the annotation in the right spot in assign_regs
+- nak: Always copy sources when handling vec/pack/mov ops
+- nak: Fix a SM check for OpPCnt
+- nvk: Free owned_gart_mem correctly
+- nvk: Fix a Volta check
+- nouveau/mme/fermi: Don't allow STATE and EMIT on the same op
+- nvk: Use the right sample mask for 8x/4pass on Maxwell A
+- vulkan/wsi: Signal buffer memory object when blitting
+- nvk: Use max_image_dimension for maxFramebufferWidth/Height
+- nvk: Disable 32k images on Pascal A
+
+Hyunjun Ko (1):
+
+- vulkan/video: Do byte-alignment when building a h264 slice header
+
+Ian Romanick (1):
+
+- brw/nir: Lower fsign again after last call to brw_nir_optimize
+
+Job Noorman (1):
+
+- ir3/legalize: take wrmask into account for delay updates
+
+Jordan Justen (2):
+
+- intel/dev: Add BMG PCI IDs (0xe210, 0xe215, 0xe216)
+- intel/dev: Add BMG 0xe211 PCI ID
+
+Lionel Landwerlin (4):
+
+- anv: fix end of pipe timestamp query writes
+- anv: disable replication when we don't have both VS/FS stages
+- brw: always write the VUE header
+- anv: limit implict write with drirc
+
+Lucas Stach (1):
+
+- kmsro: look for graphics capable screen as renderonly device
+
+Natalie Vock (2):
+
+- radv/rt: Flush CP writes from the common BVH framework with INV_L2 on GFX12
+- vulkan/bvh: Move first PLOC task_count fetch inside PHASE
+
+Paulo Zanoni (1):
+
+- drirc/anv: DiggingGame.exe needs force_vk_vendor=-1
+
+Pierre-Eric Pelloux-Prayer (2):
+
+- ac/nir: fix nir_metadata value of ac_nir_lower_image_opcodes
+- radeonsi: use composed swizzle in cdna_emu_make_image_descriptor
+
+Rebecca Mckeever (1):
+
+- panvk: Remove lower_tg4_broadcom_swizzle from panvk_preprocess_nir()
+
+Rhys Perry (1):
+
+- aco/ra: fix free register counting when moving variables
+
+Robert Mader (3):
+
+- llvmpipe: Take offset into account when importing dmabufs
+- llvmpipe: Free dummy_dmabuf on shutdown
+- gallivm: Re-add check for passmgr before disposing it
+
+Samuel Pitoiset (8):
+
+- radv: fix creating pipeline binary from the traversal shader
+- radv: fix bpe for the stencil aspect of depth/stencil copies on transfer queue
+- radv: fix compresed depth/stencil copies on transfer queue
+- radv/meta: fix color<->depth/stencil image copies
+- radv: do not trigger FCE or FMASK decompress on compute queue
+- ac/surface: fix selecting preferred alignments for HiZ/HiS on GFX12
+- Revert "radv: program SAMPLE_MASK_TRACKER_WATERMARK optimally for GFX11 APUs"
+- Revert "radeonsi/gfx11: program SAMPLE_MASK_TRACKER_WATERMARK optimally for APUs"
+
+Taras Pisetskyi (1):
+
+- anv,driconf: Add sampler coordinate precision workaround for EVE Online
+
+Timothy Arceri (9):
+
+- mesa: fix reuse of deleted buffer object
+- mesa: fix reuse of deleted texture object
+- mesa: fix potential race condition in with TexObjects
+- mesa: fix reuse of deleted sampler object
+- mesa: fix potential race conditions in with FrameBuffers
+- mesa: fix potential race condition in with RenderBuffers
+- mesa: fix potential race condition in with ATIShaders
+- mesa: fix potential race condition in with Programs
+- nir: fix uniform cloning helper
+
+Tomeu Vizoso (2):
+
+- egl/surfaceless: Only choose drivers that expose the graphics capability
+- kopper: Explicitly choose zink
+
+Trigger Huang (1):
+
+- radeonsi: Fix perfcounter start event in si_pc_emit_start
+
+Valentine Burley (1):
+
+- ci: Add missing kvm runner tags
+
+Yiwei Zhang (6):
+
+- docs: demote VK_KHR_shader_relaxed_extended_instruction
+- venus: fix unexpected ring alive status expire upon owner thread switch
+- venus: fix ahb usage caching
+- venus: fix maint4 multi-planar memory requirements
+- panvk/csf: rework cache flush reduction
+- panvk: fix memory requirement query for aliased disjoint image
+
+irql-notlessorequal (1):
+
+- hasvk: Fix non-functioning version override.
--- a/docs/relnotes/25.0.4.rst
+++ b/docs/relnotes/25.0.4.rst
@@ -0,0 +1,256 @@
+Mesa 25.0.4 Release Notes / 2025-04-17
+======================================
+
+Mesa 25.0.4 is a bug fix release which fixes bugs found since the 25.0.3 release.
+
+Mesa 25.0.4 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 25.0.4 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    SHA256: 76293cf4372ca4e4e73fd6c36c567b917b608a4db9d11bd2e33068199a7df04d  mesa-25.0.4.tar.xz
+    SHA512: 562a97bd0374ff2a76f71c848df4fe542f1fc66c420a9101eb4bb1947d00eee4417d9c6f2d1be19638663753785c19384f8a6dc078c3187448ab79413d906152  mesa-25.0.4.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- RADV: Performance regression in Elden Ring on GFX8/Polaris
+- RADV: Performance regression in Elden Ring on GFX8/Polaris
+- Confidential issue #12324
+- Confidential issue #12946
+- The Last of Us Part I GPU hang on gfx1201
+- brw: new Xe2 CTS failures
+- [NVK] NAK assert in The Last of Us Part 2 shader
+- [ANV][LNL] - Lost Records: Bloom & Rage (1902960) - Title hangs on launch and subsequently crashes to desktop.
+- [BMG] Intel b580 battlemage: Fort Solis (Unreal Engine game) boots to menu, hangs while loading after hitting continue from the main menu
+- [ANV][LNL] -  NINJA GAIDEN 2 Black (3287520) - Environment assets are incorrectly rendered or missing.
+- [ANV][LNL] - The Headliners (3059070) - Title hangs a few minutes after launch.
+- anv, regression: Invisibly blinking cliffs & rocks in Satisfactory DX12 on BMG
+- vk/overlay: output_file option failing
+- [bisected, LNL] brw: 341e5117ecbc ("brw/nir: Treat load_const as convergent") regresses arb_gpu_shader5-interpolateAtOffset on LNL
+- vulkan regression mesa 24.3.4 to 25.0.0.rc3 with broadcom
+- radv: nir_opt_varyings.c:2766: deduplicate_outputs: Assertion \`list_index == 0' failed.
+- vulkan/wsi: memory leak from wsi_CreateSwapchainKHR
+
+
+Changes
+-------
+
+Aaron Ruby (2):
+
+- gfxstream: Make the virtgpu device discovery for LinuxVirtGpu more robust
+- gfxstream: Add common interfaces in the VirtGpuDevice to query DrmInfo and PciBusInfo
+
+Alyssa Rosenzweig (4):
+
+- nir/lower_blend: refactor logicop variables
+- nir/lower_blend: disable logic ops for unsupported formats
+- panfrost: invert and rename no_ubo_to_push flag
+- panfrost: do not push "true" UBOs
+
+Benjamin Lee (2):
+
+- panvk/csf: fix uninitialized read in utrace_clone_init_builder
+- panfrost/pps: fix omitting several counters
+
+Benjamin Otte (1):
+
+- lavapipe: Don't advertise support for multiplane drm formats
+
+Boris Brezillon (2):
+
+- vulkan/state: Fix input attachment map state initialization/copy
+- vk/pass: Add input attachment location info
+
+Caio Oliveira (1):
+
+- nir/load_store_vectorize: Skip new bit-sizes that are unaligned with high_offset
+
+Caterina Shablia (2):
+
+- panfrost: don't overwrite push uniforms and sysvals UBO with user's UBO
+- panfrost: update nr_uniform_buffers before dispatching XFB
+
+Connor Abbott (1):
+
+- tu: Fix layer_count with dynamic rendering + multiview
+
+David Rosca (4):
+
+- radeonsi/vcn: Disable AV1 unidir compound with rate control
+- radv/video: Fix msg header total size
+- radv/video: Fix encode session info for VCN3+
+- radeonsi/vpe: Use float division to get scaling ratio
+
+Eric Engestrom (7):
+
+- docs: add sha sum for 25.0.3
+- [25.0 only] update more ci expectations
+- .pick_status.json: Update to 7c5389695bdf106acaab6ccc69535f25c1d7a8e6
+- ci: rename ci-tron priority tag to avoid conflict with the generic fdo runners
+- .pick_status.json: Update to 2f00daf67a7990da68dfc4a8e5f2019daecb7a59
+- .pick_status.json: Update to 58321cf2e57279079bf742be1063ac2900ea2436
+- .pick_status.json: Update to 555821ff93118d4a6ea441127cd0427a95743d47
+
+Eric R. Smith (2):
+
+- panfrost,lima: use index size in panfrost minmax_cache
+- panfrost: fix transaction elimination crc valid calculation
+
+Erik Faye-Lund (4):
+
+- panfrost: fixup typo in 16x sample-pattern
+- nir/lower_tex: use texture_mask instead of shifting on use
+- panvk: set shared_addr_format
+- panvk: claim official conformance on v10
+
+Faith Ekstrand (3):
+
+- nak: Allow predicates in nir_intrinsic_as_uniform
+- nvk/nvkmd: Check the correct flag for the Kepler GART workaround
+- nil: Multiply by array_stride_B instead of adding
+
+Felix DeGrood (1):
+
+- vk/overlay-layer: fix regression in non-control pathway
+
+Georg Lehmann (2):
+
+- spirv: clamp/sign-extend non 32bit ldexp exponents
+- spirv: fix cooperative matrix by value function params
+
+Gurchetan Singh (3):
+
+- gfxstream: check device exists before using it
+- gfxstream: refactor device initialization
+- gfxstream: follow the semantics desired by distro VK loader
+
+Ian Romanick (4):
+
+- brw/algebraic: Constant folding for BROADCAST and SHUFFLE
+- brw/nir: Fix source handling of nir_intrinsic_load_barycentric_at_offset
+- brw/algebraic: Optimize derivative of convergent value
+- brw/nir: Use offset() for all uses of offs in emit_pixel_interpolater_alu_at_offset
+
+Jan Alexander Steffens (heftig) (1):
+
+- gfxstream: Use proper log format for 32-bit Vulkan
+
+Job Noorman (1):
+
+- ir3/ra: assign interval offsets to new defs after shared RA
+
+Jose Maria Casanova Crespo (1):
+
+- v3dv: avoid TFU reading unmapped pages beyond the end of the buffers
+
+Juan A. Suarez Romero (1):
+
+- v3dv: don't check if DRM device is master
+
+Kenneth Graunke (4):
+
+- brw: Track the largest VGRF size in liveness analysis
+- brw: Use live->max_vgrf_size in register coalescing
+- brw: Use live->max_vgrf_size in pre-RA scheduling
+- brw: Don't assert about MAX_VGRF_SIZE in brw_opt_split_virtual_grfs()
+
+Lars-Ivar Hesselberg Simonsen (2):
+
+- panvk: Add barrier for interleaved ZS copy cmds
+- vk/sync: Fix execution only barriers
+
+Lionel Landwerlin (3):
+
+- brw: fix shuffle with scalar/uniform index
+- anv: fix self dependency computation
+- brw: fix Wa_22013689345 emission
+
+Marek Olšák (5):
+
+- radeonsi: work around a primitive restart bug on gfx10-10.3
+- radeonsi: make si_shader_selector::main_shader_part_* an iterable union
+- radeonsi: add ACO-specific main shader parts
+- ac/surface: make gfx12_estimate_size reusable by gfx6
+- ac/surface: select 3D tile mode without overallocating too much for gfx6-8
+
+Mike Blumenkrantz (4):
+
+- gallium/util: check nr_samples in pipe_surface_equal()
+- tu: check for valid descriptor set when binding descriptors
+- zink: don't set shared block stride without KHR_workgroup_memory_explicit_layout
+- zink: stop setting ArrayStride on image arrays
+
+Natalie Vock (1):
+
+- aco: Make private_segment_buffer/scratch_offset per-resume
+
+Patrick Lerda (9):
+
+- r600: move stores to the end of shader when required
+- r600: fix textures with swizzles limited to zero and one
+- r600: fallback to util_blitter_draw_rectangle when required
+- r600: fix pa_su_vtx_cntl rounding mode
+- r600: fix points clipping
+- i915: fix i915_set_vertex_buffers() related refcnt imbalance and remove redundancies
+- i915: fix slab_create() related memory leaks
+- i915: fix nir_to_tgsi() related memory leak
+- i915: fix draw_create_fragment_shader() related memory leak
+
+Pierre-Eric Pelloux-Prayer (1):
+
+- winsys/amdgpu: disable VM_ALWAYS_VALID
+
+Rob Clark (1):
+
+- tu/vdrm: Fix userspace fence cmds
+
+Ryan Mckeever (1):
+
+- pan/format: Update format flags to follow HW spec
+
+Samuel Pitoiset (4):
+
+- radv: fix ignoring conditional rendering with vkCmdResolveImage()
+- radv: determine if HiZ/HiS is enabled earlier on GFX12
+- radv: add a workaround for buggy HiZ/HiS on GFX12
+- radv: apply the workaround for buggy HiZ/HiS on GFX12 for DGC
+
+Sviatoslav Peleshko (1):
+
+- vulkan/wsi/headless: Remove unnecessary wsi_configure_image()
+
+Tapani Pälli (3):
+
+- compiler/glsl: check that bias is not used outside fragment stage
+- mesa: clamp texbuf query size to MAX_TEXTURE_BUFFER_SIZE
+- mesa: various fixes for ClearTexImage/ClearTexSubImage
+
+Timothy Arceri (1):
+
+- glsl: fix regression in ubo cloning
+
+Timur Kristóf (4):
+
+- nir/xfb: Preserve some xfb information when gathering from intrinsics.
+- nir/opt_varyings: Fix assertion when deduplicating TCS outputs.
+- radv: Use buffers_written mask when gathering XFB info.
+- radv: Call nir_opt_undef too after nir_opt_varyings.
--- a/docs/relnotes/25.0.5.rst
+++ b/docs/relnotes/25.0.5.rst
@@ -0,0 +1,185 @@
+Mesa 25.0.5 Release Notes / 2025-04-30
+======================================
+
+Mesa 25.0.5 is a bug fix release which fixes bugs found since the 25.0.4 release.
+
+Mesa 25.0.5 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 25.0.5 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    SHA256: c0d245dea0aa4b49f74b3d474b16542e4a8799791cd33d676c69f650ad4378d0  mesa-25.0.5.tar.xz
+    SHA512: d65e027829e3bef60bc0e3e71160e6b3721e797e2157c71dbeef0cd6e202f8f8098b3cd41159cd0e96e520eaf92ea49c2c9bb1af1a54867b6a7c551c2197c068  mesa-25.0.5.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- WWE 2k23 small "artifacts"
+- Variable Rate Shading (VRS) produces very aliased results on RADV with an AMD gpu
+- Vulkan issues after sleeping on 9070 XT
+- ring gfx_0.0.0 timeout after waking from sleep - RX 9070
+- radeonsi: CL conformance test \`vector_swizzle` fails since 177427877bb50ad7ba24abfa13e55a2684d804df
+- Random mesa crashes in kwin_wayland on a 6600XT
+- Patch to fix clinfo on rusticl
+- radv/aco: Ghost of Tsushima hangs and causes gpu resets on RDNA 3 GPU
+- mesa-vulkan-driver-git.x86_64 causes strange colored rectangle artifacts in Final Fantasy XIV
+
+
+Changes
+-------
+
+Connor Abbott (1):
+
+- tu: Fix flushing when using a staging buffer for copies
+
+Danylo Piliaiev (1):
+
+- tu,freedreno: Don't fallback to LINEAR with DRM_FORMAT_MOD_QCOM_COMPRESSED
+
+David Rosca (1):
+
+- radv: Use radv_format_to_pipe_format instead of vk_format_to_pipe_format
+
+Dmitry Baryshkov (1):
+
+- meson: disable SIMD blake optimisations on x32 host
+
+Ella Stanforth (1):
+
+- v3d/compiler: Fixup output types for all 8 outputs
+
+Eric Engestrom (8):
+
+- docs: add sha sum for 25.0.4
+- .pick_status.json: Update to 5f3a3740dcc6d243f2ef14138fb1c09bcbb9b5fd
+- pick-ui: make \`Backport-to: 25.0` backport to 25.0 \*and more recent release branches*
+- aco: help clang 20 do some additions and subtractions
+- .pick_status.json: Update to 091d52965f805d61dd3a8e091ac20869a794e632
+- pick-ui: add missing dependency
+- .pick_status.json: Update to 3493500abb78a4dc22aba14840bba5c777fde745
+- .pick_status.json: Update to 5a55133ce7d5bb2419f2aa99c5296037afb7ba6a
+
+Faith Ekstrand (2):
+
+- nak/legalize: Take a RegFile in copy_alu_src_and_lower_fmod
+- nak/sm70: Fix the bit74_75_ar_mod assert
+
+Georg Lehmann (2):
+
+- nir/opt_algebraic: disable fsat(a + 1.0) opt if a can be NaN
+- aco: set opsel_hi to 1 for WMMA
+
+Ian Romanick (4):
+
+- brw/algebraic: Clear condition modifier on optimized SEL instruction
+- brw/algebraic: Don't optimize float SEL.CMOD to MOV
+- elk/algebraic: Clear condition modifier on optimized SEL instruction
+- elk/algebraic: Don't optimize float SEL.CMOD to MOV
+
+Janne Grunau (2):
+
+- venus: Do not use instance pointer before NULL check
+- venus: virtgpu: Require stable wire format
+
+John Anthony (1):
+
+- panvk: Enable VK_EXT_direct_mode_display
+
+José Roberto de Souza (3):
+
+- intel: Program XY_FAST_COLOR_BLT::Destination Mocs for gfx12
+- intel: Fix the MOCS values in XY_FAST_COLOR_BLT for Xe2+
+- intel: Fix the MOCS values in XY_BLOCK_COPY_BLT for Xe2+
+
+Karol Herbst (2):
+
+- rusticl/device: fix panic when disabling 3D image write support
+- nir_lower_mem_access_bit_sizes: fix negative chunk offsets
+
+Lionel Landwerlin (1):
+
+- anv: use companion batch for operations with HIZ/STC_CCS destination
+
+Loïc Minier (1):
+
+- freedreno: check if GPU supported in fd_pipe_new2
+
+Marek Olšák (1):
+
+- radv: fix incorrect patch_outputs_read for TCS with dynamic state
+
+Mary Guillemard (3):
+
+- panvk: reset dyn_bufs map count to 0 in create_copy_table
+- panvk: Take rasterization sample into account in indirect draw on v10+
+- panvk: Take resource index in valhall_lower_get_ssbo_size
+
+Mel Henning (3):
+
+- nvk: SET_STATISTICS_COUNTER at start of meta_begin
+- nvk: Override render enable for blits and resolves
+- wsi/headless: Override finish_create
+
+Mike Blumenkrantz (1):
+
+- zink: verify that surface exists when adding implicit feedback loop
+
+Olivia Lee (1):
+
+- panfrost: allow promoting sysval UBO to push constants
+
+Patrick Lerda (1):
+
+- mesa_interface: fix legacy dri2 compatibility
+
+Pierre-Eric Pelloux-Prayer (1):
+
+- radeonsi: fix potential use after free in si_set_debug_callback
+
+Rhys Perry (3):
+
+- aco/gfx12: don't use second VALU for VOPD's OPX if there is a WaR
+- aco: combine VALU lanemask hazard into VALUMaskWriteHazard
+- aco/gfx11: create waitcnt for workgroup vmem barriers
+
+Samuel Pitoiset (3):
+
+- radv: only enable DCC for invisible VRAM on GFX12
+- radv: fix re-emitting VRS state when rendering begins
+- radv: set radv_disable_dcc=true for WWE 2k23
+
+Tapani Pälli (2):
+
+- iris: force reallocate on eglCreateImage with GFX >= 20
+- iris: make sure to not mix compressed vs non-compressed
+
+Tomeu Vizoso (1):
+
+- etnaviv: Release screen->dummy_desc_reloc.bo
+
+Yinjie Yao (2):
+
+- gallium/pipe: Increase hevc max slice to 600
+- frontends/va: Handle properly when decoding more slices than limit
+
+Yiwei Zhang (1):
+
+- venus: fix missing renderer destructions
--- a/docs/relnotes/25.0.6.rst
+++ b/docs/relnotes/25.0.6.rst
@@ -0,0 +1,182 @@
+Mesa 25.0.6 Release Notes / 2025-05-14
+======================================
+
+Mesa 25.0.6 is a bug fix release which fixes bugs found since the 25.0.5 release.
+
+Mesa 25.0.6 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 25.0.6 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    SHA256: 0d179e019e3441f5d957330d7abb3b0ef38e6782cc85a382608cd1a4a77fa2e1  mesa-25.0.6.tar.xz
+    SHA512: 6a0abc8a5bbbb8ffdad7286fc5642f643b1f4183794425ba689c2c9f5c73a4131c8685074241deb1022631b4c1f1c505dbd848190ec60d5d6931e90dd9316e05  mesa-25.0.6.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- In SkinDeep, GL_LINES causes GL_INVALID_OPERATION with radeonsi and llvmpipe
+- radv: UB and artifacts when copying a \`COMBINED_IMAGE_SAMPLER` with an immutable sampler
+- RADV: Dynamic state multiple viewport corruption
+- [drm:amdgpu_uvd_cs_pass2 [amdgpu]] \*ERROR* )Handle 0x48780001 already in use!
+- glGetInternalformativ returns incorrect information for GL_STENCIL_INDEX8
+- RadeonSI: Psychonauts rendering regression since !29895
+- [r600g] Rejected CS when using dolphin's GPU texture decoder
+- radeonsi: Assertion \`src_bit_size == bit_size' failed. when running without MESA_GLSL_DISABLE_IO_OPT=1
+- radeonsi vdpau + Packed YUY2 = assert
+- Indiana Jones and The Great Circle, Graphical corruption on 9070 XT.
+- glPushAttrib/glPopAttrib broken with glColorMaterial and ligthing
+- radv: Flickering in Kingdom Come: Deliverance II
+- RADV regression causes severe glitches in Hunt Showdown 1896 on Polaris
+- Z-Fighting in Tomb Raider IV - VI Remastered Linux
+- RADV:RX 9070:Mesa-25.0.5 GTA 5 Enhanced GPU HANG
+- [anv] VK_ERROR_DEVICE_LOST on Linux 6.13.8 while playing Dota 2 on Intel Graphics
+
+
+Changes
+-------
+
+Connor Abbott (4):
+
+- freedreno: Add compute_lb_size device info
+- freedreno/a6xx: Define CONSTANTRAMMODE
+- freedreno/a6xx, turnip: Set CONSTANTRAMMODE correctly
+- ir3: Take LB restriction on constlen into account on a7xx
+
+David Rosca (3):
+
+- frontends/vdpau: Fix creating surfaces with 422 chroma
+- ac/uvd: Add ac_uvd_alloc_stream_handle
+- radv/video: Use ac_uvd_alloc_stream_handle
+
+Eric Engestrom (4):
+
+- docs: add sha sum for 25.0.5
+- .pick_status.json: Update to e7a7d9ea2e2e48171fad131a7bfa7576e02ea4e0
+- .pick_status.json: Mark eeffb4e674d10db9aefebeca91c2d87c1676b81e as denominated
+- .pick_status.json: Mark 4b76d04f7f3348838239f184e68141df6409b67a as denominated
+
+Faith Ekstrand (1):
+
+- nak: Set lower_pack_64_4x16
+
+Gurchetan Singh (1):
+
+- gfxstream: make sure by default descriptor is negative
+
+José Roberto de Souza (1):
+
+- intel/tools: Fix batch buffer decoder
+
+Karmjit Mahil (1):
+
+- tu: Fix segfault in fail_submit KGSL path
+
+Karol Herbst (4):
+
+- r600: fix r600_buffer_from_user_memory for rusticl
+- iris: parse global bindings for every gen
+- iris/xe: fix compute shader start address
+- iris/xe: take the grids variable_shared_mem into account
+
+Konstantin Seurer (1):
+
+- radv: Return VK_ERROR_INCOMPATIBLE_DRIVER for unsupported devices
+
+Lars-Ivar Hesselberg Simonsen (4):
+
+- pan/texture: Correctly handle slice stride for MSAA
+- pan/texture: Set plane size to slice size
+- pan/genxml/v10: Add minus1 mod for plane width/height
+- pan/texture/v10+: Set width/height in the plane descs
+
+Lionel Landwerlin (3):
+
+- anv: force fragment shader execution when occlusion queries are active
+- intel: fix null render target setup logic
+- vulkan/runtime: fixup assert with link_geom_stages
+
+Marek Olšák (2):
+
+- nir/opt_vectorize_io: fix a failure when vectorizing different bit sizes
+- nir: fix gathering color interp modes in nir_lower_color_inputs
+
+Matthieu Oechslin (1):
+
+- r600: Take dual source blending in account when creating target mask with RATs
+
+Mel Henning (3):
+
+- nak: Remove hfma2 src 1 modifiers
+- nak: Add Src::is_unmodified() helper
+- nak: Check that swizzles are none
+
+Mike Blumenkrantz (2):
+
+- egl: fix sw fallback rejection in non-sw EGL_PLATFORM=device
+- zink: fix broken comparison for dummy pipe surface sizing
+
+Natalie Vock (2):
+
+- radv,driconf: Add radv_force_64k_sparse_alignment config
+- driconf: Add workarounds for DOOM: The Dark Ages
+
+Paul Gofman (1):
+
+- radv/amdgpu: Fix hash key in radv_amdgpu_winsys_destroy().
+
+Rhys Perry (3):
+
+- aco: swap the correct v_mov_b32 if there are two of them
+- ac/llvm: correctly split vector 8/16-bit stores
+- ac/llvm: correctly set alignment of vector global load/store
+
+Robert Mader (1):
+
+- llvmpipe: Fix dmabuf import paths for DRM_FORMAT_YUYV variants
+
+Sagar Ghuge (2):
+
+- intel/compiler: Fix stackIDs on Xe2+
+- anv: Fix untyped data port cache pipe control dump output
+
+Samuel Pitoiset (7):
+
+- radv: do not clear unwritten color attachments with dual-source blending
+- radv: disable SINGLE clear codes to workaround a hw bug with DCC on GFX11
+- radv: fix GPU hangs with image copies for ASTC/ETC2 formats on transfer queue
+- radv: ignore radv_disable_dcc_stores on GFX12
+- radv: fix SDMA copies for linear 96-bits formats
+- radv: fix emitting dynamic viewports/scissors when the count is static
+- radv: remove the optimization for equal immutable samplers
+
+Tapani Pälli (1):
+
+- mesa: add missing stencil formats to _mesa_is_stencil_format
+
+Thomas H.P. Andersen (1):
+
+- driconf: update X4 Foundations executable name
+
+Timothy Arceri (3):
+
+- util/driconf: add force_gl_depth_component_type_int workaround
+- mesa: fix color material tracking
+- mesa: relax EXT_texture_integer validation
--- a/docs/relnotes/25.0.7.rst
+++ b/docs/relnotes/25.0.7.rst
@@ -0,0 +1,199 @@
+Mesa 25.0.7 Release Notes / 2025-05-28
+======================================
+
+Mesa 25.0.7 is a bug fix release which fixes bugs found since the 25.0.6 release.
+
+Mesa 25.0.7 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 25.0.7 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    SHA256: 592272df3cf01e85e7db300c449df5061092574d099da275d19e97ef0510f8a6  mesa-25.0.7.tar.xz
+    SHA512: 825bbd8bc5507de147488519786c0200afacf97dae621c80ead24b2c5dd55c5a442757ac8452698ae611e9344025465080795cf8f2dc4eb7ce07b5cc521b2b5c  mesa-25.0.7.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- RX9070 hard crash with Mafia Definitive Edition
+- RADV: Potential bug with vulkan fragment shader interpolation (on outputs from mesh shaders?)
+- In the game "Foundation" a buildings areas of effect is missing
+- ANV: Dota 2 May 22 2025 update crashing in vkCmdBindDescriptorSets with no validation error
+- [RADV][GFX9] Recent Mesa-git broken on AMD Vega 64 with ring sdma0 timeouts when launching DXVK games
+- Vulkan Video engages during playback of format which is not supported by my Fiji GPU
+- ACO: IR Validation error "SDWA operand selection size" triggered by compute shader on VEGA20
+- RADV: Gibberish output with llama.cpp (Vulkan compute) on Radeon VII (Vega20) with Mesa 25.1.0, works on 25.0.5
+- Blending broken in game SkinDeep
+- Radeon R5 (Mullins) H264 VA-API encoding acceleration doesn't work
+- nvk: lib_rs_gen.py requires \`rustfmt`
+- radv: vkd3d-proton test failure with predication + EXT_dgc
+- mesa-25.0.4 aborts Xserver due to ACO "Unsupported opcode" v_lshlrev_b16
+
+
+Changes
+-------
+
+Adam Jackson (2):
+
+- vtn: (Silently) handle FunctionParameterAttributeNo{Capture,Write}
+- vtn/opencl: Handle OpenCLstd_F{Min,Max}_common
+
+Calder Young (2):
+
+- iris: Fix accidental writes to global dirty bit instead of local
+- iris: set dependency between SF_CL and CC states
+
+Christian Gmeiner (1):
+
+- zink: Fix NIR validation error in cubemap-to-array lowering
+
+Dave Airlie (1):
+
+- nvk: Fix compute class comparison in dispatch indirect
+
+David Rosca (4):
+
+- radeonsi/vce: Fix bitstream buffer size
+- radeonsi/vce: Only send one task per IB
+- radeonsi/vce: Fix output quality and performance in speed preset
+- radv/video: Limit 10bit H265 decode support to stoney and newer
+
+Ella Stanforth (1):
+
+- v3d/compiler: Fix ub when using memcmp for texture comparisons.
+
+Eric Engestrom (3):
+
+- docs: add sha sum for 25.0.6
+- .pick_status.json: Mark 29d7b90cfcb67ecc2ff3e422dd7b38898abb1bbe as denominated
+- .pick_status.json: Update to 8965e60118fa17407c5bfcdca1fe2854ad2fb150
+
+Erik Faye-Lund (1):
+
+- mesa/main: remove non-existing function prototype
+
+Faith Ekstrand (2):
+
+- nvk: Allocate the correct VAB size on Kepler
+- nouveau/mme: Don't install the HW tests
+
+Georg Lehmann (2):
+
+- radeonsi: always lower alu bit sizes
+- aco: assume sram ecc is enabled on Vega20
+
+Gurchetan Singh (1):
+
+- gfxstream: get rid of logspam in virtualized case
+
+Hans-Kristian Arntzen (1):
+
+- radv: Consider that DGC might need shader reads of predicated data.
+
+José Roberto de Souza (2):
+
+- anv: Implement missing part of Wa_1604061319
+- anv: Enable preemption due 3DPRIMITIVE in GFX 12
+
+Karol Herbst (2):
+
+- nir: fix use-after-free on function parameter names
+- vtn: fix use-after-free on function parameter names
+
+Lars-Ivar Hesselberg Simonsen (2):
+
+- panvk/v9+: Set up limited texture descs for storage use
+- panvk/v9+: Set up limited texture descs for storage use
+
+LingMan (1):
+
+- entaviv/isa: Silence warnings about non snake case names
+
+Lionel Landwerlin (4):
+
+- anv: enable preemption setting on command/batch correctly
+- anv/brw: stop turning load_push_constants into load_uniform
+- hasvk/elk: stop turning load_push_constants into load_uniform
+- anv: don't use pipeline layout at descriptor bind
+
+Marek Olšák (2):
+
+- winsys/amdgpu: fix running out of 32bit address space with high FPS
+- glsl: fix sampler and image type checking in lower_precision
+
+Matt Turner (1):
+
+- gallivm: Use \`llvm.roundeven` in lp_build_round()
+
+Mel Henning (2):
+
+- nouveau/headers: Run rustfmt after file is closed
+- nouveau/headers: Ignore PermissionError in rustfmt
+
+Mike Blumenkrantz (2):
+
+- llvmpipe: disable conditional rendering mem for blits
+- lavapipe: handle counterOffset in vkCmdDrawIndirectByteCountEXT
+
+Natalie Vock (1):
+
+- driconf: Fix DOOM: The Dark Ages workaround name in 25.0.x
+
+Olivia Lee (1):
+
+- util/u_printf: fix memory leak in u_printf_singleton_add_serialized
+
+Patrick Lerda (1):
+
+- r600: fix pop-free clipping
+
+Paulo Zanoni (1):
+
+- anv/trtt: don't avoid the TR-TT submission when there is stuff to signal
+
+Qiang Yu (1):
+
+- nir/opt_varyings: fix mesh shader miss promote varying to flat
+
+Rhys Perry (1):
+
+- aco/gfx115: consider point sample acceleration
+
+Rob Clark (1):
+
+- ci: Disable fd-farm
+
+Samuel Pitoiset (5):
+
+- radv: fix fetching conditional rendering state for DGC preprocess
+- radv: fix conditional rendering with DGC and non native 32-bit predicate
+- radv: fix missing texel scale for unaligned linear SDMA copies
+- radv: fix capture/replay with sparse images and descriptor buffer
+- radv: add radv_disable_hiz_his_gfx12 and enable for Mafia Definitive Edition
+
+Timothy Arceri (7):
+
+- st/mesa: fix _IntegerBuffers bitfield use
+- mesa/st: fix _BlendForceAlphaToOneDraw bitfield use
+- mesa/st: fix _IsRGBDraw bitfield use
+- mesa: fix _FP32Buffers bitfield use
+- mesa: update validation when draw buffer changes
+- mesa: extend linear_as_nearest work around
+- util: add workaround for the game Foundation
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@@ -1,40 +0,0 @@
-cl_khr_depth_images in rusticl
-Vulkan 1.4 on radv/gfx8+
-VK_KHR_dedicated_allocation on panvk
-VK_KHR_global_priority on panvk
-VK_KHR_index_type_uint8 on panvk
-VK_KHR_map_memory2 on panvk
-VK_KHR_multiview on panvk/v10+
-VK_KHR_shader_non_semantic_info on panvk
-VK_KHR_shader_relaxed_extended_instruction on panvk
-VK_KHR_vertex_attribute_divisor on panvk
-VK_KHR_zero_initialize_workgroup_memory on panvk
-VK_KHR_shader_draw_parameters on panvk
-VK_KHR_shader_float16_int8 on panvk
-VK_KHR_8bit_storage on panvk
-VK_EXT_4444_formats on panvk
-VK_EXT_global_priority on panvk
-VK_EXT_global_priority_query on panvk
-VK_EXT_host_query_reset on panvk
-VK_EXT_image_robustness on panvk
-VK_EXT_pipeline_robustness on panvk
-VK_EXT_provoking_vertex on panvk
-VK_EXT_queue_family_foreign on panvk
-VK_EXT_sampler_filter_minmax on panvk
-VK_EXT_scalar_block_layout on panvk
-VK_EXT_tooling_info on panvk
-depthClamp on panvk
-depthBiasClamp on panvk
-drawIndirectFirstInstance on panvk
-fragmentStoresAndAtomics on panvk/v10+
-sampleRateShading on panvk
-occlusionQueryPrecise on panvk
-shaderInt16 on panvk
-shaderInt64 on panvk
-imageCubeArray on panvk
-VK_KHR_depth_clamp_zero_one on RADV
-VK_KHR_maintenance8 on radv
-VK_KHR_shader_subgroup_rotate on panvk/v10+
-Vulkan 1.1 on panvk/v10+
-VK_EXT_subgroup_size_control on panvk/v10+
-initial GFX12 (RDNA4) support on RADV
--- a/docs/submittingpatches.rst
+++ b/docs/submittingpatches.rst
@@ -136,7 +136,9 @@ following example::

    Backport-to: 21.0

-Multiple ``Backport-to:`` lines are allowed.
+This will backport the commit to the 21.0 branch, as well as any more recent
+stable branch. Multiple ``Backport-to:`` lines are allowed, but only the
+lowest number mentioned actually matters, so for clarity, please only use one.

 The last option is deprecated and mostly here for historical reasons
 dating back to when patch submission was done via emails: using a ``Cc:``
--- a/include/drm-uapi/amdgpu_drm.h
+++ b/include/drm-uapi/amdgpu_drm.h
@@ -652,13 +652,17 @@ struct drm_amdgpu_gem_userptr {
 /* GFX12 and later: */
 #define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT			0
 #define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK			0x7
-/* These are DCC recompression setting for memory management: */
+/* These are DCC recompression settings for memory management: */
 #define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT	3
 #define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK	0x3 /* 0:64B, 1:128B, 2:256B */
 #define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT		5
 #define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK		0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */
 #define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT		8
 #define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK		0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */
+/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata
+ * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */
+#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT	14
+#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK	0x1
 /* bit gap */
 #define AMDGPU_TILING_GFX12_SCANOUT_SHIFT			63
 #define AMDGPU_TILING_GFX12_SCANOUT_MASK			0x1
--- a/include/pci_ids/iris_pci_ids.h
+++ b/include/pci_ids/iris_pci_ids.h
@@ -277,7 +277,11 @@ CHIPSET(0xe202, bmg, "BMG G21", "Intel(R) Graphics")
 CHIPSET(0xe20b, bmg, "BMG G21", "Intel(R) Graphics")
 CHIPSET(0xe20c, bmg, "BMG G21", "Intel(R) Graphics")
 CHIPSET(0xe20d, bmg, "BMG G21", "Intel(R) Graphics")
+CHIPSET(0xe210, bmg, "BMG G21", "Intel(R) Graphics")
+CHIPSET(0xe211, bmg, "BMG G21", "Intel(R) Graphics")
 CHIPSET(0xe212, bmg, "BMG G21", "Intel(R) Graphics")
+CHIPSET(0xe215, bmg, "BMG", "Intel(R) Graphics")
+CHIPSET(0xe216, bmg, "BMG", "Intel(R) Graphics")

 CHIPSET(0xb080, ptl, "PTL", "Intel(R) Graphics", FORCE_PROBE)
 CHIPSET(0xb081, ptl, "PTL", "Intel(R) Graphics", FORCE_PROBE)
--- a/meson.build
+++ b/meson.build
@@ -525,6 +525,8 @@ if not have_mtls_dialect
  # cross-compiling, but because this is just an optimization we can skip it
  if meson.is_cross_build() and not meson.can_run_host_binaries()
    warning('cannot auto-detect -mtls-dialect when cross-compiling, using compiler default')
+  elif host_machine.system() == 'freebsd'
+    warning('cannot use -mtls-dialect for FreeBSD, using compiler default')
  else
    # The way to specify the TLSDESC dialect is architecture-specific.
    # We probe both because there is not a fallback guaranteed to work for all
@@ -766,6 +768,8 @@ endif
 _opencl = get_option('gallium-opencl')
 _rtti = get_option('cpp_rtti')
 if _opencl != 'disabled'
+  warning('Clover will be removed in Mesa 25.2')
+
  if not with_gallium
    error('OpenCL Clover implementation requires at least one gallium driver.')
  endif
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -151,6 +151,7 @@ option(
  choices : ['icd', 'standalone', 'disabled'],
  value : 'disabled',
  description : 'build gallium "clover" OpenCL frontend.',
+  deprecated: true,
 )

 option(
--- a/src/amd/ci/gitlab-ci-inc.yml
+++ b/src/amd/ci/gitlab-ci-inc.yml
@@ -191,6 +191,9 @@
    HWCI_KERNEL_MODULES: amdgpu
    KERNEL_IMAGE_TYPE: ""
    RUNNER_TAG: mesa-ci-x86-64-lava-asus-CM1400CXA-dalboz
+    # Force fixed 6.6 kernel, amdgpu doesn't revcover from GPU resets on 6.13
+    # https://gitlab.freedesktop.org/drm/amd/-/issues/3861
+    EXTERNAL_KERNEL_TAG: "v6.6.21-mesa-f8ea"

 # Status: https://lava.collabora.dev/scheduler/device_type/lenovo-TPad-C13-Yoga-zork
 .lava-lenovo-TPad-C13-Yoga-zork:x86_64:
@@ -204,6 +207,9 @@
    HWCI_KERNEL_MODULES: amdgpu
    KERNEL_IMAGE_TYPE: ""
    RUNNER_TAG: mesa-ci-x86-64-lava-lenovo-TPad-C13-Yoga-zork
+    # Force fixed 6.6 kernel, amdgpu doesn't revcover from GPU resets on 6.13
+    # https://gitlab.freedesktop.org/drm/amd/-/issues/3861
+    EXTERNAL_KERNEL_TAG: "v6.6.21-mesa-f8ea"

 # Status: https://lava.collabora.dev/scheduler/device_type/hp-x360-14a-cb0001xx-zork
 .lava-hp-x360-14a-cb0001xx-zork:x86_64:
@@ -217,6 +223,9 @@
    HWCI_KERNEL_MODULES: amdgpu
    KERNEL_IMAGE_TYPE: ""
    RUNNER_TAG: mesa-ci-x86-64-lava-hp-x360-14a-cb0001xx-zork
+    # Force fixed 6.6 kernel, amdgpu doesn't revcover from GPU resets on 6.13
+    # https://gitlab.freedesktop.org/drm/amd/-/issues/3861
+    EXTERNAL_KERNEL_TAG: "v6.6.21-mesa-f8ea"


 ############### LAVA
@@ -397,7 +406,7 @@
  tags:
    - farm:$RUNNER_FARM_LOCATION
    - amdgpu:codename:VANGOGH
-    - $VALVE_INFRA_VANGOGH_JOB_PRIORITY
+    - $CI_TRON_JOB_PRIORITY_TAG

 .navi31-test-valve:
  variables:
--- a/src/amd/ci/radeonsi-raven-fails.txt
+++ b/src/amd/ci/radeonsi-raven-fails.txt
@@ -1,10 +1,7 @@
 glx@glx-make-current,Fail
 glx@glx-multi-window-single-context,Fail
-glx@glx-visuals-depth -pixmap,Fail
-glx@glx-visuals-stencil -pixmap,Fail
 glx@glx-swap-event_async,Fail
 glx@glx-swap-pixmap-bad,Fail
-glx@glx_arb_create_context_no_error@no error,Fail
 spec@!opengl 1.0@rasterpos,Fail
 spec@!opengl 1.0@rasterpos@glsl_vs_gs_linked,Fail
 spec@!opengl 1.0@rasterpos@glsl_vs_tes_linked,Fail
@@ -13,8 +10,6 @@ spec@!opengl 3.2@gl-3.2-adj-prims cull-front pv-last,Fail
 spec@!opengl 3.2@gl-3.2-adj-prims line cull-back pv-last,Fail
 spec@!opengl 3.2@gl-3.2-adj-prims line cull-front pv-last,Fail
 spec@!opengl 3.2@gl-3.2-adj-prims pv-last,Fail
-spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
-spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail
 spec@arb_shading_language_packing@execution@built-in-functions@fs-packhalf2x16,Fail
 spec@arb_shading_language_packing@execution@built-in-functions@vs-packhalf2x16,Fail
 spec@egl 1.4@eglterminate then unbind context,Fail
--- a/src/amd/ci/radeonsi-stoney-fails.txt
+++ b/src/amd/ci/radeonsi-stoney-fails.txt
@@ -1,6 +1,5 @@
 glx@glx-multi-window-single-context,Fail
 glx@glx-swap-pixmap-bad,Fail
-glx@glx-visuals-depth -pixmap,Fail
 glx@glx-visuals-stencil -pixmap,Fail
 glx@glx_arb_create_context_no_error@no error,Fail
 spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
@@ -151,3 +150,7 @@ spec@arb_fragment_layer_viewport@layer-gs-writes-out-of-range,Fail
 # glcts update
 KHR-GLES3.clip_distance.coverage,Fail
 KHR-GLES3.cull_distance.functional,Fail
+
+# since hetzner migration
+spec@ext_external_objects@vk-ping-pong-multi-sem,Fail
+spec@ext_external_objects@vk-ping-pong-single-sem,Crash
--- a/src/amd/ci/radeonsi-stoney-flakes.txt
+++ b/src/amd/ci/radeonsi-stoney-flakes.txt
@@ -27,3 +27,6 @@ dEQP-GLES3.functional.occlusion_query.conservative_scissor_stencil_clear
 dEQP-GLES3.functional.occlusion_query.conservative_depth_clear
 dEQP-GLES3.functional.occlusion_query.scissor_depth_clear_stencil_write_stencil_clear
 dEQP-GLES3.functional.occlusion_query.conservative_depth_write_stencil_clear
+
+# since hetzner migration
+spec@!opengl 1.0@gl-1.0-ortho-pos
--- a/src/amd/ci/radeonsi-vangogh-fails.txt
+++ b/src/amd/ci/radeonsi-vangogh-fails.txt
@@ -16,8 +16,6 @@ spec@!opengl 1.0@rasterpos@glsl_vs_tes_linked,Fail
 spec@!opengl 1.1@line-smooth-stipple,Fail
 spec@arb_fragment_layer_viewport@layer-gs-writes-out-of-range,Fail
 spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-frag,Fail
-spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
-spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail
 spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail
 spec@arb_shading_language_packing@execution@built-in-functions@fs-packhalf2x16,Fail
 spec@arb_shading_language_packing@execution@built-in-functions@vs-packhalf2x16,Fail
--- a/src/amd/ci/radv-gfx1200-aco-fails.txt
+++ b/src/amd/ci/radv-gfx1200-aco-fails.txt
@@ -8,11 +8,3 @@ dEQP-VK.api.copy_and_blit.core.resolve_image.whole_copy_before_resolving_transfe
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.2_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.4_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.8_bit,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
--- a/src/amd/ci/radv-navi10-aco-fails.txt
+++ b/src/amd/ci/radv-navi10-aco-fails.txt
@@ -8,11 +8,3 @@ dEQP-VK.api.copy_and_blit.core.resolve_image.whole_copy_before_resolving_transfe
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.2_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.4_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.8_bit,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_optimal,Fail
--- a/src/amd/ci/radv-navi21-aco-fails.txt
+++ b/src/amd/ci/radv-navi21-aco-fails.txt
@@ -8,27 +8,3 @@ dEQP-VK.api.copy_and_blit.core.resolve_image.whole_copy_before_resolving_transfe
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.2_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.4_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.8_bit,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.general_optimal,Fail
--- a/src/amd/ci/radv-navi31-aco-fails.txt
+++ b/src/amd/ci/radv-navi31-aco-fails.txt
@@ -8,27 +8,3 @@ dEQP-VK.api.copy_and_blit.core.resolve_image.whole_copy_before_resolving_transfe
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.2_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.4_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.8_bit,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.optimal_optimal,Fail
--- a/src/amd/ci/radv-renoir-aco-skips.txt
+++ b/src/amd/ci/radv-renoir-aco-skips.txt
@@ -1,9 +0,0 @@
-# RADV_PERFTEST=transfer_queue hangs
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_general
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_optimal
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_general
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_optimal
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_general
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_optimal
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_general
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_optimal
--- a/src/amd/ci/radv-vangogh-aco-fails.txt
+++ b/src/amd/ci/radv-vangogh-aco-fails.txt
@@ -8,27 +8,3 @@ dEQP-VK.api.copy_and_blit.core.resolve_image.whole_copy_before_resolving_transfe
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.2_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.4_bit,Fail
 dEQP-VK.api.copy_and_blit.dedicated_allocation.resolve_image.whole_copy_before_resolving_transfer.8_bit,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_separate_layouts.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.optimal_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint.optimal_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_general,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_optimal,Fail
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_separate_layouts.optimal_optimal,Fail
--- a/src/amd/ci/radv-vega10-aco-skips.txt
+++ b/src/amd/ci/radv-vega10-aco-skips.txt
@@ -1,9 +0,0 @@
-# RADV_PERFTEST=transfer_queue hangs
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_general
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.general_optimal
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_general
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d16_unorm_s8_uint_d16_unorm_s8_uint_depth_stencil_aspects.optimal_optimal
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_general
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.general_optimal
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_general
-dEQP-VK.api.copy_and_blit.copy_commands2.image_to_image_transfer_queue.all_formats.depth_stencil.2d_to_2d.d32_sfloat_s8_uint_d32_sfloat_s8_uint_depth_stencil_aspects.optimal_optimal
--- a/src/amd/common/ac_cmdbuf.c
+++ b/src/amd/common/ac_cmdbuf.c
@@ -249,7 +249,6 @@ gfx6_init_graphics_preamble_state(const struct ac_preamble_state *state,
      /* CLEAR_STATE doesn't clear these correctly on certain generations.
       * I don't know why. Deduced by trial and error.
       */
-      ac_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
      ac_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
      ac_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
   }
@@ -678,7 +677,6 @@ gfx12_init_graphics_preamble_state(const struct ac_preamble_state *state,
   ac_pm4_set_reg(pm4, R_028AA0_VGT_DRAW_PAYLOAD_CNTL, 0);
   ac_pm4_set_reg(pm4, R_028ABC_DB_HTILE_SURFACE, 0);

-   ac_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
   ac_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
                  S_028B50_ACCUM_ISOLINE(128) |
                  S_028B50_ACCUM_TRI(128) |
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -542,8 +542,9 @@ static void handle_env_var_force_family(struct radeon_info *info)
   exit(1);
 }

-bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
-                       bool require_pci_bus_info)
+enum ac_query_gpu_info_result
+ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
+                  bool require_pci_bus_info)
 {
   struct amdgpu_gpu_info amdinfo;
   struct drm_amdgpu_info_device device_info = {0};
@@ -567,7 +568,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,

   if (!ac_query_pci_bus_info(fd, info)) {
      if (require_pci_bus_info)
-         return false;
+         return AC_QUERY_GPU_INFO_FAIL;
   }

   assert(info->drm_major == 3);
@@ -577,27 +578,27 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
      fprintf(stderr, "amdgpu: DRM version is %u.%u.%u, but this driver is "
                      "only compatible with 3.27.0 (kernel 4.20+) or later.\n",
              info->drm_major, info->drm_minor, info->drm_patchlevel);
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   uint64_t cap;
   r = drmGetCap(fd, DRM_CAP_SYNCOBJ, &cap);
   if (r != 0 || cap == 0) {
      fprintf(stderr, "amdgpu: syncobj support is missing but is required.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   /* Query hardware and driver information. */
   r = ac_drm_query_gpu_info(dev, &amdinfo);
   if (r) {
      fprintf(stderr, "amdgpu: ac_drm_query_gpu_info failed.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   r = ac_drm_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), &device_info);
   if (r) {
      fprintf(stderr, "amdgpu: ac_drm_query_info(dev_info) failed.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   for (unsigned ip_type = 0; ip_type < AMD_NUM_IP_TYPES; ip_type++) {
@@ -660,35 +661,35 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   /* Only require gfx or compute. */
   if (!info->ip[AMD_IP_GFX].num_queues && !info->ip[AMD_IP_COMPUTE].num_queues) {
      fprintf(stderr, "amdgpu: failed to find gfx or compute.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version,
                                     &info->me_fw_feature);
   if (r) {
      fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(me) failed.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_MEC, 0, 0, &info->mec_fw_version,
                                     &info->mec_fw_feature);
   if (r) {
      fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(mec) failed.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, &info->pfp_fw_version,
                                     &info->pfp_fw_feature);
   if (r) {
      fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(pfp) failed.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   if (info->ip[AMD_IP_VCN_DEC].num_queues || info->ip[AMD_IP_VCN_UNIFIED].num_queues) {
      r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_VCN, 0, 0, &vidip_fw_version, &vidip_fw_feature);
      if (r) {
         fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(vcn) failed.\n");
-         return false;
+         return AC_QUERY_GPU_INFO_FAIL;
      } else {
         info->vcn_dec_version = (vidip_fw_version & 0x0F000000) >> 24;
         info->vcn_enc_major_version = (vidip_fw_version & 0x00F00000) >> 20;
@@ -699,7 +700,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
         r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, &vidip_fw_version, &vidip_fw_feature);
         if (r) {
            fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(vce) failed.\n");
-            return false;
+            return AC_QUERY_GPU_INFO_FAIL;
         } else
            info->vce_fw_version = vidip_fw_version;
      }
@@ -708,7 +709,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
         r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, &vidip_fw_version, &vidip_fw_feature);
         if (r) {
            fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(uvd) failed.\n");
-            return false;
+            return AC_QUERY_GPU_INFO_FAIL;
         } else
            info->uvd_fw_version = vidip_fw_version;
      }
@@ -717,7 +718,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   r = ac_drm_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi);
   if (r) {
      fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   struct drm_amdgpu_memory_info meminfo = {0};
@@ -725,7 +726,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   r = ac_drm_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
   if (r) {
      fprintf(stderr, "amdgpu: ac_drm_query_info(memory) failed.\n");
-      return false;
+      return AC_QUERY_GPU_INFO_FAIL;
   }

   /* Note: usable_heap_size values can be random and can't be relied on. */
@@ -865,7 +866,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
      else {
         fprintf(stderr, "amdgpu: Unknown gfx version: %u.%u\n",
                 info->ip[AMD_IP_GFX].ver_major, info->ip[AMD_IP_GFX].ver_minor);
-         return false;
+         return AC_QUERY_GPU_INFO_UNIMPLEMENTED_HW;
      }

      info->family_id = device_info.family;
@@ -880,7 +881,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   if (!info->name) {
      fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n",
              device_info.family, device_info.external_rev);
-      return false;
+      return AC_QUERY_GPU_INFO_UNIMPLEMENTED_HW;
   }

   memset(info->lowercase_name, 0, sizeof(info->lowercase_name));
@@ -1255,6 +1256,15 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   info->has_vgt_flush_ngg_legacy_bug = info->gfx_level == GFX10 ||
                                        info->family == CHIP_NAVI21;

+   /* GFX10-GFX10.3 (tested on NAVI10, NAVI21 and NAVI24 but likely all) are
+    * affected by a hw bug when primitive restart is updated and no context
+    * registers are written between draws. One workaround is to emit
+    * SQ_NON_EVENT(0) which is a NOP packet that adds a small delay and seems
+    * to fix it reliably.
+    */
+   info->has_prim_restart_sync_bug = info->gfx_level == GFX10 ||
+                                     info->gfx_level == GFX10_3;
+
   /* First Navi2x chips have a hw bug that doesn't allow to write
    * depth/stencil from a FS for multi-pixel fragments.
    */
@@ -1450,6 +1460,11 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
    */
   info->gfx12_supports_display_dcc = info->gfx_level >= GFX12 && info->drm_minor >= 58;

+   /* AMDGPU always enables DCC compressed writes when a BO is moved back to
+    * VRAM until .60.
+    */
+   info->gfx12_supports_dcc_write_compress_disable = info->gfx_level >= GFX12 && info->drm_minor >= 60;
+
   info->has_stable_pstate = info->drm_minor >= 45;

   if (info->gfx_level >= GFX12) {
@@ -1691,7 +1706,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
      r = ac_drm_query_uq_fw_area_info(dev, AMDGPU_HW_IP_GFX, 0, &fw_info);
      if (r) {
         fprintf(stderr, "amdgpu: amdgpu_query_uq_fw_area_info() failed.\n");
-         return false;
+         return AC_QUERY_GPU_INFO_FAIL;
      }

      info->has_fw_based_shadowing = true;
@@ -1754,7 +1769,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
         exit(0);
      }
   }
-   return true;
+   return AC_QUERY_GPU_INFO_SUCCESS;
 }

 void ac_compute_driver_uuid(char *uuid, size_t size)
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -104,6 +104,7 @@ struct radeon_info {
   bool has_image_load_dcc_bug;
   bool has_two_planes_iterate256_bug;
   bool has_vgt_flush_ngg_legacy_bug;
+   bool has_prim_restart_sync_bug;
   bool has_cs_regalloc_hang_bug;
   bool has_async_compute_threadgroup_bug;
   bool has_async_compute_align32_bug;
@@ -161,6 +162,7 @@ struct radeon_info {
   /* Allocate both aligned and unaligned DCC and use the retile blit. */
   bool use_display_dcc_with_retile_blit;
   bool gfx12_supports_display_dcc;
+   bool gfx12_supports_dcc_write_compress_disable;

   /* Memory info. */
   uint32_t pte_fragment_size;
@@ -327,8 +329,14 @@ struct radeon_info {
   bool has_image_bvh_intersect_ray;
 };

-bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
-                       bool require_pci_bus_info);
+enum ac_query_gpu_info_result {
+   AC_QUERY_GPU_INFO_SUCCESS,
+   AC_QUERY_GPU_INFO_FAIL,
+   AC_QUERY_GPU_INFO_UNIMPLEMENTED_HW,
+};
+
+enum ac_query_gpu_info_result ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
+                                                bool require_pci_bus_info);

 void ac_compute_driver_uuid(char *uuid, size_t size);

--- a/src/amd/common/ac_surface.c
+++ b/src/amd/common/ac_surface.c
@@ -65,6 +65,10 @@
 #define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK		0x7
 #define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT		8
 #define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK		0x3f
+/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata
+ * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */
+#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT   14
+#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK    0x1
 #define AMDGPU_TILING_SET(field, value) \
 	(((__u64)(value) & AMDGPU_TILING_##field##_MASK) << AMDGPU_TILING_##field##_SHIFT)
 #define AMDGPU_TILING_GET(value, field) \
@@ -1193,6 +1197,59 @@ static void ac_compute_cmask(const struct radeon_info *info, const struct ac_sur
   surf->cmask_size = surf->cmask_slice_size * num_layers;
 }

+static uint64_t ac_estimate_size(const struct ac_surf_config *config,
+                                 unsigned blk_w, unsigned blk_h, unsigned bpp,
+                                 unsigned in_width, unsigned in_height,
+                                 unsigned align_width, unsigned align_height,
+                                 unsigned align_depth)
+{
+   assert(bpp);
+   unsigned num_samples = MAX2(1, config->info.samples);
+   unsigned bpe = bpp / 8;
+   unsigned width = align(in_width, align_width * blk_w);
+   unsigned height = align(in_height , align_height * blk_h);
+   unsigned depth = align(config->is_3d ? config->info.depth :
+                          config->is_cube ? 6 : config->info.array_size, align_depth);
+   unsigned tile_size_bytes = align_width * align_height * align_depth * num_samples * bpe;
+
+   if (config->info.levels > 1 && align_height > 1) {
+      width = util_next_power_of_two(width);
+      height = util_next_power_of_two(height);
+   }
+
+   uint64_t size = 0;
+
+   /* Note: This mipmap size computation is inaccurate. */
+   for (unsigned i = 0; i < config->info.levels; i++) {
+      uint64_t level_size =
+         (uint64_t)DIV_ROUND_UP(width, blk_w) * DIV_ROUND_UP(height, blk_h) * depth *
+         num_samples * bpe;
+
+      size += level_size;
+
+      if (tile_size_bytes >= 4096 && level_size <= tile_size_bytes / 2) {
+         /* We are likely in the mip tail, return. */
+         assert(size);
+         return size;
+      }
+
+      /* Minify the level. */
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      if (config->is_3d)
+         depth = u_minify(depth, 1);
+   }
+
+   /* TODO: check that this is not too different from the correct value */
+   assert(size);
+   return size;
+}
+
+#define SI__GB_TILE_MODE__BANK_WIDTH(x)         (((x) >> 14) & 0x3)
+#define SI__GB_TILE_MODE__BANK_HEIGHT(x)        (((x) >> 16) & 0x3)
+#define SI__GB_TILE_MODE__MACRO_TILE_ASPECT(x)  (((x) >> 18) & 0x3)
+#define SI__GB_TILE_MODE__NUM_BANKS(x)          (((x) >> 20) & 0x3)
+
 /**
 * Fill in the tiling information in \p surf based on the given surface config.
 *
@@ -1255,11 +1312,100 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *i
         }
      } else {
         if (config->is_3d) {
-            /* GFX6 doesn't have 3D_TILED_XTHICK. */
-            if (info->gfx_level >= GFX7)
-               AddrSurfInfoIn.tileMode = ADDR_TM_3D_TILED_XTHICK;
-            else
-               AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_XTHICK;
+            /* Select the best tile mode that doesn't overallocate memory too much.
+             * The tile modes below are sorted from best to worst performance.
+             */
+            struct {
+               unsigned tile_mode;
+               unsigned gfx6_tile_mode_index;
+               unsigned gfx7_tile_mode_index;
+               unsigned microtile_width;
+               unsigned microtile_height;
+               unsigned microtile_depth;
+               bool supported; /* this comes from the tile mode arrays in the kernel */
+               /* Derived fields. */
+               unsigned bank_width;
+               unsigned bank_height;
+               unsigned num_banks;
+               unsigned macro_tile_aspect;
+               unsigned align_width;
+               unsigned align_height;
+               unsigned align_depth;
+            } modes[] = {
+               {ADDR_TM_3D_TILED_XTHICK, 0, 26, 8, 8, 8, info->gfx_level >= GFX7},
+               {ADDR_TM_2D_TILED_XTHICK, 19, 25, 8, 8, 8, true},
+               {ADDR_TM_3D_TILED_THICK, 0, 21, 8, 8, 4, info->gfx_level >= GFX7},
+               {ADDR_TM_2D_TILED_THICK, 20, 20, 8, 8, 4, true},
+               {ADDR_TM_3D_TILED_THIN1, 0, 15, 8, 8, 1, info->gfx_level >= GFX7},
+               {ADDR_TM_2D_TILED_THIN1, 14, 14, 8, 8, 1, true},
+               {ADDR_TM_1D_TILED_THICK, 18, 19, 8, 8, 4, true},
+               {ADDR_TM_1D_TILED_THIN1, 13, 13, 8, 8, 1, true},
+               /* Don't use LINEAR_ALIGNED. It doesn't work with BC formats. */
+            };
+
+            for (unsigned i = 0; i < ARRAY_SIZE(modes); i++) {
+               if (!modes[i].supported)
+                  continue;
+
+               if (modes[i].tile_mode <= ADDR_TM_1D_TILED_THICK) {
+                  modes[i].align_width = modes[i].microtile_width;
+                  modes[i].align_height = modes[i].microtile_height;
+                  modes[i].align_depth = modes[i].microtile_depth;
+                  continue;
+               }
+
+               if (info->gfx_level >= GFX7) {
+                  ADDR_GET_MACROMODEINDEX_INPUT in = {sizeof(in)};
+                  ADDR_GET_MACROMODEINDEX_OUTPUT out = {sizeof(out)};
+
+                  in.tileIndex = modes[i].gfx7_tile_mode_index;
+                  in.bpp = surf->bpe * 8;
+                  in.numFrags = 1;
+
+                  if (AddrGetMacroModeIndex(addrlib, &in, &out) != ADDR_OK) {
+                     fprintf(stderr, "amdgpu: AddrGetMacroModeIndex failed.\n");
+                     return -1;
+                  }
+
+                  uint32_t macro_mode_reg = info->cik_macrotile_mode_array[out.macroModeIndex];
+                  modes[i].bank_width = 1 << G_009990_BANK_WIDTH(macro_mode_reg);
+                  modes[i].bank_height = 1 << G_009990_BANK_HEIGHT(macro_mode_reg);
+                  modes[i].num_banks = 2 << G_009990_NUM_BANKS(macro_mode_reg);
+                  modes[i].macro_tile_aspect = 1 << G_009990_MACRO_TILE_ASPECT(macro_mode_reg);
+               } else {
+                  /* GFX6. */
+                  uint32_t tile_mode_reg = info->si_tile_mode_array[modes[i].gfx6_tile_mode_index];
+                  modes[i].bank_width = 1 << SI__GB_TILE_MODE__BANK_WIDTH(tile_mode_reg);
+                  modes[i].bank_height = 1 << SI__GB_TILE_MODE__BANK_HEIGHT(tile_mode_reg);
+                  modes[i].num_banks = 2 << SI__GB_TILE_MODE__NUM_BANKS(tile_mode_reg);
+                  modes[i].macro_tile_aspect = 1 << SI__GB_TILE_MODE__MACRO_TILE_ASPECT(tile_mode_reg);
+               }
+
+               modes[i].align_width = modes[i].microtile_width * modes[i].bank_width *
+                                      info->num_tile_pipes * modes[i].macro_tile_aspect;
+               modes[i].align_height = modes[i].microtile_height * modes[i].bank_height *
+                                       modes[i].num_banks / modes[i].macro_tile_aspect;
+               modes[i].align_depth = modes[i].microtile_depth;
+            }
+
+            uint64_t ideal_size = ac_estimate_size(config, surf->blk_w, surf->blk_h, surf->bpe * 8,
+                                                   config->info.width, config->info.height, 1, 1, 1);
+            AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1; /* used if everything else fails */
+
+            for (unsigned i = 0; i < ARRAY_SIZE(modes); i++) {
+               if (!modes[i].supported)
+                  continue;
+
+               uint64_t size = ac_estimate_size(config, surf->blk_w, surf->blk_h, surf->bpe * 8,
+                                                config->info.width, config->info.height,
+                                                modes[i].align_width, modes[i].align_height,
+                                                modes[i].align_depth);
+
+               if (size <= ideal_size * 3) {
+                  AddrSurfInfoIn.tileMode = modes[i].tile_mode;
+                  break;
+               }
+            }
         } else {
            AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1;
         }
@@ -2709,57 +2855,12 @@ static int gfx9_compute_surface(struct ac_addrlib *addrlib, const struct radeon_
   return 0;
 }

-static uint64_t gfx12_estimate_size(const ADDR3_COMPUTE_SURFACE_INFO_INPUT *in,
-                                    const struct radeon_surf *surf,
-                                    unsigned align_width, unsigned align_height,
-                                    unsigned align_depth)
-{
-   unsigned blk_w = surf ? surf->blk_w : 1;
-   unsigned blk_h = surf ? surf->blk_h : 1;
-   unsigned bpe = in->bpp ? in->bpp / 8 : surf->bpe;
-   unsigned width = align(in->width, align_width * blk_w);
-   unsigned height = align(in->height, align_height * blk_h);
-   unsigned depth = align(in->numSlices, align_depth);
-   unsigned tile_size = align_width * align_height * align_depth *
-                        in->numSamples * bpe;
-
-   if (in->numMipLevels > 1 && align_height > 1) {
-      width = util_next_power_of_two(width);
-      height = util_next_power_of_two(height);
-   }
-
-   uint64_t size = 0;
-
-   /* Note: This mipmap size computation is inaccurate. */
-   for (unsigned i = 0; i < in->numMipLevels; i++) {
-      uint64_t level_size =
-         (uint64_t)DIV_ROUND_UP(width, blk_w) * DIV_ROUND_UP(height, blk_h) * depth *
-         in->numSamples * bpe;
-
-      size += level_size;
-
-      if (tile_size >= 4096 && level_size <= tile_size / 2) {
-         /* We are likely in the mip tail, return. */
-         assert(size);
-         return size;
-      }
-
-      /* Minify the level. */
-      width = u_minify(width, 1);
-      height = u_minify(height, 1);
-      if (in->resourceType == ADDR_RSRC_TEX_3D)
-         depth = u_minify(depth, 1);
-   }
-
-   /* TODO: check that this is not too different from the correct value */
-   assert(size);
-   return size;
-}
-
 static unsigned gfx12_select_swizzle_mode(struct ac_addrlib *addrlib,
                                          const struct radeon_info *info,
+                                          const struct ac_surf_config *config,
                                          const struct radeon_surf *surf,
-                                          const ADDR3_COMPUTE_SURFACE_INFO_INPUT *in)
+                                          const ADDR3_COMPUTE_SURFACE_INFO_INPUT *in,
+                                          uint64_t flags)
 {
   ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT get_in = {0};
   ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT get_out = {0};
@@ -2776,9 +2877,9 @@ static unsigned gfx12_select_swizzle_mode(struct ac_addrlib *addrlib,
   get_in.numMipLevels = in->numMipLevels;
   get_in.numSamples = in->numSamples;

-   if (surf && surf->flags & RADEON_SURF_PREFER_4K_ALIGNMENT) {
+   if (flags & RADEON_SURF_PREFER_4K_ALIGNMENT) {
      get_in.maxAlign = 4 * 1024;
-   } else if (surf && surf->flags & RADEON_SURF_PREFER_64K_ALIGNMENT) {
+   } else if (flags & RADEON_SURF_PREFER_64K_ALIGNMENT) {
      get_in.maxAlign = 64 * 1024;
   } else {
      get_in.maxAlign = info->has_dedicated_vram ? (256 * 1024) : (64 * 1024);
@@ -2795,10 +2896,11 @@ static unsigned gfx12_select_swizzle_mode(struct ac_addrlib *addrlib,

   assert(get_out.validModes.value);

-   unsigned bpe = in->bpp ? in->bpp / 8 : surf->bpe;
-   unsigned log_bpp = util_logbase2(bpe);
+   unsigned log_bpp = util_logbase2(get_in.bpp / 8);
   unsigned log_samples = util_logbase2(in->numSamples);
-   uint64_t ideal_size = gfx12_estimate_size(in, surf, 1, 1, 1);
+   unsigned blk_w = surf ? surf->blk_w : 1;
+   unsigned blk_h = surf ? surf->blk_h : 1;
+   uint64_t ideal_size = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height, 1, 1, 1);

   if (in->resourceType == ADDR_RSRC_TEX_3D) {
      static unsigned block3d_size_4K[5][3] = {
@@ -2823,17 +2925,20 @@ static unsigned gfx12_select_swizzle_mode(struct ac_addrlib *addrlib,
         {16, 32, 32},
      };

-      uint64_t size_4K = gfx12_estimate_size(in, surf, block3d_size_4K[log_bpp][0],
-                                             block3d_size_4K[log_bpp][1],
-                                             block3d_size_4K[log_bpp][2]);
+      uint64_t size_4K = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height,
+                                          block3d_size_4K[log_bpp][0],
+                                          block3d_size_4K[log_bpp][1],
+                                          block3d_size_4K[log_bpp][2]);

-      uint64_t size_64K = gfx12_estimate_size(in, surf, block3d_size_64K[log_bpp][0],
-                                              block3d_size_64K[log_bpp][1],
-                                              block3d_size_64K[log_bpp][2]);
+      uint64_t size_64K = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height,
+                                           block3d_size_64K[log_bpp][0],
+                                           block3d_size_64K[log_bpp][1],
+                                           block3d_size_64K[log_bpp][2]);

-      uint64_t size_256K = gfx12_estimate_size(in, surf, block3d_size_256K[log_bpp][0],
-                                               block3d_size_256K[log_bpp][1],
-                                               block3d_size_256K[log_bpp][2]);;
+      uint64_t size_256K = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height,
+                                            block3d_size_256K[log_bpp][0],
+                                            block3d_size_256K[log_bpp][1],
+                                            block3d_size_256K[log_bpp][2]);

      float max_3d_overalloc_256K = 1.1;
      float max_3d_overalloc_64K = 1.2;
@@ -2989,19 +3094,24 @@ static unsigned gfx12_select_swizzle_mode(struct ac_addrlib *addrlib,
      },
   };

-   uint64_t size_LINEAR = gfx12_estimate_size(in, surf, block_size_LINEAR[log_bpp], 1, 1);
+   uint64_t size_LINEAR = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height,
+                                           block_size_LINEAR[log_bpp], 1, 1);

-   uint64_t size_256B = gfx12_estimate_size(in, surf, block_size_256B[log_samples][log_bpp][0],
-                                            block_size_256B[log_samples][log_bpp][1], 1);
+   uint64_t size_256B = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height,
+                                         block_size_256B[log_samples][log_bpp][0],
+                                         block_size_256B[log_samples][log_bpp][1], 1);

-   uint64_t size_4K = gfx12_estimate_size(in, surf, block_size_4K[log_samples][log_bpp][0],
-                                          block_size_4K[log_samples][log_bpp][1], 1);;
+   uint64_t size_4K = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height,
+                                       block_size_4K[log_samples][log_bpp][0],
+                                       block_size_4K[log_samples][log_bpp][1], 1);

-   uint64_t size_64K = gfx12_estimate_size(in, surf, block_size_64K[log_samples][log_bpp][0],
-                                           block_size_64K[log_samples][log_bpp][1], 1);
+   uint64_t size_64K = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height,
+                                        block_size_64K[log_samples][log_bpp][0],
+                                        block_size_64K[log_samples][log_bpp][1], 1);

-   uint64_t size_256K = gfx12_estimate_size(in, surf, block_size_256K[log_samples][log_bpp][0],
-                                            block_size_256K[log_samples][log_bpp][1], 1);
+   uint64_t size_256K = ac_estimate_size(config, blk_w, blk_h, get_in.bpp, in->width, in->height,
+                                         block_size_256K[log_samples][log_bpp][0],
+                                         block_size_256K[log_samples][log_bpp][1], 1);

   float max_2d_overalloc_256K = 1.1;  /* relative to ideal */
   float max_2d_overalloc_64K = 1.3;   /* relative to ideal */
@@ -3032,6 +3142,7 @@ static unsigned gfx12_select_swizzle_mode(struct ac_addrlib *addrlib,
 }

 static bool gfx12_compute_hiz_his_info(struct ac_addrlib *addrlib, const struct radeon_info *info,
+                                       const struct ac_surf_config *config,
                                       struct radeon_surf *surf, struct gfx12_hiz_his_layout *hizs,
                                       const ADDR3_COMPUTE_SURFACE_INFO_INPUT *surf_in)
 {
@@ -3059,7 +3170,7 @@ static bool gfx12_compute_hiz_his_info(struct ac_addrlib *addrlib, const struct
   /* Compute the HiZ/HiS size. */
   in.width = align(DIV_ROUND_UP(surf_in->width, 8), 2);
   in.height = align(DIV_ROUND_UP(surf_in->height, 8), 2);
-   in.swizzleMode = gfx12_select_swizzle_mode(addrlib, info, NULL, &in);
+   in.swizzleMode = gfx12_select_swizzle_mode(addrlib, info, config, NULL, &in, surf->flags);

   int ret = Addr3ComputeSurfaceInfo(addrlib->handle, &in, &out);
   if (ret != ADDR_OK)
@@ -3112,7 +3223,7 @@ static bool gfx12_compute_miptree(struct ac_addrlib *addrlib, const struct radeo
      surf->surf_size = surf->u.gfx9.zs.stencil_offset + out.surfSize;

      if (info->chip_rev >= 2 &&
-          !gfx12_compute_hiz_his_info(addrlib, info, surf, &surf->u.gfx9.zs.his, in))
+          !gfx12_compute_hiz_his_info(addrlib, info, config, surf, &surf->u.gfx9.zs.his, in))
         return false;

      return true;
@@ -3175,7 +3286,7 @@ static bool gfx12_compute_miptree(struct ac_addrlib *addrlib, const struct radeo
   if (in->flags.depth) {
      assert(in->swizzleMode != ADDR3_LINEAR);

-      return gfx12_compute_hiz_his_info(addrlib, info, surf, &surf->u.gfx9.zs.hiz, in);
+      return gfx12_compute_hiz_his_info(addrlib, info, config, surf, &surf->u.gfx9.zs.hiz, in);
   }

   /* Compute tile swizzle for the color surface. All swizzle modes >= 4K support it. */
@@ -3261,7 +3372,8 @@ static bool gfx12_compute_surface(struct ac_addrlib *addrlib, const struct radeo
   } else if (surf->flags & RADEON_SURF_VIDEO_REFERENCE) {
      AddrSurfInfoIn.swizzleMode = ADDR3_256B_2D;
   } else {
-      AddrSurfInfoIn.swizzleMode = gfx12_select_swizzle_mode(addrlib, info, surf, &AddrSurfInfoIn);
+      AddrSurfInfoIn.swizzleMode = gfx12_select_swizzle_mode(addrlib, info, config, surf,
+                                                             &AddrSurfInfoIn, surf->flags);
   }

   /* Force the linear pitch from 128B (default) to 256B for multi-GPU interop. This only applies
@@ -3309,6 +3421,8 @@ static bool gfx12_compute_surface(struct ac_addrlib *addrlib, const struct radeo
                 /* Don't change the DCC settings for imported buffers - they might differ. */
                 !(surf->flags & RADEON_SURF_IMPORTED)) {
         surf->u.gfx9.color.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
+         if ((info->drm_minor < 63) && (surf->flags & RADEON_SURF_SCANOUT))
+            surf->u.gfx9.color.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
      }
   }

@@ -3517,6 +3631,8 @@ void ac_surface_apply_bo_metadata(enum amd_gfx_level gfx_level, struct radeon_su
         AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_DATA_FORMAT);
      surf->u.gfx9.color.dcc_number_type =
         AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_NUMBER_TYPE);
+      surf->u.gfx9.color.dcc_write_compress_disable =
+         AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_WRITE_COMPRESS_DISABLE);
      scanout = AMDGPU_TILING_GET(tiling_flags, GFX12_SCANOUT);
   } else if (gfx_level >= GFX9) {
      surf->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
@@ -3564,6 +3680,7 @@ void ac_surface_compute_bo_metadata(const struct radeon_info *info, struct radeo
                                         surf->u.gfx9.color.dcc.max_compressed_block_size);
      *tiling_flags |= AMDGPU_TILING_SET(GFX12_DCC_NUMBER_TYPE, surf->u.gfx9.color.dcc_number_type);
      *tiling_flags |= AMDGPU_TILING_SET(GFX12_DCC_DATA_FORMAT, surf->u.gfx9.color.dcc_data_format);
+      *tiling_flags |= AMDGPU_TILING_SET(GFX12_DCC_WRITE_COMPRESS_DISABLE, surf->u.gfx9.color.dcc_write_compress_disable);
      *tiling_flags |= AMDGPU_TILING_SET(GFX12_SCANOUT, (surf->flags & RADEON_SURF_SCANOUT) != 0);
   } else if (info->gfx_level >= GFX9) {
      uint64_t dcc_offset = 0;
--- a/src/amd/common/ac_surface.h
+++ b/src/amd/common/ac_surface.h
@@ -275,6 +275,7 @@ struct gfx9_surf_layout {
          */
         uint8_t dcc_number_type; /* CB_COLOR0_INFO.NUMBER_TYPE */
         uint8_t dcc_data_format; /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */
+         bool dcc_write_compress_disable;

         /* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0.
          * The 3D engine doesn't support that layout except for chips with 1 RB.
--- a/src/amd/common/ac_uvd_dec.c
+++ b/src/amd/common/ac_uvd_dec.c
@@ -0,0 +1,33 @@
+/**************************************************************************
+ *
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ **************************************************************************/
+
+#include <stdint.h>
+
+#include "ac_uvd_dec.h"
+#include "util/os_time.h"
+#include "util/detect_os.h"
+#include "util/bitpack_helpers.h"
+
+#if DETECT_OS_POSIX
+#include <unistd.h>
+#endif
+
+void ac_uvd_init_stream_handle(struct ac_uvd_stream_handle *handle)
+{
+#if DETECT_OS_POSIX
+   handle->base = util_bitreverse(getpid() ^ os_time_get());
+#else
+   handle->base = util_bitreverse(os_time_get());
+#endif
+   handle->counter = 0;
+}
+
+unsigned ac_uvd_alloc_stream_handle(struct ac_uvd_stream_handle *handle)
+{
+   return handle->base ^ ++handle->counter;
+}
--- a/src/amd/common/ac_uvd_dec.h
+++ b/src/amd/common/ac_uvd_dec.h
@@ -406,4 +406,12 @@ struct ruvd_msg {
   } body;
 };

+struct ac_uvd_stream_handle {
+   uint32_t base;
+   uint32_t counter;
+};
+
+void ac_uvd_init_stream_handle(struct ac_uvd_stream_handle *handle);
+unsigned ac_uvd_alloc_stream_handle(struct ac_uvd_stream_handle *handle);
+
 #endif
--- a/src/amd/common/ac_vcn_dec.c
+++ b/src/amd/common/ac_vcn_dec.c
@@ -412,7 +412,7 @@ radv_vcn_av1_film_grain_init_scaling(uint8_t scaling_points[][2], uint8_t num, s
 }

 void
-ac_vcn_av1_init_film_grain_buffer(rvcn_dec_film_grain_params_t *fg_params, rvcn_dec_av1_fg_init_buf_t *fg_buf)
+ac_vcn_av1_init_film_grain_buffer(unsigned av1_version, rvcn_dec_film_grain_params_t *fg_params, rvcn_dec_av1_fg_init_buf_t *fg_buf)
 {
   const int32_t luma_block_size_y = LUMA_BLOCK_SIZE_Y;
   const int32_t luma_block_size_x = LUMA_BLOCK_SIZE_X;
@@ -542,24 +542,38 @@ ac_vcn_av1_init_film_grain_buffer(rvcn_dec_film_grain_params_t *fg_params, rvcn_
      }

   align_ptr = &fg_buf->luma_grain_block[0][0];
-   for (i = 0; i < 64; i++) {
-      for (j = 0; j < 80; j++)
-         *align_ptr++ = luma_grain_block_tmp[i][j];
-
-      if (((i + 1) % 4) == 0)
-         align_ptr += 64;
-   }
-
   align_ptr0 = &fg_buf->cb_grain_block[0][0];
   align_ptr1 = &fg_buf->cr_grain_block[0][0];
-   for (i = 0; i < 32; i++) {
-      for (j = 0; j < 40; j++) {
-         *align_ptr0++ = cb_grain_block_tmp[i][j];
-         *align_ptr1++ = cr_grain_block_tmp[i][j];
+
+   if (av1_version == RDECODE_AV1_VER_2) {
+      for (i = 0; i < 64; i++)
+         for (j = 0; j < 64; j++)
+            *align_ptr++ = luma_grain_block_tmp[i][j];
+
+      for (i = 0; i < 32; i++) {
+         for (j = 0; j < 32; j++) {
+            *align_ptr0++ = cb_grain_block_tmp[i][j];
+            *align_ptr1++ = cr_grain_block_tmp[i][j];
+         }
      }
-      if (((i + 1) % 8) == 0) {
-         align_ptr0 += 64;
-         align_ptr1 += 64;
+   } else {
+      for (i = 0; i < 64; i++) {
+         for (j = 0; j < 80; j++)
+            *align_ptr++ = luma_grain_block_tmp[i][j];
+
+         if (((i + 1) % 4) == 0)
+            align_ptr += 64;
+      }
+
+      for (i = 0; i < 32; i++) {
+         for (j = 0; j < 40; j++) {
+            *align_ptr0++ = cb_grain_block_tmp[i][j];
+            *align_ptr1++ = cr_grain_block_tmp[i][j];
+         }
+         if (((i + 1) % 8) == 0) {
+            align_ptr0 += 64;
+            align_ptr1 += 64;
+         }
      }
   }

--- a/src/amd/common/ac_vcn_dec.h
+++ b/src/amd/common/ac_vcn_dec.h
@@ -433,6 +433,7 @@

 #define RDECODE_AV1_VER_0  0
 #define RDECODE_AV1_VER_1  1
+#define RDECODE_AV1_VER_2  2

 typedef struct rvcn_decode_buffer_s {
   unsigned int valid_buf_flag;
@@ -1216,6 +1217,6 @@ struct jpeg_params {

 unsigned ac_vcn_dec_calc_ctx_size_av1(unsigned av1_version);
 void ac_vcn_av1_init_probs(unsigned av1_version, uint8_t *prob);
-void ac_vcn_av1_init_film_grain_buffer(rvcn_dec_film_grain_params_t *fg_params, rvcn_dec_av1_fg_init_buf_t *fg_buf);
+void ac_vcn_av1_init_film_grain_buffer(unsigned av1_version, rvcn_dec_film_grain_params_t *fg_params, rvcn_dec_av1_fg_init_buf_t *fg_buf);

 #endif
--- a/src/amd/common/meson.build
+++ b/src/amd/common/meson.build
@@ -91,6 +91,7 @@ amd_common_files = files(
  'ac_vcn_av1_default.h',
  'ac_vcn_dec.c',
  'ac_vcn_enc.c',
+  'ac_uvd_dec.c',
  'nir/ac_nir.c',
  'nir/ac_nir.h',
  'nir/ac_nir_helpers.h',
--- a/src/amd/common/nir/ac_nir_lower_image_opcodes_cdna.c
+++ b/src/amd/common/nir/ac_nir_lower_image_opcodes_cdna.c
@@ -513,6 +513,6 @@ static bool lower_image_opcodes(nir_builder *b, nir_instr *instr, void *data)
 bool ac_nir_lower_image_opcodes(nir_shader *nir)
 {
   return nir_shader_instructions_pass(nir, lower_image_opcodes,
-                                       nir_metadata_control_flow,
+                                       nir_metadata_none,
                                       NULL);
 }
--- a/src/amd/common/nir/ac_nir_lower_legacy_vs.c
+++ b/src/amd/common/nir/ac_nir_lower_legacy_vs.c
@@ -70,9 +70,6 @@ ac_nir_lower_legacy_vs(nir_shader *nir,
   /* This should be after streamout and before exports. */
   ac_nir_clamp_vertex_color_outputs(&b, &out);

-   /* This should be after streamout and before exports. */
-   ac_nir_clamp_vertex_color_outputs(&b, &out);
-
   uint64_t export_outputs = nir->info.outputs_written | VARYING_BIT_POS;
   if (kill_pointsize)
      export_outputs &= ~VARYING_BIT_PSIZ;
--- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
+++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
@@ -831,9 +831,10 @@ hs_msg_group_vote_use_memory(nir_builder *b, lower_tess_io_state *st,
   nir_pop_if(&top_b, thread0);

   /* Insert a barrier to wait for initialization above if there hasn't been any other barrier
-    * in the shader.
+    * in the shader. If tcs_out_patch_fits_subgroup=true, then TCS barriers don't have a scope
+    * larger than a subgroup.
    */
-   if (!st->tcs_info.always_executes_barrier) {
+   if (!st->tcs_info.always_executes_barrier || st->tcs_out_patch_fits_subgroup) {
      nir_barrier(b, .execution_scope = SCOPE_WORKGROUP, .memory_scope = SCOPE_WORKGROUP,
                  .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared);
   }
--- a/src/amd/compiler/README-ISA.md
+++ b/src/amd/compiler/README-ISA.md
@@ -376,11 +376,13 @@ A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff`
 ### VALUMaskWriteHazard

 Triggered by:
-SALU writing then SALU or VALU reading a SGPR that was previously used as a lane mask for a VALU.
+SALU or VALU writing then SALU or VALU reading a SGPR that was previously used as a lane mask for a
+VALU when using wave64.

 Mitigated by:
-A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait after the
-SALU write: `s_waitcnt_depctr 0xfffe`
+A VALU instruction reading a non-exec SGPR before the SGPR write, or a wait after the
+write: `s_waitcnt_depctr 0xfffe` for SALU, `s_waitcnt_depctr 0xf1ff` for non-VCC VALU and
+`s_waitcnt_depctr 0xfffd` for VCC VALU.

 ## RDNA4 / GFX12 hazards

--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -832,8 +832,8 @@ emit_mimg_instruction_gfx12(asm_context& ctx, std::vector<uint32_t>& out, const
   uint8_t vaddr[5] = {0, 0, 0, 0, 0};
   for (unsigned i = 3; i < instr->operands.size(); i++)
      vaddr[i - 3] = reg(ctx, instr->operands[i], 8);
-   unsigned num_vaddr = instr->operands.size() - 3;
-   for (unsigned i = 0; i < MIN2(instr->operands.back().size() - 1, 5 - num_vaddr); i++)
+   int num_vaddr = instr->operands.size() - 3;
+   for (int i = 0; i < (int)MIN2(instr->operands.back().size() - 1, ARRAY_SIZE(vaddr) - num_vaddr); i++)
      vaddr[num_vaddr + i] = reg(ctx, instr->operands.back(), 8) + i + 1;

   encoding = 0;
@@ -1538,6 +1538,8 @@ chain_branches(asm_context& ctx, std::vector<uint32_t>& out, branch_info& branch
   unsigned target = branch.target;
   branch.target = new_block->index;

+   unsigned skip_branch_target = 0; /* Target of potentially inserted short jump. */
+
   /* Find suitable insertion point:
    * We define two offset ranges within our new branch instruction should be placed.
    * Then we try to maximize the distance from either the previous branch or the target.
@@ -1604,6 +1606,7 @@ chain_branches(asm_context& ctx, std::vector<uint32_t>& out, branch_info& branch
         bld.reset(&ctx.program->blocks[insertion_block_idx].instructions, it);
      } else {
         bld.reset(&ctx.program->blocks[insertion_block_idx - 1].instructions);
+         skip_branch_target = insertion_block_idx;
      }

      /* Since we insert a branch into existing code, mitigate LdsBranchVmemWARHazard on GFX10. */
@@ -1623,6 +1626,11 @@ chain_branches(asm_context& ctx, std::vector<uint32_t>& out, branch_info& branch
   insert_code(ctx, out, insert_at, code.size(), code.data());

   new_block->offset = block_offset;
+   if (skip_branch_target) {
+      /* If we insert a short jump over the new branch at the end of a block,
+       * ensure that it gets updated accordingly after additional changes. */
+      ctx.branches.push_back({block_offset - 1, skip_branch_target});
+   }
   ctx.branches.push_back({block_offset, target});
   assert(out[ctx.branches.back().pos] == code.back());
 }
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -258,6 +258,7 @@ struct NOP_ctx_gfx11 {
   /* VALUMaskWriteHazard */
   std::bitset<128> sgpr_read_by_valu_as_lanemask;
   std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
+   std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_valu;

   /* WMMAHazards */
   std::bitset<256> vgpr_written_by_wmma;
@@ -280,6 +281,8 @@ struct NOP_ctx_gfx11 {
      sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
      sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
         other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
+      sgpr_read_by_valu_as_lanemask_then_wr_by_valu |=
+         other.sgpr_read_by_valu_as_lanemask_then_wr_by_valu;
      vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
      sgpr_read_by_valu |= other.sgpr_read_by_valu;
      sgpr_read_by_valu_then_wr_by_valu |= other.sgpr_read_by_valu_then_wr_by_valu;
@@ -299,6 +302,8 @@ struct NOP_ctx_gfx11 {
             sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
             sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
                other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
+             sgpr_read_by_valu_as_lanemask_then_wr_by_valu ==
+                other.sgpr_read_by_valu_as_lanemask_then_wr_by_valu &&
             vgpr_written_by_wmma == other.vgpr_written_by_wmma &&
             sgpr_read_by_valu == other.sgpr_read_by_valu &&
             sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu;
@@ -798,24 +803,6 @@ check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& chec
                      });
 }

-template <std::size_t N>
-bool
-check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
-{
-   return std::any_of(instr->operands.begin(), instr->operands.end(),
-                      [&check_regs](const Operand& op) -> bool
-                      {
-                         if (op.isConstant())
-                            return false;
-                         bool writes_any = false;
-                         for (unsigned i = 0; i < op.size(); i++) {
-                            unsigned op_reg = op.physReg() + i;
-                            writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
-                         }
-                         return writes_any;
-                      });
-}
-
 template <std::size_t N>
 void
 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
@@ -1464,23 +1451,62 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&

   if (state.program->gfx_level < GFX12) {
      /* VALUMaskWriteHazard
-       * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU or
-       * VALU.
+       * VALU reads SGPR as a lane mask and later written by SALU or VALU cannot safely be read by
+       * SALU or VALU.
       */
-      if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU()) &&
-          check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
-         bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
-         sa_sdst = 0;
+      if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU())) {
+         uint16_t imm = 0xffff;
+
+         for (Operand op : instr->operands) {
+            if (op.physReg() >= state.program->dev.sgpr_limit)
+               continue;
+
+            for (unsigned i = 0; i < op.size(); i++) {
+               unsigned reg = op.physReg() + i;
+
+               /* s_waitcnt_depctr on sa_sdst */
+               if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu[reg]) {
+                  imm &= 0xfffe;
+                  sa_sdst = 0;
+               }
+
+               /* s_waitcnt_depctr on va_sdst (if non-VCC SGPR) or va_vcc (if VCC SGPR) */
+               if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[reg]) {
+                  bool is_vcc = reg == vcc || reg == vcc_hi;
+                  imm &= is_vcc ? 0xfffd : 0xf1ff;
+                  if (is_vcc)
+                     wait.va_vcc = 0;
+                  else
+                     wait.va_sdst = 0;
+               }
+            }
+         }
+
+         if (imm != 0xffff)
+            bld.sopp(aco_opcode::s_waitcnt_depctr, imm);
      }

      if (va_vdst == 0) {
         ctx.valu_since_wr_by_trans.reset();
         ctx.trans_since_wr_by_trans.reset();
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu.reset();
      }

      if (sa_sdst == 0)
         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();

+      if (wait.va_sdst == 0) {
+         std::bitset<128> old = ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu;
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu.reset();
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc] = old[vcc];
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc_hi] = old[vcc_hi];
+      }
+
+      if (wait.va_vcc == 0) {
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc] = false;
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc_hi] = false;
+      }
+
      if (state.program->wave_size == 64 && instr->isSALU() &&
          check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
         unsigned reg = instr->definitions[0].physReg().reg();
@@ -1511,6 +1537,15 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
               if (!op.isConstant() && op.physReg().reg() < 126)
                  ctx.sgpr_read_by_valu_as_lanemask.reset();
            }
+
+            if (!instr->definitions.empty() &&
+                instr->definitions.back().getTemp().type() == RegType::sgpr &&
+                check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
+               unsigned reg = instr->definitions.back().physReg().reg();
+               for (unsigned i = 0; i < instr->definitions.back().size(); i++)
+                  ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[reg + i] = 1;
+            }
+
            switch (instr->opcode) {
            case aco_opcode::v_addc_co_u32:
            case aco_opcode::v_subb_co_u32:
@@ -1745,6 +1780,16 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
         waitcnt_depctr &= 0xfffe;
         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
      }
+      if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc] ||
+          ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc_hi]) {
+         waitcnt_depctr &= 0xfffd;
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc] = false;
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc_hi] = false;
+      }
+      if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu.any()) {
+         waitcnt_depctr &= 0xf1ff;
+         ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu.reset();
+      }
      if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
         valu_read_sgpr = true;
         ctx.sgpr_read_by_valu_as_lanemask.reset();
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -300,6 +300,21 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>

   } else if (preds.size() == 1) {
      ctx.info[idx].exec = ctx.info[preds[0]].exec;
+
+      /* After continue and break blocks, we implicitly set exec to zero.
+       * This is so that parallelcopies can be inserted before the branch
+       * without being affected by the changed exec mask.
+       */
+      if (ctx.info[idx].exec.back().op.constantEquals(0)) {
+         assert(block->logical_succs.empty());
+         /* Check whether the successor block already restores exec. */
+         uint16_t block_kind = ctx.program->blocks[block->linear_succs[0]].kind;
+         if (!(block_kind & (block_kind_loop_header | block_kind_loop_exit | block_kind_invert |
+                             block_kind_merge))) {
+            /* The successor does not restore exec. */
+            restore_exec = true;
+         }
+      }
   } else {
      assert(preds.size() == 2);
      assert(ctx.info[preds[0]].exec.size() == ctx.info[preds[1]].exec.size());
@@ -627,15 +642,14 @@ add_branch_code(exec_ctx& ctx, Block* block)
      assert(block->instructions.back()->opcode == aco_opcode::p_branch);
      block->instructions.pop_back();

-      bool need_parallelcopy = false;
-      while (!(ctx.info[idx].exec.back().type & mask_type_loop)) {
+      while (!(ctx.info[idx].exec.back().type & mask_type_loop))
         ctx.info[idx].exec.pop_back();
-         need_parallelcopy = true;
-      }

-      if (need_parallelcopy)
-         bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().op);
-      bld.branch(aco_opcode::p_cbranch_nz, Operand(exec, bld.lm), block->linear_succs[1],
+      Temp cond = bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc),
+                           ctx.info[idx].exec.back().op, Operand::zero(bld.lm.bytes()))
+                     .def(1)
+                     .getTemp();
+      bld.branch(aco_opcode::p_cbranch_nz, Operand(cond, scc), block->linear_succs[1],
                 block->linear_succs[0]);
   } else if (block->kind & block_kind_uniform) {
      Pseudo_branch_instruction& branch = block->instructions.back()->branch();
@@ -703,14 +717,8 @@ add_branch_code(exec_ctx& ctx, Block* block)
            break;
      }

-      /* check if the successor is the merge block, otherwise set exec to 0 */
-      // TODO: this could be done better by directly branching to the merge block
-      unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
-      Block& succ = ctx.program->blocks[succ_idx];
-      if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
-         bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes()));
-      }
-
+      /* Implicitly set exec to zero and branch. */
+      ctx.info[idx].exec.back().op = Operand::zero(bld.lm.bytes());
      bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1],
                 block->linear_succs[0]);
   } else if (block->kind & block_kind_continue) {
@@ -729,14 +737,8 @@ add_branch_code(exec_ctx& ctx, Block* block)
      }
      assert(cond != Temp());

-      /* check if the successor is the merge block, otherwise set exec to 0 */
-      // TODO: this could be done better by directly branching to the merge block
-      unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
-      Block& succ = ctx.program->blocks[succ_idx];
-      if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
-         bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes()));
-      }
-
+      /* Implicitly set exec to zero and branch. */
+      ctx.info[idx].exec.back().op = Operand::zero(bld.lm.bytes());
      bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1],
                 block->linear_succs[0]);
   } else {
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -287,8 +287,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
         if (vmem_type && ctx.gfx_level < GFX12) {
            wait_event event = get_vmem_event(ctx, instr, vmem_type);
            wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
-            if ((it->second.events & ctx.info->events[type]) == event &&
-                (type != wait_type_vm || it->second.vmem_types == vmem_type))
+
+            bool event_matches = (it->second.events & ctx.info->events[type]) == event;
+            /* wait_type_vm/counter_vm can have several different vmem_types */
+            bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type &&
+                                                         util_bitcount(vmem_type) == 1);
+
+            if (event_matches && type_matches)
               reg_imm[type] = wait_imm::unset_counter;
         }

@@ -319,9 +324,9 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned se
         if (bar_scope_lds <= subgroup_scope)
            events &= ~event_lds;

-         /* Until GFX12, in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
+         /* Until GFX11, in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
          * in-order for the same workgroup */
-         if (ctx.gfx_level < GFX12 && !ctx.program->wgp_mode && sync.scope <= scope_workgroup)
+         if (ctx.gfx_level < GFX11 && !ctx.program->wgp_mode && sync.scope <= scope_workgroup)
            events &= ~(event_vmem | event_vmem_store | event_smem);

         if (events)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -7391,7 +7391,9 @@ Temp
 get_scratch_resource(isel_context* ctx)
 {
   Builder bld(ctx->program, ctx->block);
-   Temp scratch_addr = ctx->program->private_segment_buffer;
+   Temp scratch_addr;
+   if (!ctx->program->private_segment_buffers.empty())
+      scratch_addr = ctx->program->private_segment_buffers.back();
   if (!scratch_addr.bytes()) {
      Temp addr_lo =
         bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
@@ -7449,7 +7451,7 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
   } else {
      info.resource = get_scratch_resource(ctx);
      info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
-      info.soffset = ctx->program->scratch_offset;
+      info.soffset = ctx->program->scratch_offsets.back();
      emit_load(ctx, bld, info, scratch_mubuf_load_params);
   }
 }
@@ -7505,7 +7507,7 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
      offset = as_vgpr(ctx, offset);
      for (unsigned i = 0; i < write_count; i++) {
         aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
-         Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
+         Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offsets.back(),
                                        write_datas[i], offsets[i], true);
         mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
         unsigned access = ACCESS_TYPE_STORE | ACCESS_IS_SWIZZLED_AMD |
@@ -7932,7 +7934,7 @@ visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
   Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)));
   Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));

-   VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu();
+   VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0x7)->valu();
   vop3p.neg_lo[0] = (signed_mask & 0x1) != 0;
   vop3p.neg_lo[1] = (signed_mask & 0x2) != 0;
   vop3p.clamp = clamp;
@@ -8082,24 +8084,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
         aco_opcode subrev =
            instr->def.bit_size == 16 ? aco_opcode::v_subrev_f16 : aco_opcode::v_subrev_f32;

-         /* v_interp with constant sources only works on GFX11/11.5,
-          * and it's only faster on GFX11.5.
-          */
-         bool use_interp = dpp_ctrl1 == dpp_quad_perm(0, 0, 0, 0) && instr->def.bit_size == 32 &&
-                           ctx->program->gfx_level == GFX11_5;
         if (!nir_src_is_divergent(&instr->src[0])) {
            bld.vop2(subrev, Definition(dst), src, src);
-         } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(1, 1, 1, 1)) {
-            bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), src,
-                              Operand::c32(0x3f800000), src)
-               ->valu()
-               .neg[2] = true;
-         } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(2, 2, 2, 2)) {
-            Builder::Result tmp = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1),
-                                                    Operand::c32(0), Operand::c32(0), src);
-            tmp->valu().neg = 0x6;
-            bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), src,
-                              Operand::c32(0x3f800000), tmp);
         } else if (ctx->program->gfx_level >= GFX8 && dpp_ctrl2 == dpp_quad_perm(0, 1, 2, 3)) {
            bld.vop2_dpp(subrev, Definition(dst), src, src, dpp_ctrl1);
         } else if (ctx->program->gfx_level >= GFX8) {
@@ -8662,6 +8648,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
      if (ctx->shader->info.maximally_reconverges)
         ctx->program->needs_wqm = true;

+      if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) {
+         ctx->cf_info.exec.potentially_empty_discard = true;
+         begin_empty_exec_skip(ctx, &instr->instr, instr->instr.block);
+      }
+
      break;
   }
   case nir_intrinsic_terminate:
@@ -10940,9 +10931,9 @@ add_startpgm(struct isel_context* ctx)
          * handling spilling.
          */
         if (ctx->args->ring_offsets.used)
-            ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
+            ctx->program->private_segment_buffers.push_back(get_arg(ctx, ctx->args->ring_offsets));

-         ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
+         ctx->program->scratch_offsets.push_back(get_arg(ctx, ctx->args->scratch_offset));
      } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
         /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
          */
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -75,6 +75,7 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
      case GFX10: program->family = CHIP_NAVI10; break;
      case GFX10_3: program->family = CHIP_NAVI21; break;
      case GFX11: program->family = CHIP_NAVI31; break;
+      case GFX11_5: program->family = CHIP_GFX1150; break;
      case GFX12: program->family = CHIP_GFX1200; break;
      default: program->family = CHIP_UNKNOWN; break;
      }
@@ -151,7 +152,9 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
   default: break;
   }

-   program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
+   program->dev.sram_ecc_enabled = program->family == CHIP_VEGA20 ||
+                                   program->family == CHIP_MI100 || program->family == CHIP_MI200 ||
+                                   program->family == CHIP_GFX940;
   /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
   program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
   if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
@@ -1430,15 +1433,20 @@ get_op_fixed_to_def(Instruction* instr)
 uint8_t
 get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
 {
-   if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
+   if (instr->opcode == aco_opcode::image_bvh64_intersect_ray) {
      return vmem_bvh;
-   else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
+   } else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load) {
      return vmem_sampler;
-   else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
-            instr->operands[1].regClass() == s4)
-      return vmem_sampler;
-   else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
+   } else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
+              instr->operands[1].regClass() == s4) {
+      bool point_sample_accel =
+         gfx_level == GFX11_5 && (instr->opcode == aco_opcode::image_sample ||
+                                  instr->opcode == aco_opcode::image_sample_l ||
+                                  instr->opcode == aco_opcode::image_sample_lz);
+      return vmem_sampler | (point_sample_accel ? vmem_nosampler : 0);
+   } else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) {
      return vmem_nosampler;
+   }
   return 0;
 }

--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -2130,8 +2130,9 @@ public:
   std::vector<ac_shader_debug_info> debug_info;

   std::vector<uint8_t> constant_data;
-   Temp private_segment_buffer;
-   Temp scratch_offset;
+   /* Private segment buffers and scratch offsets. One entry per start/resume block */
+   aco::small_vec<Temp, 2> private_segment_buffers;
+   aco::small_vec<Temp, 2> scratch_offsets;

   uint16_t num_waves = 0;
   uint16_t min_waves = 0;
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -3096,6 +3096,9 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
   if (needs_vop3 && !can_vop3)
      return false;

+   if (instr_info.classes[(int)instr->opcode] == instr_class::valu_pseudo_scalar_trans)
+      return false;
+
   /* SDWA omod is GFX9+. */
   bool can_use_omod = (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P() &&
                       (!instr->isVINTERP_INREG() || interp_can_become_fma(ctx, instr));
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -253,6 +253,12 @@ struct DefInfo {

         if (imageGather4D16Bug)
            bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
+      } else if (instr_info.classes[(int)instr->opcode] == instr_class::valu_pseudo_scalar_trans) {
+         /* RDNA4 ISA doc, 7.10. Pseudo-scalar Transcendental ALU ops:
+          * - VCC may not be used as a destination
+          */
+         if (bounds.contains(vcc))
+            bounds.size = vcc - bounds.lo();
      }

      if (!data_stride)
@@ -1274,7 +1280,7 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
   RegClass rc = info.rc;

   /* check how many free regs we have */
-   unsigned regs_free = reg_file.count_zero(bounds);
+   unsigned regs_free = reg_file.count_zero(get_reg_bounds(ctx, rc));

   /* mark and count killed operands */
   unsigned killed_ops = 0;
@@ -1427,6 +1433,14 @@ get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
   if (!info.bounds.contains(reg_win) && !is_vcc && !is_m0)
      return false;

+   if (instr_info.classes[(int)instr->opcode] == instr_class::valu_pseudo_scalar_trans) {
+      /* RDNA4 ISA doc, 7.10. Pseudo-scalar Transcendental ALU ops:
+       * - VCC may not be used as a destination
+       */
+      if (vcc_win.contains(reg_win))
+         return false;
+   }
+
   if (reg_file.test(reg, info.rc.bytes()))
      return false;

@@ -1835,7 +1849,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,

   /* We should only fail here because keeping under the limit would require
    * too many moves. */
-   assert(reg_file.count_zero(info.bounds) >= info.size);
+   assert(reg_file.count_zero(get_reg_bounds(ctx, info.rc)) >= info.size);

   /* try using more registers */
   if (!increase_register_file(ctx, info.rc)) {
--- a/src/amd/compiler/aco_reindex_ssa.cpp
+++ b/src/amd/compiler/aco_reindex_ssa.cpp
@@ -69,10 +69,14 @@ reindex_program(idx_ctx& ctx, Program* program)
   }

   /* update program members */
-   program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()],
-                                          program->private_segment_buffer.regClass());
-   program->scratch_offset =
-      Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass());
+   for (auto& private_segment_buffer : program->private_segment_buffers) {
+      private_segment_buffer =
+         Temp(ctx.renames[private_segment_buffer.id()], private_segment_buffer.regClass());
+   }
+   for (auto& scratch_offset : program->scratch_offsets) {
+      scratch_offset =
+         Temp(ctx.renames[scratch_offset.id()], scratch_offset.regClass());
+   }
   program->temp_rc = ctx.temp_rc;
 }

--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -1264,8 +1264,12 @@ schedule_program(Program* program)
   ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);

   assert(ctx.num_waves > 0);
-   ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
-                           int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
+   ctx.mv.max_registers = {
+      int16_t(get_addr_vgpr_from_waves(
+                 program, std::max<uint16_t>(ctx.num_waves * wave_fac, program->min_waves)) -
+              2),
+      int16_t(get_addr_sgpr_from_waves(
+         program, std::max<uint16_t>(ctx.num_waves * wave_fac, program->min_waves)))};

   /* NGG culling shaders are very sensitive to position export scheduling.
    * Schedule less aggressively when early primitive export is used, and
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@@ -213,7 +213,7 @@ get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
 }

 bool
-is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
+is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b, bool* swap)
 {
   if ((a.is_opy_only && b.is_opy_only) || (a.is_dst_odd == b.is_dst_odd))
      return false;
@@ -222,6 +222,8 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
   if (a.has_literal && b.has_literal && a.literal != b.literal)
      return false;

+   *swap = false;
+
   /* The rest is checking src VGPR bank compatibility. */
   if ((a.src_banks & b.src_banks) == 0)
      return true;
@@ -244,11 +246,13 @@ is_vopd_compatible(const VOPDInfo& a, const VOPDInfo& b)
   if (b.op == aco_opcode::v_dual_mov_b32 && !a.is_commutative && a.is_opy_only)
      return false;

+   *swap = true;
+
   return true;
 }

 bool
-can_use_vopd(const SchedILPContext& ctx, unsigned idx)
+can_use_vopd(const SchedILPContext& ctx, unsigned idx, bool* prev_can_be_opx)
 {
   VOPDInfo cur_vopd = ctx.vopd[idx];
   Instruction* first = ctx.nodes[idx].instr;
@@ -260,9 +264,14 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
   if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
      return false;

-   if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd))
+   bool swap = false;
+   if (!is_vopd_compatible(ctx.prev_vopd_info, cur_vopd, &swap))
      return false;

+   /* If we have to swap a v_mov_b32, it will become an OPY-only opcode. */
+   if (swap && !ctx.prev_vopd_info.is_commutative && cur_vopd.op == aco_opcode::v_dual_mov_b32)
+      cur_vopd.is_opy_only = true;
+
   assert(first->definitions.size() == 1);
   assert(first->definitions[0].size() == 1);
   assert(second->definitions.size() == 1);
@@ -279,8 +288,23 @@ can_use_vopd(const SchedILPContext& ctx, unsigned idx)
         return false;
   }

-   /* WaR dependencies are not a concern. */
-   return true;
+   /* WaR dependencies are not a concern before GFX12. */
+   *prev_can_be_opx = true;
+   if (ctx.program->gfx_level >= GFX12) {
+      /* From RDNA4 ISA doc:
+       * The OPX instruction must not overwrite sources of the OPY instruction".
+       */
+      bool war = false;
+      for (Operand op : first->operands) {
+         assert(op.size() == 1);
+         if (second->definitions[0].physReg() == op.physReg())
+            war = true;
+      }
+      if (war)
+         *prev_can_be_opx = false;
+   }
+
+   return *prev_can_be_opx || !cur_vopd.is_opy_only;
 }

 Instruction_cycle_info
@@ -619,9 +643,9 @@ select_instruction_ilp(const SchedILPContext& ctx)

 bool
 compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
-                   unsigned current, unsigned candidate)
+                   bool* prev_can_be_opx, unsigned current, unsigned candidate)
 {
-   if (can_use_vopd(ctx, candidate)) {
+   if (can_use_vopd(ctx, candidate, prev_can_be_opx)) {
      /* If we can form a VOPD instruction, always prefer to do so. */
      if (!*use_vopd) {
         *use_vopd = true;
@@ -657,7 +681,7 @@ compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool
 }

 unsigned
-select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
+select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd, bool* prev_can_be_opx)
 {
   *use_vopd = false;

@@ -679,11 +703,14 @@ select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
      if (candidate.dependency_mask)
         continue;

+      bool prev_can_be_opx_for_i;
      if (cur == -1u) {
         cur = i;
-         *use_vopd = can_use_vopd(ctx, i);
-      } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) {
+         *use_vopd = can_use_vopd(ctx, i, prev_can_be_opx);
+      } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, &prev_can_be_opx_for_i,
+                                    cur, i)) {
         cur = i;
+         *prev_can_be_opx = prev_can_be_opx_for_i;
      }
   }

@@ -719,24 +746,29 @@ get_vopd_opcode_operands(const SchedILPContext& ctx, Instruction* instr, const V
 }

 Instruction*
-create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
+create_vopd_instruction(const SchedILPContext& ctx, unsigned idx, bool prev_can_be_opx)
 {
   Instruction* x = ctx.prev_info.instr;
   Instruction* y = ctx.nodes[idx].instr;
   VOPDInfo x_info = ctx.prev_vopd_info;
   VOPDInfo y_info = ctx.vopd[idx];
+   x_info.is_opy_only |= !prev_can_be_opx;

   bool swap_x = false, swap_y = false;
   if (x_info.src_banks & y_info.src_banks) {
      assert(x_info.is_commutative || y_info.is_commutative);
      /* Avoid swapping v_mov_b32 because it will become an OPY-only opcode. */
-      if (x_info.op == aco_opcode::v_dual_mov_b32 && !y_info.is_commutative) {
+      if (x_info.op == aco_opcode::v_dual_mov_b32 && y_info.op == aco_opcode::v_dual_mov_b32) {
+         swap_x = x_info.is_opy_only;
+         swap_y = !swap_x;
+      } else if (x_info.op == aco_opcode::v_dual_mov_b32 && !y_info.is_commutative) {
         swap_x = true;
         x_info.is_opy_only = true;
      } else {
         swap_x = x_info.is_commutative && x_info.op != aco_opcode::v_dual_mov_b32;
         swap_y = y_info.is_commutative && !swap_x;
      }
+      y_info.is_opy_only |= swap_y && y_info.op == aco_opcode::v_dual_mov_b32;
   }

   if (x_info.is_opy_only) {
@@ -744,6 +776,7 @@ create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
      std::swap(x_info, y_info);
      std::swap(swap_x, swap_y);
   }
+   assert(!x_info.is_opy_only);

   aco_opcode x_op, y_op;
   unsigned num_operands = 0;
@@ -774,14 +807,15 @@ do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_

   ctx.prev_info.instr = NULL;
   bool use_vopd = false;
+   bool prev_can_be_opx;

   while (ctx.active_mask) {
-      unsigned next_idx =
-         ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx);
+      unsigned next_idx = ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd, &prev_can_be_opx)
+                                      : select_instruction_ilp(ctx);
      Instruction* next_instr = ctx.nodes[next_idx].instr;

      if (use_vopd) {
-         std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx));
+         std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx, prev_can_be_opx));
         ctx.prev_info.instr = NULL;
      } else {
         (insert_it++)->reset(next_instr);
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -88,13 +88,16 @@ struct spill_ctx {
   unsigned vgpr_spill_slots;
   Temp scratch_rsrc;

+   unsigned resume_idx;
+
   spill_ctx(const RegisterDemand target_pressure_, Program* program_)
       : target_pressure(target_pressure_), program(program_), memory(),
         renames(program->blocks.size(), aco::map<Temp, Temp>(memory)),
         spills_entry(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
         spills_exit(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
         processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()),
-         remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0)
+         remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0),
+         resume_idx(0)
   {}

   void add_affinity(uint32_t first, uint32_t second)
@@ -1088,7 +1091,10 @@ spill_block(spill_ctx& ctx, unsigned block_idx)
 Temp
 load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset)
 {
-   Temp private_segment_buffer = ctx.program->private_segment_buffer;
+   Temp private_segment_buffer;
+   if (!ctx.program->private_segment_buffers.empty())
+      private_segment_buffer = ctx.program->private_segment_buffers[ctx.resume_idx];
+
   if (!private_segment_buffer.bytes()) {
      Temp addr_lo =
         bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
@@ -1109,7 +1115,7 @@ load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset)

      Temp carry = bld.tmp(s1);
      addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
-                         ctx.program->scratch_offset);
+                         ctx.program->scratch_offsets[ctx.resume_idx]);
      addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
                         Operand::c32(0), bld.scc(carry));

@@ -1218,7 +1224,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
   uint32_t spill_id = spill->operands[1].constantValue();
   uint32_t spill_slot = slots[spill_id];

-   Temp scratch_offset = ctx.program->scratch_offset;
+   Temp scratch_offset;
+   if (!ctx.program->scratch_offsets.empty())
+      scratch_offset = ctx.program->scratch_offsets[ctx.resume_idx];
   unsigned offset;
   setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset);

@@ -1264,7 +1272,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& ins
   uint32_t spill_id = reload->operands[0].constantValue();
   uint32_t spill_slot = slots[spill_id];

-   Temp scratch_offset = ctx.program->scratch_offset;
+   Temp scratch_offset;
+   if (!ctx.program->scratch_offsets.empty())
+      scratch_offset = ctx.program->scratch_offsets[ctx.resume_idx];
   unsigned offset;
   setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset);

@@ -1488,6 +1498,8 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
          * we cannot reuse the current scratch_rsrc temp because its definition is unreachable */
         if (block.linear_preds.empty())
            ctx.scratch_rsrc = Temp();
+         if (block.kind & block_kind_resume)
+            ++ctx.resume_idx;
      }

      std::vector<aco_ptr<Instruction>>::iterator it;
--- a/src/amd/compiler/aco_ssa_elimination.cpp
+++ b/src/amd/compiler/aco_ssa_elimination.cpp
@@ -59,20 +59,13 @@ collect_phi_info(ssa_elimination_ctx& ctx)
 void
 insert_parallelcopies(ssa_elimination_ctx& ctx)
 {
-   /* insert the parallelcopies from logical phis before p_logical_end */
+   /* insert the parallelcopies from logical phis before branch */
   for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
      auto& logical_phi_info = ctx.logical_phi_info[block_idx];
      if (logical_phi_info.empty())
         continue;

      Block& block = ctx.program->blocks[block_idx];
-      unsigned idx = block.instructions.size() - 1;
-      while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
-         assert(idx > 0);
-         idx--;
-      }
-
-      std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
      aco_ptr<Instruction> pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO,
                                                 logical_phi_info.size(), logical_phi_info.size())};
      unsigned i = 0;
@@ -82,6 +75,7 @@ insert_parallelcopies(ssa_elimination_ctx& ctx)
         i++;
      }
      pc->pseudo().needs_scratch_reg = false;
+      auto it = std::prev(block.instructions.end());
      block.instructions.insert(it, std::move(pc));
   }

--- a/src/amd/compiler/tests/test_insert_waitcnt.cpp
+++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp
@@ -340,6 +340,81 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
   }
 END_TEST

+BEGIN_TEST(insert_waitcnt.waw.point_sample_accel)
+   if (!setup_cs(NULL, GFX11_5))
+      return;
+
+   Definition def_v4(PhysReg(260), v1);
+   Operand op_v0(PhysReg(256), v1);
+   Operand desc_s4(PhysReg(0), s4);
+   Operand desc_s8(PhysReg(8), s8);
+
+   /* image_sample has point sample acceleration, but image_sample_b does not. Both are VMEM sample
+    * instructions. */
+
+   //>> p_unit_test 0
+   //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   //! s_waitcnt vmcnt(0)
+   //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
+   bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+   bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+
+   //>> p_unit_test 1
+   //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   //! s_waitcnt vmcnt(0)
+   //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+   bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+   bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+
+   //>> p_unit_test 2
+   //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
+   //! s_waitcnt vmcnt(0)
+   //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+   bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
+   bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+
+   //>> p_unit_test 3
+   //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   //! s_waitcnt vmcnt(0)
+   //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+   bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+   bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+
+   //>> p_unit_test 4
+   //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   //! s_waitcnt vmcnt(0)
+   //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+   bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+   bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
+
+   //>> p_unit_test 5
+   //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
+   bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+   bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
+
+   //>> p_unit_test 5
+   //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
+   //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
+   bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
+   bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
+
+   finish_waitcnt_test();
+END_TEST
+
 BEGIN_TEST(insert_waitcnt.vmem)
   if (!setup_cs(NULL, GFX12))
      return;
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -2083,3 +2083,18 @@ BEGIN_TEST(optimizer.trans_inline_constant)

   finish_opt_test();
 END_TEST
+
+BEGIN_TEST(optimizer.trans_no_omod)
+   //>> s1: %a = p_startpgm
+   if (!setup_cs("s1", GFX12))
+      return;
+
+   //! s1: %tmp0 = v_s_log_f32 %a
+   //! v1: %res = v_mul_legacy_f32 %tmp0, 0.5
+   //! p_unit_test 0, %res
+   Temp dst = bld.vop3(aco_opcode::v_s_log_f32, bld.def(s1), inputs[0]);
+   writeout(0, bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), dst,
+                        bld.copy(bld.def(v1), Operand::c32(0x3f000000))));
+
+   finish_opt_test();
+END_TEST
--- a/src/amd/compiler/tests/test_scheduler.cpp
+++ b/src/amd/compiler/tests/test_scheduler.cpp
@@ -153,3 +153,46 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)

   finish_schedule_vopd_test();
 END_TEST
+
+BEGIN_TEST(vopd_sched.war)
+   for (amd_gfx_level gfx : {GFX11, GFX12}) {
+      if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
+         continue;
+
+      PhysReg reg_v0{256};
+      PhysReg reg_v1{257};
+      PhysReg reg_v3{259};
+      PhysReg reg_v5{261};
+
+      //>> p_unit_test 0
+      //~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3]
+      //~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] :: v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1]
+      bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
+      bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v1, v1),
+               Operand(reg_v3, v1));
+      bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
+               Operand(reg_v1, v1));
+
+      /* We can't use OPX for the v_mul_f32 because of the WaR, but we also can't use OPX for the
+       * v_add_u32 because that opcode is OPY-only. */
+      //>> p_unit_test 1
+      //~gfx11! v1: %0:v[1] = v_dual_mul_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_add_nc_u32 %0:v[1], %0:v[3]
+      //~gfx12! v1: %0:v[0] = v_add_u32 %0:v[1], %0:v[3]
+      //~gfx12! v1: %0:v[1] = v_mul_f32 %0:v[3], %0:v[1]
+      bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+      bld.vop2(aco_opcode::v_add_u32, Definition(reg_v0, v1), Operand(reg_v1, v1),
+               Operand(reg_v3, v1));
+      bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
+               Operand(reg_v1, v1));
+
+      /* Test that we swap the right v_mov_b32. */
+      //>> p_unit_test 2
+      //~gfx11! v1: %0:v[1] = v_dual_mov_b32 %0:v[5] :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[1]
+      //~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[1] :: v1: %0:v[1] = v_dual_add_nc_u32 0, %0:v[5]
+      bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+      bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v1, v1));
+      bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v1, v1), Operand(reg_v5, v1));
+
+      finish_schedule_vopd_test();
+   }
+END_TEST
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -1578,11 +1578,14 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in
         num_bytes = 16;
      }

-      /* check alignment of 16 Bit stores */
-      if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) {
-         writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
+      /* check alignment of 8/16 Bit stores */
+      uint32_t align_mul = nir_intrinsic_align_mul(instr);
+      uint32_t align_offset = nir_intrinsic_align_offset(instr) + start * elem_size_bytes;
+      uint32_t align = nir_combined_align(align_mul, align_offset & (align_mul - 1));
+      if (align < MIN2(num_bytes, 4) || (ctx->ac.gfx_level == GFX6 && elem_size_bytes < 4)) {
+         writemask |= BITFIELD_RANGE(start + 1, count - 1);
         count = 1;
-         num_bytes = 2;
+         num_bytes = elem_size_bytes;
      }

      /* Due to alignment issues, split stores of 8-bit/16-bit
@@ -1882,10 +1885,17 @@ static LLVMValueRef visit_load_global(struct ac_nir_context *ctx,

   val = LLVMBuildLoad2(ctx->ac.builder, result_type, addr, "");

-   if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) {
+   /* From the LLVM 21.0.0 language reference:
+    * > An alignment value higher than the size of the loaded type implies memory up to the
+    * > alignment value bytes can be safely loaded without trapping in the default address space.
+    * So limit the alignment to the access size, since this isn't true in NIR.
+    */
+   uint32_t align = nir_intrinsic_align(instr);
+   uint32_t size = ac_get_type_size(result_type);
+   LLVMSetAlignment(val, MIN2(align, 1 << (ffs(size) - 1)));
+
+   if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
      LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic);
-      LLVMSetAlignment(val, ac_get_type_size(result_type));
-   }

   return val;
 }
@@ -1904,10 +1914,12 @@ static void visit_store_global(struct ac_nir_context *ctx,

   val = LLVMBuildStore(ctx->ac.builder, data, addr);

-   if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) {
+   uint32_t align = nir_intrinsic_align(instr);
+   uint32_t size = ac_get_type_size(type);
+   LLVMSetAlignment(val, MIN2(align, 1 << (ffs(size) - 1)));
+
+   if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
      LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic);
-      LLVMSetAlignment(val, ac_get_type_size(type));
-   }
 }

 static LLVMValueRef visit_global_atomic(struct ac_nir_context *ctx,
--- a/src/amd/vulkan/bvh/encode.comp
+++ b/src/amd/vulkan/bvh/encode.comp
@@ -42,45 +42,47 @@ main()

   uint32_t ir_leaf_node_size;
   uint32_t output_leaf_node_size;
-   switch (args.geometry_type) {
-   case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
-      ir_leaf_node_size = SIZEOF(vk_ir_triangle_node);
-      output_leaf_node_size = SIZEOF(radv_bvh_triangle_node);
+   if (gl_GlobalInvocationID.x < args.leaf_node_count) {
+      switch (args.geometry_type) {
+      case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
+         ir_leaf_node_size = SIZEOF(vk_ir_triangle_node);
+         output_leaf_node_size = SIZEOF(radv_bvh_triangle_node);

-      vk_ir_triangle_node src_node =
-         DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
-      REF(radv_bvh_triangle_node) dst_node =
-         REF(radv_bvh_triangle_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+         vk_ir_triangle_node src_node =
+            DEREF(REF(vk_ir_triangle_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+         REF(radv_bvh_triangle_node) dst_node =
+            REF(radv_bvh_triangle_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));

-      DEREF(dst_node).coords = src_node.coords;
-      DEREF(dst_node).triangle_id = src_node.triangle_id;
-      DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
-      DEREF(dst_node).id = 9;
+         DEREF(dst_node).coords = src_node.coords;
+         DEREF(dst_node).triangle_id = src_node.triangle_id;
+         DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
+         DEREF(dst_node).id = 9;

-      break;
-   }
-   case VK_GEOMETRY_TYPE_AABBS_KHR: {
-      ir_leaf_node_size = SIZEOF(vk_ir_aabb_node);
-      output_leaf_node_size = SIZEOF(radv_bvh_aabb_node);
+         break;
+      }
+      case VK_GEOMETRY_TYPE_AABBS_KHR: {
+         ir_leaf_node_size = SIZEOF(vk_ir_aabb_node);
+         output_leaf_node_size = SIZEOF(radv_bvh_aabb_node);

-      vk_ir_aabb_node src_node =
-         DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
-      REF(radv_bvh_aabb_node) dst_node =
-         REF(radv_bvh_aabb_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));
+         vk_ir_aabb_node src_node =
+            DEREF(REF(vk_ir_aabb_node)(OFFSET(args.intermediate_bvh, gl_GlobalInvocationID.x * ir_leaf_node_size)));
+         REF(radv_bvh_aabb_node) dst_node =
+            REF(radv_bvh_aabb_node)(OFFSET(args.output_bvh, dst_leaf_offset + gl_GlobalInvocationID.x * output_leaf_node_size));

-      DEREF(dst_node).primitive_id = src_node.primitive_id;
-      DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;
+         DEREF(dst_node).primitive_id = src_node.primitive_id;
+         DEREF(dst_node).geometry_id_and_flags = src_node.geometry_id_and_flags;

-      break;
-   }
-   default:
-      /* instances */
-      ir_leaf_node_size = SIZEOF(vk_ir_instance_node);
-      output_leaf_node_size = SIZEOF(radv_bvh_instance_node);
-      /* Instance nodes have to be emitted inside the loop since encoding them
-       * loads an address from the IR node which is uninitialized for inactive nodes.
-       */
-      break;
+         break;
+      }
+      default:
+         /* instances */
+         ir_leaf_node_size = SIZEOF(vk_ir_instance_node);
+         output_leaf_node_size = SIZEOF(radv_bvh_instance_node);
+         /* Instance nodes have to be emitted inside the loop since encoding them
+          * loads an address from the IR node which is uninitialized for inactive nodes.
+          */
+         break;
+      }
   }

   if (gl_GlobalInvocationID.x >= DEREF(args.header).ir_internal_node_count)
--- a/src/amd/vulkan/meta/radv_meta.c
+++ b/src/amd/vulkan/meta/radv_meta.c
@@ -13,13 +13,6 @@
 #include "vk_pipeline_cache.h"
 #include "vk_util.h"

-#include <fcntl.h>
-#include <limits.h>
-#ifndef _WIN32
-#include <pwd.h>
-#endif
-#include <sys/stat.h>
-
 static void
 radv_suspend_queries(struct radv_meta_saved_state *state, struct radv_cmd_buffer *cmd_buffer)
 {
@@ -292,54 +285,11 @@ meta_free(void *_device, void *data)
   device->vk.alloc.pfnFree(device->vk.alloc.pUserData, data);
 }

-#ifndef _WIN32
-static bool
-radv_builtin_cache_path(char *path)
-{
-   char *xdg_cache_home = secure_getenv("XDG_CACHE_HOME");
-   const char *suffix = "/radv_builtin_shaders";
-   const char *suffix2 = "/.cache/radv_builtin_shaders";
-   struct passwd pwd, *result;
-   char path2[PATH_MAX + 1]; /* PATH_MAX is not a real max,but suffices here. */
-   int ret;
-
-   if (xdg_cache_home) {
-      ret = snprintf(path, PATH_MAX + 1, "%s%s%zd", xdg_cache_home, suffix, sizeof(void *) * 8);
-      return ret > 0 && ret < PATH_MAX + 1;
-   }
-
-   getpwuid_r(getuid(), &pwd, path2, PATH_MAX - strlen(suffix2), &result);
-   if (!result)
-      return false;
-
-   strcpy(path, pwd.pw_dir);
-   strcat(path, "/.cache");
-   if (mkdir(path, 0755) && errno != EEXIST)
-      return false;
-
-   ret = snprintf(path, PATH_MAX + 1, "%s%s%zd", pwd.pw_dir, suffix2, sizeof(void *) * 8);
-   return ret > 0 && ret < PATH_MAX + 1;
-}
-#endif
-
-static uint32_t
-num_cache_entries(VkPipelineCache cache)
-{
-   struct set *s = vk_pipeline_cache_from_handle(cache)->object_cache;
-   if (!s)
-      return 0;
-   return s->entries;
-}
-
 static void
-radv_load_meta_pipeline(struct radv_device *device)
+radv_init_meta_cache(struct radv_device *device)
 {
-#ifndef _WIN32
-   char path[PATH_MAX + 1];
-   struct stat st;
-   void *data = NULL;
-   int fd = -1;
-   struct vk_pipeline_cache *cache = NULL;
+   const struct radv_physical_device *pdev = radv_device_physical(device);
+   struct vk_pipeline_cache *cache;

   VkPipelineCacheCreateInfo create_info = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
@@ -347,81 +297,12 @@ radv_load_meta_pipeline(struct radv_device *device)

   struct vk_pipeline_cache_create_info info = {
      .pCreateInfo = &create_info,
-      .skip_disk_cache = true,
+      .disk_cache = pdev->disk_cache_meta,
   };

-   if (!radv_builtin_cache_path(path))
-      goto fail;
-
-   fd = open(path, O_RDONLY);
-   if (fd < 0)
-      goto fail;
-   if (fstat(fd, &st))
-      goto fail;
-   data = malloc(st.st_size);
-   if (!data)
-      goto fail;
-   if (read(fd, data, st.st_size) == -1)
-      goto fail;
-
-   create_info.initialDataSize = st.st_size;
-   create_info.pInitialData = data;
-
-fail:
   cache = vk_pipeline_cache_create(&device->vk, &info, NULL);
-
-   if (cache) {
+   if (cache)
      device->meta_state.cache = vk_pipeline_cache_to_handle(cache);
-      device->meta_state.initial_cache_entries = num_cache_entries(device->meta_state.cache);
-   }
-
-   free(data);
-   if (fd >= 0)
-      close(fd);
-#endif
-}
-
-static void
-radv_store_meta_pipeline(struct radv_device *device)
-{
-#ifndef _WIN32
-   char path[PATH_MAX + 1], path2[PATH_MAX + 7];
-   size_t size;
-   void *data = NULL;
-
-   if (device->meta_state.cache == VK_NULL_HANDLE)
-      return;
-
-   /* Skip serialization if no entries were added. */
-   if (num_cache_entries(device->meta_state.cache) <= device->meta_state.initial_cache_entries)
-      return;
-
-   if (vk_common_GetPipelineCacheData(radv_device_to_handle(device), device->meta_state.cache, &size, NULL))
-      return;
-
-   if (!radv_builtin_cache_path(path))
-      return;
-
-   strcpy(path2, path);
-   strcat(path2, "XXXXXX");
-   int fd = mkstemp(path2); // open(path, O_WRONLY | O_CREAT, 0600);
-   if (fd < 0)
-      return;
-   data = malloc(size);
-   if (!data)
-      goto fail;
-
-   if (vk_common_GetPipelineCacheData(radv_device_to_handle(device), device->meta_state.cache, &size, data))
-      goto fail;
-   if (write(fd, data, size) == -1)
-      goto fail;
-
-   rename(path2, path);
-fail:
-   free(data);
-   close(fd);
-   unlink(path2);
-#endif
 }

 VkResult
@@ -439,7 +320,7 @@ radv_device_init_meta(struct radv_device *device)
      .pfnFree = meta_free,
   };

-   radv_load_meta_pipeline(device);
+   radv_init_meta_cache(device);

   result = vk_meta_device_init(&device->vk, &device->meta_state.device);
   if (result != VK_SUCCESS)
@@ -488,7 +369,6 @@ radv_device_finish_meta(struct radv_device *device)

   radv_device_finish_accel_struct_build_state(device);

-   radv_store_meta_pipeline(device);
   vk_common_DestroyPipelineCache(radv_device_to_handle(device), device->meta_state.cache, NULL);
   mtx_destroy(&device->meta_state.mtx);

@@ -612,8 +492,8 @@ radv_break_on_count(nir_builder *b, nir_variable *var, nir_def *count)
 VkResult
 radv_meta_get_noop_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-noop";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_NOOP;

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, NULL, key_data, strlen(key_data),
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, NULL, &key, sizeof(key),
                                      layout_out);
 }
--- a/src/amd/vulkan/meta/radv_meta.h
+++ b/src/amd/vulkan/meta/radv_meta.h
@@ -102,6 +102,52 @@ radv_meta_dst_layout_to_layout(enum radv_meta_dst_layout layout)

 extern const VkFormat radv_fs_key_format_exemplars[NUM_META_FS_KEYS];

+enum radv_meta_object_key_type {
+   RADV_META_OBJECT_KEY_NOOP = VK_META_OBJECT_KEY_DRIVER_OFFSET,
+   RADV_META_OBJECT_KEY_BLIT,
+   RADV_META_OBJECT_KEY_BLIT2D,
+   RADV_META_OBJECT_KEY_BLIT2D_COLOR,
+   RADV_META_OBJECT_KEY_BLIT2D_DEPTH,
+   RADV_META_OBJECT_KEY_BLIT2D_STENCIL,
+   RADV_META_OBJECT_KEY_FILL_BUFFER,
+   RADV_META_OBJECT_KEY_COPY_BUFFER,
+   RADV_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER,
+   RADV_META_OBJECT_KEY_COPY_BUFFER_TO_IMAGE,
+   RADV_META_OBJECT_KEY_COPY_BUFFER_TO_IMAGE_R32G32B32,
+   RADV_META_OBJECT_KEY_COPY_IMAGE,
+   RADV_META_OBJECT_KEY_COPY_IMAGE_R32G32B32,
+   RADV_META_OBJECT_KEY_COPY_VRS_HTILE,
+   RADV_META_OBJECT_KEY_CLEAR_CS,
+   RADV_META_OBJECT_KEY_CLEAR_CS_R32G32B32,
+   RADV_META_OBJECT_KEY_CLEAR_COLOR,
+   RADV_META_OBJECT_KEY_CLEAR_DS,
+   RADV_META_OBJECT_KEY_CLEAR_HTILE,
+   RADV_META_OBJECT_KEY_CLEAR_DCC_COMP_TO_SINGLE,
+   RADV_META_OBJECT_KEY_FAST_CLEAR_ELIMINATE,
+   RADV_META_OBJECT_KEY_DCC_DECOMPRESS,
+   RADV_META_OBJECT_KEY_DCC_RETILE,
+   RADV_META_OBJECT_KEY_HTILE_EXPAND_GFX,
+   RADV_META_OBJECT_KEY_HTILE_EXPAND_CS,
+   RADV_META_OBJECT_KEY_FMASK_COPY,
+   RADV_META_OBJECT_KEY_FMASK_EXPAND,
+   RADV_META_OBJECT_KEY_FMASK_DECOMPRESS,
+   RADV_META_OBJECT_KEY_RESOLVE_HW,
+   RADV_META_OBJECT_KEY_RESOLVE_CS,
+   RADV_META_OBJECT_KEY_RESOLVE_COLOR_CS,
+   RADV_META_OBJECT_KEY_RESOLVE_DS_CS,
+   RADV_META_OBJECT_KEY_RESOLVE_FS,
+   RADV_META_OBJECT_KEY_RESOLVE_COLOR_FS,
+   RADV_META_OBJECT_KEY_RESOLVE_DS_FS,
+   RADV_META_OBJECT_KEY_DGC,
+   RADV_META_OBJECT_KEY_QUERY,
+   RADV_META_OBJECT_KEY_QUERY_OCCLUSION,
+   RADV_META_OBJECT_KEY_QUERY_PIPELINE_STATS,
+   RADV_META_OBJECT_KEY_QUERY_TFB,
+   RADV_META_OBJECT_KEY_QUERY_TIMESTAMP,
+   RADV_META_OBJECT_KEY_QUERY_PRIMS_GEN,
+   RADV_META_OBJECT_KEY_QUERY_MESH_PRIMS_GEN,
+};
+
 VkResult radv_device_init_meta(struct radv_device *device);
 void radv_device_finish_meta(struct radv_device *device);

--- a/src/amd/vulkan/meta/radv_meta_blit.c
+++ b/src/amd/vulkan/meta/radv_meta_blit.c
@@ -165,7 +165,7 @@ translate_sampler_dim(VkImageType type)
 static VkResult
 get_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-blit";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_BLIT;

   const VkDescriptorSetLayoutBinding binding = {
      .binding = 0,
@@ -183,10 +183,17 @@ get_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)

   const VkPushConstantRange pc_range = {VK_SHADER_STAGE_VERTEX_BIT, 0, 20};

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

+struct radv_blit_key {
+   enum radv_meta_object_key_type type;
+   VkImageAspectFlags aspects;
+   VkImageType image_type;
+   uint32_t fs_key;
+};
+
 static VkResult
 get_pipeline(struct radv_device *device, const struct radv_image_view *src_iview,
             const struct radv_image_view *dst_iview, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
@@ -195,18 +202,26 @@ get_pipeline(struct radv_device *device, const struct radv_image_view *src_iview
   const struct radv_image *src_image = src_iview->image;
   const struct radv_image *dst_image = dst_iview->image;
   const enum glsl_sampler_dim tex_dim = translate_sampler_dim(src_image->vk.image_type);
-   unsigned fs_key = 0;
-   char key_data[64];
+   struct radv_blit_key key;
   VkResult result;

   result = get_pipeline_layout(device, layout_out);
   if (result != VK_SUCCESS)
      return result;

-   if (src_image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT)
-      fs_key = radv_format_meta_fs_key(device, dst_image->vk.format);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_BLIT;
+   key.aspects = src_image->vk.aspects;
+   key.image_type = src_image->vk.image_type;

-   snprintf(key_data, sizeof(key_data), "radv-blit-%d-%d-%d", src_image->vk.aspects, src_image->vk.image_type, fs_key);
+   if (src_image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT)
+      key.fs_key = radv_format_meta_fs_key(device, dst_image->vk.format);
+
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
+   if (pipeline_from_cache != VK_NULL_HANDLE) {
+      *pipeline_out = pipeline_from_cache;
+      return VK_SUCCESS;
+   }

   nir_shader *fs;
   nir_shader *vs = build_nir_vertex_shader(device);
@@ -331,7 +346,7 @@ get_pipeline(struct radv_device *device, const struct radv_image_view *src_iview
   case VK_IMAGE_ASPECT_COLOR_BIT:
      pipeline_create_info.pColorBlendState = &color_blend_info;
      render.color_attachment_count = 1;
-      render.color_attachment_formats[0] = radv_fs_key_format_exemplars[fs_key];
+      render.color_attachment_formats[0] = radv_fs_key_format_exemplars[key.fs_key];
      break;
   case VK_IMAGE_ASPECT_DEPTH_BIT:
      pipeline_create_info.pDepthStencilState = &depth_info;
@@ -346,7 +361,7 @@ get_pipeline(struct radv_device *device, const struct radv_image_view *src_iview
   }

   result = vk_meta_create_graphics_pipeline(&device->vk, &device->meta_state.device, &pipeline_create_info, &render,
-                                             key_data, strlen(key_data), pipeline_out);
+                                             &key, sizeof(key), pipeline_out);

   ralloc_free(vs);
   ralloc_free(fs);
--- a/src/amd/vulkan/meta/radv_meta_blit2d.c
+++ b/src/amd/vulkan/meta/radv_meta_blit2d.c
@@ -167,11 +167,21 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer, struct radv_meta
         if (vk_format_is_color(src_img->image->vk.format) && vk_format_is_depth_or_stencil(dst->image->vk.format)) {
            assert(src_img->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT);
            src_aspect_mask = src_img->aspect_mask;
+         } else if (vk_format_is_depth_or_stencil(src_img->image->vk.format) &&
+                    vk_format_is_color(dst->image->vk.format)) {
+            if (src_img->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+               depth_format = vk_format_stencil_only(src_img->image->vk.format);
+            } else {
+               assert(src_img->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
+               depth_format = vk_format_depth_only(src_img->image->vk.format);
+            }
         }
      }

      struct radv_image_view dst_iview;
-      create_iview(cmd_buffer, dst, &dst_iview, depth_format, aspect_mask);
+      create_iview(cmd_buffer, dst, &dst_iview,
+                   aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) ? depth_format : 0,
+                   aspect_mask);

      const VkRenderingAttachmentInfo att_info = {
         .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
@@ -446,14 +456,22 @@ build_nir_copy_fragment_shader_stencil(struct radv_device *device, texel_fetch_b
   return b.shader;
 }

+struct radv_blit2d_key {
+   enum radv_meta_object_key_type type;
+   uint32_t index;
+};
+
 static VkResult
 create_layout(struct radv_device *device, int idx, VkPipelineLayout *layout_out)
 {
   const VkDescriptorType desc_type =
      (idx == BLIT2D_SRC_TYPE_BUFFER) ? VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
-   char key_data[64];

-   snprintf(key_data, sizeof(key_data), "radv-blit2d-%d", idx);
+   struct radv_blit2d_key key;
+
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_BLIT2D;
+   key.index = idx;

   const VkDescriptorSetLayoutBinding binding = {
      .binding = 0,
@@ -474,16 +492,22 @@ create_layout(struct radv_device *device, int idx, VkPipelineLayout *layout_out)
      .size = 20,
   };

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

+struct radv_blit2d_color_key {
+   enum radv_meta_object_key_type type;
+   enum blit2d_src_type src_type;
+   uint32_t log2_samples;
+   uint32_t fs_key;
+};
+
 static VkResult
 get_color_pipeline(struct radv_device *device, enum blit2d_src_type src_type, VkFormat format, uint32_t log2_samples,
                   VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   const unsigned fs_key = radv_format_meta_fs_key(device, format);
-   char key_data[64];
+   struct radv_blit2d_color_key key;
   const char *name;
   VkResult result;

@@ -491,7 +515,17 @@ get_color_pipeline(struct radv_device *device, enum blit2d_src_type src_type, Vk
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-blit2d-color-%d-%d-%d", src_type, log2_samples, fs_key);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_BLIT2D_COLOR;
+   key.src_type = src_type;
+   key.log2_samples = log2_samples;
+   key.fs_key = radv_format_meta_fs_key(device, format);
+
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
+   if (pipeline_from_cache != VK_NULL_HANDLE) {
+      *pipeline_out = pipeline_from_cache;
+      return VK_SUCCESS;
+   }

   texel_fetch_build_func src_func;
   switch (src_type) {
@@ -597,18 +631,24 @@ get_color_pipeline(struct radv_device *device, enum blit2d_src_type src_type, Vk
   };

   result = vk_meta_create_graphics_pipeline(&device->vk, &device->meta_state.device, &pipeline_create_info, &render,
-                                             key_data, strlen(key_data), pipeline_out);
+                                             &key, sizeof(key), pipeline_out);

   ralloc_free(vs_module);
   ralloc_free(fs_module);
   return result;
 }

+struct radv_blit2d_ds_key {
+   enum radv_meta_object_key_type type;
+   enum blit2d_src_type src_type;
+   uint32_t log2_samples;
+};
+
 static VkResult
 get_depth_only_pipeline(struct radv_device *device, enum blit2d_src_type src_type, uint32_t log2_samples,
                        VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   char key_data[64];
+   struct radv_blit2d_ds_key key;
   const char *name;
   VkResult result;

@@ -616,7 +656,16 @@ get_depth_only_pipeline(struct radv_device *device, enum blit2d_src_type src_typ
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-blit2d-depth-%d-%d", src_type, log2_samples);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_BLIT2D_DEPTH;
+   key.src_type = src_type;
+   key.log2_samples = log2_samples;
+
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
+   if (pipeline_from_cache != VK_NULL_HANDLE) {
+      *pipeline_out = pipeline_from_cache;
+      return VK_SUCCESS;
+   }

   texel_fetch_build_func src_func;
   switch (src_type) {
@@ -746,7 +795,7 @@ get_depth_only_pipeline(struct radv_device *device, enum blit2d_src_type src_typ
   };

   result = vk_meta_create_graphics_pipeline(&device->vk, &device->meta_state.device, &pipeline_create_info, &render,
-                                             key_data, strlen(key_data), pipeline_out);
+                                             &key, sizeof(key), pipeline_out);

   ralloc_free(vs_module);
   ralloc_free(fs_module);
@@ -757,7 +806,7 @@ static VkResult
 get_stencil_only_pipeline(struct radv_device *device, enum blit2d_src_type src_type, uint32_t log2_samples,
                          VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   char key_data[64];
+   struct radv_blit2d_ds_key key;
   const char *name;
   VkResult result;

@@ -765,7 +814,16 @@ get_stencil_only_pipeline(struct radv_device *device, enum blit2d_src_type src_t
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-blit2d-stencil-%d-%d", src_type, log2_samples);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_BLIT2D_STENCIL;
+   key.src_type = src_type;
+   key.log2_samples = log2_samples;
+
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
+   if (pipeline_from_cache != VK_NULL_HANDLE) {
+      *pipeline_out = pipeline_from_cache;
+      return VK_SUCCESS;
+   }

   texel_fetch_build_func src_func;
   switch (src_type) {
@@ -890,7 +948,7 @@ get_stencil_only_pipeline(struct radv_device *device, enum blit2d_src_type src_t
   };

   result = vk_meta_create_graphics_pipeline(&device->vk, &device->meta_state.device, &pipeline_create_info, &render,
-                                             key_data, strlen(key_data), pipeline_out);
+                                             &key, sizeof(key), pipeline_out);

   ralloc_free(vs_module);
   ralloc_free(fs_module);
--- a/src/amd/vulkan/meta/radv_meta_buffer.c
+++ b/src/amd/vulkan/meta/radv_meta_buffer.c
@@ -39,7 +39,7 @@ struct fill_constants {
 static VkResult
 get_fill_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-fill-buffer";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_FILL_BUFFER;
   VkResult result;

   const VkPushConstantRange pc_range = {
@@ -47,12 +47,12 @@ get_fill_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipeli
      .size = sizeof(struct fill_constants),
   };

-   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, &pc_range, key_data,
-                                        strlen(key_data), layout_out);
+   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, &pc_range, &key, sizeof(key),
+                                        layout_out);
   if (result != VK_SUCCESS)
      return result;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -75,8 +75,8 @@ get_fill_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipeli
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -114,7 +114,7 @@ struct copy_constants {
 static VkResult
 get_copy_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-copy-buffer";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_COPY_BUFFER;
   VkResult result;

   const VkPushConstantRange pc_range = {
@@ -122,12 +122,12 @@ get_copy_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipeli
      .size = sizeof(struct copy_constants),
   };

-   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, &pc_range, key_data,
-                                        strlen(key_data), layout_out);
+   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, &pc_range, &key, sizeof(key),
+                                        layout_out);
   if (result != VK_SUCCESS)
      return result;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -150,8 +150,8 @@ get_copy_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipeli
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -301,11 +301,21 @@ radv_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSi
 {
   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   VK_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
+   bool old_predicating;
+
+   /* VK_EXT_conditional_rendering says that copy commands should not be
+    * affected by conditional rendering.
+    */
+   old_predicating = cmd_buffer->state.predicating;
+   cmd_buffer->state.predicating = false;

   fillSize = vk_buffer_range(&dst_buffer->vk, dstOffset, fillSize) & ~3ull;

   radv_fill_buffer(cmd_buffer, NULL, dst_buffer->bo,
                    radv_buffer_get_va(dst_buffer->bo) + dst_buffer->offset + dstOffset, fillSize, data);
+
+   /* Restore conditional rendering. */
+   cmd_buffer->state.predicating = old_predicating;
 }

 static void
@@ -369,6 +379,7 @@ radv_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDevice
   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   VK_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+   bool old_predicating;
   uint64_t va = radv_buffer_get_va(dst_buffer->bo);
   va += dstOffset + dst_buffer->offset;

@@ -378,6 +389,12 @@ radv_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDevice
   if (!dataSize)
      return;

+   /* VK_EXT_conditional_rendering says that copy commands should not be
+    * affected by conditional rendering.
+    */
+   old_predicating = cmd_buffer->state.predicating;
+   cmd_buffer->state.predicating = false;
+
   if (dataSize < RADV_BUFFER_UPDATE_THRESHOLD && cmd_buffer->qf != RADV_QUEUE_TRANSFER) {
      radv_cs_add_buffer(device->ws, cmd_buffer->cs, dst_buffer->bo);
      radv_update_buffer_cp(cmd_buffer, va, pData, dataSize);
@@ -387,4 +404,7 @@ radv_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDevice
      radv_copy_buffer(cmd_buffer, cmd_buffer->upload.upload_bo, dst_buffer->bo, buf_offset,
                       dstOffset + dst_buffer->offset, dataSize);
   }
+
+   /* Restore conditional rendering. */
+   cmd_buffer->state.predicating = old_predicating;
 }
--- a/src/amd/vulkan/meta/radv_meta_bufimage.c
+++ b/src/amd/vulkan/meta/radv_meta_bufimage.c
@@ -58,7 +58,7 @@ build_nir_itob_compute_shader(struct radv_device *dev, bool is_3d)
 static VkResult
 get_itob_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-itob";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER;

   const VkDescriptorSetLayoutBinding bindings[] = {
      {
@@ -87,25 +87,32 @@ get_itob_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_ou
      .size = 16,
   };

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

+struct radv_copy_buffer_image_key {
+   enum radv_meta_object_key_type type;
+   bool is_3d;
+};
+
 static VkResult
 get_itob_pipeline(struct radv_device *device, const struct radv_image *image, VkPipeline *pipeline_out,
                  VkPipelineLayout *layout_out)
 {
   const bool is_3d = image->vk.image_type == VK_IMAGE_TYPE_3D;
-   char key_data[64];
+   struct radv_copy_buffer_image_key key;
   VkResult result;

   result = get_itob_pipeline_layout(device, layout_out);
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-itob-%d", is_3d);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER;
+   key.is_3d = is_3d;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -128,8 +135,8 @@ get_itob_pipeline(struct radv_device *device, const struct radv_image *image, Vk
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -178,7 +185,7 @@ build_nir_btoi_compute_shader(struct radv_device *dev, bool is_3d)
 static VkResult
 get_btoi_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-btoi";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_COPY_BUFFER_TO_IMAGE;

   const VkDescriptorSetLayoutBinding bindings[] = {
      {
@@ -207,8 +214,8 @@ get_btoi_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_ou
      .size = 16,
   };

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

 static VkResult
@@ -216,16 +223,18 @@ get_btoi_pipeline(struct radv_device *device, const struct radv_image *image, Vk
                  VkPipelineLayout *layout_out)
 {
   const bool is_3d = image->vk.image_type == VK_IMAGE_TYPE_3D;
-   char key_data[64];
+   struct radv_copy_buffer_image_key key;
   VkResult result;

   result = get_btoi_pipeline_layout(device, layout_out);
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-btoi-%d", is_3d);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_COPY_BUFFER_TO_IMAGE;
+   key.is_3d = is_3d;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -248,8 +257,8 @@ get_btoi_pipeline(struct radv_device *device, const struct radv_image *image, Vk
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -306,7 +315,7 @@ build_nir_btoi_r32g32b32_compute_shader(struct radv_device *dev)
 static VkResult
 get_btoi_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-btoi-r32g32b32";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_COPY_BUFFER_TO_IMAGE_R32G32B32;
   VkResult result;

   const VkDescriptorSetLayoutBinding bindings[] = {
@@ -336,12 +345,12 @@ get_btoi_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_out
      .size = 16,
   };

-   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                        strlen(key_data), layout_out);
+   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key,
+                                        sizeof(key), layout_out);
   if (result != VK_SUCCESS)
      return result;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -364,8 +373,8 @@ get_btoi_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_out
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -428,7 +437,7 @@ build_nir_itoi_compute_shader(struct radv_device *dev, bool src_3d, bool dst_3d,
 static VkResult
 get_itoi_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-itoi";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_COPY_IMAGE;

   const VkDescriptorSetLayoutBinding bindings[] = {
      {
@@ -457,10 +466,17 @@ get_itoi_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_ou
      .size = 24,
   };

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

+struct radv_copy_image_key {
+   enum radv_meta_object_key_type type;
+   bool src_3d;
+   bool dst_3d;
+   uint8_t samples_log2;
+};
+
 static VkResult
 get_itoi_pipeline(struct radv_device *device, const struct radv_image *src_image, const struct radv_image *dst_image,
                  int samples, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
@@ -469,15 +485,19 @@ get_itoi_pipeline(struct radv_device *device, const struct radv_image *src_image
   const bool dst_3d = dst_image->vk.image_type == VK_IMAGE_TYPE_3D;
   const uint32_t samples_log2 = ffs(samples) - 1;
   VkResult result;
-   char key_data[64];
+   struct radv_copy_image_key key;

   result = get_itoi_pipeline_layout(device, layout_out);
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-itoi-%d-%d-%d", src_3d, dst_3d, samples_log2);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_COPY_IMAGE;
+   key.src_3d = src_3d;
+   key.dst_3d = dst_3d;
+   key.samples_log2 = samples_log2;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -500,8 +520,8 @@ get_itoi_pipeline(struct radv_device *device, const struct radv_image *src_image
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -560,7 +580,7 @@ build_nir_itoi_r32g32b32_compute_shader(struct radv_device *dev)
 static VkResult
 get_itoi_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-itoi-r32g32b32";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_COPY_IMAGE_R32G32B32;
   VkResult result;

   const VkDescriptorSetLayoutBinding bindings[] = {
@@ -590,12 +610,12 @@ get_itoi_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_out
      .size = 24,
   };

-   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                        strlen(key_data), layout_out);
+   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key,
+                                        sizeof(key), layout_out);
   if (result != VK_SUCCESS)
      return result;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -618,8 +638,8 @@ get_itoi_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_out
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -665,7 +685,7 @@ build_nir_cleari_compute_shader(struct radv_device *dev, bool is_3d, int samples
 static VkResult
 get_cleari_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-cleari";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_CLEAR_CS;

   const VkDescriptorSetLayoutBinding binding = {
      .binding = 0,
@@ -686,10 +706,16 @@ get_cleari_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_
      .size = 20,
   };

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

+struct radv_clear_key {
+   enum radv_meta_object_key_type type;
+   bool is_3d;
+   uint8_t samples_log2;
+};
+
 static VkResult
 get_cleari_pipeline(struct radv_device *device, const struct radv_image *image, VkPipeline *pipeline_out,
                    VkPipelineLayout *layout_out)
@@ -697,16 +723,19 @@ get_cleari_pipeline(struct radv_device *device, const struct radv_image *image,
   const bool is_3d = image->vk.image_type == VK_IMAGE_TYPE_3D;
   const uint32_t samples = image->vk.samples;
   const uint32_t samples_log2 = ffs(samples) - 1;
-   char key_data[64];
+   struct radv_clear_key key;
   VkResult result;

   result = get_cleari_pipeline_layout(device, layout_out);
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-cleari-%d-%d", is_3d, samples_log2);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_CLEAR_CS;
+   key.is_3d = is_3d;
+   key.samples_log2 = samples_log2;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -729,8 +758,8 @@ get_cleari_pipeline(struct radv_device *device, const struct radv_image *image,
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -774,7 +803,7 @@ build_nir_cleari_r32g32b32_compute_shader(struct radv_device *dev)
 static VkResult
 get_cleari_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-cleari-r32g32b32";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_CLEAR_CS_R32G32B32;
   VkResult result;

   const VkDescriptorSetLayoutBinding binding = {
@@ -796,12 +825,12 @@ get_cleari_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_o
      .size = 16,
   };

-   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                        strlen(key_data), layout_out);
+   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key,
+                                        sizeof(key), layout_out);
   if (result != VK_SUCCESS)
      return result;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -824,8 +853,8 @@ get_cleari_r32g32b32_pipeline(struct radv_device *device, VkPipeline *pipeline_o
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -1344,12 +1373,21 @@ radv_meta_image_to_image_cs(struct radv_cmd_buffer *cmd_buffer, struct radv_meta
      if (vk_format_is_color(src->image->vk.format) && vk_format_is_depth_or_stencil(dst->image->vk.format)) {
         assert(src->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT);
         src_aspect_mask = src->aspect_mask;
+      } else if (vk_format_is_depth_or_stencil(src->image->vk.format) && vk_format_is_color(dst->image->vk.format)) {
+         if (src->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+            depth_format = vk_format_stencil_only(src->image->vk.format);
+         } else {
+            assert(src->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
+            depth_format = vk_format_depth_only(src->image->vk.format);
+         }
      }

      create_iview(cmd_buffer, src, &src_view,
                   (src_aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) ? depth_format : 0,
                   src_aspect_mask);
-      create_iview(cmd_buffer, dst, &dst_view, depth_format, dst_aspect_mask);
+      create_iview(cmd_buffer, dst, &dst_view,
+                   dst_aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) ? depth_format : 0,
+                   dst_aspect_mask);

      radv_meta_push_descriptor_set(
         cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, 2,
--- a/src/amd/vulkan/meta/radv_meta_clear.c
+++ b/src/amd/vulkan/meta/radv_meta_clear.c
@@ -57,32 +57,43 @@ build_color_shaders(struct radv_device *dev, struct nir_shader **out_vs, struct
 static VkResult
 get_color_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-clear-color";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_CLEAR_COLOR;

   const VkPushConstantRange pc_range = {
      .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
      .size = 16,
   };

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

+struct radv_clear_color_key {
+   enum radv_meta_object_key_type type;
+   uint8_t samples;
+   uint8_t frag_output;
+   uint32_t fs_key;
+};
+
 static VkResult
 get_color_pipeline(struct radv_device *device, uint32_t samples, uint32_t frag_output, VkFormat format,
                   VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
   const uint32_t fs_key = radv_format_meta_fs_key(device, format);
-   char key_data[64];
+   struct radv_clear_color_key key;
   VkResult result;

   result = get_color_pipeline_layout(device, layout_out);
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-clear-color-%d-%d-%d", samples, frag_output, fs_key);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_CLEAR_COLOR;
+   key.samples = samples;
+   key.frag_output = frag_output;
+   key.fs_key = fs_key;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -198,7 +209,7 @@ get_color_pipeline(struct radv_device *device, uint32_t samples, uint32_t frag_o
      render.color_attachment_formats[i] = format;

   result = vk_meta_create_graphics_pipeline(&device->vk, &device->meta_state.device, &pipeline_create_info, &render,
-                                             key_data, strlen(key_data), pipeline_out);
+                                             &key, sizeof(key), pipeline_out);

   ralloc_free(vs_module);
   ralloc_free(fs_module);
@@ -317,37 +328,57 @@ static bool radv_can_fast_clear_depth(struct radv_cmd_buffer *cmd_buffer, const
                                      const VkClearRect *clear_rect, const VkClearDepthStencilValue clear_value,
                                      uint32_t view_mask);

+struct radv_clear_ds_layout_key {
+   enum radv_meta_object_key_type type;
+   bool unrestricted;
+};
+
 static VkResult
 get_depth_stencil_pipeline_layout(struct radv_device *device, bool unrestricted, VkPipelineLayout *layout_out)
 {
-   char key_data[64];
+   struct radv_clear_ds_layout_key key;

-   snprintf(key_data, sizeof(key_data), "radv-clear-ds-%d", unrestricted);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_CLEAR_DS;
+   key.unrestricted = unrestricted;

   const VkPushConstantRange pc_range = {
      .stageFlags = unrestricted ? VK_SHADER_STAGE_FRAGMENT_BIT : VK_SHADER_STAGE_VERTEX_BIT,
      .size = 4,
   };

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, NULL, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

+struct radv_clear_ds_key {
+   enum radv_meta_object_key_type type;
+   VkImageAspectFlags aspects;
+   uint8_t samples;
+   bool fast;
+   bool unrestricted;
+};
+
 static VkResult
 get_depth_stencil_pipeline(struct radv_device *device, int samples, VkImageAspectFlags aspects, bool fast,
                           VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
   const bool unrestricted = device->vk.enabled_extensions.EXT_depth_range_unrestricted;
-   char key_data[64];
+   struct radv_clear_ds_key key;
   VkResult result;

   result = get_depth_stencil_pipeline_layout(device, unrestricted, layout_out);
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-clear-ds-%d-%d-%d-%d", aspects, samples, fast, unrestricted);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_CLEAR_DS;
+   key.aspects = aspects;
+   key.samples = samples;
+   key.fast = fast;
+   key.unrestricted = unrestricted;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -475,7 +506,7 @@ get_depth_stencil_pipeline(struct radv_device *device, int samples, VkImageAspec
   };

   result = vk_meta_create_graphics_pipeline(&device->vk, &device->meta_state.device, &pipeline_create_info, &render,
-                                             key_data, strlen(key_data), pipeline_out);
+                                             &key, sizeof(key), pipeline_out);

   ralloc_free(vs_module);
   ralloc_free(fs_module);
@@ -584,7 +615,7 @@ build_clear_htile_mask_shader(struct radv_device *dev)
 static VkResult
 get_clear_htile_mask_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-clear-htile-mask";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_CLEAR_HTILE;
   VkResult result;

   const VkDescriptorSetLayoutBinding binding = {
@@ -606,12 +637,12 @@ get_clear_htile_mask_pipeline(struct radv_device *device, VkPipeline *pipeline_o
      .size = 8,
   };

-   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                        strlen(key_data), layout_out);
+   result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key,
+                                        sizeof(key), layout_out);
   if (result != VK_SUCCESS)
      return result;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -634,8 +665,8 @@ get_clear_htile_mask_pipeline(struct radv_device *device, VkPipeline *pipeline_o
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -1033,7 +1064,7 @@ radv_clear_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, con
 static VkResult
 get_clear_dcc_comp_to_single_pipeline_layout(struct radv_device *device, VkPipelineLayout *layout_out)
 {
-   const char *key_data = "radv-clear-dcc-comp-to-single";
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_CLEAR_DCC_COMP_TO_SINGLE;

   const VkDescriptorSetLayoutBinding binding = {
      .binding = 0,
@@ -1054,24 +1085,31 @@ get_clear_dcc_comp_to_single_pipeline_layout(struct radv_device *device, VkPipel
      .size = 24,
   };

-   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, key_data,
-                                      strlen(key_data), layout_out);
+   return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
+                                      layout_out);
 }

+struct radv_clear_dcc_comp_to_single_key {
+   enum radv_meta_object_key_type type;
+   bool is_msaa;
+};
+
 static VkResult
 get_clear_dcc_comp_to_single_pipeline(struct radv_device *device, bool is_msaa, VkPipeline *pipeline_out,
                                      VkPipelineLayout *layout_out)
 {
-   char key_data[64];
+   struct radv_clear_dcc_comp_to_single_key key;
   VkResult result;

   result = get_clear_dcc_comp_to_single_pipeline_layout(device, layout_out);
   if (result != VK_SUCCESS)
      return result;

-   snprintf(key_data, sizeof(key_data), "radv-clear-dcc-comp-to-single-%d", is_msaa);
+   memset(&key, 0, sizeof(key));
+   key.type = RADV_META_OBJECT_KEY_CLEAR_DCC_COMP_TO_SINGLE;
+   key.is_msaa = is_msaa;

-   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, key_data, strlen(key_data));
+   VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
   if (pipeline_from_cache != VK_NULL_HANDLE) {
      *pipeline_out = pipeline_from_cache;
      return VK_SUCCESS;
@@ -1094,8 +1132,8 @@ get_clear_dcc_comp_to_single_pipeline(struct radv_device *device, bool is_msaa,
      .layout = *layout_out,
   };

-   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, key_data,
-                                            strlen(key_data), pipeline_out);
+   result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
+                                            pipeline_out);

   ralloc_free(cs);
   return result;
@@ -1312,7 +1350,7 @@ gfx8_get_fast_clear_parameters(struct radv_device *device, const struct radv_ima
      *can_avoid_fast_clear_elim = false;
   }

-   const struct util_format_description *desc = vk_format_description(iview->vk.format);
+   const struct util_format_description *desc = radv_format_description(iview->vk.format);
   if (iview->vk.format == VK_FORMAT_B10G11R11_UFLOAT_PACK32 || iview->vk.format == VK_FORMAT_R5G6B5_UNORM_PACK16 ||
       iview->vk.format == VK_FORMAT_B5G6R5_UNORM_PACK16)
      extra_channel = -1;
@@ -1392,7 +1430,7 @@ static bool
 gfx11_get_fast_clear_parameters(struct radv_device *device, const struct radv_image_view *iview,
                                const VkClearColorValue *clear_value, uint32_t *reset_value)
 {
-   const struct util_format_description *desc = vk_format_description(iview->vk.format);
+   const struct util_format_description *desc = radv_format_description(iview->vk.format);
   unsigned start_bit = UINT_MAX;
   unsigned end_bit = 0;

--- a/Show More
+++ b/Show More