CHROMIUM: i965: Implement EGL_KHR_mutable_render_buffer

Tested with a low-latency handwriting application on Android Nougat on the Chrome OS Pixelbook (codename Eve) with Kabylake. BUG=b:77899911 TEST=No android-cts-7.1 regressions on Eve. Change-Id: Ia816fa6b0a1158f81e5b63477451bf337c2001aa
CHROMIUM: egl/android: Implement EGL_KHR_mutable_render_buffer
2018-07-30 10:50:41 -07:00 · 2018-07-30 10:50:41 -07:00 · 2018-07-30 10:50:41 -07:00 · 2018-07-30 10:50:41 -07:00 · 2018-07-30 10:50:40 -07:00 · 2018-07-30 09:32:54 -07:00
912 changed files with 53354 additions and 36157 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,8 +21,8 @@ env:
    - LIBXCB_VERSION=libxcb-1.13
    - LIBXSHMFENCE_VERSION=libxshmfence-1.2
    - LIBVDPAU_VERSION=libvdpau-1.1
-    - LIBVA_VERSION=libva-1.6.2
-    - LIBWAYLAND_VERSION=wayland-1.11.1
+    - LIBVA_VERSION=libva-1.7.0
+    - LIBWAYLAND_VERSION=wayland-1.15.0
    - WAYLAND_PROTOCOLS_VERSION=wayland-protocols-1.8
    - PKG_CONFIG_PATH=$HOME/prefix/lib/pkgconfig:$HOME/prefix/share/pkgconfig
    - LD_LIBRARY_PATH="$HOME/prefix/lib:$LD_LIBRARY_PATH"
@@ -33,18 +33,18 @@ matrix:
    - env:
        - LABEL="meson Vulkan"
        - BUILD=meson
-        - MESON_OPTIONS="-Ddri-drivers= -Dgallium-drivers="
-        - LLVM_VERSION=4.0
+        - MESON_OPTIONS="-Ddri-drivers=[] -Dgallium-drivers=[]"
+        - LLVM_VERSION=5.0
        - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
      addons:
        apt:
          sources:
-            - llvm-toolchain-trusty-4.0
+            - llvm-toolchain-trusty-5.0
          packages:
            # LLVM packaging is broken and misses these dependencies
            - libedit-dev
            # From sources above
-            - llvm-4.0-dev
+            - llvm-5.0-dev
            # Common
            - xz-utils
            - libexpat1-dev
@@ -53,7 +53,7 @@ matrix:
    - env:
        - LABEL="meson loaders/classic DRI"
        - BUILD=meson
-        - MESON_OPTIONS="-Dvulkan-drivers= -Dgallium-drivers="
+        - MESON_OPTIONS="-Dvulkan-drivers=[] -Dgallium-drivers=[]"
      addons:
        apt:
          packages:
@@ -123,7 +123,7 @@ matrix:
        - BUILD=make
        - MAKEFLAGS="-j4"
        - MAKE_CHECK_COMMAND="true"
-        - LLVM_VERSION=4.0
+        - LLVM_VERSION=5.0
        - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
        - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
        - DRI_DRIVERS=""
@@ -134,12 +134,12 @@ matrix:
      addons:
        apt:
          sources:
-            - llvm-toolchain-trusty-4.0
+            - llvm-toolchain-trusty-5.0
          packages:
            # LLVM packaging is broken and misses these dependencies
            - libedit-dev
            # From sources above
-            - llvm-4.0-dev
+            - llvm-5.0-dev
            # Common
            - xz-utils
            - x11proto-xf86vidmode-dev
@@ -159,7 +159,7 @@ matrix:
        - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
        - DRI_DRIVERS=""
        - GALLIUM_ST="--enable-dri --disable-opencl --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx-bellagio --disable-gallium-osmesa"
-        - GALLIUM_DRIVERS="i915,nouveau,pl111,r300,r600,freedreno,svga,swrast,vc4,virgl,etnaviv,imx"
+        - GALLIUM_DRIVERS="i915,nouveau,pl111,r300,r600,freedreno,svga,swrast,v3d,vc4,virgl,etnaviv,imx"
        - VULKAN_DRIVERS=""
        - LIBUNWIND_FLAGS="--enable-libunwind"
      addons:
@@ -231,7 +231,7 @@ matrix:
        - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
        - DRI_DRIVERS=""
        - GALLIUM_ST="--disable-dri --enable-opencl --enable-opencl-icd --enable-llvm --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx-bellagio --disable-gallium-osmesa"
-        - GALLIUM_DRIVERS="r600,radeonsi"
+        - GALLIUM_DRIVERS="r600"
        - VULKAN_DRIVERS=""
        - LIBUNWIND_FLAGS="--enable-libunwind"
      addons:
@@ -290,6 +290,39 @@ matrix:
            - libx11-xcb-dev
            - libelf-dev
            - libunwind8-dev
+    - env:
+        # NOTE: Analogous to SWR above, building Clover is quite slow.
+        - LABEL="make Gallium ST Clover LLVM-6.0"
+        - BUILD=make
+        - MAKEFLAGS="-j4"
+        - MAKE_CHECK_COMMAND="true"
+        - LLVM_VERSION=6.0
+        - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
+        - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
+        - DRI_DRIVERS=""
+        - GALLIUM_ST="--disable-dri --enable-opencl --enable-opencl-icd --enable-llvm --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx-bellagio --disable-gallium-osmesa"
+        - GALLIUM_DRIVERS="r600,radeonsi"
+        - VULKAN_DRIVERS=""
+        - LIBUNWIND_FLAGS="--enable-libunwind"
+      addons:
+        apt:
+          sources:
+            - llvm-toolchain-trusty-6.0
+            # llvm-6 depends on gcc-4.9 which is not in main repo
+            - ubuntu-toolchain-r-test
+          packages:
+            - libclc-dev
+            # From sources above
+            - llvm-6.0-dev
+            - clang-6.0
+            - libclang-6.0-dev
+            # Common
+            - xz-utils
+            - x11proto-xf86vidmode-dev
+            - libexpat1-dev
+            - libx11-xcb-dev
+            - libelf-dev
+            - libunwind8-dev
    - env:
        - LABEL="make Gallium ST Other"
        - BUILD=make
@@ -331,7 +364,7 @@ matrix:
        - BUILD=make
        - MAKEFLAGS="-j4"
        - MAKE_CHECK_COMMAND="make -C src/gtest check && make -C src/intel check"
-        - LLVM_VERSION=4.0
+        - LLVM_VERSION=5.0
        - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
        - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl --with-platforms=x11,wayland"
        - DRI_DRIVERS=""
@@ -342,12 +375,12 @@ matrix:
      addons:
        apt:
          sources:
-            - llvm-toolchain-trusty-4.0
+            - llvm-toolchain-trusty-5.0
          packages:
            # LLVM packaging is broken and misses these dependencies
            - libedit-dev
            # From sources above
-            - llvm-4.0-dev
+            - llvm-5.0-dev
            # Common
            - xz-utils
            - x11proto-xf86vidmode-dev
@@ -558,7 +591,9 @@ script:

      export CFLAGS="$CFLAGS -isystem`pwd`";

-      ./autogen.sh --enable-debug
+      mkdir build &&
+      cd build &&
+      ../autogen.sh --enable-debug
        $LIBUNWIND_FLAGS
        $DRI_LOADERS
        --with-dri-drivers=$DRI_DRIVERS
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -73,6 +73,7 @@ LOCAL_CFLAGS += \
 	-DHAVE_ENDIAN_H \
 	-DHAVE_ZLIB \
 	-DMAJOR_IN_SYSMACROS \
+	-DVK_USE_PLATFORM_ANDROID_KHR \
 	-fvisibility=hidden \
 	-Wno-sign-compare

--- a/Makefile.am
+++ b/Makefile.am
@@ -77,6 +77,7 @@ noinst_HEADERS = \
 	include/drm-uapi/drm_mode.h \
 	include/drm-uapi/i915_drm.h \
 	include/drm-uapi/tegra_drm.h \
+	include/drm-uapi/v3d_drm.h \
 	include/drm-uapi/vc4_drm.h \
 	include/D3D9 \
 	include/GL/wglext.h \
--- a/PRESUBMIT.cfg
+++ b/PRESUBMIT.cfg
@@ -0,0 +1,10 @@
+# This sample config file disables all of the ChromiumOS source style checks.
+# Comment out the disable-flags for any checks you want to leave enabled.
+
+[Hook Overrides]
+stray_whitespace_check: false
+long_line_check: false
+cros_license_check: false
+tab_check: false
+bug_field_check: false
+test_field_check: false
--- a/README.rst
+++ b/README.rst
@@ -0,0 +1,79 @@
+`Mesa <https://mesa3d.org>`_ - The 3D Graphics Library
+======================================================
+
+
+Source
+------
+
+This repository lives at https://gitlab.freedesktop.org/mesa/mesa.
+Other repositories are likely forks, and code found there is not supported.
+
+
+Build status
+------------
+
+Travis:
+
+.. image:: https://travis-ci.org/mesa3d/mesa.svg?branch=master
+    :target: https://travis-ci.org/mesa3d/mesa
+
+Appveyor:
+
+.. image:: https://img.shields.io/appveyor/ci/mesa3d/mesa.svg
+    :target: https://ci.appveyor.com/project/mesa3d/mesa
+
+Coverity:
+
+.. image:: https://scan.coverity.com/projects/139/badge.svg?flat=1
+    :target: https://scan.coverity.com/projects/mesa
+
+
+Build & install
+---------------
+
+You can find more information in our documentation (`docs/install.html
+<https://mesa3d.org/install.html>`_), but the recommended way is to use
+Meson (`docs/meson.html <https://mesa3d.org/meson.html>`_):
+
+.. code-block:: sh
+
+  $ mkdir build
+  $ cd build
+  $ meson ..
+  $ sudo ninja install
+
+
+Support
+-------
+
+Many Mesa devs hang on IRC; if you're not sure which channel is
+appropriate, you should ask your question on `Freenode's #dri-devel
+<irc://chat.freenode.net#dri-devel>`_, someone will redirect you if
+necessary.
+Remember that not everyone is in the same timezone as you, so it might
+take a while before someone qualified sees your question.
+To figure out who you're talking to, or which nick to ping for your
+question, check out `Who's Who on IRC
+<https://dri.freedesktop.org/wiki/WhosWho/>`_.
+
+The next best option is to ask your question in an email to the
+mailing lists: `mesa-dev\@lists.freedesktop.org
+<https://lists.freedesktop.org/mailman/listinfo/mesa-dev>`_
+
+
+Bug reports
+-----------
+
+If you think something isn't working properly, please file a bug report
+(`docs/bugs.html <https://mesa3d.org/bugs.html>`_).
+
+
+Contributing
+------------
+
+Contributions are welcome, and step-by-step instructions can be found in our
+documentation (`docs/submittingpatches.html
+<https://mesa3d.org/submittingpatches.html>`_).
+
+Note that Mesa uses email mailing-lists for patches submission, review and
+discussions.
--- a/1
+++ b/1
@@ -116,6 +116,7 @@ MESON BUILD
 R: Dylan Baker <dylan@pnwbakers.com>
 R: Eric Engestrom <eric@engestrom.ch>
 F: */meson.build
+F: meson.build
 F: meson_options.txt

 ANDROID EGL SUPPORT
--- a/2
+++ b/2
@@ -1 +1 @@
-18.1.0-devel
+18.2.0-devel
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -35,13 +35,13 @@ clone_depth: 100

 cache:
 - win_flex_bison-2.5.9.zip
- llvm-3.3.1-msvc2015-mtd.7z
+- llvm-5.0.1-msvc2015-mtd.7z

 os: Visual Studio 2015

 environment:
  WINFLEXBISON_ARCHIVE: win_flex_bison-2.5.9.zip
-  LLVM_ARCHIVE: llvm-3.3.1-msvc2015-mtd.7z
+  LLVM_ARCHIVE: llvm-5.0.1-msvc2015-mtd.7z

 install:
 # Check pip
--- a/bin/bugzilla_mesa.sh
+++ b/bin/bugzilla_mesa.sh
@@ -23,7 +23,7 @@ echo "<ul>"
 echo ""

 # extract fdo urls from commit log
-git log $* | grep 'bugs.freedesktop.org/show_bug' | sed -e $trim_before | sort -n -u | sed -e $use_after |\
+git log --pretty=medium $* | grep 'bugs.freedesktop.org/show_bug' | sed -e $trim_before | sort -n -u | sed -e $use_after |\
 while read url
 do
 	id=$(echo $url | cut -d'=' -f2)
--- a/bin/get-fixes-pick-list.sh
+++ b/bin/get-fixes-pick-list.sh
@@ -16,7 +16,7 @@ latest_branchpoint=`git merge-base origin/master HEAD`
 git log --reverse --pretty=%H $latest_branchpoint > already_landed

 # ... and the ones cherry-picked.
-git log --reverse --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
+git log --reverse --pretty=medium --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
 	grep "cherry picked from commit" |\
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//'  > already_picked

@@ -38,7 +38,7 @@ do

 	# Place every "fixes:" tag on its own line and join with the next word
 	# on its line or a later one.
-	fixes=`git show -s $sha | tr -d "\n" | sed -e 's/fixes:[[:space:]]*/\nfixes:/Ig' | grep "fixes:" | sed -e 's/\(fixes:[a-zA-Z0-9]*\).*$/\1/'`
+	fixes=`git show --pretty=medium -s $sha | tr -d "\n" | sed -e 's/fixes:[[:space:]]*/\nfixes:/Ig' | grep "fixes:" | sed -e 's/\(fixes:[a-zA-Z0-9]*\).*$/\1/'`

 	# For each one try to extract the tag
 	fixes_count=`echo "$fixes" | wc -l`
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -12,7 +12,7 @@
 latest_branchpoint=`git merge-base origin/master HEAD`

 # Grep for commits with "cherry picked from commit" in the commit message.
-git log --reverse --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
+git log --reverse --pretty=medium --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
 	grep "cherry picked from commit" |\
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked

--- a/bin/install_megadrivers.py
+++ b/bin/install_megadrivers.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # encoding=utf-8
-# Copyright © 2017 Intel Corporation
+# Copyright © 2017-2018 Intel Corporation

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -35,30 +35,34 @@ def main():
    parser.add_argument('drivers', nargs='+')
    args = parser.parse_args()

-    to = os.path.join(os.environ.get('MESON_INSTALL_DESTDIR_PREFIX'), args.libdir)
+    if os.path.isabs(args.libdir):
+        to = os.path.join(os.environ.get('DESTDIR', '/'), args.libdir[1:])
+    else:
+        to = os.path.join(os.environ['MESON_INSTALL_DESTDIR_PREFIX'], args.libdir)
+
    master = os.path.join(to, os.path.basename(args.megadriver))

    if not os.path.exists(to):
        os.makedirs(to)
    shutil.copy(args.megadriver, master)

-    for each in args.drivers:
-        driver = os.path.join(to, each)
+    for driver in args.drivers:
+        abs_driver = os.path.join(to, driver)

-        if os.path.exists(driver):
-            os.unlink(driver)
-        print('installing {} to {}'.format(args.megadriver, driver))
-        os.link(master, driver)
+        if os.path.exists(abs_driver):
+            os.unlink(abs_driver)
+        print('installing {} to {}'.format(args.megadriver, abs_driver))
+        os.link(master, abs_driver)

        try:
            ret = os.getcwd()
            os.chdir(to)

-            name, ext = os.path.splitext(each)
+            name, ext = os.path.splitext(driver)
            while ext != '.so':
                if os.path.exists(name):
                    os.unlink(name)
-                os.symlink(each, name)
+                os.symlink(driver, name)
                name, ext = os.path.splitext(name)
        finally:
            os.chdir(ret)
--- a/configure.ac
+++ b/configure.ac
@@ -78,17 +78,19 @@ LIBDRM_AMDGPU_REQUIRED=2.4.91
 LIBDRM_INTEL_REQUIRED=2.4.75
 LIBDRM_NVVIEUX_REQUIRED=2.4.66
 LIBDRM_NOUVEAU_REQUIRED=2.4.66
-LIBDRM_FREEDRENO_REQUIRED=2.4.91
+LIBDRM_FREEDRENO_REQUIRED=2.4.92
 LIBDRM_ETNAVIV_REQUIRED=2.4.89
+LIBDRM_VC4_REQUIRED=2.4.89

 dnl Versions for external dependencies
 DRI2PROTO_REQUIRED=2.8
 GLPROTO_REQUIRED=1.4.14
 LIBOMXIL_BELLAGIO_REQUIRED=0.0
 LIBOMXIL_TIZONIA_REQUIRED=0.10.0
-LIBVA_REQUIRED=0.38.0
+LIBVA_REQUIRED=0.39.0
 VDPAU_REQUIRED=1.1
 WAYLAND_REQUIRED=1.11
+WAYLAND_EGL_BACKEND_REQUIRED=3
 WAYLAND_PROTOCOLS_REQUIRED=1.8
 XCB_REQUIRED=1.9.3
 XCBDRI2_REQUIRED=1.8
@@ -106,8 +108,8 @@ dnl LLVM versions
 LLVM_REQUIRED_GALLIUM=3.3.0
 LLVM_REQUIRED_OPENCL=3.9.0
 LLVM_REQUIRED_R600=3.9.0
-LLVM_REQUIRED_RADEONSI=4.0.0
-LLVM_REQUIRED_RADV=4.0.0
+LLVM_REQUIRED_RADEONSI=5.0.0
+LLVM_REQUIRED_RADV=5.0.0
 LLVM_REQUIRED_SWR=4.0.0

 dnl Check for progs
@@ -119,6 +121,7 @@ dnl other CC/CXX flags related help
 AC_ARG_VAR([CXX11_CXXFLAGS], [Compiler flag to enable C++11 support (only needed if not
                              enabled by default and different  from -std=c++11)])
 AM_PROG_CC_C_O
+AC_PROG_GREP
 AC_PROG_NM
 AM_PROG_AS
 AX_CHECK_GNU_MAKE
@@ -433,26 +436,40 @@ fi
 AM_CONDITIONAL([SSE41_SUPPORTED], [test x$SSE41_SUPPORTED = x1])
 AC_SUBST([SSE41_CFLAGS], $SSE41_CFLAGS)

-dnl Check for new-style atomic builtins
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+dnl Check for new-style atomic builtins. We first check without linking to
+dnl -latomic.
+AC_MSG_CHECKING(whether __atomic_load_n is supported)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#include <stdint.h>
 int main() {
-    int n;
-    return __atomic_load_n(&n, __ATOMIC_ACQUIRE);
-}]])], GCC_ATOMIC_BUILTINS_SUPPORTED=1)
-if test "x$GCC_ATOMIC_BUILTINS_SUPPORTED" = x1; then
+    struct {
+        uint64_t *v;
+    } x;
+    return (int)__atomic_load_n(x.v, __ATOMIC_ACQUIRE) &
+           (int)__atomic_add_fetch(x.v, (uint64_t)1, __ATOMIC_ACQ_REL);
+}]])], GCC_ATOMIC_BUILTINS_SUPPORTED=yes, GCC_ATOMIC_BUILTINS_SUPPORTED=no)
+
+dnl If that didn't work, we try linking with -latomic, which is needed on some
+dnl platforms.
+if test "x$GCC_ATOMIC_BUILTINS_SUPPORTED" != xyes; then
+   save_LDFLAGS=$LDFLAGS
+   LDFLAGS="$LDFLAGS -latomic"
+   AC_LINK_IFELSE([AC_LANG_SOURCE([[
+   #include <stdint.h>
+   int main() {
+        struct {
+            uint64_t *v;
+        } x;
+        return (int)__atomic_load_n(x.v, __ATOMIC_ACQUIRE) &
+               (int)__atomic_add_fetch(x.v, (uint64_t)1, __ATOMIC_ACQ_REL);
+   }]])], GCC_ATOMIC_BUILTINS_SUPPORTED=yes LIBATOMIC_LIBS="-latomic",
+          GCC_ATOMIC_BUILTINS_SUPPORTED=no)
+   LDFLAGS=$save_LDFLAGS
+fi
+AC_MSG_RESULT($GCC_ATOMIC_BUILTINS_SUPPORTED)
+
+if test "x$GCC_ATOMIC_BUILTINS_SUPPORTED" = xyes; then
    DEFINES="$DEFINES -DUSE_GCC_ATOMIC_BUILTINS"
-    dnl On some platforms, new-style atomics need a helper library
-    AC_MSG_CHECKING(whether -latomic is needed)
-    AC_LINK_IFELSE([AC_LANG_SOURCE([[
-    #include <stdint.h>
-    uint64_t v;
-    int main() {
-        return (int)__atomic_load_n(&v, __ATOMIC_ACQUIRE);
-    }]])], GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC=no, GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC=yes)
-    AC_MSG_RESULT($GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC)
-    if test "x$GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC" = xyes; then
-        LIBATOMIC_LIBS="-latomic"
-    fi
 fi
 AC_SUBST([LIBATOMIC_LIBS])

@@ -746,21 +763,6 @@ esac

 AC_SUBST([LIB_EXT])

-dnl
-dnl potentially-infringing-but-nobody-knows-for-sure stuff
-dnl
-AC_ARG_ENABLE([texture-float],
-    [AS_HELP_STRING([--enable-texture-float],
-        [enable floating-point textures and renderbuffers @<:@default=disabled@:>@])],
-    [enable_texture_float="$enableval"],
-    [enable_texture_float=no]
-)
-if test "x$enable_texture_float" = xyes; then
-    AC_MSG_WARN([Floating-point textures enabled.])
-    AC_MSG_WARN([Please consult docs/patents.txt with your lawyer before building Mesa.])
-    DEFINES="$DEFINES -DTEXTURE_FLOAT_ENABLED"
-fi
-
 dnl
 dnl Arch/platform-specific settings
 dnl
@@ -1359,7 +1361,7 @@ GALLIUM_DRIVERS_DEFAULT="r300,r600,svga,swrast"
 AC_ARG_WITH([gallium-drivers],
    [AS_HELP_STRING([--with-gallium-drivers@<:@=DIRS...@:>@],
        [comma delimited Gallium drivers list, e.g.
-        "i915,nouveau,r300,r600,radeonsi,freedreno,pl111,svga,swrast,swr,tegra,vc4,vc5,virgl,etnaviv,imx"
+        "i915,nouveau,r300,r600,radeonsi,freedreno,pl111,svga,swrast,swr,tegra,v3d,vc4,virgl,etnaviv,imx"
        @<:@default=r300,r600,svga,swrast@:>@])],
    [with_gallium_drivers="$withval"],
    [with_gallium_drivers="$GALLIUM_DRIVERS_DEFAULT"])
@@ -1794,6 +1796,9 @@ for plat in $platforms; do
        PKG_CHECK_MODULES([WAYLAND_CLIENT], [wayland-client >= $WAYLAND_REQUIRED])
        PKG_CHECK_MODULES([WAYLAND_SERVER], [wayland-server >= $WAYLAND_REQUIRED])
        PKG_CHECK_MODULES([WAYLAND_PROTOCOLS], [wayland-protocols >= $WAYLAND_PROTOCOLS_REQUIRED])
+        if test "x$enable_egl" = xyes; then
+          PKG_CHECK_MODULES([WAYLAND_EGL], [wayland-egl-backend >= $WAYLAND_EGL_BACKEND_REQUIRED])
+        fi
        WAYLAND_PROTOCOLS_DATADIR=`$PKG_CONFIG --variable=pkgdatadir wayland-protocols`

        PKG_CHECK_MODULES([WAYLAND_SCANNER], [wayland-scanner],
@@ -1826,6 +1831,9 @@ for plat in $platforms; do

    android)
        PKG_CHECK_MODULES([ANDROID], [cutils hardware sync])
+        if test -n "$with_gallium_drivers"; then
+            PKG_CHECK_MODULES([BACKTRACE], [backtrace])
+        fi
        DEFINES="$DEFINES -DHAVE_ANDROID_PLATFORM"
        ;;

@@ -2085,6 +2093,9 @@ if test -n "$with_vulkan_drivers"; then
            PKG_CHECK_MODULES([AMDGPU], [libdrm >= $LIBDRM_AMDGPU_REQUIRED libdrm_amdgpu >= $LIBDRM_AMDGPU_REQUIRED])
            radeon_llvm_check $LLVM_REQUIRED_RADV "radv"
            require_x11_dri3 "radv"
+            if test "x$acv_mako_found" = xno; then
+                AC_MSG_ERROR([Python mako module v$PYTHON_MAKO_REQUIRED or higher not found])
+            fi
            HAVE_RADEON_VULKAN=yes
            ;;
        *)
@@ -2714,20 +2725,20 @@ if test -n "$with_gallium_drivers"; then
            ;;
        xvc4)
            HAVE_GALLIUM_VC4=yes
-            require_libdrm "vc4"
+            PKG_CHECK_MODULES([VC4], [libdrm >= $LIBDRM_VC4_REQUIRED])

            PKG_CHECK_MODULES([SIMPENROSE], [simpenrose],
                              [USE_VC4_SIMULATOR=yes;
                               DEFINES="$DEFINES -DUSE_VC4_SIMULATOR"],
                              [USE_VC4_SIMULATOR=no])
            ;;
-        xvc5)
-            HAVE_GALLIUM_VC5=yes
+        xv3d)
+            HAVE_GALLIUM_V3D=yes

-            PKG_CHECK_MODULES([VC5_SIMULATOR], [v3dv3],
-                              [USE_VC5_SIMULATOR=yes;
-                               DEFINES="$DEFINES -DUSE_VC5_SIMULATOR"],
-                              [AC_MSG_ERROR([vc5 requires the simulator])])
+            PKG_CHECK_MODULES([V3D_SIMULATOR], [v3dv3],
+                              [USE_V3D_SIMULATOR=yes;
+                               DEFINES="$DEFINES -DUSE_V3D_SIMULATOR"],
+                              [USE_V3D_SIMULATOR=no])
            ;;
        xpl111)
            HAVE_GALLIUM_PL111=yes
@@ -2879,8 +2890,8 @@ AM_CONDITIONAL(HAVE_GALLIUM_SWR, test "x$HAVE_GALLIUM_SWR" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_SWRAST, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes -o \
                                         "x$HAVE_GALLIUM_LLVMPIPE" = xyes -o \
                                         "x$HAVE_GALLIUM_SWR" = xyes)
+AM_CONDITIONAL(HAVE_GALLIUM_V3D, test "x$HAVE_GALLIUM_V3D" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes)
-AM_CONDITIONAL(HAVE_GALLIUM_VC5, test "x$HAVE_GALLIUM_VC5" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_VIRGL, test "x$HAVE_GALLIUM_VIRGL" = xyes)

 AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers" = xno)
@@ -2908,7 +2919,7 @@ AM_CONDITIONAL(HAVE_AMD_DRIVERS, test "x$HAVE_GALLIUM_RADEONSI" = xyes -o \
                                      "x$HAVE_RADEON_VULKAN" = xyes)

 AM_CONDITIONAL(HAVE_BROADCOM_DRIVERS, test "x$HAVE_GALLIUM_VC4" = xyes -o \
-                                      "x$HAVE_GALLIUM_VC5" = xyes)
+                                      "x$HAVE_GALLIUM_V3D" = xyes)

 AM_CONDITIONAL(HAVE_INTEL_DRIVERS, test "x$HAVE_INTEL_VULKAN" = xyes -o \
                                        "x$HAVE_I965_DRI" = xyes)
@@ -2919,8 +2930,8 @@ AM_CONDITIONAL(NEED_RADEON_DRM_WINSYS, test "x$HAVE_GALLIUM_R300" = xyes -o \
 AM_CONDITIONAL(NEED_WINSYS_XLIB, test "x$enable_glx" = xgallium-xlib)
 AM_CONDITIONAL(HAVE_GALLIUM_COMPUTE, test x$enable_opencl = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_LLVM, test "x$enable_llvm" = xyes)
+AM_CONDITIONAL(USE_V3D_SIMULATOR, test x$USE_V3D_SIMULATOR = xyes)
 AM_CONDITIONAL(USE_VC4_SIMULATOR, test x$USE_VC4_SIMULATOR = xyes)
-AM_CONDITIONAL(USE_VC5_SIMULATOR, test x$USE_VC5_SIMULATOR = xyes)

 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
@@ -2956,7 +2967,7 @@ AC_SUBST([XVMC_MAJOR], 1)
 AC_SUBST([XVMC_MINOR], 0)

 AC_SUBST([XA_MAJOR], 2)
-AC_SUBST([XA_MINOR], 3)
+AC_SUBST([XA_MINOR], 4)
 AC_SUBST([XA_PATCH], 0)
 AC_SUBST([XA_VERSION], "$XA_MAJOR.$XA_MINOR.$XA_PATCH")

@@ -3005,8 +3016,6 @@ AC_CONFIG_FILES([Makefile
                 src/egl/Makefile
                 src/egl/main/egl.pc
                 src/egl/wayland/wayland-drm/Makefile
-                 src/egl/wayland/wayland-egl/Makefile
-                 src/egl/wayland/wayland-egl/wayland-egl.pc
                 src/gallium/Makefile
                 src/gallium/auxiliary/Makefile
                 src/gallium/auxiliary/pipe-loader/Makefile
@@ -3024,8 +3033,8 @@ AC_CONFIG_FILES([Makefile
                 src/gallium/drivers/tegra/Makefile
                 src/gallium/drivers/etnaviv/Makefile
                 src/gallium/drivers/imx/Makefile
+                 src/gallium/drivers/v3d/Makefile
                 src/gallium/drivers/vc4/Makefile
-                 src/gallium/drivers/vc5/Makefile
                 src/gallium/drivers/virgl/Makefile
                 src/gallium/state_trackers/clover/Makefile
                 src/gallium/state_trackers/dri/Makefile
@@ -3072,8 +3081,8 @@ AC_CONFIG_FILES([Makefile
                 src/gallium/winsys/sw/wrapper/Makefile
                 src/gallium/winsys/sw/xlib/Makefile
                 src/gallium/winsys/tegra/drm/Makefile
+                 src/gallium/winsys/v3d/drm/Makefile
                 src/gallium/winsys/vc4/drm/Makefile
-                 src/gallium/winsys/vc5/drm/Makefile
                 src/gallium/winsys/virgl/drm/Makefile
                 src/gallium/winsys/virgl/vtest/Makefile
                 src/gbm/Makefile
@@ -3109,6 +3118,7 @@ AC_CONFIG_FILES([Makefile
                 src/util/Makefile
                 src/util/tests/hash_table/Makefile
                 src/util/tests/string_buffer/Makefile
+                 src/util/tests/vma/Makefile
                 src/util/xmlpool/Makefile
                 src/vulkan/Makefile])

--- a/docs/codingstyle.html
+++ b/docs/codingstyle.html
@@ -83,7 +83,7 @@ We try to quote the OpenGL specification where prudent:
    *     "An INVALID_OPERATION error is generated for any of the following
    *     conditions:
    *
-    *     * <length> is zero."
+    *     * &lt;length&gt; is zero."
    *
    * Additionally, page 94 of the PDF of the OpenGL 4.5 core spec
    * (30.10.2014) also says this, so it's no longer allowed for desktop GL,
@@ -94,7 +94,7 @@ Function comment example:
 <pre>
   /**
    * Create and initialize a new buffer object.  Called via the
-    * ctx->Driver.CreateObject() driver callback function.
+    * ctx-&gt;Driver.CreateObject() driver callback function.
    * \param  name  integer name of the object
    * \param  type  one of GL_FOO, GL_BAR, etc.
    * \return  pointer to new object or NULL if error
--- a/docs/egl.html
+++ b/docs/egl.html
@@ -168,6 +168,7 @@ the X server directly using (XCB-)DRI2 protocol.</p>
 <p>This driver can share DRI drivers with <code>libGL</code>.</p>

 </dd>
+</dl>

 <h2>Packaging</h2>

--- a/docs/favicon.ico
+++ b/docs/favicon.ico
--- a/docs/favicon.png
+++ b/docs/favicon.png
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -36,7 +36,7 @@ context as extensions.
 Feature                                                 Status
 ------------------------------------------------------- ------------------------

-GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr
+GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr, virgl

  glBindFragDataLocation, glGetFragDataLocation         DONE
  GL_NV_conditional_render (Conditional rendering)      DONE ()
@@ -68,7 +68,7 @@ GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llv
 (*) freedreno, llvmpipe, softpipe, and swr have fake Multisample anti-aliasing support


-GL 3.1, GLSL 1.40 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr
+GL 3.1, GLSL 1.40 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr, virgl

  Forward compatible context support/deprecations       DONE ()
  GL_ARB_draw_instanced (Instanced drawing)             DONE ()
@@ -81,7 +81,7 @@ GL 3.1, GLSL 1.40 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llv
  GL_EXT_texture_snorm (Signed normalized textures)     DONE ()


-GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr
+GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr, virgl

  Core/compatibility profiles                           DONE
  Geometry shaders                                      DONE ()
@@ -96,7 +96,7 @@ GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
  GLX_ARB_create_context_profile                        DONE


-GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
+GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, virgl

  GL_ARB_blend_func_extended                            DONE (freedreno/a3xx, swr)
  GL_ARB_explicit_attrib_location                       DONE (all drivers that support GLSL)
@@ -110,7 +110,7 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
  GL_ARB_vertex_type_2_10_10_10_rev                     DONE (freedreno, swr)


-GL 4.0, GLSL 4.00 --- all DONE: i965/gen7+, nvc0, r600, radeonsi
+GL 4.0, GLSL 4.00 --- all DONE: i965/gen7+, nvc0, r600, radeonsi, virgl

  GL_ARB_draw_buffers_blend                             DONE (freedreno, i965/gen6+, nv50, llvmpipe, softpipe, swr)
  GL_ARB_draw_indirect                                  DONE (freedreno, i965/gen7+, llvmpipe, softpipe, swr)
@@ -139,7 +139,7 @@ GL 4.0, GLSL 4.00 --- all DONE: i965/gen7+, nvc0, r600, radeonsi
  GL_ARB_transform_feedback3                            DONE (i965/gen7+, llvmpipe, softpipe, swr)


-GL 4.1, GLSL 4.10 --- all DONE: i965/gen7+, nvc0, r600, radeonsi
+GL 4.1, GLSL 4.10 --- all DONE: i965/gen7+, nvc0, r600, radeonsi, virgl

  GL_ARB_ES2_compatibility                              DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr)
  GL_ARB_get_program_binary                             DONE (0 or 1 binary formats)
@@ -151,17 +151,17 @@ GL 4.1, GLSL 4.10 --- all DONE: i965/gen7+, nvc0, r600, radeonsi

 GL 4.2, GLSL 4.20 -- all DONE: i965/gen7+, nvc0, r600, radeonsi

-  GL_ARB_texture_compression_bptc                       DONE (freedreno, i965)
+  GL_ARB_texture_compression_bptc                       DONE (freedreno, i965, virgl)
  GL_ARB_compressed_texture_pixel_storage               DONE (all drivers)
  GL_ARB_shader_atomic_counters                         DONE (freedreno/a5xx, i965, softpipe)
  GL_ARB_texture_storage                                DONE (all drivers)
-  GL_ARB_transform_feedback_instanced                   DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr)
-  GL_ARB_base_instance                                  DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr)
+  GL_ARB_transform_feedback_instanced                   DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_base_instance                                  DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr, virgl)
  GL_ARB_shader_image_load_store                        DONE (freedreno/a5xx, i965, softpipe)
  GL_ARB_conservative_depth                             DONE (all drivers that support GLSL 1.30)
  GL_ARB_shading_language_420pack                       DONE (all drivers that support GLSL 1.30)
  GL_ARB_shading_language_packing                       DONE (all drivers)
-  GL_ARB_internalformat_query                           DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr)
+  GL_ARB_internalformat_query                           DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr, virgl)
  GL_ARB_map_buffer_alignment                           DONE (all drivers)


@@ -174,17 +174,17 @@ GL 4.3, GLSL 4.30 -- all DONE: i965/gen8+, nvc0, r600, radeonsi
  GL_ARB_copy_image                                     DONE (i965, nv50, softpipe, llvmpipe)
  GL_KHR_debug                                          DONE (all drivers)
  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, llvmpipe, softpipe, virgl)
  GL_ARB_framebuffer_no_attachments                     DONE (freedreno, i965, softpipe)
  GL_ARB_internalformat_query2                          DONE (all drivers)
  GL_ARB_invalidate_subdata                             DONE (all drivers)
-  GL_ARB_multi_draw_indirect                            DONE (freedreno, i965, llvmpipe, softpipe, swr)
+  GL_ARB_multi_draw_indirect                            DONE (freedreno, i965, llvmpipe, softpipe, swr, virgl)
  GL_ARB_program_interface_query                        DONE (all drivers)
  GL_ARB_robust_buffer_access_behavior                  DONE (i965)
  GL_ARB_shader_image_size                              DONE (freedreno/a5xx, i965, softpipe)
  GL_ARB_shader_storage_buffer_object                   DONE (freedreno/a5xx, i965, softpipe)
-  GL_ARB_stencil_texturing                              DONE (freedreno, i965/hsw+, nv50, llvmpipe, softpipe, swr)
-  GL_ARB_texture_buffer_range                           DONE (freedreno, nv50, i965, llvmpipe)
+  GL_ARB_stencil_texturing                              DONE (freedreno, i965/hsw+, nv50, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_texture_buffer_range                           DONE (freedreno, nv50, i965, llvmpipe, virgl)
  GL_ARB_texture_query_levels                           DONE (all drivers that support GLSL 1.30)
  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
  GL_ARB_texture_view                                   DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr)
@@ -205,17 +205,17 @@ GL 4.4, GLSL 4.40 -- all DONE: i965/gen8+, nvc0, r600, radeonsi
  - input/output block locations                        DONE
  GL_ARB_multi_bind                                     DONE (all drivers)
  GL_ARB_query_buffer_object                            DONE (i965/hsw+)
-  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, llvmpipe, softpipe, swr)
-  GL_ARB_texture_stencil8                               DONE (freedreno, i965/hsw+, nv50, llvmpipe, softpipe, swr)
-  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (i965, nv50, llvmpipe, softpipe, swr)
+  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_texture_stencil8                               DONE (freedreno, i965/hsw+, nv50, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (i965, nv50, llvmpipe, softpipe, swr, virgl)

 GL 4.5, GLSL 4.50 -- all DONE: nvc0, radeonsi

  GL_ARB_ES3_1_compatibility                            DONE (i965/hsw+, r600)
  GL_ARB_clip_control                                   DONE (freedreno, i965, nv50, r600, llvmpipe, softpipe, swr)
-  GL_ARB_conditional_render_inverted                    DONE (freedreno, i965, nv50, r600, llvmpipe, softpipe, swr)
-  GL_ARB_cull_distance                                  DONE (i965, nv50, r600, llvmpipe, softpipe, swr)
-  GL_ARB_derivative_control                             DONE (i965, nv50, r600)
+  GL_ARB_conditional_render_inverted                    DONE (freedreno, i965, nv50, r600, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_cull_distance                                  DONE (i965, nv50, r600, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_derivative_control                             DONE (i965, nv50, r600, virgl)
  GL_ARB_direct_state_access                            DONE (all drivers)
  GL_ARB_get_texture_sub_image                          DONE (all drivers)
  GL_ARB_shader_texture_image_samples                   DONE (i965, nv50, r600)
@@ -229,13 +229,13 @@ GL 4.6, GLSL 4.60
  GL_ARB_gl_spirv                                       in progress (Nicolai Hähnle, Ian Romanick)
  GL_ARB_indirect_parameters                            DONE (i965/gen7+, nvc0, radeonsi)
  GL_ARB_pipeline_statistics_query                      DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
-  GL_ARB_polygon_offset_clamp                           DONE (freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, swr)
+  GL_ARB_polygon_offset_clamp                           DONE (freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, swr, virgl)
  GL_ARB_shader_atomic_counter_ops                      DONE (freedreno/a5xx, i965/gen7+, nvc0, r600, radeonsi, softpipe)
  GL_ARB_shader_draw_parameters                         DONE (i965, nvc0, radeonsi)
  GL_ARB_shader_group_vote                              DONE (i965, nvc0, radeonsi)
  GL_ARB_spirv_extensions                               in progress (Nicolai Hähnle, Ian Romanick)
  GL_ARB_texture_filter_anisotropic                     DONE (freedreno, i965, nv50, nvc0, r600, radeonsi, softpipe (*), llvmpipe (*))
-  GL_ARB_transform_feedback_overflow_query              DONE (i965/gen6+, nvc0, radeonsi, llvmpipe, softpipe)
+  GL_ARB_transform_feedback_overflow_query              DONE (i965/gen6+, nvc0, radeonsi, llvmpipe, softpipe, virgl)
  GL_KHR_no_error                                       DONE (all drivers)

 (*) softpipe and llvmpipe advertise 16x anisotropy but simply ignore the setting
@@ -245,7 +245,7 @@ GLES3.1, GLSL ES 3.1 -- all DONE: i965/hsw+, nvc0, r600, radeonsi

  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
  GL_ARB_compute_shader                                 DONE (freedreno/a5xx, i965/gen7+, softpipe)
-  GL_ARB_draw_indirect                                  DONE (freedreno, i965/gen7+, llvmpipe, softpipe, swr)
+  GL_ARB_draw_indirect                                  DONE (freedreno, i965/gen7+, llvmpipe, softpipe, swr, virgl)
  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
  GL_ARB_framebuffer_no_attachments                     DONE (freedreno, i965/gen7+, softpipe)
  GL_ARB_program_interface_query                        DONE (all drivers)
@@ -255,12 +255,12 @@ GLES3.1, GLSL ES 3.1 -- all DONE: i965/hsw+, nvc0, r600, radeonsi
  GL_ARB_shader_storage_buffer_object                   DONE (freedreno/a5xx, i965/gen7+, softpipe)
  GL_ARB_shading_language_packing                       DONE (all drivers)
  GL_ARB_separate_shader_objects                        DONE (all drivers)
-  GL_ARB_stencil_texturing                              DONE (freedreno, nv50, llvmpipe, softpipe, swr)
-  GL_ARB_texture_multisample (Multisample textures)     DONE (i965/gen7+, nv50, llvmpipe, softpipe)
+  GL_ARB_stencil_texturing                              DONE (freedreno, nv50, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_texture_multisample (Multisample textures)     DONE (i965/gen7+, nv50, llvmpipe, softpipe, virgl)
  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
  GL_ARB_vertex_attrib_binding                          DONE (all drivers)
-  GS5 Enhanced textureGather                            DONE (freedreno, i965/gen7+,)
-  GS5 Packing/bitfield/conversion functions             DONE (i965/gen6+)
+  GS5 Enhanced textureGather                            DONE (freedreno, i965/gen7+,virgl)
+  GS5 Packing/bitfield/conversion functions             DONE (i965/gen6+,virgl)
  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)

  Additional functionality not covered above:
@@ -300,16 +300,16 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
  GL_ARB_cl_event                                       not started
  GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
  GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
-  GL_ARB_fragment_shader_interlock                      not started
+  GL_ARB_fragment_shader_interlock                      DONE (i965)
  GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
  GL_ARB_parallel_shader_compile                        not started, but Chia-I Wu did some related work in 2014
-  GL_ARB_post_depth_coverage                            DONE (i965)
+  GL_ARB_post_depth_coverage                            DONE (i965, nvc0)
  GL_ARB_robustness_isolation                           not started
-  GL_ARB_sample_locations                               not started
-  GL_ARB_seamless_cubemap_per_texture                   DONE (i965, nvc0, radeonsi, r600, softpipe, swr)
+  GL_ARB_sample_locations                               DONE (nvc0)
+  GL_ARB_seamless_cubemap_per_texture                   DONE (i965, nvc0, radeonsi, r600, softpipe, swr, virgl)
  GL_ARB_shader_ballot                                  DONE (i965/gen8+, nvc0, radeonsi)
  GL_ARB_shader_clock                                   DONE (i965/gen7+, nv50, nvc0, r600, radeonsi)
-  GL_ARB_shader_stencil_export                          DONE (i965/gen9+, r600, radeonsi, softpipe, llvmpipe, swr)
+  GL_ARB_shader_stencil_export                          DONE (i965/gen9+, r600, radeonsi, softpipe, llvmpipe, swr, virgl)
  GL_ARB_shader_viewport_layer_array                    DONE (i965/gen6+, nvc0, radeonsi)
  GL_ARB_sparse_buffer                                  DONE (radeonsi/CIK+)
  GL_ARB_sparse_texture                                 not started
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,47 @@

 <h1>News</h1>

+<h2>June 3, 2018</h2>
+<p>
+<a href="relnotes/18.0.5.html">Mesa 18.0.5</a> is released.
+This is a bug-fix release.
+<br>
+NOTE: It is anticipated that 18.0.5 will be the final release in the
+18.0 series. Users of 18.0 are encouraged to migrate to the 18.1
+series in order to obtain future fixes.
+</p>
+
+<h2>June 1, 2018</h2>
+<p>
+<a href="relnotes/18.1.1.html">Mesa 18.1.1</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>May 18, 2018</h2>
+<p>
+<a href="relnotes/18.1.0.html">Mesa 18.1.0</a> is released.  This is a
+new development release.  See the release notes for more information
+about the release.
+</p>
+
+<h2>May 17, 2018</h2>
+<p>
+<a href="relnotes/18.0.4.html">Mesa 18.0.4</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>May 7, 2018</h2>
+<p>
+<a href="relnotes/18.0.3.html">Mesa 18.0.3</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>April 28, 2018</h2>
+<p>
+<a href="relnotes/18.0.2.html">Mesa 18.0.2</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>April 18, 2018</h2>
 <p>
 <a href="relnotes/18.0.1.html">Mesa 18.0.1</a> is released.
--- a/docs/meson.html
+++ b/docs/meson.html
@@ -24,10 +24,7 @@ for production</strong></p>
 <p>The meson build is tested on on Linux, macOS, Cygwin and Haiku, it should
 work on FreeBSD, DragonflyBSD, NetBSD, and OpenBSD.</p>

-<p><strong>Mesa requires Meson >= 0.42.0 to build in general.</strong>
-
-Additionaly, to build the Clover OpenCL state tracker or the OpenSWR driver
-meson 0.44.0 or greater is required.
+<p><strong>Mesa requires Meson >= 0.44.1 to build.</strong>

 Some older versions of meson do not check that they are too old and will error
 out in odd ways.
@@ -36,7 +33,7 @@ out in odd ways.
 <p>
 The meson program is used to configure the source directory and generates
 either a ninja build file or Visual Studio® build files. The latter must
-be enabled via the --backend switch, as ninja is the default backend on all
+be enabled via the <code>--backend</code> switch, as ninja is the default backend on all
 operating systems. Meson only supports out-of-tree builds, and must be passed a
 directory to put built and generated sources into. We'll call that directory
 "build" for examples.
@@ -52,7 +49,7 @@ along with a build directory to view the selected options for. This will show
 your meson global arguments and project arguments, along with their defaults
 and your local settings.

-Moes does not currently support listing options before configure a build
+Meson does not currently support listing options before configure a build
 directory, but this feature is being discussed upstream.
 </p>

@@ -63,13 +60,21 @@ directory, but this feature is being discussed upstream.
 <p>
 With additional arguments <code>meson configure</code> is used to change
 options on already configured build directory. All options passed to this
-command are in the form -D "command"="value".
+command are in the form <code>-D "command"="value"</code>.
 </p>

 <pre>
    meson configure build/ -Dprefix=/tmp/install -Dglx=true
 </pre>

+<p>
+Note that options taking lists (such as <code>platforms</code>) are
+<a href="http://mesonbuild.com/Build-options.html#using-build-options">a bit
+more complicated</a>, but the simplest form compatible with Mesa options
+is to use a comma to separate values (<code>-D platforms=drm,wayland</code>)
+and brackets to represent an empty list (<code>-D platforms=[]</code>).
+</p>
+
 <p>
 Once you've run the initial <code>meson</code> command successfully you can use
 your configured backend to build the project. With ninja, the -C option can be
@@ -85,13 +90,14 @@ Without arguments, it will produce libGL.so and/or several other libraries
 depending on the options you have chosen. Later, if you want to rebuild for a
 different configuration, you should run <code>ninja clean</code> before
 changing the configuration, or create a new out of tree build directory for
-each configuration you want to build.
-
-http://mesonbuild.com/Using-multiple-build-directories.html
+each configuration you want to build
+<a href="http://mesonbuild.com/Using-multiple-build-directories.html">as
+recommended in the documentation</a>
 </p>

+<dl>
 <dt><code>Environment Variables</code></dt>
-<dd><p>Meson supports the standard CC and CXX envrionment variables for
+<dd><p>Meson supports the standard CC and CXX environment variables for
 changing the default compiler, and CFLAGS, CXXFLAGS, and LDFLAGS for setting
 options to the compiler and linker.

@@ -102,9 +108,9 @@ the popular compilers, a complete list is available
 These arguments are consumed and stored by meson when it is initialized or
 re-initialized. Therefore passing them to meson configure will not do anything,
 and passing them to ninja will only do something if ninja decides to
-re-initialze meson, for example, if a meson.build file has been changed.
+re-initialize meson, for example, if a meson.build file has been changed.
 Changing these variables will not cause all targets to be rebuilt, so running
-ninja clean is recomended when changing CFLAGS or CXXFLAGS. meson will never
+ninja clean is recommended when changing CFLAGS or CXXFLAGS. Meson will never
 change compiler in a configured build directory.
 </p>

@@ -116,14 +122,13 @@ change compiler in a configured build directory.
    CFLAGS=-Wno-typedef-redefinition ninja -C build-clang
 </pre>

-<p>Meson also honors DESTDIR for installs</p>
+<p>Meson also honors <code>DESTDIR</code> for installs</p>
 </dd>


-<dl>
 <dt><code>LLVM</code></dt>
 <dd><p>Meson includes upstream logic to wrap llvm-config using it's standard
-dependncy interface. It will search $PATH (or %PATH% on windows) for
+dependency interface. It will search <code>$PATH</code> (or <code>%PATH%</code> on windows) for
 llvm-config, so using an LLVM from a non-standard path is as easy as
 <code>PATH=/path/with/llvm-config:$PATH meson build</code>.
 </p></dd>
@@ -146,7 +151,7 @@ One of the oddities of meson is that some options are different when passed to
 the <code>meson</code> than to <code>meson configure</code>. These options are
 passed as --option=foo to <code>meson</code>, but -Doption=foo to <code>meson
 configure</code>. Mesa defined options are always passed as -Doption=foo.
-<p>
+</p>

 <p>For those coming from autotools be aware of the following:</p>

@@ -155,13 +160,13 @@ configure</code>. Mesa defined options are always passed as -Doption=foo.
 <dd><p>This option will set the compiler debug/optimisation levels to aid
 debugging the Mesa libraries.</p>

-<p>Note that in meson this defaults to "debugoptimized", and  not setting it to
-"release" will yield non-optimal performance and binary size. Not using "debug"
-may interfer with debbugging as some code and validation will be optimized
-away.
+<p>Note that in meson this defaults to <code>debugoptimized</code>, and
+not setting it to <code>release</code> will yield non-optimal
+performance and binary size. Not using <code>debug</code> may interfere
+with debugging as some code and validation will be optimized away.
 </p>

-<p> For those wishing to pass their own optimization flags, use the "plain"
+<p> For those wishing to pass their own optimization flags, use the <code>plain</code>
 buildtype, which causes meson to inject no additional compiler arguments, only
 those in the C/CXXFLAGS and those that mesa itself defines.</p>
 </dd>
@@ -169,10 +174,14 @@ those in the C/CXXFLAGS and those that mesa itself defines.</p>

 <dl>
 <dt><code>-Db_ndebug</code></dt>
-<dd><p>This option controls assertions in meson projects. When set to false
+<dd><p>This option controls assertions in meson projects. When set to <code>false</code>
 (the default) assertions are enabled, when set to true they are disabled. This
 is unrelated to the <code>buildtype</code>; setting the latter to
 <code>release</code> will not turn off assertions.
 </p>
 </dd>
 </dl>
+
+</div>
+</body>
+</html>
--- a/docs/patents.txt
+++ b/docs/patents.txt
@@ -1,31 +0,0 @@
-ARB_texture_float:
-
-    Silicon Graphics, Inc. owns US Patent #6,650,327, issued November 18,
-    2003 [1].
-
-    SGI believes this patent contains necessary IP for graphics systems
-    implementing floating point rasterization and floating point
-    framebuffer capabilities described in ARB_texture_float extension, and
-    will discuss licensing on RAND terms, on an individual basis with
-    companies wishing to use this IP in the context of conformant OpenGL
-    implementations [2].
-
-    The source code to implement ARB_texture_float extension is included
-    and can be toggled on at compile time, for those who purchased a
-    license from SGI, or are in a country where the patent does not apply,
-    etc.
-
-    The software is provided "as is", without warranty of any kind, express
-    or implied, including but not limited to the warranties of
-    merchantability, fitness for a particular purpose and noninfringement.
-    In no event shall the authors or copyright holders be liable for any
-    claim, damages or other liability, whether in an action of contract,
-    tort or otherwise, arising from, out of or in connection with the
-    software or the use or other dealings in the software.
-
-    You should contact a lawyer or SGI's legal department if you want to
-    enable this extension.
-
-
-[1] https://patents.google.com/patent/US6650327B1
-[2] https://www.opengl.org/registry/specs/ARB/texture_float.txt
--- a/docs/precompiled.html
+++ b/docs/precompiled.html
@@ -24,10 +24,12 @@ Some Linux distributions closely follow the latest Mesa releases. On others one
 has to use unofficial channels.
 <br>
 There are some general directions:
+<ul>
 <li>Debian/Ubuntu based distros - PPA: xorg-edgers, oibaf and padoka</li>
 <li>Fedora - Corp: erp and che</li>
 <li>OpenSuse/SLES - OBS: X11:XOrg and pontostroy:X11</li>
 <li>Gentoo/Archlinux - officially provided/supported</li>
+</ul>
 </p>

 </div>
--- a/docs/release-calendar.html
+++ b/docs/release-calendar.html
@@ -37,73 +37,36 @@ if you'd like to nominate a patch in the next stable release.
 <th>Release</th>
 <th>Release manager</th>
 <th>Notes</th>
-<tr>
-<td rowspan="3">18.0</td>
-<td>2018-04-20</td>
-<td>18.0.2</td>
-<td>Juan A. Suarez Romero</td>
-<td></td>
 </tr>
 <tr>
-<td>2018-05-04</td>
-<td>18.0.3</td>
-<td>Juan A. Suarez Romero</td>
-<td></td>
-</tr>
-<tr>
-<td>2018-05-18</td>
-<td>18.0.4</td>
-<td>Juan A. Suarez Romero</td>
-<td>Last planned 18.0.x release</td>
-</tr>
-<tr>
-<td rowspan="8">18.1</td>
-<td>2018-04-20</td>
-<td>18.1.0rc1</td>
-<td>Dylan Baker</td>
-<td></td>
-</tr>
-<tr>
-<td>2018-04-27</td>
-<td>18.1.0rc2</td>
-<td>Dylan Baker</td>
-<td></td>
-</tr>
-<tr>
-<td>2018-05-04</td>
-<td>18.1.0rc3</td>
-<td>Dylan Baker</td>
-<td></td>
-</tr>
-<tr>
-<td>2018-05-11</td>
-<td>18.1.0rc4</td>
-<td>Dylan Baker</td>
-<td>Last planned RC/Final release</td>
-</tr>
-<tr>
-<td>TBD</td>
-<td>18.1.1</td>
-<td>Emil Velikov</td>
-<td></td>
-</tr>
-<tr>
-<td>TBD</td>
-<td>18.1.2</td>
-<td>Emil Velikov</td>
-<td></td>
-</tr>
-<tr>
-<td>TBD</td>
+<td>2018-06-29</td>
 <td>18.1.3</td>
-<td>Emil Velikov</td>
+<td>Dylan Baker</td>
 <td></td>
 </tr>
 <tr>
-<td>TBD</td>
+<td>2018-07-13</td>
 <td>18.1.4</td>
-<td>Emil Velikov</td>
-<td>Last planned RC/Final release</td>
+<td>Dylan Baker</td>
+<td></td>
+</tr>
+<tr>
+<td>2018-07-27</td>
+<td>18.1.5</td>
+<td>Dylan Baker</td>
+<td></td>
+</tr>
+<tr>
+<td>2018-08-10</td>
+<td>18.1.6</td>
+<td>Dylan Baker</td>
+<td></td>
+</tr>
+<tr>
+<td>2018-08-24</td>
+<td>18.1.7</td>
+<td>Dylan Baker</td>
+<td>Last planned 18.1.x release</td>
 </tr>
 <tr>
 <td rowspan="4">18.2</td>
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,13 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>

 <ul>
+<li><a href="relnotes/18.1.2.html">18.1.2 release notes</a>
+<li><a href="relnotes/18.0.5.html">18.0.5 release notes</a>
+<li><a href="relnotes/18.1.1.html">18.1.1 release notes</a>
+<li><a href="relnotes/18.1.0.html">18.1.0 release notes</a>
+<li><a href="relnotes/18.0.4.html">18.0.4 release notes</a>
+<li><a href="relnotes/18.0.3.html">18.0.3 release notes</a>
+<li><a href="relnotes/18.0.2.html">18.0.2 release notes</a>
 <li><a href="relnotes/18.0.1.html">18.0.1 release notes</a>
 <li><a href="relnotes/17.3.9.html">17.3.9 release notes</a>
 <li><a href="relnotes/17.3.8.html">17.3.8 release notes</a>
--- a/docs/relnotes/18.0.0.html
+++ b/docs/relnotes/18.0.0.html
@@ -48,8 +48,8 @@ Note: some of the new features are only available with certain drivers.
 <li>Disk shader cache support for i965 when MESA_GLSL_CACHE_DISABLE environment variable is set to "0" or "false"</li>
 <li>GL_ARB_shader_atomic_counters and GL_ARB_shader_atomic_counter_ops on r600/evergreen+</li>
 <li>GL_ARB_shader_image_load_store and GL_ARB_shader_image_size on r600/evergreen+</li>
-<li>GL_ARB_shader_storage_buffer_object on r600/evergreen+<li>
-<li>GL_ARB_compute_shader on r600/evergreen+<li>
+<li>GL_ARB_shader_storage_buffer_object on r600/evergreen+</li>
+<li>GL_ARB_compute_shader on r600/evergreen+</li>
 <li>GL_ARB_cull_distance on r600/evergreen+</li>
 <li>GL_ARB_enhanced_layouts on r600/evergreen+</li>
 <li>GL_ARB_bindless_texture on nvc0/kepler</li>
--- a/docs/relnotes/18.0.2.html
+++ b/docs/relnotes/18.0.2.html
@@ -0,0 +1,144 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.0.2 Release Notes / April 28, 2018</h1>
+
+<p>
+Mesa 18.0.2 is a bug fix release which fixes bugs found since the 18.0.1 release.
+</p>
+<p>
+Mesa 18.0.2 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+SHA256: ffd8dfe3337b474a3baa085f0e7ef1a32c7cdc3bed1ad810b2633919a9324840  mesa-18.0.2.tar.gz
+SHA256: 98fa159768482dc568b9f8bf0f36c7acb823fa47428ffd650b40784f16b9e7b3  mesa-18.0.2.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95009">Bug 95009</a> - [SNB] amd_shader_trinary_minmax.execution.built-in-functions.gs-mid3-ivec2-ivec2-ivec2 intermittent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95012">Bug 95012</a> - [SNB] glsl-1_50.execution.built-in-functions.gs-op tests intermittent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98281">Bug 98281</a> - 'message's in ctx-&gt;Debug.LogMessages[] seem to leak.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105320">Bug 105320</a> - Storage texel buffer access produces wrong results (RX Vega)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105775">Bug 105775</a> - SI reaches the maximum IB size in dwords and fail to submit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105994">Bug 105994</a> - surface state leak when creating and destroying image views with aspectMask depth and stencil</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106074">Bug 106074</a> - radv: si_scissor_from_viewport returns incorrect result when using half-pixel viewport offset</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106126">Bug 106126</a> - eglMakeCurrent does not always ensure dri_drawable-&gt;update_drawable_info has been called for a new EGLSurface if another has been created and destroyed first</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Bas Nieuwenhuizen (2):</p>
+<ul>
+  <li>ac/nir: Make the GFX9 buffer size fix apply to image loads/atomics too.</li>
+  <li>radv: Mark GTT memory as device local for APUs.</li>
+</ul>
+
+<p>Dylan Baker (2):</p>
+<ul>
+  <li>bin/install_megadrivers: fix DESTDIR and -D*-path</li>
+  <li>meson: don't build classic mesa tests without dri_drivers</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>intel/compiler: Add scheduler deps for instructions that implicitly read g0</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>i965/fs: Return mlen * 8 for size_read() for INTERPOLATE_AT_*</li>
+</ul>
+
+<p>Johan Klokkhammer Helsing (1):</p>
+<ul>
+  <li>st/dri: Fix dangling pointer to a destroyed dri_drawable</li>
+</ul>
+
+<p>Juan A. Suarez Romero (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.0.1</li>
+  <li>travis: radv needs LLVM 4.0</li>
+  <li>cherry-ignore: add explicit 18.1 only nominations</li>
+  <li>Update version to 18.0.2</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>i965: Fix shadow batches to be the same size as the real BO.</li>
+</ul>
+
+<p>Lionel Landwerlin (1):</p>
+<ul>
+  <li>anv: fix number of planes for depth &amp; stencil</li>
+</ul>
+
+<p>Lucas Stach (1):</p>
+<ul>
+  <li>etnaviv: fix texture_format_needs_swiz</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>radeonsi/gfx9: fix a hang with an empty first IB</li>
+  <li>glsl_to_tgsi: try harder to lower unsupported ir_binop_vector_extract</li>
+  <li>Revert "st/dri: Fix dangling pointer to a destroyed dri_drawable"</li>
+</ul>
+
+<p>Samuel Pitoiset (2):</p>
+<ul>
+  <li>radv: fix scissor computation when using half-pixel viewport offset</li>
+  <li>radv/winsys: allow to submit up to 4 IBs for chips without chaining</li>
+</ul>
+
+<p>Thomas Hellstrom (1):</p>
+<ul>
+  <li>svga: Fix incorrect advertizing of EGL_KHR_gl_colorspace</li>
+</ul>
+
+<p>Timothy Arceri (1):</p>
+<ul>
+  <li>mesa: free debug messages when destroying the debug state</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/18.0.3.html
+++ b/docs/relnotes/18.0.3.html
@@ -0,0 +1,107 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.0.3 Release Notes / May 7, 2018</h1>
+
+<p>
+Mesa 18.0.3 is a bug fix release which fixes bugs found since the 18.0.2 release.
+</p>
+<p>
+Mesa 18.0.3 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+58cc5c5b1ab2a44e6e47f18ef6c29836ad06f95450adce635ce3c317507a171b  mesa-18.0.3.tar.gz
+099d9667327a76a61741a533f95067d76ea71a656e66b91507b3c0caf1d49e30  mesa-18.0.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105374">Bug 105374</a> - texture3d, a SaschaWillems demo, assert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106147">Bug 106147</a> - SIGBUS in write_reloc() when Sacha Willems' &quot;texture3d&quot; Vulkan demo starts</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Andres Rodriguez (1):</p>
+<ul>
+  <li>radv/winsys: fix leaking resources from bo's imported by fd</li>
+</ul>
+
+<p>Boyuan Zhang (1):</p>
+<ul>
+  <li>radeon/vcn: fix mpeg4 msg buffer settings</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>gallium/util: Fix incorrect refcounting of separate stencil.</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>anv/allocator: Don't shrink either end of the block pool</li>
+</ul>
+
+<p>Juan A. Suarez Romero (3):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.0.2</li>
+  <li>cherry-ignore: add explicit 18.1 only nominations</li>
+  <li>Update version to 18.0.3</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>st/omx/enc: fix blit setup for YUV LoadImage</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>util/u_queue: fix a deadlock in util_queue_finish</li>
+  <li>radeonsi/gfx9: workaround for INTERP with indirect indexing</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>i965/tex_image: Avoid the ASTC LDR workaround on gen9lp</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radv: compute the number of subpass attachments correctly</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/18.0.4.html
+++ b/docs/relnotes/18.0.4.html
@@ -0,0 +1,157 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.0.4 Release Notes / May 17, 2018</h1>
+
+<p>
+Mesa 18.0.4 is a bug fix release which fixes bugs found since the 18.0.3 release.
+</p>
+<p>
+Mesa 18.0.4 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+d1dc3469faccdd73439479426952d71a9e8f684e8d03b6687063c12b13430801  mesa-18.0.4.tar.gz
+1f3bcfe7cef0a5c20dae2b41df5d7e0a985e06be0183fa4d43b6068fcba2920f  mesa-18.0.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91808">Bug 91808</a> - trine1 misrender r600g</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100430">Bug 100430</a> - [radv] graphical glitches on dolphin emulator</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106243">Bug 106243</a> - [kbl] GPU HANG: 9:0:0x85dffffb, in Cinnamon</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106480">Bug 106480</a> - A2B10G10R10_SNORM vertex attribute doesn't work.</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Bas Nieuwenhuizen (3):</p>
+<ul>
+  <li>radv: Translate logic ops.</li>
+  <li>radv: Fix up 2_10_10_10 alpha sign.</li>
+  <li>radv: Disable texel buffers with A2 SNORM/SSCALED/SINT for pre-vega.</li>
+</ul>
+
+<p>Dave Airlie (3):</p>
+<ul>
+  <li>r600: fix constant buffer bounds.</li>
+  <li>radv: resolve all layers in compute resolve path.</li>
+  <li>radv: use compute path for multi-layer images.</li>
+</ul>
+
+<p>Deepak Rawat (1):</p>
+<ul>
+  <li>egl/x11: Send invalidate to driver on copy_region path in swap_buffer</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>mesa: Add missing support for glFogiv(GL_FOG_DISTANCE_MODE_NV)</li>
+</ul>
+
+<p>Jan Vesely (8):</p>
+<ul>
+  <li>clover: Add explicit virtual destructor to argument class</li>
+  <li>eg/compute: Drop reference on code_bo in destructor.</li>
+  <li>r600: Cleanup constant buffers on context destruction</li>
+  <li>eg/compute: Drop reference to kernel_param bo in destructor</li>
+  <li>pipe-loader: Free driver_name in error path</li>
+  <li>gallium/auxiliary: Add helper function to count the number of entries in hash table</li>
+  <li>winsys/radeon: Destroy fd_hash table when the last winsys is removed.</li>
+  <li>winsys/amdgpu: Destroy dev_hash table when the last winsys is removed.</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>i965,anv: Set the CS stall bit on the ISP disable PIPE_CONTROL</li>
+</ul>
+
+<p>Jose Maria Casanova Crespo (2):</p>
+<ul>
+  <li>intel/compiler: fix 16-bit int brw_negate_immediate and brw_abs_immediate</li>
+  <li>intel/compiler: fix brw_imm_w for negative 16-bit integers</li>
+</ul>
+
+<p>Juan A. Suarez Romero (7):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.0.3</li>
+  <li>cherry-ignore: add explicit 18.1 only nominations</li>
+  <li>cherry-ignore: glsl: change ast_type_qualifier bitset size to work around GCC 5.4 bug</li>
+  <li>cherry-ignore: mesa: fix glGetInteger/Float/etc queries for vertex arrays attribs</li>
+  <li>cherry-ignore: mesa: revert GL_[SECONDARY_]COLOR_ARRAY_SIZE glGet type to TYPE_INT</li>
+  <li>cherry-ignore: radv/resolve: do fmask decompress on all layers.</li>
+  <li>Update version to 18.0.4</li>
+</ul>
+
+<p>Kai Wasserbäch (1):</p>
+<ul>
+  <li>opencl: autotools: Fix linking order for OpenCL target</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>i965: Don't leak blorp on Gen4-5.</li>
+</ul>
+
+<p>Lionel Landwerlin (2):</p>
+<ul>
+  <li>i965: require pixel scoreboard stall prior to ISP disable</li>
+  <li>anv: emit pixel scoreboard stall before ISP disable</li>
+</ul>
+
+<p>Matthew Nicholls (1):</p>
+<ul>
+  <li>radv: fix multisample image copies</li>
+</ul>
+
+<p>Neil Roberts (1):</p>
+<ul>
+  <li>spirv: Apply OriginUpperLeft to FragCoord</li>
+</ul>
+
+<p>Rhys Perry (1):</p>
+<ul>
+  <li>mesa: fix error handling in get_framebuffer_parameteriv</li>
+</ul>
+
+<p>Ross Burton (1):</p>
+<ul>
+  <li>src/intel/Makefile.vulkan.am: add missing MKDIR_GEN</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/18.0.5.html
+++ b/docs/relnotes/18.0.5.html
@@ -0,0 +1,162 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.0.5 Release Notes / June 3, 2018</h1>
+
+<p>
+Mesa 18.0.5 is a bug fix release which fixes bugs found since the 18.0.4 release.
+</p>
+<p>
+Mesa 18.0.5 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+ea3e00329cea899b1e32db812fd2f426832be37e4baa2e2fd9288a3480f30531  mesa-18.0.5.tar.gz
+5187bba8d72aea78f2062d134ec6079a508e8216062dce9ec9048b5eb2c4fc6b  mesa-18.0.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=78097">Bug 78097</a> - glUniform1ui and friends not supported by display lists</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102390">Bug 102390</a> - centroid interpolation causes broken attribute values</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105351">Bug 105351</a> - [Gen6+] piglit's arb_shader_image_load_store-host-mem-barrier fails with a glGetTexSubImage fallback path</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106090">Bug 106090</a> - Compiling compute shader crashes RADV</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106315">Bug 106315</a> - The witness + dxvk suffers flickering garbage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106465">Bug 106465</a> - No test for Image Load/Store on format-incompatible texture buffer</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106479">Bug 106479</a> - NDEBUG not defined for libamdgpu_addrlib</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106481">Bug 106481</a> - No test for Image Load/Store on texture buffer sized greater than MAX_TEXTURE_BUFFER_SIZE_ARB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106504">Bug 106504</a> - vulkan SPIR-V parsing failed at ../src/compiler/spirv/vtn_cfg.c:381</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106587">Bug 106587</a> - Dota2 is very dark when using vulkan render on a Intel &lt;&lt; AMD prime setup</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106629">Bug 106629</a> - [SNB,IVB,HSW,BDW] dEQP-EGL.functional.image.create.gles2_cubemap_negative_z_rgb_read_pixels</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Anuj Phogat (1):</p>
+<ul>
+  <li>i965/glk: Add l3 banks count for 2x6 configuration</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (2):</p>
+<ul>
+  <li>amd/addrlib: Use defines in autotools build.</li>
+  <li>radv: Fix SRGB compute copies.</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>tgsi/scan: add hw atomic to the list of memory accessing files</li>
+</ul>
+
+<p>Francisco Jerez (4):</p>
+<ul>
+  <li>Revert "mesa: simplify _mesa_is_image_unit_valid for buffers"</li>
+  <li>i965: Move buffer texture size calculation into a common helper function.</li>
+  <li>i965: Handle non-zero texture buffer offsets in buffer object range calculation.</li>
+  <li>i965: Use intel_bufferobj_buffer() wrapper in image surface state setup.</li>
+</ul>
+
+<p>Jan Vesely (1):</p>
+<ul>
+  <li>eg/compute: Use reference counting to handle compute memory pool.</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>intel/eu: Set EXECUTE_1 when setting the rounding mode in cr0</li>
+  <li>intel/blorp: Support blits and clears on surfaces with offsets</li>
+</ul>
+
+<p>Jose Dapena Paz (1):</p>
+<ul>
+  <li>mesa: do not leak ctx-&gt;Shader.ReferencedProgram references</li>
+</ul>
+
+<p>Juan A. Suarez Romero (8):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.0.4</li>
+  <li>cherry-ignore: i965/miptree: Fix handling of uninitialized MCS buffers</li>
+  <li>cherry-ignore: add explicit 18.1 only nominations</li>
+  <li>cherry-ignore: mesa/st: handle vert_attrib_mask in nir case too</li>
+  <li>cherry-ignore: Tegra is not supported</li>
+  <li>cherry-ignore: st/mesa: fix assertion failures with GL_UNSIGNED_INT64_ARB (v2)</li>
+  <li>cherry-ignore: nv30: ensure that displayable formats are marked accordingly</li>
+  <li>Update version to 18.0.5</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>st/mesa: simplify lastLevel determination in st_finalize_texture</li>
+  <li>radeonsi: fix incorrect parentheses around VS-PS varying elimination</li>
+  <li>mesa: handle GL_UNSIGNED_INT64_ARB properly (v2)</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>dri3: Stricter SBC wraparound handling</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>i965/miptree: Zero-initialize CCS_D buffers</li>
+</ul>
+
+<p>Samuel Pitoiset (2):</p>
+<ul>
+  <li>spirv: fix visiting inner loops with same break/continue block</li>
+  <li>radv: fix centroid interpolation</li>
+</ul>
+
+<p>Stuart Young (1):</p>
+<ul>
+  <li>etnaviv: Fix missing rnndb file in tarballs</li>
+</ul>
+
+<p>Timothy Arceri (1):</p>
+<ul>
+  <li>mesa: add glUniform*ui{v} support to display lists</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/18.1.0.html
+++ b/docs/relnotes/18.1.0.html
@@ -14,7 +14,7 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">

-<h1>Mesa 18.1.0 Release Notes / TBD</h1>
+<h1>Mesa 18.1.0 Release Notes / May 18 2018</h1>

 <p>
 Mesa 18.1.0 is a new development release. People who are concerned
@@ -33,7 +33,8 @@ Compatibility contexts may report a lower version depending on each driver.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD.
+b1c1dbb42597190503d3abc518b12de880623f097c6cb6c293ecf69ae87e6fbf  mesa-18.1.0.tar.gz
+c855c5b67ef993b7621f76d8b120769ec0415f1c3616eaff44ef7f7f300aceba  mesa-18.1.0.tar.xz
 </pre>


@@ -58,7 +59,201 @@ Note: some of the new features are only available with certain drivers.
 <h2>Bug fixes</h2>

 <ul>
-TBD
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90311">Bug 90311</a> - Fail to build libglx with clang at linking stage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91808">Bug 91808</a> - trine1 misrender r600g</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95009">Bug 95009</a> - [SNB] amd_shader_trinary_minmax.execution.built-in-functions.gs-mid3-ivec2-ivec2-ivec2 intermittent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95012">Bug 95012</a> - [SNB] glsl-1_50.execution.built-in-functions.gs-op tests intermittent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98281">Bug 98281</a> - 'message's in ctx-&gt;Debug.LogMessages[] seem to leak.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99549">Bug 99549</a> - pp: Failed to translate a shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100259">Bug 100259</a> - [EGL] [GBM] undefined reference to `gbm_bo_create_with_modifiers'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101408">Bug 101408</a> - [Gen8+] Xonotic fails to render one of the weapons</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101442">Bug 101442</a> - Piglit shaders&#64;ssa&#64;fs-if-def-else-break fails with sb but passes with R600_DEBUG=nosb</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102342">Bug 102342</a> - mesa-17.1.7/src/gallium/auxiliary/pipebuffer/pb_cache.c:169]: (style) Suspicious condition</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102542">Bug 102542</a> - mesa-17.2.0/src/gallium/state_trackers/nine/nine_ff.c:1938: bad assignment ?</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102905">Bug 102905</a> - [R600] Miscompilation of TGSI to VLIW causes artifacts in Gallium Nine with Crysis2 bump mapping</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103006">Bug 103006</a> - [OpenGL CTS] [HSW] KHR-GL45.vertex_attrib_binding.basic-inputL-case1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103142">Bug 103142</a> - R600g+sb: optimizer apparently stuck in an endless loop</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103626">Bug 103626</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103746">Bug 103746</a> - [BDW BSW SKL KBL] dEQP-GLES31.functional.copy_image regressions</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104302">Bug 104302</a> - Wolfenstein 2 (2017) under wine graphical artifacting on RADV</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104335">Bug 104335</a> - [OpenGL CTS][SKL,KBL] KHR-GL45.vertex_attrib_64bit.limits_test occasionally fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104625">Bug 104625</a> - semicolon after if</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104636">Bug 104636</a> - [BSW/HD400] Aztec Ruins GL version GPU hangs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104642">Bug 104642</a> - Android: NULL pointer dereference with i965 mesa-dev, seems build_id_length related</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104654">Bug 104654</a> - r600/sb: Alien Isolation GPU lock</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104668">Bug 104668</a> - dEQP-GLES31.functional.shaders.linkage.uniform.block.differing_precision regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104717">Bug 104717</a> - Rocket League: grass rendering broken with nir</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104732">Bug 104732</a> - [radv] Binding descriptor sets disturbs other pipeline bindings</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104741">Bug 104741</a> - Graphic corruption for Android apps Telegram and KineMaster</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104762">Bug 104762</a> - Various segfaults/problems in qt/plasma</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104777">Bug 104777</a> - Attaching multiple shader objects for the same stage to a GLSL program triggers a linker error</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104794">Bug 104794</a> - piglit.spec.arb_internalformat_query2.samples and num_sample_counts pname checks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104803">Bug 104803</a> - SIGSEGV in state_tracker/st_glsl_to_tgsi_temprename.cpp</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104863">Bug 104863</a> - 186 assertions in piglit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104884">Bug 104884</a> - memory leak with intel i965 mesa when running android container in Ubuntu</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104905">Bug 104905</a> - SpvOpFOrdEqual doesn't return correct results for NaNs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104908">Bug 104908</a> - Texture Compression Hint not converted to enum16</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104915">Bug 104915</a> - Indexed SHADING_LANGUAGE_VERSION query not supported</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104923">Bug 104923</a> - anv: Dota2 rendering corruption</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104989">Bug 104989</a> - [r600] [bisected] OpenGL applications can't render anything at all</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105013">Bug 105013</a> - [regression] GLX+VA-API+clutter-gst video playback is corrupt with Mesa 17.3 (but is fine with 17.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105026">Bug 105026</a> - glxgears asserts with pp_jimenezmlaa=1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105029">Bug 105029</a> - simdlib_512_avx512.inl:371:57: error: could not convert ‘_mm512_mask_blend_epi32((__mmask16)(ImmT), a, b)’ from ‘__m512i’ {aka ‘__vector(8) long long int’} to ‘SIMDImpl::SIMD512Impl::Float’</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105052">Bug 105052</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105065">Bug 105065</a> - Qt Programs occasionally fail to render with new Mesa (glGetProgramBinary)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105067">Bug 105067</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105088">Bug 105088</a> - brw_nir_uniforms.cpp:256:10: error: non-constant-expression cannot be narrowed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105098">Bug 105098</a> - [RADV] GPU freeze with simple Vulkan App</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105103">Bug 105103</a> - Wayland master causes Mesa to fail to compile</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105120">Bug 105120</a> - meson build broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105161">Bug 105161</a> - KHR_blend_equation_advanced doesn't work in GLSL 1.10-1.40 shaders</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105183">Bug 105183</a> - Weird assertion in NIR linker</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105211">Bug 105211</a> - build failure after zwp_dmabuf commit if wayland-protocols is not installed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105224">Bug 105224</a> - Webgl Pointclouds flickers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105229">Bug 105229</a> - [KBL SKL BDW HSW] [Regression] KHR-GLES31.core.shader_image_load_store.advanced-sso-simple failures</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105238">Bug 105238</a> - ast.h:648:16: error: union member 'i' has a non-trivial constructor</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105255">Bug 105255</a> - Waiting for fences without waitAll is not implemented</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105262">Bug 105262</a> - [R600] [BISECTED] ttf fonts are invisible in many programs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105271">Bug 105271</a> - WebGL2 shader crashes i965_dri.so 17.3.3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105274">Bug 105274</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105290">Bug 105290</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105292">Bug 105292</a> - vkGetQueryPoolResults returns incorrect query status for large query buffers (bisected)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105317">Bug 105317</a> - The GPU Vega 56 was hang while try to pass #GraphicsFuzz shader15 test</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105320">Bug 105320</a> - Storage texel buffer access produces wrong results (RX Vega)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105374">Bug 105374</a> - texture3d, a SaschaWillems demo, assert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105436">Bug 105436</a> - Blinking textures in UT2004 [bisected]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105440">Bug 105440</a> - GEN7: rendering issue on citra</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105442">Bug 105442</a> - Hang when running nine ff lighting shader with radeonsi</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105444">Bug 105444</a> - Enable GL disk shader cache when transform feedback is enabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105464">Bug 105464</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105471">Bug 105471</a> - [g33] [bisected] dEQP-GLES2.functional.shaders failures</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105497">Bug 105497</a> - shader-db crashes on 72 core system after ast_type_qualifier bitset change</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105529">Bug 105529</a> - u_debug_stack.c:268: error: #pragma GCC diagnostic not allowed inside functions</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105567">Bug 105567</a> - meson/ninja: 1. mesa/vdpau incorrect symlinks in DESTDIR and 2. Ddri-drivers-path Dvdpau-libs-path overrides DESTDIR</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105621">Bug 105621</a> - Build failure on GNOME Continuous</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105634">Bug 105634</a> - Android build test fails when building brw_oa_metrics.c</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105670">Bug 105670</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105704">Bug 105704</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105717">Bug 105717</a> - [bisected] Mesa build tests fails: BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105737">Bug 105737</a> - st_tests_common.cpp:140:42: error: no matching function for call to 'tgsi_get_opcode_info'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105738">Bug 105738</a> - commit f7ffa504a065dc2631fd38cc5fe885b277f4e7e7 causes artifacting in radv</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105740">Bug 105740</a> - glsl_types.cpp(524): error: a dynamically-initialized local static variable is not allowed inside of a statement expression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105775">Bug 105775</a> - SI reaches the maximum IB size in dwords and fail to submit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105807">Bug 105807</a> - [Regression, bisected]: 3D Rendering not working correctly in Warhammer 40k: Dawn of War II</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105817">Bug 105817</a> - scons build broken by glSpecializeShaderARB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105820">Bug 105820</a> - [m32] piglit regressions relinking program without shaders</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105942">Bug 105942</a> - Graphical artefacts after update to mesa 18.0.0-2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105952">Bug 105952</a> - radv causes GPU hang on SI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105960">Bug 105960</a> - [bisected] meson build test fails with: undefined reference to `etna_pm_create_query'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105994">Bug 105994</a> - surface state leak when creating and destroying image views with aspectMask depth and stencil</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106074">Bug 106074</a> - radv: si_scissor_from_viewport returns incorrect result when using half-pixel viewport offset</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106126">Bug 106126</a> - eglMakeCurrent does not always ensure dri_drawable-&gt;update_drawable_info has been called for a new EGLSurface if another has been created and destroyed first</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106131">Bug 106131</a> - meson/ninja build missing file gtest.h</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106133">Bug 106133</a> - make check &quot;OSError: [Errno 24] Too many open files&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106147">Bug 106147</a> - SIGBUS in write_reloc() when Sacha Willems' &quot;texture3d&quot; Vulkan demo starts</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106174">Bug 106174</a> - vulkan dota2 broken (segfaulting), found bug commit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106180">Bug 106180</a> - [bisected] radv vulkan smoke test black screen (Add support for DRI3 v1.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106243">Bug 106243</a> - [kbl] GPU HANG: 9:0:0x85dffffb, in Cinnamon</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106450">Bug 106450</a> - </li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106462">Bug 106462</a> - piglit.spec.arb_vertex_array_bgra.get regression</li>
+
 </ul>

 <h2>Changes</h2>
--- a/docs/relnotes/18.1.1.html
+++ b/docs/relnotes/18.1.1.html
@@ -0,0 +1,168 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.1.1 Release Notes / June 1 2018</h1>
+
+<p>
+Mesa 18.1.1 is a bug fix release which fixes bugs found since the 18.1.0 release.
+</p>
+<p>
+Mesa 18.1.1 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+366a35f7530a016f2a8284fb0ee5759eeb216b4d6fa47f0e96b89ad2e43faf96  mesa-18.1.1.tar.gz
+d3312a2ede5aac14a47476b208b8e3a401367838330197c4588ab8ad420d7781  mesa-18.1.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>None<p>
+
+<h2>Changes</h2>
+<p>Anuj Phogat (1):</p>
+<ul>
+  <li>i965/glk: Add l3 banks count for 2x6 configuration</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (7):</p>
+<ul>
+  <li>radv: Fix multiview queries.</li>
+  <li>radv: Translate logic ops.</li>
+  <li>radv: Fix up 2_10_10_10 alpha sign.</li>
+  <li>radv: Disable texel buffers with A2 SNORM/SSCALED/SINT for pre-vega.</li>
+  <li>amd/addrlib: Use defines in autotools build.</li>
+  <li>radv: Fix SRGB compute copies.</li>
+  <li>radv: Only expose subgroup shuffles on VI+.</li>
+</ul>
+
+<p>Christoph Haag (1):</p>
+<ul>
+  <li>radv: fix VK_EXT_descriptor_indexing</li>
+</ul>
+
+<p>Dave Airlie (5):</p>
+<ul>
+  <li>radv/resolve: do fmask decompress on all layers.</li>
+  <li>radv: resolve all layers in compute resolve path.</li>
+  <li>radv: use compute path for multi-layer images.</li>
+  <li>virgl: set texture buffer offset alignment to disable ARB_texture_buffer_range.</li>
+  <li>tgsi/scan: add hw atomic to the list of memory accessing files</li>
+</ul>
+
+<p>Dylan Baker (2):</p>
+<ul>
+  <li>docs: Add sha sums for release</li>
+  <li>VERSION: bump to 18.1.1 for next release</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>vulkan: don't free uninitialised memory</li>
+</ul>
+
+<p>Francisco Jerez (4):</p>
+<ul>
+  <li>Revert "mesa: simplify _mesa_is_image_unit_valid for buffers"</li>
+  <li>i965: Move buffer texture size calculation into a common helper function.</li>
+  <li>i965: Handle non-zero texture buffer offsets in buffer object range calculation.</li>
+  <li>i965: Use intel_bufferobj_buffer() wrapper in image surface state setup.</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nv30: ensure that displayable formats are marked accordingly</li>
+</ul>
+
+<p>Jan Vesely (1):</p>
+<ul>
+  <li>eg/compute: Use reference counting to handle compute memory pool.</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>intel/eu: Set EXECUTE_1 when setting the rounding mode in cr0</li>
+  <li>intel/blorp: Support blits and clears on surfaces with offsets</li>
+</ul>
+
+<p>Jose Dapena Paz (1):</p>
+<ul>
+  <li>mesa: do not leak ctx-&gt;Shader.ReferencedProgram references</li>
+</ul>
+
+<p>Kai Wasserbäch (1):</p>
+<ul>
+  <li>opencl: autotools: Fix linking order for OpenCL target</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>st/mesa: simplify lastLevel determination in st_finalize_texture</li>
+  <li>radeonsi: fix incorrect parentheses around VS-PS varying elimination</li>
+  <li>mesa: handle GL_UNSIGNED_INT64_ARB properly (v2)</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>dri3: Stricter SBC wraparound handling</li>
+</ul>
+
+<p>Nanley Chery (4):</p>
+<ul>
+  <li>i965: Add and use a getter for the miptree aux buffer</li>
+  <li>i965: Add and use a single miptree aux_buf field</li>
+  <li>i965/miptree: Fix handling of uninitialized MCS buffers</li>
+  <li>i965/miptree: Zero-initialize CCS_D buffers</li>
+</ul>
+
+<p>Samuel Pitoiset (2):</p>
+<ul>
+  <li>spirv: fix visiting inner loops with same break/continue block</li>
+  <li>radv: fix centroid interpolation</li>
+</ul>
+
+<p>Stuart Young (1):</p>
+<ul>
+  <li>etnaviv: Fix missing rnndb file in tarballs</li>
+</ul>
+
+<p>Thierry Reding (3):</p>
+<ul>
+  <li>tegra: Treat resources with modifiers as scanout</li>
+  <li>tegra: Fix scanout resources without modifiers</li>
+  <li>tegra: Remove usage of non-stable UAPI</li>
+</ul>
+
+<p>Timothy Arceri (1):</p>
+<ul>
+  <li>mesa: add glUniform*ui{v} support to display lists</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/18.1.2.html
+++ b/docs/relnotes/18.1.2.html
@@ -0,0 +1,170 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.1.2 Release Notes / June 15 2018</h1>
+
+<p>
+Mesa 18.1.2 is a bug fix release which fixes bugs found since the 18.1.1 release.
+</p>
+<p>
+Mesa 18.1.2 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+a644df23937f4078a2bd9a54349f6315c1955f5e3a4ac272832da51dea4d3c11  mesa-18.1.1.tar.gz
+070bf0648ba5b242d7303ceed32aed80842f4c0ba16e5acc1a650a46eadfb1f9  mesa-18.1.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>None<p>
+
+<h2>Changes</h2>
+
+<p>Alex Smith (4):</p>
+<ul>
+  <li>radv: Consolidate GFX9 merged shader lookup logic</li>
+  <li>radv: Handle GFX9 merged shaders in radv_flush_constants()</li>
+  <li>radeonsi: Fix crash on shaders using MSAA image load/store</li>
+  <li>radv: Set active_stages the same whether or not shaders were cached</li>
+</ul>
+
+<p>Andrew Galante (2):</p>
+<ul>
+  <li>meson: Test for __atomic_add_fetch in atomic checks</li>
+  <li>configure.ac: Test for __atomic_add_fetch in atomic checks</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (1):</p>
+<ul>
+  <li>radv: Don't pass a TESS_EVAL shader when tesselation is not enabled.</li>
+</ul>
+
+<p>Cameron Kumar (1):</p>
+<ul>
+  <li>vulkan/wsi: Destroy swapchain images after terminating FIFO queues</li>
+</ul>
+
+<p>Dylan Baker (6):</p>
+<ul>
+  <li>docs/relnotes: Add sha256 sums for mesa 18.1.1</li>
+  <li>cherry-ignore: add commits not to pull</li>
+  <li>cherry-ignore: Add patches from Jason that he rebased on 18.1</li>
+  <li>meson: work around gentoo applying -m32 to host compiler in cross builds</li>
+  <li>cherry-ignore: Add another patch</li>
+  <li>version: bump version for 18.1.2 release</li>
+</ul>
+
+<p>Eric Engestrom (3):</p>
+<ul>
+  <li>autotools: add missing android file to package</li>
+  <li>configure: radv depends on mako</li>
+  <li>i965: fix resource leak</li>
+</ul>
+
+<p>Jason Ekstrand (10):</p>
+<ul>
+  <li>intel/eu: Add some brw_get_default_ helpers</li>
+  <li>intel/eu: Copy fields manually in brw_next_insn</li>
+  <li>intel/eu: Set flag [sub]register number differently for 3src</li>
+  <li>intel/blorp: Don't vertex fetch directly from clear values</li>
+  <li>intel/isl: Add bounds-checking assertions in isl_format_get_layout</li>
+  <li>intel/isl: Add bounds-checking assertions for the format_info table</li>
+  <li>i965/screen: Refactor query_dma_buf_formats</li>
+  <li>i965/screen: Use RGBA non-sRGB formats for images</li>
+  <li>anv: Set fence/semaphore types to NONE in impl_cleanup</li>
+  <li>i965/screen: Return false for unsupported formats in query_modifiers</li>
+</ul>
+
+<p>Jordan Justen (1):</p>
+<ul>
+  <li>mesa/program_binary: add implicit UseProgram after successful ProgramBinary</li>
+</ul>
+
+<p>Juan A. Suarez Romero (1):</p>
+<ul>
+  <li>glsl: Add ir_binop_vector_extract in NIR</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>i965: Fix batch-last mode to properly swap BOs.</li>
+  <li>anv: Disable __gen_validate_value if NDEBUG is set.</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>r300g/swtcl: make pipe_context uploaders use malloc'd memory as before</li>
+</ul>
+
+<p>Matt Turner (1):</p>
+<ul>
+  <li>meson: Fix -latomic check</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>glx: Fix number of property values to read in glXImportContextEXT</li>
+</ul>
+
+<p>Nicolas Boichat (1):</p>
+<ul>
+  <li>configure.ac/meson.build: Fix -latomic test</li>
+</ul>
+
+<p>Philip Rebohle (1):</p>
+<ul>
+  <li>radv: Use correct color format for fast clears</li>
+</ul>
+
+<p>Samuel Pitoiset (3):</p>
+<ul>
+  <li>radv: fix a GPU hang when MRTs are sparse</li>
+  <li>radv: fix missing ZRANGE_PRECISION(1) for GFX9+</li>
+  <li>radv: add a workaround for DXVK hangs by setting amdgpu-skip-threshold</li>
+</ul>
+
+<p>Scott D Phillips (1):</p>
+<ul>
+  <li>intel/tools: add intel_sanitize_gpu to EXTRA_DIST</li>
+</ul>
+
+<p>Thomas Petazzoni (1):</p>
+<ul>
+  <li>configure.ac: rework -latomic check</li>
+</ul>
+
+<p>Timothy Arceri (2):</p>
+<ul>
+  <li>ac: fix possible truncation of intrinsic name</li>
+  <li>radeonsi: fix possible truncation on renderer string</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/18.2.0.html
+++ b/docs/relnotes/18.2.0.html
@@ -0,0 +1,72 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.2.0 Release Notes / TBD</h1>
+
+<p>
+Mesa 18.2.0 is a new development release. People who are concerned
+with stability and reliability should stick with a previous release or
+wait for Mesa 18.2.1.
+</p>
+<p>
+Mesa 18.2.0 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+<p>
+libwayland-egl is now distributed by Wayland (since 1.15,
+<a href="https://lists.freedesktop.org/archives/wayland-devel/2018-April/037767.html">see announcement</a>),
+and has been removed from Mesa in this release. Make sure you're using
+an up-to-date version of Wayland to keep the functionality.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD.
+</pre>
+
+
+<h2>New features</h2>
+
+<p>
+Note: some of the new features are only available with certain drivers.
+</p>
+
+<ul>
+<li>GL_ARB_fragment_shader_interlock on i965</li>
+</ul>
+
+<h2>Bug fixes</h2>
+
+<ul>
+<li>GL_ARB_sample_locations and GL_NV_sample_locations on nvc0 (GM200+)</li>
+</ul>
+
+<h2>Changes</h2>
+
+<ul>
+<li>Removed GL_EXT_polygon_offset applications should use glPolygonOffset instead.</li>
+<li>Removed libwayland-egl, now part of Wayland</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/submittingpatches.html
+++ b/docs/submittingpatches.html
@@ -122,9 +122,9 @@ Please use common sense and do <strong>not</strong> blindly add everyone.
 <pre>
    $ scripts/get_reviewer.pl --help # to get the help screen
    $ scripts/get_reviewer.pl -f src/egl/drivers/dri2/platform_android.c
-    Rob Herring <robh@kernel.org> (reviewer:ANDROID EGL SUPPORT,added_lines:188/700=27%,removed_lines:58/283=20%)
-    Tomasz Figa <tfiga@chromium.org> (reviewer:ANDROID EGL SUPPORT,authored:12/41=29%,added_lines:308/700=44%,removed_lines:115/283=41%)
-    Emil Velikov <emil.l.velikov@gmail.com> (authored:13/41=32%,removed_lines:76/283=27%)
+    Rob Herring &lt;robh@kernel.org&gt; (reviewer:ANDROID EGL SUPPORT,added_lines:188/700=27%,removed_lines:58/283=20%)
+    Tomasz Figa &lt;tfiga@chromium.org&gt; (reviewer:ANDROID EGL SUPPORT,authored:12/41=29%,added_lines:308/700=44%,removed_lines:115/283=41%)
+    Emil Velikov &lt;emil.l.velikov@gmail.com&gt; (authored:13/41=32%,removed_lines:76/283=27%)
 </pre>
 </ul>

--- a/docs/utilities.html
+++ b/docs/utilities.html
@@ -31,7 +31,7 @@
  <dd>is a very useful tool for tracking down
  memory-related problems in your code.</dd>

-  <dt><a href="https://scan.coverity.com/projects/mesa">Coverity</a><dt>
+  <dt><a href="https://scan.coverity.com/projects/mesa">Coverity</a></dt>
  <dd>provides static code analysis of Mesa.  If you create an account
  you can see the results and try to fix outstanding issues.</dd>
 </dl>
--- a/docs/viewperf.html
+++ b/docs/viewperf.html
@@ -18,8 +18,8 @@

 <p>
 This page lists known issues with
-<a href="https://www.spec.org/gwpg/gpc.static/vp11info.html" target="_main">SPEC Viewperf 11</a>
-and <a href="https://www.spec.org/gwpg/gpc.static/vp12info.html" target="_main">SPEC Viewperf 12</a>
+<a href="https://www.spec.org/gwpg/gpc.static/vp11info.html">SPEC Viewperf 11</a>
+and <a href="https://www.spec.org/gwpg/gpc.static/vp12info.html">SPEC Viewperf 12</a>
 when running on Mesa-based drivers.
 </p>

@@ -66,13 +66,10 @@ either in Viewperf or the Mesa driver.

 <p>
 These tests use features of the
-<a href="https://www.opengl.org/registry/specs/NV/fragment_program2.txt"
-target="_main">
-GL_NV_fragment_program2</a> and
-<a href="https://www.opengl.org/registry/specs/NV/vertex_program3.txt"
-target="_main">
-GL_NV_vertex_program3</a> extensions without checking if the driver supports
-them.
+<a href="https://www.opengl.org/registry/specs/NV/fragment_program2.txt">GL_NV_fragment_program2</a>
+and
+<a href="https://www.opengl.org/registry/specs/NV/vertex_program3.txt">GL_NV_vertex_program3</a>
+extensions without checking if the driver supports them.
 </p>
 <p>
 When Mesa tries to compile the vertex/fragment programs it generates errors
@@ -86,8 +83,8 @@ Subsequent drawing calls become no-ops and the rendering is incorrect.

 <p>
 These tests depend on the
-<a href="https://www.opengl.org/registry/specs/NV/primitive_restart.txt"
-target="_main">GL_NV_primitive_restart</a> extension.
+<a href="https://www.opengl.org/registry/specs/NV/primitive_restart.txt">GL_NV_primitive_restart</a>
+extension.
 </p>

 <p>
@@ -124,7 +121,7 @@ never specified.

 <p>
 A trace captured with
-<a href="https://github.com/apitrace/apitrace" target="_main">API trace</a>
+<a href="https://github.com/apitrace/apitrace">API trace</a>
 shows this sequences of calls like this:

 <pre>
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -48,6 +48,7 @@ typedef unsigned int drm_drawable_t;
 typedef struct drm_clip_rect drm_clip_rect_t;
 #endif

+#include <stdbool.h>
 #include <stdint.h>

 /**
@@ -589,7 +590,7 @@ struct __DRIdamageExtensionRec {
 * SWRast Loader extension.
 */
 #define __DRI_SWRAST_LOADER "DRI_SWRastLoader"
-#define __DRI_SWRAST_LOADER_VERSION 3
+#define __DRI_SWRAST_LOADER_VERSION 4
 struct __DRIswrastLoaderExtensionRec {
    __DRIextension base;

@@ -631,6 +632,24 @@ struct __DRIswrastLoaderExtensionRec {
   void (*getImage2)(__DRIdrawable *readable,
 		     int x, int y, int width, int height, int stride,
 		     char *data, void *loaderPrivate);
+
+    /**
+     * Put shm image to drawable
+     *
+     * \since 4
+     */
+    void (*putImageShm)(__DRIdrawable *drawable, int op,
+                        int x, int y, int width, int height, int stride,
+                        int shmid, char *shmaddr, unsigned offset,
+                        void *loaderPrivate);
+    /**
+     * Get shm image from readable
+     *
+     * \since 4
+     */
+    void (*getImageShm)(__DRIdrawable *readable,
+                        int x, int y, int width, int height,
+                        int shmid, void *loaderPrivate);
 };

 /**
@@ -728,7 +747,8 @@ struct __DRIuseInvalidateExtensionRec {
 #define __DRI_ATTRIB_BIND_TO_TEXTURE_TARGETS	46
 #define __DRI_ATTRIB_YINVERTED			47
 #define __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE	48
-#define __DRI_ATTRIB_MAX			(__DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE + 1)
+#define __DRI_ATTRIB_MUTABLE_RENDER_BUFFER	49 /* EGL_MUTABLE_RENDER_BUFFER_BIT_KHR */
+#define __DRI_ATTRIB_MAX			50

 /* __DRI_ATTRIB_RENDER_TYPE */
 #define __DRI_ATTRIB_RGBA_BIT			0x01	
@@ -1253,6 +1273,7 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_FORMAT_YUYV         0x100f
 #define __DRI_IMAGE_FORMAT_XBGR2101010  0x1010
 #define __DRI_IMAGE_FORMAT_ABGR2101010  0x1011
+#define __DRI_IMAGE_FORMAT_SABGR8       0x1012

 #define __DRI_IMAGE_USE_SHARE		0x0001
 #define __DRI_IMAGE_USE_SCANOUT		0x0002
@@ -1289,6 +1310,7 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_FOURCC_ABGR8888	0x34324241
 #define __DRI_IMAGE_FOURCC_XBGR8888	0x34324258
 #define __DRI_IMAGE_FOURCC_SARGB8888	0x83324258
+#define __DRI_IMAGE_FOURCC_SABGR8888	0x84324258
 #define __DRI_IMAGE_FOURCC_ARGB2101010	0x30335241
 #define __DRI_IMAGE_FOURCC_XRGB2101010	0x30335258
 #define __DRI_IMAGE_FOURCC_ABGR2101010	0x30334241
@@ -1868,9 +1890,57 @@ struct __DRI2rendererQueryExtensionRec {
 * Image Loader extension. Drivers use this to allocate color buffers
 */

+/**
+ * See __DRIimageLoaderExtensionRec::getBuffers::buffer_mask.
+ */
 enum __DRIimageBufferMask {
   __DRI_IMAGE_BUFFER_BACK = (1 << 0),
-   __DRI_IMAGE_BUFFER_FRONT = (1 << 1)
+   __DRI_IMAGE_BUFFER_FRONT = (1 << 1),
+
+   /**
+    * A buffer shared between application and compositor. The buffer may be
+    * simultaneously accessed by each.
+    *
+    * A shared buffer is equivalent to an EGLSurface whose EGLConfig contains
+    * EGL_MUTABLE_RENDER_BUFFER_BIT_KHR and whose active EGL_RENDER_BUFFER (as
+    * opposed to any pending, requested change to EGL_RENDER_BUFFER) is
+    * EGL_SINGLE_BUFFER.
+    *
+    * If buffer_mask contains __DRI_IMAGE_BUFFER_SHARED, then must contains no
+    * other bits. As a corollary, a __DRIdrawable that has a "shared" buffer
+    * has no front nor back buffer.
+    *
+    * The loader returns __DRI_IMAGE_BUFFER_SHARED in buffer_mask if and only
+    * if:
+    *     - The loader supports __DRI_MUTABLE_RENDER_BUFFER_LOADER.
+    *     - The driver supports __DRI_MUTABLE_RENDER_BUFFER_DRIVER.
+    *     - The EGLConfig of the drawable EGLSurface contains
+    *       EGL_MUTABLE_RENDER_BUFFER_BIT_KHR.
+    *     - The EGLContext's EGL_RENDER_BUFFER is EGL_SINGLE_BUFFER.
+    *       Equivalently, the EGLSurface's active EGL_RENDER_BUFFER (as
+    *       opposed to any pending, requested change to EGL_RENDER_BUFFER) is
+    *       EGL_SINGLE_BUFFER. (See the EGL 1.5 and
+    *       EGL_KHR_mutable_render_buffer spec for details about "pending" vs
+    *       "active" EGL_RENDER_BUFFER state).
+    *
+    * A shared buffer is similar to a front buffer in that all rendering to the
+    * buffer should appear promptly on the screen. It is different from
+    * a front buffer in that its behavior is independent from the
+    * GL_DRAW_BUFFER state. Specifically, if GL_DRAW_FRAMEBUFFER is 0 and the
+    * __DRIdrawable's buffer_mask is __DRI_IMAGE_BUFFER_SHARED, then all
+    * rendering should appear promptly on the screen if GL_DRAW_BUFFER is not
+    * GL_NONE.
+    *
+    * The difference between a shared buffer and a front buffer is motivated
+    * by the constraints of Android and OpenGL ES. OpenGL ES does not support
+    * front-buffer rendering. Android's SurfaceFlinger protocol provides the
+    * EGL driver only a back buffer and no front buffer. The shared buffer
+    * mode introduced by EGL_KHR_mutable_render_buffer is a backdoor though
+    * EGL that allows Android OpenGL ES applications to render to what is
+    * effectively the front buffer, a backdoor that required no change to the
+    * OpenGL ES API and little change to the SurfaceFlinger API.
+    */
+   __DRI_IMAGE_BUFFER_SHARED = (1 << 2),
 };

 struct __DRIimageList {
@@ -1895,7 +1965,8 @@ struct __DRIimageLoaderExtensionRec {
    * \param stamp              Address of variable to be updated when
    *                           getBuffers must be called again
    * \param loaderPrivate      The loaderPrivate for driDrawable
-    * \param buffer_mask        Set of buffers to allocate
+    * \param buffer_mask        Set of buffers to allocate. A bitmask of
+    *                           __DRIimageBufferMask.
    * \param buffers            Returned buffers
    */
   int (*getBuffers)(__DRIdrawable *driDrawable,
@@ -2009,4 +2080,85 @@ struct __DRIbackgroundCallableExtensionRec {
   GLboolean (*isThreadSafe)(void *loaderPrivate);
 };

+/**
+ * The driver portion of EGL_KHR_mutable_render_buffer.
+ *
+ * If the driver creates a __DRIconfig with
+ * __DRI_ATTRIB_MUTABLE_RENDER_BUFFER, then it must support this extension.
+ *
+ * To support this extension:
+ *
+ *    - The driver should create at least one __DRIconfig with
+ *      __DRI_ATTRIB_MUTABLE_RENDER_BUFFER. This is strongly recommended but
+ *      not required.
+ *
+ *    - The driver must be able to handle __DRI_IMAGE_BUFFER_SHARED if
+ *      returned by __DRIimageLoaderExtension:getBuffers().
+ *
+ *    - When rendering to __DRI_IMAGE_BUFFER_SHARED, it must call
+ *      __DRImutableRenderBufferLoaderExtension::displaySharedBuffer() in
+ *      response to glFlush and glFinish.  (This requirement is not documented
+ *      in EGL_KHR_mutable_render_buffer, but is a de-facto requirement in the
+ *      Android ecosystem. Android applications expect that glFlush will
+ *      immediately display the buffer when in shared buffer mode, and Android
+ *      drivers comply with this expectation).  It :may: call
+ *      displaySharedBuffer() more often than required.
+ *
+ *    - When rendering to __DRI_IMAGE_BUFFER_SHARED, it must ensure that the
+ *      buffer is always in a format compatible for display because the
+ *      display engine (usually SurfaceFlinger or hwcomposer) may display the
+ *      image at any time, even concurrently with 3D rendering. For example,
+ *      display hardware and the GL hardware may be able to access the buffer
+ *      simultaneously. In particular, if the buffer is compressed then take
+ *      care that SurfaceFlinger and hwcomposer can consume the compression
+ *      format.
+ *
+ * \see __DRI_IMAGE_BUFFER_SHARED
+ * \see __DRI_ATTRIB_MUTABLE_RENDER_BUFFER
+ * \see __DRI_MUTABLE_RENDER_BUFFER_LOADER
+ */
+#define __DRI_MUTABLE_RENDER_BUFFER_DRIVER "DRI_MutableRenderBufferDriver"
+#define __DRI_MUTABLE_RENDER_BUFFER_DRIVER_VERSION 1
+
+typedef struct __DRImutableRenderBufferDriverExtensionRec __DRImutableRenderBufferDriverExtension;
+struct __DRImutableRenderBufferDriverExtensionRec {
+   __DRIextension base;
+};
+
+/**
+ * The loader portion of EGL_KHR_mutable_render_buffer.
+ *
+ * Requires loader extension DRI_IMAGE_LOADER, through which the loader sends
+ * __DRI_IMAGE_BUFFER_SHARED to the driver.
+ *
+ * \see __DRI_MUTABLE_RENDER_BUFFER_DRIVER
+ */
+#define __DRI_MUTABLE_RENDER_BUFFER_LOADER "DRI_MutableRenderBufferLoader"
+#define __DRI_MUTABLE_RENDER_BUFFER_LOADER_VERSION 1
+
+typedef struct __DRImutableRenderBufferLoaderExtensionRec __DRImutableRenderBufferLoaderExtension;
+struct __DRImutableRenderBufferLoaderExtensionRec {
+   __DRIextension base;
+
+   /**
+    * Inform the display engine (that is, SurfaceFlinger and/or hwcomposer)
+    * that the __DRIdrawable has new content.
+    *
+    * The display engine may ignore this call, for example, if it continually
+    * refreshes and displays the buffer on every frame, as in
+    * EGL_ANDROID_front_buffer_auto_refresh. On the other extreme, the display
+    * engine may refresh and display the buffer only in frames in which the
+    * driver calls this.
+    *
+    * If the fence_fd is not -1, then the display engine will display the
+    * buffer only after the fence signals.
+    *
+    * The drawable's current __DRIimageBufferMask, as returned by
+    * __DRIimageLoaderExtension::getBuffers(), must be
+    * __DRI_IMAGE_BUFFER_SHARED.
+    */
+   void (*displaySharedBuffer)(__DRIdrawable *drawable, int fence_fd,
+                               void *loaderPrivate);
+};
+
 #endif
--- a/src/gallium/drivers/vc5/vc5_drm.h
+++ b/src/gallium/drivers/vc5/vc5_drm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014-2017 Broadcom
+ * Copyright © 2014-2018 Broadcom
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -21,8 +21,8 @@
 * IN THE SOFTWARE.
 */

-#ifndef _VC5_DRM_H_
-#define _VC5_DRM_H_
+#ifndef _V3D_DRM_H_
+#define _V3D_DRM_H_

 #include "drm.h"

@@ -30,28 +30,28 @@
 extern "C" {
 #endif

-#define DRM_VC5_SUBMIT_CL                         0x00
-#define DRM_VC5_WAIT_BO                           0x01
-#define DRM_VC5_CREATE_BO                         0x02
-#define DRM_VC5_MMAP_BO                           0x03
-#define DRM_VC5_GET_PARAM                         0x04
-#define DRM_VC5_GET_BO_OFFSET                     0x05
+#define DRM_V3D_SUBMIT_CL                         0x00
+#define DRM_V3D_WAIT_BO                           0x01
+#define DRM_V3D_CREATE_BO                         0x02
+#define DRM_V3D_MMAP_BO                           0x03
+#define DRM_V3D_GET_PARAM                         0x04
+#define DRM_V3D_GET_BO_OFFSET                     0x05

-#define DRM_IOCTL_VC5_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_SUBMIT_CL, struct drm_vc5_submit_cl)
-#define DRM_IOCTL_VC5_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_WAIT_BO, struct drm_vc5_wait_bo)
-#define DRM_IOCTL_VC5_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_CREATE_BO, struct drm_vc5_create_bo)
-#define DRM_IOCTL_VC5_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_MMAP_BO, struct drm_vc5_mmap_bo)
-#define DRM_IOCTL_VC5_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_GET_PARAM, struct drm_vc5_get_param)
-#define DRM_IOCTL_VC5_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_GET_BO_OFFSET, struct drm_vc5_get_bo_offset)
+#define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
+#define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
+#define DRM_IOCTL_V3D_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_CREATE_BO, struct drm_v3d_create_bo)
+#define DRM_IOCTL_V3D_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
+#define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
+#define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)

 /**
- * struct drm_vc5_submit_cl - ioctl argument for submitting commands to the 3D
+ * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
 * engine.
 *
 * This asks the kernel to have the GPU execute an optional binner
 * command list, and a render command list.
 */
-struct drm_vc5_submit_cl {
+struct drm_v3d_submit_cl {
 	/* Pointer to the binner command list.
 	 *
 	 * This is the first set of commands executed, which runs the
@@ -101,29 +101,32 @@ struct drm_vc5_submit_cl {

 	/* Number of BO handles passed in (size is that times 4). */
 	__u32 bo_handle_count;
+
+	/* Pad, must be zero-filled. */
+	__u32 pad;
 };

 /**
- * struct drm_vc5_wait_bo - ioctl argument for waiting for
- * completion of the last DRM_VC5_SUBMIT_CL on a BO.
+ * struct drm_v3d_wait_bo - ioctl argument for waiting for
+ * completion of the last DRM_V3D_SUBMIT_CL on a BO.
 *
 * This is useful for cases where multiple processes might be
 * rendering to a BO and you want to wait for all rendering to be
 * completed.
 */
-struct drm_vc5_wait_bo {
+struct drm_v3d_wait_bo {
 	__u32 handle;
 	__u32 pad;
 	__u64 timeout_ns;
 };

 /**
- * struct drm_vc5_create_bo - ioctl argument for creating VC5 BOs.
+ * struct drm_v3d_create_bo - ioctl argument for creating V3D BOs.
 *
 * There are currently no values for the flags argument, but it may be
 * used in a future extension.
 */
-struct drm_vc5_create_bo {
+struct drm_v3d_create_bo {
 	__u32 size;
 	__u32 flags;
 	/** Returned GEM handle for the BO. */
@@ -140,7 +143,7 @@ struct drm_vc5_create_bo {
 };

 /**
- * struct drm_vc5_mmap_bo - ioctl argument for mapping VC5 BOs.
+ * struct drm_v3d_mmap_bo - ioctl argument for mapping V3D BOs.
 *
 * This doesn't actually perform an mmap.  Instead, it returns the
 * offset you need to use in an mmap on the DRM device node.  This
@@ -150,7 +153,7 @@ struct drm_vc5_create_bo {
 * There are currently no values for the flags argument, but it may be
 * used in a future extension.
 */
-struct drm_vc5_mmap_bo {
+struct drm_v3d_mmap_bo {
 	/** Handle for the object being mapped. */
 	__u32 handle;
 	__u32 flags;
@@ -158,17 +161,17 @@ struct drm_vc5_mmap_bo {
 	__u64 offset;
 };

-enum drm_vc5_param {
-	DRM_VC5_PARAM_V3D_UIFCFG,
-	DRM_VC5_PARAM_V3D_HUB_IDENT1,
-	DRM_VC5_PARAM_V3D_HUB_IDENT2,
-	DRM_VC5_PARAM_V3D_HUB_IDENT3,
-	DRM_VC5_PARAM_V3D_CORE0_IDENT0,
-	DRM_VC5_PARAM_V3D_CORE0_IDENT1,
-	DRM_VC5_PARAM_V3D_CORE0_IDENT2,
+enum drm_v3d_param {
+	DRM_V3D_PARAM_V3D_UIFCFG,
+	DRM_V3D_PARAM_V3D_HUB_IDENT1,
+	DRM_V3D_PARAM_V3D_HUB_IDENT2,
+	DRM_V3D_PARAM_V3D_HUB_IDENT3,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT0,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT2,
 };

-struct drm_vc5_get_param {
+struct drm_v3d_get_param {
 	__u32 param;
 	__u32 pad;
 	__u64 value;
@@ -176,10 +179,10 @@ struct drm_vc5_get_param {

 /**
 * Returns the offset for the BO in the V3D address space for this DRM fd.
- * This is the same value returned by drm_vc5_create_bo, if that was called
+ * This is the same value returned by drm_v3d_create_bo, if that was called
 * from this DRM fd.
 */
-struct drm_vc5_get_bo_offset {
+struct drm_v3d_get_bo_offset {
 	__u32 handle;
 	__u32 offset;
 };
@@ -188,4 +191,4 @@ struct drm_vc5_get_bo_offset {
 }
 #endif

-#endif /* _VC5_DRM_H_ */
+#endif /* _V3D_DRM_H_ */
--- a/include/drm-uapi/vc4_drm.h
+++ b/include/drm-uapi/vc4_drm.h
@@ -183,10 +183,17 @@ struct drm_vc4_submit_cl {
 	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
 	__u32 perfmonid;

-	/* Unused field to align this struct on 64 bits. Must be set to 0.
-	 * If one ever needs to add an u32 field to this struct, this field
-	 * can be used.
+	/* Syncobj handle to wait on. If set, processing of this render job
+	 * will not start until the syncobj is signaled. 0 means ignore.
 	 */
+	__u32 in_sync;
+
+	/* Syncobj handle to export fence to. If set, the fence in the syncobj
+	 * will be replaced with a fence that signals upon completion of this
+	 * render job. 0 means ignore.
+	 */
+	__u32 out_sync;
+
 	__u32 pad2;
 };

--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -156,6 +156,7 @@ CHIPSET(0x5912, kbl_gt2, "Intel(R) HD Graphics 630 (Kaby Lake GT2)")
 CHIPSET(0x5916, kbl_gt2, "Intel(R) HD Graphics 620 (Kaby Lake GT2)")
 CHIPSET(0x591A, kbl_gt2, "Intel(R) HD Graphics P630 (Kaby Lake GT2)")
 CHIPSET(0x591B, kbl_gt2, "Intel(R) HD Graphics 630 (Kaby Lake GT2)")
+CHIPSET(0x591C, kbl_gt2, "Intel(R) Kaby Lake GT2")
 CHIPSET(0x591D, kbl_gt2, "Intel(R) HD Graphics P630 (Kaby Lake GT2)")
 CHIPSET(0x591E, kbl_gt2, "Intel(R) HD Graphics 615 (Kaby Lake GT2)")
 CHIPSET(0x5921, kbl_gt2, "Intel(R) Kabylake GT2F")
--- a/include/vulkan/vk_icd.h
+++ b/include/vulkan/vk_icd.h
@@ -24,13 +24,34 @@
 #define VKICD_H

 #include "vulkan.h"
+#include <stdbool.h>

-/*
- * Loader-ICD version negotiation API
- */
-#define CURRENT_LOADER_ICD_INTERFACE_VERSION 3
+// Loader-ICD version negotiation API.  Versions add the following features:
+//   Version 0 - Initial.  Doesn't support vk_icdGetInstanceProcAddr
+//               or vk_icdNegotiateLoaderICDInterfaceVersion.
+//   Version 1 - Add support for vk_icdGetInstanceProcAddr.
+//   Version 2 - Add Loader/ICD Interface version negotiation
+//               via vk_icdNegotiateLoaderICDInterfaceVersion.
+//   Version 3 - Add ICD creation/destruction of KHR_surface objects.
+//   Version 4 - Add unknown physical device extension qyering via
+//               vk_icdGetPhysicalDeviceProcAddr.
+//   Version 5 - Tells ICDs that the loader is now paying attention to the
+//               application version of Vulkan passed into the ApplicationInfo
+//               structure during vkCreateInstance.  This will tell the ICD
+//               that if the loader is older, it should automatically fail a
+//               call for any API version > 1.0.  Otherwise, the loader will
+//               manually determine if it can support the expected version.
+#define CURRENT_LOADER_ICD_INTERFACE_VERSION 5
 #define MIN_SUPPORTED_LOADER_ICD_INTERFACE_VERSION 0
-typedef VkResult (VKAPI_PTR *PFN_vkNegotiateLoaderICDInterfaceVersion)(uint32_t *pVersion);
+#define MIN_PHYS_DEV_EXTENSION_ICD_INTERFACE_VERSION 4
+typedef VkResult(VKAPI_PTR *PFN_vkNegotiateLoaderICDInterfaceVersion)(uint32_t *pVersion);
+
+// This is defined in vk_layer.h which will be found by the loader, but if an ICD is building against this
+// file directly, it won't be found.
+#ifndef PFN_GetPhysicalDeviceProcAddr
+typedef PFN_vkVoidFunction(VKAPI_PTR *PFN_GetPhysicalDeviceProcAddr)(VkInstance instance, const char *pName);
+#endif
+
 /*
 * The ICD must reserve space for a pointer for the loader's dispatch
 * table, at the start of <each object>.
@@ -64,6 +85,9 @@ typedef enum {
    VK_ICD_WSI_PLATFORM_WIN32,
    VK_ICD_WSI_PLATFORM_XCB,
    VK_ICD_WSI_PLATFORM_XLIB,
+    VK_ICD_WSI_PLATFORM_ANDROID,
+    VK_ICD_WSI_PLATFORM_MACOS,
+    VK_ICD_WSI_PLATFORM_IOS,
    VK_ICD_WSI_PLATFORM_DISPLAY
 } VkIcdWsiPlatform;

@@ -77,7 +101,7 @@ typedef struct {
    MirConnection *connection;
    MirSurface *mirSurface;
 } VkIcdSurfaceMir;
-#endif // VK_USE_PLATFORM_MIR_KHR
+#endif  // VK_USE_PLATFORM_MIR_KHR

 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
 typedef struct {
@@ -85,7 +109,7 @@ typedef struct {
    struct wl_display *display;
    struct wl_surface *surface;
 } VkIcdSurfaceWayland;
-#endif // VK_USE_PLATFORM_WAYLAND_KHR
+#endif  // VK_USE_PLATFORM_WAYLAND_KHR

 #ifdef VK_USE_PLATFORM_WIN32_KHR
 typedef struct {
@@ -93,7 +117,7 @@ typedef struct {
    HINSTANCE hinstance;
    HWND hwnd;
 } VkIcdSurfaceWin32;
-#endif // VK_USE_PLATFORM_WIN32_KHR
+#endif  // VK_USE_PLATFORM_WIN32_KHR

 #ifdef VK_USE_PLATFORM_XCB_KHR
 typedef struct {
@@ -101,7 +125,7 @@ typedef struct {
    xcb_connection_t *connection;
    xcb_window_t window;
 } VkIcdSurfaceXcb;
-#endif // VK_USE_PLATFORM_XCB_KHR
+#endif  // VK_USE_PLATFORM_XCB_KHR

 #ifdef VK_USE_PLATFORM_XLIB_KHR
 typedef struct {
@@ -109,13 +133,28 @@ typedef struct {
    Display *dpy;
    Window window;
 } VkIcdSurfaceXlib;
-#endif // VK_USE_PLATFORM_XLIB_KHR
+#endif  // VK_USE_PLATFORM_XLIB_KHR

 #ifdef VK_USE_PLATFORM_ANDROID_KHR
 typedef struct {
-    ANativeWindow* window;
+    VkIcdSurfaceBase base;
+    struct ANativeWindow *window;
 } VkIcdSurfaceAndroid;
-#endif //VK_USE_PLATFORM_ANDROID_KHR
+#endif  // VK_USE_PLATFORM_ANDROID_KHR
+
+#ifdef VK_USE_PLATFORM_MACOS_MVK
+typedef struct {
+    VkIcdSurfaceBase base;
+    const void *pView;
+} VkIcdSurfaceMacOS;
+#endif  // VK_USE_PLATFORM_MACOS_MVK
+
+#ifdef VK_USE_PLATFORM_IOS_MVK
+typedef struct {
+    VkIcdSurfaceBase base;
+    const void *pView;
+} VkIcdSurfaceIOS;
+#endif  // VK_USE_PLATFORM_IOS_MVK

 typedef struct {
    VkIcdSurfaceBase base;
@@ -128,4 +167,4 @@ typedef struct {
    VkExtent2D imageExtent;
 } VkIcdSurfaceDisplay;

-#endif // VKICD_H
+#endif  // VKICD_H
--- a/include/vulkan/vulkan_core.h
+++ b/include/vulkan/vulkan_core.h
@@ -43,7 +43,7 @@ extern "C" {
 #define VK_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3ff)
 #define VK_VERSION_PATCH(version) ((uint32_t)(version) & 0xfff)
 // Version of this file
-#define VK_HEADER_VERSION 73
+#define VK_HEADER_VERSION 76


 #define VK_NULL_HANDLE 0
@@ -350,6 +350,11 @@ typedef enum VkStructureType {
    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SURFACE_INFO_2_KHR = 1000119000,
    VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR = 1000119001,
    VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR = 1000119002,
+    VK_STRUCTURE_TYPE_DISPLAY_PROPERTIES_2_KHR = 1000121000,
+    VK_STRUCTURE_TYPE_DISPLAY_PLANE_PROPERTIES_2_KHR = 1000121001,
+    VK_STRUCTURE_TYPE_DISPLAY_MODE_PROPERTIES_2_KHR = 1000121002,
+    VK_STRUCTURE_TYPE_DISPLAY_PLANE_INFO_2_KHR = 1000121003,
+    VK_STRUCTURE_TYPE_DISPLAY_PLANE_CAPABILITIES_2_KHR = 1000121004,
    VK_STRUCTURE_TYPE_IOS_SURFACE_CREATE_INFO_MVK = 1000122000,
    VK_STRUCTURE_TYPE_MACOS_SURFACE_CREATE_INFO_MVK = 1000123000,
    VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT = 1000128000,
@@ -2715,6 +2720,16 @@ typedef struct VkDrawIndirectCommand {
    uint32_t    firstInstance;
 } VkDrawIndirectCommand;

+typedef struct VkBaseOutStructure {
+    VkStructureType               sType;
+    struct VkBaseOutStructure*    pNext;
+} VkBaseOutStructure;
+
+typedef struct VkBaseInStructure {
+    VkStructureType                    sType;
+    const struct VkBaseInStructure*    pNext;
+} VkBaseInStructure;
+

 typedef VkResult (VKAPI_PTR *PFN_vkCreateInstance)(const VkInstanceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkInstance* pInstance);
 typedef void (VKAPI_PTR *PFN_vkDestroyInstance)(VkInstance instance, const VkAllocationCallbacks* pAllocator);
@@ -5572,6 +5587,70 @@ typedef VkPhysicalDeviceVariablePointerFeatures VkPhysicalDeviceVariablePointerF



+#define VK_KHR_get_display_properties2 1
+#define VK_KHR_GET_DISPLAY_PROPERTIES_2_SPEC_VERSION 1
+#define VK_KHR_GET_DISPLAY_PROPERTIES_2_EXTENSION_NAME "VK_KHR_get_display_properties2"
+
+typedef struct VkDisplayProperties2KHR {
+    VkStructureType           sType;
+    void*                     pNext;
+    VkDisplayPropertiesKHR    displayProperties;
+} VkDisplayProperties2KHR;
+
+typedef struct VkDisplayPlaneProperties2KHR {
+    VkStructureType                sType;
+    void*                          pNext;
+    VkDisplayPlanePropertiesKHR    displayPlaneProperties;
+} VkDisplayPlaneProperties2KHR;
+
+typedef struct VkDisplayModeProperties2KHR {
+    VkStructureType               sType;
+    void*                         pNext;
+    VkDisplayModePropertiesKHR    displayModeProperties;
+} VkDisplayModeProperties2KHR;
+
+typedef struct VkDisplayPlaneInfo2KHR {
+    VkStructureType     sType;
+    const void*         pNext;
+    VkDisplayModeKHR    mode;
+    uint32_t            planeIndex;
+} VkDisplayPlaneInfo2KHR;
+
+typedef struct VkDisplayPlaneCapabilities2KHR {
+    VkStructureType                  sType;
+    void*                            pNext;
+    VkDisplayPlaneCapabilitiesKHR    capabilities;
+} VkDisplayPlaneCapabilities2KHR;
+
+
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceDisplayProperties2KHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkDisplayProperties2KHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceDisplayPlaneProperties2KHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkDisplayPlaneProperties2KHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetDisplayModeProperties2KHR)(VkPhysicalDevice physicalDevice, VkDisplayKHR display, uint32_t* pPropertyCount, VkDisplayModeProperties2KHR* pProperties);
+typedef VkResult (VKAPI_PTR *PFN_vkGetDisplayPlaneCapabilities2KHR)(VkPhysicalDevice physicalDevice, const VkDisplayPlaneInfo2KHR* pDisplayPlaneInfo, VkDisplayPlaneCapabilities2KHR* pCapabilities);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceDisplayProperties2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayProperties2KHR*                    pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceDisplayPlaneProperties2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayPlaneProperties2KHR*               pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayModeProperties2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkDisplayKHR                                display,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayModeProperties2KHR*                pProperties);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetDisplayPlaneCapabilities2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    const VkDisplayPlaneInfo2KHR*               pDisplayPlaneInfo,
+    VkDisplayPlaneCapabilities2KHR*             pCapabilities);
+#endif
+
 #define VK_KHR_dedicated_allocation 1
 #define VK_KHR_DEDICATED_ALLOCATION_SPEC_VERSION 3
 #define VK_KHR_DEDICATED_ALLOCATION_EXTENSION_NAME "VK_KHR_dedicated_allocation"
@@ -5727,6 +5806,33 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorSetLayoutSupportKHR(
    VkDescriptorSetLayoutSupport*               pSupport);
 #endif

+#define VK_KHR_draw_indirect_count 1
+#define VK_KHR_DRAW_INDIRECT_COUNT_SPEC_VERSION 1
+#define VK_KHR_DRAW_INDIRECT_COUNT_EXTENSION_NAME "VK_KHR_draw_indirect_count"
+
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirectCountKHR)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride);
+typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirectCountKHR)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirectCountKHR(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirectCountKHR(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride);
+#endif
+
 #define VK_EXT_debug_report 1
 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDebugReportCallbackEXT)

--- a/meson.build
+++ b/meson.build
@@ -25,10 +25,13 @@ project(
    [find_program('python', 'python2', 'python3'), 'bin/meson_get_version.py']
  ).stdout(),
  license : 'MIT',
-  meson_version : '>= 0.42',
+  meson_version : '>= 0.44.1',
  default_options : ['buildtype=debugoptimized', 'c_std=c99', 'cpp_std=c++11']
 )

+cc = meson.get_compiler('c')
+cpp = meson.get_compiler('cpp')
+
 null_dep = dependency('', required : false)

 system_has_kms_drm = ['openbsd', 'netbsd', 'freebsd', 'dragonfly', 'linux'].contains(host_machine.system())
@@ -50,16 +53,13 @@ with_tests = get_option('build-tests')
 with_valgrind = get_option('valgrind')
 with_libunwind = get_option('libunwind')
 with_asm = get_option('asm')
+with_glx_read_only_text = get_option('glx-read-only-text')
 with_osmesa = get_option('osmesa')
-with_swr_arches = get_option('swr-arches').split(',')
-with_tools = get_option('tools').split(',')
+with_swr_arches = get_option('swr-arches')
+with_tools = get_option('tools')
 if with_tools.contains('all')
  with_tools = ['freedreno', 'glsl', 'intel', 'nir', 'nouveau']
 endif
-if get_option('texture-float')
-  pre_args += '-DTEXTURE_FLOAT_ENABLED'
-  message('WARNING: Floating-point texture enabled. Please consult docs/patents.txt and your lawyer before building mesa.')
-endif

 dri_drivers_path = get_option('dri-drivers-path')
 if dri_drivers_path == ''
@@ -93,128 +93,102 @@ endif

 system_has_kms_drm = ['openbsd', 'netbsd', 'freebsd', 'dragonfly', 'linux'].contains(host_machine.system())

-with_dri = false
-with_dri_i915 = false
-with_dri_i965 = false
-with_dri_r100 = false
-with_dri_r200 = false
-with_dri_nouveau = false
-with_dri_swrast = false
 _drivers = get_option('dri-drivers')
-if _drivers == 'auto'
+if _drivers.contains('auto')
  if system_has_kms_drm
    # TODO: PPC, Sparc
    if ['x86', 'x86_64'].contains(host_machine.cpu_family())
-      _drivers = 'i915,i965,r100,r200,nouveau'
+      _drivers = ['i915', 'i965', 'r100', 'r200', 'nouveau']
    elif ['arm', 'aarch64'].contains(host_machine.cpu_family())
-      _drivers = ''
+      _drivers = []
    else
      error('Unknown architecture. Please pass -Ddri-drivers to set driver options. Patches gladly accepted to fix this.')
    endif
  elif ['darwin', 'windows', 'cygwin', 'haiku'].contains(host_machine.system())
    # only swrast would make sense here, but gallium swrast is a much better default
-    _drivers = ''
+    _drivers = []
  else
    error('Unknown OS. Please pass -Ddri-drivers to set driver options. Patches gladly accepted to fix this.')
  endif
 endif
-if _drivers != ''
-  _split = _drivers.split(',')
-  with_dri_i915 = _split.contains('i915')
-  with_dri_i965 = _split.contains('i965')
-  with_dri_r100 = _split.contains('r100')
-  with_dri_r200 = _split.contains('r200')
-  with_dri_nouveau = _split.contains('nouveau')
-  with_dri_swrast = _split.contains('swrast')
-  with_dri = true
-endif

-with_gallium = false
-with_gallium_pl111 = false
-with_gallium_radeonsi = false
-with_gallium_r300 = false
-with_gallium_r600 = false
-with_gallium_nouveau = false
-with_gallium_freedreno = false
-with_gallium_softpipe = false
-with_gallium_vc4 = false
-with_gallium_vc5 = false
-with_gallium_etnaviv = false
-with_gallium_imx = false
-with_gallium_tegra = false
-with_gallium_i915 = false
-with_gallium_svga = false
-with_gallium_virgl = false
-with_gallium_swr = false
+with_dri_i915 = _drivers.contains('i915')
+with_dri_i965 = _drivers.contains('i965')
+with_dri_r100 = _drivers.contains('r100')
+with_dri_r200 = _drivers.contains('r200')
+with_dri_nouveau = _drivers.contains('nouveau')
+with_dri_swrast = _drivers.contains('swrast')
+
+with_dri = _drivers.length() != 0 and _drivers != ['']
+
 _drivers = get_option('gallium-drivers')
-if _drivers == 'auto'
+if _drivers.contains('auto')
  if system_has_kms_drm
    # TODO: PPC, Sparc
    if ['x86', 'x86_64'].contains(host_machine.cpu_family())
-      _drivers = 'r300,r600,radeonsi,nouveau,virgl,svga,swrast'
+      _drivers = [
+        'r300', 'r600', 'radeonsi', 'nouveau', 'virgl', 'svga', 'swrast'
+      ]
    elif ['arm', 'aarch64'].contains(host_machine.cpu_family())
-      _drivers = 'pl111,vc4,vc5,freedreno,etnaviv,imx,nouveau,tegra,virgl,swrast'
+      _drivers = [
+        'pl111', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'imx', 'nouveau',
+        'tegra', 'virgl', 'swrast',
+      ]
    else
      error('Unknown architecture. Please pass -Dgallium-drivers to set driver options. Patches gladly accepted to fix this.')
    endif
  elif ['darwin', 'windows', 'cygwin', 'haiku'].contains(host_machine.system())
-    _drivers = 'swrast'
+    _drivers = ['swrast']
  else
    error('Unknown OS. Please pass -Dgallium-drivers to set driver options. Patches gladly accepted to fix this.')
  endif
 endif
-if _drivers != ''
-  _split = _drivers.split(',')
-  with_gallium_pl111 = _split.contains('pl111')
-  with_gallium_radeonsi = _split.contains('radeonsi')
-  with_gallium_r300 = _split.contains('r300')
-  with_gallium_r600 = _split.contains('r600')
-  with_gallium_nouveau = _split.contains('nouveau')
-  with_gallium_freedreno = _split.contains('freedreno')
-  with_gallium_softpipe = _split.contains('swrast')
-  with_gallium_vc4 = _split.contains('vc4')
-  with_gallium_vc5 = _split.contains('vc5')
-  with_gallium_etnaviv = _split.contains('etnaviv')
-  with_gallium_imx = _split.contains('imx')
-  with_gallium_tegra = _split.contains('tegra')
-  with_gallium_i915 = _split.contains('i915')
-  with_gallium_svga = _split.contains('svga')
-  with_gallium_virgl = _split.contains('virgl')
-  with_gallium_swr = _split.contains('swr')
-  with_gallium = true
-  if system_has_kms_drm
-    _glx = get_option('glx')
-    _egl = get_option('egl')
-    if _glx == 'dri' or _egl == 'true' or (_glx == 'disabled' and _egl != 'false')
-      with_dri = true
-    endif
+with_gallium_pl111 = _drivers.contains('pl111')
+with_gallium_radeonsi = _drivers.contains('radeonsi')
+with_gallium_r300 = _drivers.contains('r300')
+with_gallium_r600 = _drivers.contains('r600')
+with_gallium_nouveau = _drivers.contains('nouveau')
+with_gallium_freedreno = _drivers.contains('freedreno')
+with_gallium_softpipe = _drivers.contains('swrast')
+with_gallium_vc4 = _drivers.contains('vc4')
+with_gallium_v3d = _drivers.contains('v3d')
+with_gallium_etnaviv = _drivers.contains('etnaviv')
+with_gallium_imx = _drivers.contains('imx')
+with_gallium_tegra = _drivers.contains('tegra')
+with_gallium_i915 = _drivers.contains('i915')
+with_gallium_svga = _drivers.contains('svga')
+with_gallium_virgl = _drivers.contains('virgl')
+with_gallium_swr = _drivers.contains('swr')
+
+with_gallium = _drivers.length() != 0 and _drivers != ['']
+
+if with_gallium and system_has_kms_drm
+  _glx = get_option('glx')
+  _egl = get_option('egl')
+  if _glx == 'dri' or _egl == 'true' or (_glx == 'disabled' and _egl != 'false')
+    with_dri = true
  endif
 endif

-with_intel_vk = false
-with_amd_vk = false
-with_any_vk = false
 _vulkan_drivers = get_option('vulkan-drivers')
-if _vulkan_drivers == 'auto'
+if _vulkan_drivers.contains('auto')
  if system_has_kms_drm
    if host_machine.cpu_family().startswith('x86')
-      _vulkan_drivers = 'amd,intel'
+      _vulkan_drivers = ['amd', 'intel']
    else
      error('Unknown architecture. Please pass -Dvulkan-drivers to set driver options. Patches gladly accepted to fix this.')
    endif
  elif ['darwin', 'windows', 'cygwin', 'haiku'].contains(host_machine.system())
    # No vulkan driver supports windows or macOS currently
-    _vulkan_drivers = ''
+    _vulkan_drivers = []
  else
    error('Unknown OS. Please pass -Dvulkan-drivers to set driver options. Patches gladly accepted to fix this.')
  endif
 endif
-if _vulkan_drivers != ''
-  _split = _vulkan_drivers.split(',')
-  with_intel_vk = _split.contains('intel')
-  with_amd_vk = _split.contains('amd')
-  with_any_vk = with_amd_vk or with_intel_vk
-endif
+
+with_intel_vk = _vulkan_drivers.contains('intel')
+with_amd_vk = _vulkan_drivers.contains('amd')
+with_any_vk = _vulkan_drivers.length() != 0 and _vulkan_drivers != ['']

 if with_dri_swrast and (with_gallium_softpipe or with_gallium_swr)
  error('Only one swrast provider can be built')
@@ -247,33 +221,30 @@ else
  with_dri_platform = 'none'
 endif

-with_platform_android = false
-with_platform_wayland = false
-with_platform_x11 = false
-with_platform_drm = false
-with_platform_surfaceless = false
-egl_native_platform = ''
 _platforms = get_option('platforms')
-if _platforms == 'auto'
+if _platforms.contains('auto')
  if system_has_kms_drm
-    _platforms = 'x11,wayland,drm,surfaceless'
+    _platforms = ['x11', 'wayland', 'drm', 'surfaceless']
  elif ['darwin', 'windows', 'cygwin'].contains(host_machine.system())
-    _platforms = 'x11,surfaceless'
+    _platforms = ['x11', 'surfaceless']
  elif ['haiku'].contains(host_machine.system())
-    _platforms = 'haiku'
+    _platforms = ['haiku']
  else
    error('Unknown OS. Please pass -Dplatforms to set platforms. Patches gladly accepted to fix this.')
  endif
 endif
-if _platforms != ''
-  _split = _platforms.split(',')
-  with_platform_android = _split.contains('android')
-  with_platform_x11 = _split.contains('x11')
-  with_platform_wayland = _split.contains('wayland')
-  with_platform_drm = _split.contains('drm')
-  with_platform_haiku = _split.contains('haiku')
-  with_platform_surfaceless = _split.contains('surfaceless')
-  egl_native_platform = _split[0]
+
+with_platform_android = _platforms.contains('android')
+with_platform_x11 = _platforms.contains('x11')
+with_platform_wayland = _platforms.contains('wayland')
+with_platform_drm = _platforms.contains('drm')
+with_platform_haiku = _platforms.contains('haiku')
+with_platform_surfaceless = _platforms.contains('surfaceless')
+
+with_platforms = false
+if _platforms.length() != 0 and _platforms != ['']
+  with_platforms = true
+  egl_native_platform = _platforms[0]
 endif

 with_glx = get_option('glx')
@@ -315,13 +286,13 @@ endif

 _egl = get_option('egl')
 if _egl == 'auto'
-  with_egl = with_dri and with_shared_glapi and egl_native_platform != ''
+  with_egl = with_dri and with_shared_glapi and with_platforms
 elif _egl == 'true'
  if not with_dri
    error('EGL requires dri')
  elif not with_shared_glapi
    error('EGL requires shared-glapi')
-  elif egl_native_platform == ''
+  elif not with_platforms
    error('No platforms specified, consider -Dplatforms=drm,x11 at least')
  elif not ['disabled', 'dri'].contains(with_glx)
    error('EGL requires dri, but a GLX is being built without dri')
@@ -343,11 +314,7 @@ endif
 pre_args += '-DGLX_USE_TLS'
 if with_glx != 'disabled'
  if not (with_platform_x11 and with_any_opengl)
-    if with_glx == 'auto'
-      with_glx = 'disabled'
-    else
-      error('Cannot build GLX support without X11 platform support and at least one OpenGL API')
-    endif
+    error('Cannot build GLX support without X11 platform support and at least one OpenGL API')
  elif with_glx == 'gallium-xlib' 
    if not with_gallium
      error('Gallium-xlib based GLX requires at least one gallium driver')
@@ -360,8 +327,12 @@ if with_glx != 'disabled'
    if with_dri
      error('xlib conflicts with any dri driver')
    endif
-  elif with_glx == 'dri' and not with_dri
-    error('dri based GLX requires at least one DRI driver')
+  elif with_glx == 'dri'
+    if not with_dri
+      error('dri based GLX requires at least one DRI driver')
+    elif not with_shared_glapi
+      error('dri based GLX requires shared-glapi')
+    endif
  endif
 endif

@@ -584,7 +555,7 @@ endif
 with_gallium_va = _va == 'true'
 dep_va = null_dep
 if with_gallium_va
-  dep_va = dependency('libva', version : '>= 0.38.0')
+  dep_va = dependency('libva', version : '>= 0.39.0')
  dep_va_headers = declare_dependency(
    compile_args : run_command(prog_pkgconfig, ['libva', '--cflags']).stdout().split()
  )
@@ -630,13 +601,34 @@ if with_gallium_st_nine
  endif
 endif

+if get_option('power8') != 'false'
+  if host_machine.cpu_family() == 'ppc64le'
+    if cc.get_id() == 'gcc' and cc.version().version_compare('< 4.8')
+      error('Altivec is not supported with gcc version < 4.8.')
+    endif
+    if cc.compiles('''
+        #include <altivec.h>
+        int main() {
+          vector unsigned char r;
+          vector unsigned int v = vec_splat_u32 (1);
+          r = __builtin_vec_vgbbd ((vector unsigned char) v);
+          return 0;
+        }''',
+        args : '-mpower8-vector',
+        name : 'POWER8 intrinsics')
+      pre_args += ['-D_ARCH_PWR8', '-mpower8-vector']
+    elif get_option('power8') == 'true'
+      error('POWER8 intrinsic support required but not found.')
+    endif
+  endif
+endif
+
 _opencl = get_option('gallium-opencl')
 if _opencl != 'disabled'
  if not with_gallium
    error('OpenCL Clover implementation requires at least one gallium driver.')
  endif

-  # TODO: alitvec?
  dep_clc = dependency('libclc')
  with_gallium_opencl = true
  with_opencl_icd = _opencl == 'icd'
@@ -697,7 +689,6 @@ if has_mako.returncode() != 0
  error('Python (2.x) mako module required to build mesa.')
 endif

-cc = meson.get_compiler('c')
 if cc.get_id() == 'gcc' and cc.version().version_compare('< 4.4.6')
  error('When using GCC, version 4.4.6 or later is required.')
 endif
@@ -777,7 +768,6 @@ if cc.has_argument('-fvisibility=hidden')
 endif

 # Check for generic C++ arguments
-cpp = meson.get_compiler('cpp')
 cpp_args = []
 foreach a : ['-Wall', '-fno-math-errno', '-fno-trapping-math',
             '-Qunused-arguments']
@@ -836,7 +826,15 @@ endif
 # Check for GCC style atomics
 dep_atomic = null_dep

-if cc.compiles('int main() { int n; return __atomic_load_n(&n, __ATOMIC_ACQUIRE); }',
+if cc.compiles('''#include <stdint.h>
+                  int main() {
+                    struct {
+                      uint64_t *v;
+                    } x;
+                    return (int)__atomic_load_n(x.v, __ATOMIC_ACQUIRE) &
+                           (int)__atomic_add_fetch(x.v, (uint64_t)1, __ATOMIC_ACQ_REL);
+
+                  }''',
               name : 'GCC atomic builtins')
  pre_args += '-DUSE_GCC_ATOMIC_BUILTINS'

@@ -848,8 +846,11 @@ if cc.compiles('int main() { int n; return __atomic_load_n(&n, __ATOMIC_ACQUIRE)
  # as ARM.
  if not cc.links('''#include <stdint.h>
                     int main() {
-                       uint64_t n;
-                       return (int)__atomic_load_n(&n, __ATOMIC_ACQUIRE);
+                       struct {
+                         uint64_t *v;
+                       } x;
+                       return (int)__atomic_load_n(x.v, __ATOMIC_ACQUIRE) &
+                              (int)__atomic_add_fetch(x.v, (uint64_t)1, __ATOMIC_ACQ_REL);
                     }''',
                  name : 'GCC atomic builtins required -latomic')
    dep_atomic = cc.find_library('atomic')
@@ -864,30 +865,44 @@ if not cc.links('''#include <stdint.h>
  pre_args += '-DMISSING_64_BIT_ATOMICS'
 endif

-# TODO: endian
-# TODO: powr8
 # TODO: shared/static? Is this even worth doing?

-# Building x86 assembly code requires running x86 binaries. It is possible for
-# x86_64 OSes to run x86 binaries, so don't disable asm in those cases
-# TODO: it should be possible to use an exe_wrapper to run the binary during
-# the build. 
+# When cross compiling we generally need to turn off the use of assembly,
+# because mesa's assembly relies on building an executable for the host system,
+# and running it to get information about struct sizes. There is at least one
+# case of cross compiling where we can use asm, and that's x86_64 -> x86 when
+# host OS == build OS, since in that case the build machine can run the host's
+# binaries.
 if meson.is_cross_build() 
-  if not (build_machine.cpu_family() == 'x86_64' and host_machine.cpu_family() == 'x86'
-          and build_machine.system() == host_machine.system())
-    message('Cross compiling to x86 from non-x86, disabling asm')
+  if build_machine.system() != host_machine.system()
+    # TODO: It may be possible to do this with an exe_wrapper (like wine).
+    message('Cross compiling from one OS to another, disabling assembly.')
+    with_asm = false
+  elif not (build_machine.cpu_family().startswith('x86') and host_machine.cpu_family() == 'x86')
+    # FIXME: Gentoo always sets -m32 for x86_64 -> x86 builds, resulting in an
+    # x86 -> x86 cross compile. We use startswith rather than == to handle this
+    # case.
+    # TODO: There may be other cases where the 64 bit version of the
+    # architecture can run 32 bit binaries (aarch64 and armv7 for example)
+    message('''
+      Cross compiling to different architectures, and the host cannot run
+      the build machine's binaries. Disabling assembly.
+    ''')
    with_asm = false
  endif
 endif

 with_asm_arch = ''
 if with_asm
-  # TODO: SPARC and PPC
  if host_machine.cpu_family() == 'x86'
    if system_has_kms_drm
      with_asm_arch = 'x86'
      pre_args += ['-DUSE_X86_ASM', '-DUSE_MMX_ASM', '-DUSE_3DNOW_ASM',
                   '-DUSE_SSE_ASM']
+
+      if with_glx_read_only_text
+         pre_args += ['-DGLX_X86_READONLY_TEXT']
+      endif
    endif
  elif host_machine.cpu_family() == 'x86_64'
    if system_has_kms_drm
@@ -904,6 +919,16 @@ if with_asm
      with_asm_arch = 'aarch64'
      pre_args += ['-DUSE_AARCH64_ASM']
    endif
+  elif host_machine.cpu_family() == 'sparc64'
+    if system_has_kms_drm
+      with_asm_arch = 'sparc'
+      pre_args += ['-DUSE_SPARC_ASM']
+    endif
+  elif host_machine.cpu_family() == 'ppc64le'
+    if system_has_kms_drm
+      with_asm_arch = 'ppc64le'
+      pre_args += ['-DUSE_PPC64LE_ASM']
+    endif
  endif
 endif

@@ -1038,7 +1063,7 @@ _drm_amdgpu_ver = '2.4.91'
 _drm_radeon_ver = '2.4.71'
 _drm_nouveau_ver = '2.4.66'
 _drm_etnaviv_ver = '2.4.89'
-_drm_freedreno_ver = '2.4.91'
+_drm_freedreno_ver = '2.4.92'
 _drm_intel_ver = '2.4.75'
 _drm_ver = '2.4.75'

@@ -1052,6 +1077,12 @@ _libdrm_checks = [
  ['freedreno', with_gallium_freedreno],
 ]

+# VC4 only needs core libdrm support of this version, not a libdrm_vc4
+# library.
+if with_gallium_vc4
+  _drm_ver = '2.4.89'
+endif
+
 # Loop over the enables versions and get the highest libdrm requirement for all
 # active drivers.
 foreach d : _libdrm_checks
@@ -1084,6 +1115,7 @@ if dep_libdrm.found()
 endif

 llvm_modules = ['bitwriter', 'engine', 'mcdisassembler', 'mcjit']
+llvm_optional_modules = []
 if with_amd_vk or with_gallium_radeonsi or with_gallium_r600
  llvm_modules += ['amdgpu', 'bitreader', 'ipo']
  if with_gallium_r600
@@ -1095,10 +1127,12 @@ if with_gallium_opencl
    'all-targets', 'linker', 'coverage', 'instrumentation', 'ipo', 'irreader',
    'lto', 'option', 'objcarcopts', 'profiledata',
  ]
-  # TODO: optional modules
+  llvm_optional_modules += ['coroutines', 'opencl']
 endif

-if with_amd_vk or with_gallium_radeonsi or with_gallium_swr
+if with_amd_vk or with_gallium_radeonsi
+  _llvm_version = '>= 5.0.0'
+elif with_gallium_swr
  _llvm_version = '>= 4.0.0'
 elif with_gallium_opencl or with_gallium_r600
  _llvm_version = '>= 3.9.0'
@@ -1109,12 +1143,20 @@ endif
 _llvm = get_option('llvm')
 if _llvm == 'auto'
  dep_llvm = dependency(
-    'llvm', version : _llvm_version, modules : llvm_modules,
+    'llvm',
+    version : _llvm_version,
+    modules : llvm_modules,
+    optional_modules : llvm_optional_modules,
    required : with_amd_vk or with_gallium_radeonsi or with_gallium_swr or with_gallium_opencl,
  )
  with_llvm = dep_llvm.found()
 elif _llvm == 'true'
-  dep_llvm = dependency('llvm', version : _llvm_version, modules : llvm_modules)
+  dep_llvm = dependency(
+    'llvm',
+    version : _llvm_version,
+    modules : llvm_modules,
+    optional_modules : llvm_optional_modules,
+  )
  with_llvm = true
 else
  dep_llvm = null_dep
@@ -1143,6 +1185,13 @@ if with_llvm
    '-DHAVE_LLVM=0x0@0@0@1@'.format(_llvm_version[0], _llvm_version[1]),
    '-DMESA_LLVM_VERSION_PATCH=@0@'.format(_llvm_patch),
  ]
+
+  # LLVM can be built without rtti, turning off rtti changes the ABI of C++
+  # programs, so we need to build all C++ code in mesa without rtti as well to
+  # ensure that linking works.
+  if dep_llvm.get_configtool_variable('has-rtti') == 'NO'
+    cpp_args += '-fno-rtti'
+  endif
 elif with_amd_vk or with_gallium_radeonsi or with_gallium_swr
  error('The following drivers require LLVM: Radv, RadeonSI, SWR. One of these is enabled, but LLVM is disabled.')
 endif
@@ -1173,8 +1222,6 @@ if get_option('selinux')
  pre_args += '-DMESA_SELINUX'
 endif

-# TODO: llvm-prefix and llvm-shared-libs
-
 if with_libunwind != 'false'
  dep_unwind = dependency('libunwind', required : with_libunwind == 'true')
  if dep_unwind.found()
@@ -1184,8 +1231,6 @@ else
  dep_unwind = null_dep
 endif

-# TODO: gallium-hud
-
 if with_osmesa != 'none'
  if with_osmesa == 'classic' and not with_dri_swrast
    error('OSMesa classic requires dri (classic) swrast.')
@@ -1213,6 +1258,11 @@ if with_platform_wayland
  dep_wl_protocols = dependency('wayland-protocols', version : '>= 1.8')
  dep_wayland_client = dependency('wayland-client', version : '>=1.11')
  dep_wayland_server = dependency('wayland-server', version : '>=1.11')
+  if with_egl
+    dep_wayland_egl = dependency('wayland-egl-backend', version : '>= 3')
+    dep_wayland_egl_headers = declare_dependency(
+      compile_args : run_command(prog_pkgconfig, ['wayland-egl-backend', '--cflags']).stdout().split())
+  endif
  wayland_dmabuf_xml = join_paths(
    dep_wl_protocols.get_pkgconfig_variable('pkgdatadir'), 'unstable',
    'linux-dmabuf', 'linux-dmabuf-unstable-v1.xml'
@@ -1305,18 +1355,6 @@ else
  dep_lmsensors = null_dep
 endif

-# TODO: various libdirs
-
-# TODO: gallium driver dirs
-
-# FIXME: this is a workaround for #2326
-prog_touch = find_program('touch')
-dummy_cpp = custom_target(
-  'dummy_cpp',
-  output : 'dummy.cpp',
-  command : [prog_touch, '@OUTPUT@'],
-)
-
 foreach a : pre_args
  add_project_arguments(a, language : ['c', 'cpp'])
 endforeach
@@ -1329,18 +1367,24 @@ endforeach

 inc_include = include_directories('include')

-gl_priv_reqs = [
-  'x11', 'xext', 'xdamage >= 1.1', 'xfixes', 'x11-xcb', 'xcb',
-  'xcb-glx >= 1.8.1']
+gl_priv_reqs = []
+
+if with_glx == 'xlib' or with_glx == 'gallium-xlib'
+  gl_priv_reqs += ['x11', 'xext', 'xcb']
+elif with_glx == 'dri'
+  gl_priv_reqs += [
+    'x11', 'xext', 'xdamage >= 1.1', 'xfixes', 'x11-xcb', 'xcb',
+    'xcb-glx >= 1.8.1']
+  if with_dri_platform == 'drm'
+    gl_priv_reqs += 'xcb-dri2 >= 1.8'
+  endif
+endif
 if dep_libdrm.found()
  gl_priv_reqs += 'libdrm >= 2.4.75'
 endif
 if dep_xxf86vm.found()
  gl_priv_reqs += 'xxf86vm'
 endif
-if with_dri_platform == 'drm'
-  gl_priv_reqs += 'xcb-dri2 >= 1.8'
-endif

 gl_priv_libs = []
 if dep_thread.found()
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1,4 +1,4 @@
-# Copyright © 2017 Intel Corporation
+# Copyright © 2017-2018 Intel Corporation

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -20,8 +20,11 @@

 option(
  'platforms',
-  type : 'string',
-  value : 'auto',
+  type : 'array',
+  value : ['auto'],
+  choices : [
+    '', 'auto', 'x11', 'wayland', 'drm', 'surfaceless', 'haiku', 'android',
+  ],
  description : 'comma separated list of window systems to support. If this is set to auto all platforms applicable to the OS will be enabled.'
 )
 option(
@@ -33,9 +36,10 @@ option(
 )
 option(
  'dri-drivers',
-  type : 'string',
-  value : 'auto',
-  description : 'comma separated list of dri drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
+  type : 'array',
+  value : ['auto'],
+  choices : ['', 'auto', 'i915', 'i965', 'r100', 'r200', 'nouveau', 'swrast'],
+  description : 'List of dri drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
 )
 option(
  'dri-drivers-path',
@@ -51,9 +55,14 @@ option(
 )
 option(
  'gallium-drivers',
-  type : 'string',
-  value : 'auto',
-  description : 'comma separated list of gallium drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
+  type : 'array',
+  value : ['auto'],
+  choices : [
+    '', 'auto', 'pl111', 'radeonsi', 'r300', 'r600', 'nouveau', 'freedreno',
+    'swrast', 'v3d', 'vc4', 'etnaviv', 'imx', 'tegra', 'i915', 'svga', 'virgl',
+    'swr',
+  ],
+  description : 'List of gallium drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
 )
 option(
  'gallium-extra-hud',
@@ -141,9 +150,10 @@ option(
 )
 option(
  'vulkan-drivers',
-  type : 'string',
-  value : 'auto',
-  description : 'comma separated list of vulkan drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
+  type : 'array',
+  value : ['auto'],
+  choices : ['', 'auto', 'amd', 'intel'],
+  description : 'List of vulkan drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
 )
 option(
  'shader-cache',
@@ -214,6 +224,12 @@ option(
  value : true,
  description : 'Build assembly code if possible'
 )
+option(
+   'glx-read-only-text',
+   type : 'boolean',
+   value : false,
+   description : 'Disable writable .text section on x86 (decreases performance)'
+)
 option(
  'llvm',
  type : 'combo',
@@ -248,12 +264,6 @@ option(
  value : false,
  description : 'Build unit tests. Currently this will build *all* unit tests, which may build more than expected.'
 )
-option(
-  'texture-float',
-  type : 'boolean',
-  value : false,
-  description : 'Enable floating point textures and renderbuffers. This option may be patent encumbered, please read docs/patents.txt and consult with your lawyer before turning this on.'
-)
 option(
  'selinux',
  type : 'boolean',
@@ -276,13 +286,22 @@ option(
 )
 option(
  'swr-arches',
-  type : 'string',
-  value : 'avx,avx2',
-  description : 'Comma delemited swr architectures. choices : avx,avx2,knl,skx'
+  type : 'array',
+  value : ['avx', 'avx2'],
+  choices : ['avx', 'avx2', 'knl', 'skx'],
+  description : 'Architectures to build SWR support for.',
 )
 option(
  'tools',
-  type : 'string',
-  value : '',
-  description : 'Comma delimited list of tools to build. choices : freedreno,glsl,intel,nir,nouveau or all'
+  type : 'array',
+  value : [],
+  choices : ['freedreno', 'glsl', 'intel', 'nir', 'nouveau', 'all'],
+  description : 'List of tools to build.',
+)
+option(
+  'power8',
+  type : 'combo',
+  value : 'auto',
+  choices : ['auto', 'true', 'false'],
+  description : 'Enable power8 optimizations.',
 )
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -392,10 +392,6 @@ def generate(env):
        cppdefines += ['PIPE_SUBSYSTEM_WINDOWS_USER']
    if env['embedded']:
        cppdefines += ['PIPE_SUBSYSTEM_EMBEDDED']
-    if env['texture_float']:
-        print('warning: Floating-point textures enabled.')
-        print('warning: Please consult docs/patents.txt with your lawyer before building Mesa.')
-        cppdefines += ['TEXTURE_FLOAT_ENABLED']
    env.Append(CPPDEFINES = cppdefines)

    # C compiler options
--- a/scons/llvm.py
+++ b/scons/llvm.py
@@ -123,6 +123,10 @@ def generate(env):
                'LLVMDemangle', 'LLVMGlobalISel', 'LLVMDebugInfoMSF',
                'LLVMBinaryFormat',
            ])
+            if env['platform'] == 'windows' and env['crosscompile']:
+                # LLVM 5.0 requires MinGW w/ pthreads due to use of std::thread and friends.
+                assert env['gcc']
+                env['CXX'] = env['CXX'] + '-posix'
        elif llvm_version >= distutils.version.LooseVersion('4.0'):
            env.Prepend(LIBS = [
                'LLVMX86Disassembler', 'LLVMX86AsmParser',
@@ -211,8 +215,11 @@ def generate(env):
            'imagehlp',
            'psapi',
            'shell32',
-            'advapi32'
+            'advapi32',
+            'ole32',
+            'uuid',
        ])
+
        if env['msvc']:
            # Some of the LLVM C headers use the inline keyword without
            # defining it.
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -95,11 +95,6 @@ if HAVE_GBM
 SUBDIRS += gbm
 endif

-## Optionally required by EGL
-if HAVE_PLATFORM_WAYLAND
-SUBDIRS += egl/wayland/wayland-egl
-endif
-
 if HAVE_EGL
 SUBDIRS += egl
 endif
--- a/src/amd/Makefile.addrlib.am
+++ b/src/amd/Makefile.addrlib.am
@@ -22,6 +22,7 @@
 ADDRLIB_LIBS = addrlib/libamdgpu_addrlib.la

 addrlib_libamdgpu_addrlib_la_CPPFLAGS = \
+	$(DEFINES) \
 	-I$(top_srcdir)/src/ \
 	-I$(srcdir)/common \
 	-I$(srcdir)/addrlib \
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -97,7 +97,6 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
 		       struct amdgpu_gpu_info *amdinfo)
 {
 	struct amdgpu_buffer_size_alignments alignment_info = {};
-	struct amdgpu_heap_info vram, vram_vis, gtt;
 	struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {};
 	struct drm_amdgpu_info_hw_ip uvd_enc = {}, vce = {}, vcn_dec = {};
 	struct drm_amdgpu_info_hw_ip vcn_enc = {}, gfx = {};
@@ -131,26 +130,6 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
 		return false;
 	}

-	r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
-		return false;
-	}
-
-	r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM,
-				AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
-				&vram_vis);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n");
-		return false;
-	}
-
-	r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
-	if (r) {
-		fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
-		return false;
-	}
-
 	r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_DMA, 0, &dma);
 	if (r) {
 		fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
@@ -255,6 +234,60 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
 		return false;
 	}

+	if (info->drm_minor >= 9) {
+		struct drm_amdgpu_memory_info meminfo = {};
+
+		r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
+		if (r) {
+			fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n");
+			return false;
+		}
+
+		/* Note: usable_heap_size values can be random and can't be relied on. */
+		info->gart_size = meminfo.gtt.total_heap_size;
+		info->vram_size = meminfo.vram.total_heap_size;
+		info->vram_vis_size = meminfo.cpu_accessible_vram.total_heap_size;
+
+		info->max_alloc_size = MAX2(meminfo.vram.max_allocation,
+					    meminfo.gtt.max_allocation);
+	} else {
+		/* This is a deprecated interface, which reports usable sizes
+		 * (total minus pinned), but the pinned size computation is
+		 * buggy, so the values returned from these functions can be
+		 * random.
+		 */
+		struct amdgpu_heap_info vram, vram_vis, gtt;
+
+		r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
+		if (r) {
+			fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
+			return false;
+		}
+
+		r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM,
+					AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
+					&vram_vis);
+		if (r) {
+			fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n");
+			return false;
+		}
+
+		r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
+		if (r) {
+			fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
+			return false;
+		}
+
+		info->gart_size = gtt.heap_size;
+		info->vram_size = vram.heap_size;
+		info->vram_vis_size = vram_vis.heap_size;
+
+		/* The kernel can split large buffers in VRAM but not in GTT, so large
+		 * allocations can fail or cause buffer movement failures in the kernel.
+		 */
+		info->max_alloc_size = MAX2(info->vram_size * 0.9, info->gart_size * 0.7);
+	}
+
 	/* Set chip identification. */
 	info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */
 	info->vce_harvest_config = amdinfo->vce_harvest_config;
@@ -287,15 +320,8 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
 		!(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION);

 	/* Set hardware information. */
-	info->gart_size = gtt.heap_size;
-	info->vram_size = vram.heap_size;
-	info->vram_vis_size = vram_vis.heap_size;
 	info->gds_size = gds.gds_total_size;
 	info->gds_gfx_partition_size = gds.gds_gfx_partition_size;
-	/* The kernel can split large buffers in VRAM but not in GTT, so large
-	 * allocations can fail or cause buffer movement failures in the kernel.
-	 */
-	info->max_alloc_size = MIN2(info->vram_size * 0.9, info->gart_size * 0.7);
 	/* convert the shader clock from KHz to MHz */
 	info->max_shader_clock = amdinfo->max_engine_clk / 1000;
 	info->max_se = amdinfo->num_shader_engines;
@@ -316,7 +342,35 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
 	/* TODO: Enable this once the kernel handles it efficiently. */
 	info->has_local_buffers = info->drm_minor >= 20 &&
 				  !info->has_dedicated_vram;
+	info->kernel_flushes_hdp_before_ib = true;
+	info->htile_cmask_support_1d_tiling = true;
+	info->si_TA_CS_BC_BASE_ADDR_allowed = true;
+	info->has_bo_metadata = true;
+	info->has_gpu_reset_status_query = true;
+	info->has_gpu_reset_counter_query = false;
+	info->has_eqaa_surface_allocator = true;
+	info->has_format_bc1_through_bc7 = true;
+	/* DRM 3.1.0 doesn't flush TC for VI correctly. */
+	info->kernel_flushes_tc_l2_after_ib = info->chip_class != VI ||
+					      info->drm_minor >= 2;
+	info->has_indirect_compute_dispatch = true;
+	/* SI doesn't support unaligned loads. */
+	info->has_unaligned_shader_loads = info->chip_class != SI;
+	/* Disable sparse mappings on SI due to VM faults in CP DMA. Enable them once
+	 * these faults are mitigated in software.
+	 * Disable sparse mappings on GFX9 due to hangs.
+	 */
+	info->has_sparse_vm_mappings =
+		info->chip_class >= CIK && info->chip_class <= VI &&
+		info->drm_minor >= 13;
+	info->has_2d_tiling = true;
+	info->has_read_registers_query = true;
+
 	info->num_render_backends = amdinfo->rb_pipes;
+	/* The value returned by the kernel driver was wrong. */
+	if (info->family == CHIP_KAVERI)
+		info->num_render_backends = 2;
+
 	info->clock_crystal_freq = amdinfo->gpu_counter_freq;
 	if (!info->clock_crystal_freq) {
 		fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
@@ -449,7 +503,7 @@ void ac_print_gpu_info(struct radeon_info *info)
 	printf("    vce_fw_version = %u\n", info->vce_fw_version);
 	printf("    vce_harvest_config = %i\n", info->vce_harvest_config);

-	printf("Kernel info:\n");
+	printf("Kernel & winsys capabilities:\n");
 	printf("    drm = %i.%i.%i\n", info->drm_major,
 	       info->drm_minor, info->drm_patchlevel);
 	printf("    has_userptr = %i\n", info->has_userptr);
@@ -458,6 +512,20 @@ void ac_print_gpu_info(struct radeon_info *info)
 	printf("    has_fence_to_handle = %u\n", info->has_fence_to_handle);
 	printf("    has_ctx_priority = %u\n", info->has_ctx_priority);
 	printf("    has_local_buffers = %u\n", info->has_local_buffers);
+	printf("    kernel_flushes_hdp_before_ib = %u\n", info->kernel_flushes_hdp_before_ib);
+	printf("    htile_cmask_support_1d_tiling = %u\n", info->htile_cmask_support_1d_tiling);
+	printf("    si_TA_CS_BC_BASE_ADDR_allowed = %u\n", info->si_TA_CS_BC_BASE_ADDR_allowed);
+	printf("    has_bo_metadata = %u\n", info->has_bo_metadata);
+	printf("    has_gpu_reset_status_query = %u\n", info->has_gpu_reset_status_query);
+	printf("    has_gpu_reset_counter_query = %u\n", info->has_gpu_reset_counter_query);
+	printf("    has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator);
+	printf("    has_format_bc1_through_bc7 = %u\n", info->has_format_bc1_through_bc7);
+	printf("    kernel_flushes_tc_l2_after_ib = %u\n", info->kernel_flushes_tc_l2_after_ib);
+	printf("    has_indirect_compute_dispatch = %u\n", info->has_indirect_compute_dispatch);
+	printf("    has_unaligned_shader_loads = %u\n", info->has_unaligned_shader_loads);
+	printf("    has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
+	printf("    has_2d_tiling = %u\n", info->has_2d_tiling);
+	printf("    has_read_registers_query = %u\n", info->has_read_registers_query);

 	printf("Shader core info:\n");
 	printf("    max_shader_clock = %i\n", info->max_shader_clock);
@@ -521,3 +589,235 @@ void ac_print_gpu_info(struct radeon_info *info)
 		       G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
 	}
 }
+
+int
+ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family)
+{
+	if (chip_class >= GFX9)
+		return -1;
+
+	switch (family) {
+	case CHIP_OLAND:
+	case CHIP_HAINAN:
+	case CHIP_KAVERI:
+	case CHIP_KABINI:
+	case CHIP_MULLINS:
+	case CHIP_ICELAND:
+	case CHIP_CARRIZO:
+	case CHIP_STONEY:
+		return 16;
+	case CHIP_TAHITI:
+	case CHIP_PITCAIRN:
+	case CHIP_VERDE:
+	case CHIP_BONAIRE:
+	case CHIP_HAWAII:
+	case CHIP_TONGA:
+	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
+		return 32;
+	default:
+		unreachable("Unknown GPU");
+	}
+}
+
+void
+ac_get_raster_config(struct radeon_info *info,
+		     uint32_t *raster_config_p,
+		     uint32_t *raster_config_1_p)
+{
+	unsigned raster_config, raster_config_1;
+
+	switch (info->family) {
+	/* 1 SE / 1 RB */
+	case CHIP_HAINAN:
+	case CHIP_KABINI:
+	case CHIP_MULLINS:
+	case CHIP_STONEY:
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 1 SE / 4 RBs */
+	case CHIP_VERDE:
+		raster_config = 0x0000124a;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 1 SE / 2 RBs (Oland is special) */
+	case CHIP_OLAND:
+		raster_config = 0x00000082;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 1 SE / 2 RBs */
+	case CHIP_KAVERI:
+	case CHIP_ICELAND:
+	case CHIP_CARRIZO:
+		raster_config = 0x00000002;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 2 SEs / 4 RBs */
+	case CHIP_BONAIRE:
+	case CHIP_POLARIS11:
+	case CHIP_POLARIS12:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 2 SEs / 8 RBs */
+	case CHIP_TAHITI:
+	case CHIP_PITCAIRN:
+		raster_config = 0x2a00126a;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 4 SEs / 8 RBs */
+	case CHIP_TONGA:
+	case CHIP_POLARIS10:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+		break;
+	/* 4 SEs / 16 RBs */
+	case CHIP_HAWAII:
+	case CHIP_FIJI:
+	case CHIP_VEGAM:
+		raster_config = 0x3a00161a;
+		raster_config_1 = 0x0000002e;
+		break;
+	default:
+		fprintf(stderr,
+			"ac: Unknown GPU, using 0 for raster_config\n");
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	}
+
+	/* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it.
+	 * This decreases performance by up to 50% when the RB is the bottleneck.
+	 */
+	if (info->family == CHIP_KAVERI && info->drm_major == 2)
+		raster_config = 0x00000000;
+
+	/* Fiji: Old kernels have incorrect tiling config. This decreases
+	 * RB performance by 25%. (it disables 1 RB in the second packer)
+	 */
+	if (info->family == CHIP_FIJI &&
+	    info->cik_macrotile_mode_array[0] == 0x000000e8) {
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+	}
+
+	*raster_config_p = raster_config;
+	*raster_config_1_p = raster_config_1;
+}
+
+void
+ac_get_harvested_configs(struct radeon_info *info,
+			 unsigned raster_config,
+			 unsigned *cik_raster_config_1_p,
+			 unsigned *raster_config_se)
+{
+	unsigned sh_per_se = MAX2(info->max_sh_per_se, 1);
+	unsigned num_se = MAX2(info->max_se, 1);
+	unsigned rb_mask = info->enabled_rb_mask;
+	unsigned num_rb = MIN2(info->num_render_backends, 16);
+	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
+	unsigned rb_per_se = num_rb / num_se;
+	unsigned se_mask[4];
+	unsigned se;
+
+	se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
+	se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
+	se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
+	se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
+
+	assert(num_se == 1 || num_se == 2 || num_se == 4);
+	assert(sh_per_se == 1 || sh_per_se == 2);
+	assert(rb_per_pkr == 1 || rb_per_pkr == 2);
+
+
+	if (info->chip_class >= CIK) {
+		unsigned raster_config_1 = *cik_raster_config_1_p;
+		if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
+				     (!se_mask[2] && !se_mask[3]))) {
+			raster_config_1 &= C_028354_SE_PAIR_MAP;
+
+			if (!se_mask[0] && !se_mask[1]) {
+				raster_config_1 |=
+					S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
+			} else {
+				raster_config_1 |=
+					S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
+			}
+			*cik_raster_config_1_p = raster_config_1;
+		}
+	}
+
+	for (se = 0; se < num_se; se++) {
+		unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
+		unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
+		int idx = (se / 2) * 2;
+
+		raster_config_se[se] = raster_config;
+		if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
+			raster_config_se[se] &= C_028350_SE_MAP;
+
+			if (!se_mask[idx]) {
+				raster_config_se[se] |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+			} else {
+				raster_config_se[se] |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+			}
+		}
+
+		pkr0_mask &= rb_mask;
+		pkr1_mask &= rb_mask;
+		if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
+			raster_config_se[se] &= C_028350_PKR_MAP;
+
+			if (!pkr0_mask) {
+				raster_config_se[se] |=
+					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
+			} else {
+				raster_config_se[se] |=
+					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
+			}
+		}
+
+		if (rb_per_se >= 2) {
+			unsigned rb0_mask = 1 << (se * rb_per_se);
+			unsigned rb1_mask = rb0_mask << 1;
+
+			rb0_mask &= rb_mask;
+			rb1_mask &= rb_mask;
+			if (!rb0_mask || !rb1_mask) {
+				raster_config_se[se] &= C_028350_RB_MAP_PKR0;
+
+				if (!rb0_mask) {
+					raster_config_se[se] |=
+						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
+				} else {
+					raster_config_se[se] |=
+						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
+				}
+			}
+
+			if (rb_per_se > 2) {
+				rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
+				rb1_mask = rb0_mask << 1;
+				rb0_mask &= rb_mask;
+				rb1_mask &= rb_mask;
+				if (!rb0_mask || !rb1_mask) {
+					raster_config_se[se] &= C_028350_RB_MAP_PKR1;
+
+					if (!rb0_mask) {
+						raster_config_se[se] |=
+							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
+					} else {
+						raster_config_se[se] |=
+							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
+					}
+				}
+			}
+		}
+	}
+}
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -86,7 +86,7 @@ struct radeon_info {
 	uint32_t                    vce_fw_version;
 	uint32_t                    vce_harvest_config;

-	/* Kernel info. */
+	/* Kernel & winsys capabilities. */
 	uint32_t                    drm_major; /* version */
 	uint32_t                    drm_minor;
 	uint32_t                    drm_patchlevel;
@@ -96,6 +96,20 @@ struct radeon_info {
 	bool                        has_fence_to_handle;
 	bool                        has_ctx_priority;
 	bool                        has_local_buffers;
+	bool                        kernel_flushes_hdp_before_ib;
+	bool                        htile_cmask_support_1d_tiling;
+	bool                        si_TA_CS_BC_BASE_ADDR_allowed;
+	bool                        has_bo_metadata;
+	bool                        has_gpu_reset_status_query;
+	bool                        has_gpu_reset_counter_query;
+	bool                        has_eqaa_surface_allocator;
+	bool                        has_format_bc1_through_bc7;
+	bool                        kernel_flushes_tc_l2_after_ib;
+	bool                        has_indirect_compute_dispatch;
+	bool                        has_unaligned_shader_loads;
+	bool                        has_sparse_vm_mappings;
+	bool                        has_2d_tiling;
+	bool                        has_read_registers_query;

 	/* Shader cores. */
 	uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
@@ -130,6 +144,29 @@ void ac_compute_driver_uuid(char *uuid, size_t size);

 void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size);
 void ac_print_gpu_info(struct radeon_info *info);
+int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family);
+void ac_get_raster_config(struct radeon_info *info,
+			  uint32_t *raster_config_p,
+			  uint32_t *raster_config_1_p);
+void ac_get_harvested_configs(struct radeon_info *info,
+			      unsigned raster_config,
+			      unsigned *cik_raster_config_1_p,
+			      unsigned *raster_config_se);
+
+static inline unsigned ac_get_max_simd_waves(enum radeon_family family)
+{
+
+	switch (family) {
+	/* These always have 8 waves: */
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
+		return 8;
+	default:
+		return 10;
+	}
+}

 #ifdef __cplusplus
 }
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -888,37 +888,36 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 			    bool writeonly_memory,
 			    bool swizzle_enable_hint)
 {
+	/* Split 3 channel stores, becase LLVM doesn't support 3-channel
+	 * intrinsics. */
+	if (num_channels == 3) {
+		LLVMValueRef v[3], v01;
+
+		for (int i = 0; i < 3; i++) {
+			v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
+					LLVMConstInt(ctx->i32, i, 0), "");
+		}
+		v01 = ac_build_gather_values(ctx, v, 2);
+
+		ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
+					    soffset, inst_offset, glc, slc,
+					    writeonly_memory, swizzle_enable_hint);
+		ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
+					    soffset, inst_offset + 8,
+					    glc, slc,
+					    writeonly_memory, swizzle_enable_hint);
+		return;
+	}
+
 	/* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
 	 * (voffset is swizzled, but soffset isn't swizzled).
 	 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
 	 */
 	if (!swizzle_enable_hint) {
-		/* Split 3 channel stores, becase LLVM doesn't support 3-channel
-		 * intrinsics. */
-		if (num_channels == 3) {
-			LLVMValueRef v[3], v01;
-
-			for (int i = 0; i < 3; i++) {
-				v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
-						LLVMConstInt(ctx->i32, i, 0), "");
-			}
-			v01 = ac_build_gather_values(ctx, v, 2);
-
-			ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
-						    soffset, inst_offset, glc, slc,
-						    writeonly_memory, swizzle_enable_hint);
-			ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
-						    soffset, inst_offset + 8,
-						    glc, slc,
-						    writeonly_memory, swizzle_enable_hint);
-			return;
-		}
-
-		unsigned func = CLAMP(num_channels, 1, 3) - 1;
-		static const char *types[] = {"f32", "v2f32", "v4f32"};
-		char name[256];
 		LLVMValueRef offset = soffset;

+		static const char *types[] = {"f32", "v2f32", "v4f32"};
+
 		if (inst_offset)
 			offset = LLVMBuildAdd(ctx->builder, offset,
 					      LLVMConstInt(ctx->i32, inst_offset, 0), "");
@@ -934,53 +933,46 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 			LLVMConstInt(ctx->i1, slc, 0),
 		};

+		char name[256];
 		snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
-			 types[func]);
+			 types[CLAMP(num_channels, 1, 3) - 1]);

 		ac_build_intrinsic(ctx, name, ctx->voidt,
 				   args, ARRAY_SIZE(args),
 				   writeonly_memory ?
-					   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
-					   AC_FUNC_ATTR_WRITEONLY);
+				   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+				   AC_FUNC_ATTR_WRITEONLY);
 		return;
 	}

-	static unsigned dfmt[] = {
+	static const unsigned dfmt[] = {
 		V_008F0C_BUF_DATA_FORMAT_32,
 		V_008F0C_BUF_DATA_FORMAT_32_32,
 		V_008F0C_BUF_DATA_FORMAT_32_32_32,
 		V_008F0C_BUF_DATA_FORMAT_32_32_32_32
 	};
-	assert(num_channels >= 1 && num_channels <= 4);
-
+	static const char *types[] = {"i32", "v2i32", "v4i32"};
 	LLVMValueRef args[] = {
-		rsrc,
 		vdata,
-		LLVMConstInt(ctx->i32, num_channels, 0),
-		voffset ? voffset : LLVMGetUndef(ctx->i32),
+		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+		LLVMConstInt(ctx->i32, 0, 0),
+		voffset ? voffset : LLVMConstInt(ctx->i32, 0, 0),
 		soffset,
 		LLVMConstInt(ctx->i32, inst_offset, 0),
 		LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
 		LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
-		LLVMConstInt(ctx->i32, voffset != NULL, 0),
-		LLVMConstInt(ctx->i32, 0, 0), /* idxen */
-		LLVMConstInt(ctx->i32, glc, 0),
-		LLVMConstInt(ctx->i32, slc, 0),
-		LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
+		LLVMConstInt(ctx->i1, glc, 0),
+		LLVMConstInt(ctx->i1, slc, 0),
 	};
-
-	/* The instruction offset field has 12 bits */
-	assert(voffset || inst_offset < (1 << 12));
-
-	/* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
-	unsigned func = CLAMP(num_channels, 1, 3) - 1;
-	const char *types[] = {"i32", "v2i32", "v4i32"};
 	char name[256];
-	snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+	snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
+		 types[CLAMP(num_channels, 1, 3) - 1]);

 	ac_build_intrinsic(ctx, name, ctx->voidt,
 			   args, ARRAY_SIZE(args),
-			   AC_FUNC_ATTR_LEGACY);
+			   writeonly_memory ?
+				   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+				   AC_FUNC_ATTR_WRITEONLY);
 }

 static LLVMValueRef
@@ -1178,7 +1170,21 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
 	LLVMValueRef tl, trbl, args[2];
 	LLVMValueRef result;

-	if (ctx->chip_class >= VI) {
+	if (HAVE_LLVM >= 0x0700) {
+		unsigned tl_lanes[4], trbl_lanes[4];
+
+		for (unsigned i = 0; i < 4; ++i) {
+			tl_lanes[i] = i & mask;
+			trbl_lanes[i] = (i & mask) + idx;
+		}
+
+		tl = ac_build_quad_swizzle(ctx, val,
+		                           tl_lanes[0], tl_lanes[1],
+		                           tl_lanes[2], tl_lanes[3]);
+		trbl = ac_build_quad_swizzle(ctx, val,
+		                             trbl_lanes[0], trbl_lanes[1],
+		                             trbl_lanes[2], trbl_lanes[3]);
+	} else if (ctx->chip_class >= VI) {
 		LLVMValueRef thread_id, tl_tid, trbl_tid;
 		thread_id = ac_get_thread_id(ctx);

@@ -1248,6 +1254,13 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
 	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
 	trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
 	result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+	if (HAVE_LLVM >= 0x0700) {
+		result = ac_build_intrinsic(ctx,
+			"llvm.amdgcn.wqm.f32", ctx->f32,
+			&result, 1, 0);
+	}
+
 	return result;
 }

@@ -1367,66 +1380,41 @@ LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,

 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
 {
-	if (HAVE_LLVM >= 0x0500) {
-		return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
-				     ctx->f32_1);
-	}
-
-	LLVMValueRef args[3] = {
-		value,
-		LLVMConstReal(ctx->f32, 0),
-		LLVMConstReal(ctx->f32, 1),
-	};
-
-	return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
-				  AC_FUNC_ATTR_READNONE |
-				  AC_FUNC_ATTR_LEGACY);
+	return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
+			     ctx->f32_1);
 }

 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
 {
 	LLVMValueRef args[9];

-	if (HAVE_LLVM >= 0x0500) {
-		args[0] = LLVMConstInt(ctx->i32, a->target, 0);
-		args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
+	args[0] = LLVMConstInt(ctx->i32, a->target, 0);
+	args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);

-		if (a->compr) {
-			LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
-			LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
+	if (a->compr) {
+		LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
+		LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);

-			args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
-						   v2i16, "");
-			args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
-						   v2i16, "");
-			args[4] = LLVMConstInt(ctx->i1, a->done, 0);
-			args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+		args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
+				v2i16, "");
+		args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
+				v2i16, "");
+		args[4] = LLVMConstInt(ctx->i1, a->done, 0);
+		args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);

-			ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
-					   ctx->voidt, args, 6, 0);
-		} else {
-			args[2] = a->out[0];
-			args[3] = a->out[1];
-			args[4] = a->out[2];
-			args[5] = a->out[3];
-			args[6] = LLVMConstInt(ctx->i1, a->done, 0);
-			args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+		ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
+				   ctx->voidt, args, 6, 0);
+	} else {
+		args[2] = a->out[0];
+		args[3] = a->out[1];
+		args[4] = a->out[2];
+		args[5] = a->out[3];
+		args[6] = LLVMConstInt(ctx->i1, a->done, 0);
+		args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);

-			ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
-					   ctx->voidt, args, 8, 0);
-		}
-		return;
+		ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
+				   ctx->voidt, args, 8, 0);
 	}
-
-	args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
-	args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
-	args[2] = LLVMConstInt(ctx->i32, a->done, 0);
-	args[3] = LLVMConstInt(ctx->i32, a->target, 0);
-	args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
-	memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
-
-	ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
-			   AC_FUNC_ATTR_LEGACY);
 }

 void ac_build_export_null(struct ac_llvm_context *ctx)
@@ -1485,8 +1473,26 @@ static unsigned ac_num_derivs(enum ac_image_dim dim)
 	}
 }

-LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
-				   struct ac_image_args *a)
+static const char *get_atomic_name(enum ac_atomic_op op)
+{
+	switch (op) {
+	case ac_atomic_swap: return "swap";
+	case ac_atomic_add: return "add";
+	case ac_atomic_sub: return "sub";
+	case ac_atomic_smin: return "smin";
+	case ac_atomic_umin: return "umin";
+	case ac_atomic_smax: return "smax";
+	case ac_atomic_umax: return "umax";
+	case ac_atomic_and: return "and";
+	case ac_atomic_or: return "or";
+	case ac_atomic_xor: return "xor";
+	}
+	unreachable("bad atomic op");
+}
+
+/* LLVM 6 and older */
+static LLVMValueRef ac_build_image_opcode_llvm6(struct ac_llvm_context *ctx,
+						struct ac_image_args *a)
 {
 	LLVMValueRef args[16];
 	LLVMTypeRef retty = ctx->v4f32;
@@ -1494,16 +1500,6 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 	const char *atomic_subop = "";
 	char intr_name[128], coords_type[64];

-	assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
-	       !a->level_zero);
-	assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
-		a->opcode != ac_image_store_mip) ||
-	       a->lod);
-	assert((a->bias ? 1 : 0) +
-	       (a->lod ? 1 : 0) +
-	       (a->level_zero ? 1 : 0) +
-	       (a->derivs[0] ? 1 : 0) <= 1);
-
 	bool sample = a->opcode == ac_image_sample ||
 		      a->opcode == ac_image_gather4 ||
 		      a->opcode == ac_image_get_lod;
@@ -1521,6 +1517,20 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 	LLVMValueRef addr;
 	unsigned num_addr = 0;

+	if (a->opcode == ac_image_get_lod) {
+		switch (a->dim) {
+		case ac_image_1darray:
+			num_coords = 1;
+			break;
+		case ac_image_2darray:
+		case ac_image_cube:
+			num_coords = 2;
+			break;
+		default:
+			break;
+		}
+	}
+
 	if (a->offset)
 		args[num_addr++] = ac_to_integer(ctx, a->offset);
 	if (a->bias)
@@ -1601,18 +1611,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 		if (a->opcode == ac_image_atomic_cmpswap) {
 			atomic_subop = "cmpswap";
 		} else {
-			switch (a->atomic) {
-			case ac_atomic_swap: atomic_subop = "swap"; break;
-			case ac_atomic_add: atomic_subop = "add"; break;
-			case ac_atomic_sub: atomic_subop = "sub"; break;
-			case ac_atomic_smin: atomic_subop = "smin"; break;
-			case ac_atomic_umin: atomic_subop = "umin"; break;
-			case ac_atomic_smax: atomic_subop = "smax"; break;
-			case ac_atomic_umax: atomic_subop = "umax"; break;
-			case ac_atomic_and: atomic_subop = "and"; break;
-			case ac_atomic_or: atomic_subop = "or"; break;
-			case ac_atomic_xor: atomic_subop = "xor"; break;
-			}
+			atomic_subop = get_atomic_name(a->atomic);
 		}
 		break;
 	case ac_image_get_lod:
@@ -1656,22 +1655,175 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 	return result;
 }

+LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
+				   struct ac_image_args *a)
+{
+	const char *overload[3] = { "", "", "" };
+	unsigned num_overloads = 0;
+	LLVMValueRef args[18];
+	unsigned num_args = 0;
+	enum ac_image_dim dim = a->dim;
+
+	assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
+	       !a->level_zero);
+	assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
+		a->opcode != ac_image_store_mip) ||
+	       a->lod);
+	assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+	       (!a->compare && !a->offset));
+	assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+		a->opcode == ac_image_get_lod) ||
+	       !a->bias);
+	assert((a->bias ? 1 : 0) +
+	       (a->lod ? 1 : 0) +
+	       (a->level_zero ? 1 : 0) +
+	       (a->derivs[0] ? 1 : 0) <= 1);
+
+	if (HAVE_LLVM < 0x0700)
+		return ac_build_image_opcode_llvm6(ctx, a);
+
+	if (a->opcode == ac_image_get_lod) {
+		switch (dim) {
+		case ac_image_1darray:
+			dim = ac_image_1d;
+			break;
+		case ac_image_2darray:
+		case ac_image_cube:
+			dim = ac_image_2d;
+			break;
+		default:
+			break;
+		}
+	}
+
+	bool sample = a->opcode == ac_image_sample ||
+		      a->opcode == ac_image_gather4 ||
+		      a->opcode == ac_image_get_lod;
+	bool atomic = a->opcode == ac_image_atomic ||
+		      a->opcode == ac_image_atomic_cmpswap;
+	LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
+
+	if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
+		args[num_args++] = a->data[0];
+		if (a->opcode == ac_image_atomic_cmpswap)
+			args[num_args++] = a->data[1];
+	}
+
+	if (!atomic)
+		args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
+
+	if (a->offset)
+		args[num_args++] = ac_to_integer(ctx, a->offset);
+	if (a->bias) {
+		args[num_args++] = ac_to_float(ctx, a->bias);
+		overload[num_overloads++] = ".f32";
+	}
+	if (a->compare)
+		args[num_args++] = ac_to_float(ctx, a->compare);
+	if (a->derivs[0]) {
+		unsigned count = ac_num_derivs(dim);
+		for (unsigned i = 0; i < count; ++i)
+			args[num_args++] = ac_to_float(ctx, a->derivs[i]);
+		overload[num_overloads++] = ".f32";
+	}
+	unsigned num_coords =
+		a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
+	for (unsigned i = 0; i < num_coords; ++i)
+		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
+	if (a->lod)
+		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
+	overload[num_overloads++] = sample ? ".f32" : ".i32";
+
+	args[num_args++] = a->resource;
+	if (sample) {
+		args[num_args++] = a->sampler;
+		args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
+	}
+
+	args[num_args++] = ctx->i32_0; /* texfailctrl */
+	args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
+
+	const char *name;
+	const char *atomic_subop = "";
+	switch (a->opcode) {
+	case ac_image_sample: name = "sample"; break;
+	case ac_image_gather4: name = "gather4"; break;
+	case ac_image_load: name = "load"; break;
+	case ac_image_load_mip: name = "load.mip"; break;
+	case ac_image_store: name = "store"; break;
+	case ac_image_store_mip: name = "store.mip"; break;
+	case ac_image_atomic:
+		name = "atomic.";
+		atomic_subop = get_atomic_name(a->atomic);
+		break;
+	case ac_image_atomic_cmpswap:
+		name = "atomic.";
+		atomic_subop = "cmpswap";
+		break;
+	case ac_image_get_lod: name = "getlod"; break;
+	case ac_image_get_resinfo: name = "getresinfo"; break;
+	default: unreachable("invalid image opcode");
+	}
+
+	const char *dimname;
+	switch (dim) {
+	case ac_image_1d: dimname = "1d"; break;
+	case ac_image_2d: dimname = "2d"; break;
+	case ac_image_3d: dimname = "3d"; break;
+	case ac_image_cube: dimname = "cube"; break;
+	case ac_image_1darray: dimname = "1darray"; break;
+	case ac_image_2darray: dimname = "2darray"; break;
+	case ac_image_2dmsaa: dimname = "2dmsaa"; break;
+	case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
+	default: unreachable("invalid dim");
+	}
+
+	bool lod_suffix =
+		a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
+	char intr_name[96];
+	snprintf(intr_name, sizeof(intr_name),
+		 "llvm.amdgcn.image.%s%s" /* base name */
+		 "%s%s%s" /* sample/gather modifiers */
+		 ".%s.%s%s%s%s", /* dimension and type overloads */
+		 name, atomic_subop,
+		 a->compare ? ".c" : "",
+		 a->bias ? ".b" :
+		 lod_suffix ? ".l" :
+		 a->derivs[0] ? ".d" :
+		 a->level_zero ? ".lz" : "",
+		 a->offset ? ".o" : "",
+		 dimname,
+		 atomic ? "i32" : "v4f32",
+		 overload[0], overload[1], overload[2]);
+
+	LLVMTypeRef retty;
+	if (atomic)
+		retty = ctx->i32;
+	else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
+		retty = ctx->voidt;
+	else
+		retty = ctx->v4f32;
+
+	LLVMValueRef result =
+		ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
+				   a->attributes);
+	if (!sample && retty == ctx->v4f32) {
+		result = LLVMBuildBitCast(ctx->builder, result,
+					  ctx->v4i32, "");
+	}
+	return result;
+}
+
 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
 				    LLVMValueRef args[2])
 {
-	if (HAVE_LLVM >= 0x0500) {
-		LLVMTypeRef v2f16 =
-			LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
-		LLVMValueRef res =
-			ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
-					   v2f16, args, 2,
-					   AC_FUNC_ATTR_READNONE);
-		return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
-	}
-
-	return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
-				  AC_FUNC_ATTR_READNONE |
-				  AC_FUNC_ATTR_LEGACY);
+	LLVMTypeRef v2f16 =
+		LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
+	LLVMValueRef res =
+		ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
+				   v2f16, args, 2,
+				   AC_FUNC_ATTR_READNONE);
+	return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
 }

 /* Upper 16 bits must be zero. */
@@ -1855,20 +2007,11 @@ LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
 		width,
 	};

-	if (HAVE_LLVM >= 0x0500) {
-		return ac_build_intrinsic(ctx,
-					  is_signed ? "llvm.amdgcn.sbfe.i32" :
-						      "llvm.amdgcn.ubfe.i32",
-					  ctx->i32, args, 3,
-					  AC_FUNC_ATTR_READNONE);
-	}
-
 	return ac_build_intrinsic(ctx,
-				  is_signed ? "llvm.AMDGPU.bfe.i32" :
-					      "llvm.AMDGPU.bfe.u32",
+				  is_signed ? "llvm.amdgcn.sbfe.i32" :
+					      "llvm.amdgcn.ubfe.i32",
 				  ctx->i32, args, 3,
-				  AC_FUNC_ATTR_READNONE |
-				  AC_FUNC_ATTR_LEGACY);
+				  AC_FUNC_ATTR_READNONE);
 }

 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
@@ -1948,9 +2091,9 @@ LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
 	return val;
 }

-#define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
-#define AC_EXP_ENABLED_CHANNELS (HAVE_LLVM >= 0x0500 ? 1 : 0)
-#define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
+#define AC_EXP_TARGET		0
+#define AC_EXP_ENABLED_CHANNELS 1
+#define AC_EXP_OUT0		2

 enum ac_ir_type {
 	AC_IR_UNDEF,
@@ -2604,11 +2747,13 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
 	final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
 				    LLVMConstInt(ac->i32, 4, 0), "");
 	final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
+	/* Mask the sample index by 0x7, because 0x8 means an unknown value
+	 * with EQAA, so those will map to 0. */
 	final_sample = LLVMBuildAnd(ac->builder, final_sample,
-				    LLVMConstInt(ac->i32, 0xF, 0), "");
+				    LLVMConstInt(ac->i32, 0x7, 0), "");

 	/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
-	 * resource descriptor is 0 (invalid),
+	 * resource descriptor is 0 (invalid).
 	 */
 	LLVMValueRef tmp;
 	tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
@@ -2852,7 +2997,7 @@ static LLVMValueRef
 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
 		      LLVMValueRef inactive)
 {
-	char name[32], type[8];
+	char name[33], type[8];
 	LLVMTypeRef src_type = LLVMTypeOf(src);
 	src = ac_to_integer(ctx, src);
 	inactive = ac_to_integer(ctx, inactive);
--- a/src/amd/common/ac_llvm_helper.cpp
+++ b/src/amd/common/ac_llvm_helper.cpp
@@ -37,22 +37,10 @@
 #include <llvm/IR/CallSite.h>
 #include <llvm/IR/IRBuilder.h>

-#if HAVE_LLVM < 0x0500
-namespace llvm {
-typedef AttributeSet AttributeList;
-}
-#endif
-
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
 {
   llvm::Argument *A = llvm::unwrap<llvm::Argument>(val);
-#if HAVE_LLVM < 0x0500
-   llvm::AttrBuilder B;
-   B.addDereferenceableAttr(bytes);
-   A->addAttr(llvm::AttributeList::get(A->getContext(), A->getArgNo() + 1,  B));
-#else
   A->addAttr(llvm::Attribute::getWithDereferenceableBytes(A->getContext(), bytes));
-#endif
 }

 bool ac_is_sgpr_param(LLVMValueRef arg)
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -115,15 +115,19 @@ const char *ac_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_VEGAM:
 		return "polaris11";
 	case CHIP_VEGA10:
-	case CHIP_VEGA12:
-	case CHIP_RAVEN:
 		return "gfx900";
+	case CHIP_RAVEN:
+		return "gfx902";
+	case CHIP_VEGA12:
+		return HAVE_LLVM >= 0x0700 ? "gfx904" : "gfx902";
 	default:
 		return "";
 	}
 }

-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, enum ac_target_machine_options tm_options)
+LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
+					      enum ac_target_machine_options tm_options,
+					      const char **out_triple)
 {
 	assert(family >= CHIP_TAHITI);
 	char features[256];
@@ -146,6 +150,8 @@ LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, enum ac
 	                             LLVMRelocDefault,
 	                             LLVMCodeModelDefault);

+	if (out_triple)
+		*out_triple = triple;
 	return tm;
 }

--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -68,7 +68,9 @@ enum ac_float_mode {
 };

 const char *ac_get_llvm_processor_name(enum radeon_family family);
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, enum ac_target_machine_options tm_options);
+LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
+					      enum ac_target_machine_options tm_options,
+					      const char **out_triple);

 LLVMTargetRef ac_get_llvm_target(const char *triple);
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -87,7 +87,6 @@ get_ac_sampler_dim(const struct ac_llvm_context *ctx, enum glsl_sampler_dim dim,
 		return is_array ? ac_image_1darray : ac_image_1d;
 	case GLSL_SAMPLER_DIM_2D:
 	case GLSL_SAMPLER_DIM_RECT:
-	case GLSL_SAMPLER_DIM_SUBPASS:
 	case GLSL_SAMPLER_DIM_EXTERNAL:
 		return is_array ? ac_image_2darray : ac_image_2d;
 	case GLSL_SAMPLER_DIM_3D:
@@ -95,8 +94,11 @@ get_ac_sampler_dim(const struct ac_llvm_context *ctx, enum glsl_sampler_dim dim,
 	case GLSL_SAMPLER_DIM_CUBE:
 		return ac_image_cube;
 	case GLSL_SAMPLER_DIM_MS:
-	case GLSL_SAMPLER_DIM_SUBPASS_MS:
 		return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
+	case GLSL_SAMPLER_DIM_SUBPASS:
+		return ac_image_2darray;
+	case GLSL_SAMPLER_DIM_SUBPASS_MS:
+		return ac_image_2darraymsaa;
 	default:
 		unreachable("bad sampler dim");
 	}
@@ -1307,6 +1309,14 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
 		}
 	}

+	/* Fixup for GFX9 which allocates 1D textures as 2D. */
+	if (instr->op == nir_texop_lod && ctx->ac.chip_class >= GFX9) {
+		if ((args->dim == ac_image_2darray ||
+		     args->dim == ac_image_2d) && !args->coords[1]) {
+			args->coords[1] = ctx->ac.i32_0;
+		}
+	}
+
 	args->attributes = AC_FUNC_ATTR_READNONE;
 	return ac_build_image_opcode(&ctx->ac, args);
 }
@@ -1562,6 +1572,11 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
 		        LLVMConstInt(ctx->ac.i32, 6, false), LLVMConstInt(ctx->ac.i32, 7, false)
 		};

+		if (num_components == 6) {
+			/* we end up with a v4f32 and v2f32 but shuffle fails on that */
+			results[1] = ac_build_expand_to_vec4(&ctx->ac, results[1], 4);
+		}
+
 		LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
 		ret = LLVMBuildShuffleVector(ctx->ac.builder, results[0],
 					     results[num_components > 4 ? 1 : 0], swizzle, "");
@@ -2090,18 +2105,6 @@ static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx,
 	return sample_index;
 }

-static bool
-glsl_is_array_image(const struct glsl_type *type)
-{
-	const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
-
-	if (glsl_sampler_type_is_array(type))
-		return true;
-
-	return dim == GLSL_SAMPLER_DIM_SUBPASS ||
-	       dim == GLSL_SAMPLER_DIM_SUBPASS_MS;
-}
-
 static void get_image_coords(struct ac_nir_context *ctx,
 			     const nir_intrinsic_instr *instr,
 			     struct ac_image_args *args)
@@ -2247,7 +2250,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 		args.resource = get_sampler_desc(ctx, instr->variables[0],
 						 AC_DESC_IMAGE, NULL, true, false);
 		args.dim = get_ac_image_dim(&ctx->ac, glsl_get_sampler_dim(type),
-					    glsl_is_array_image(type));
+					    glsl_sampler_type_is_array(type));
 		args.dmask = 15;
 		args.attributes = AC_FUNC_ATTR_READONLY;
 		if (var->data.image._volatile || var->data.image.coherent)
@@ -2290,7 +2293,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
 		args.resource = get_sampler_desc(ctx, instr->variables[0],
 						 AC_DESC_IMAGE, NULL, true, false);
 		args.dim = get_ac_image_dim(&ctx->ac, glsl_get_sampler_dim(type),
-					    glsl_is_array_image(type));
+					    glsl_sampler_type_is_array(type));
 		args.dmask = 15;
 		if (force_glc || var->data.image._volatile || var->data.image.coherent)
 			args.cache_policy |= ac_glc;
@@ -2381,7 +2384,7 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 		args.resource = get_sampler_desc(ctx, instr->variables[0],
 						 AC_DESC_IMAGE, NULL, true, false);
 		args.dim = get_ac_image_dim(&ctx->ac, glsl_get_sampler_dim(type),
-					    glsl_is_array_image(type));
+					    glsl_sampler_type_is_array(type));

 		return ac_build_image_opcode(&ctx->ac, &args);
 	}
@@ -3397,6 +3400,13 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 	}

 	/* Texture coordinates fixups */
+	if (instr->coord_components > 1 &&
+	    instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
+	    instr->is_array &&
+	    instr->op != nir_texop_txf) {
+		args.coords[1] = apply_round_slice(&ctx->ac, args.coords[1]);
+	}
+
 	if (instr->coord_components > 2 &&
 	    (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
 	     instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
--- a/src/amd/common/ac_surface.c
+++ b/src/amd/common/ac_surface.c
@@ -227,8 +227,16 @@ ADDR_HANDLE amdgpu_addr_create(const struct radeon_info *info,
 	return addrCreateOutput.hLib;
 }

-static int surf_config_sanity(const struct ac_surf_config *config)
+static int surf_config_sanity(const struct ac_surf_config *config,
+			      unsigned flags)
 {
+	/* FMASK is allocated together with the color surface and can't be
+	 * allocated separately.
+	 */
+	assert(!(flags & RADEON_SURF_FMASK));
+	if (flags & RADEON_SURF_FMASK)
+		return -EINVAL;
+
 	/* all dimension must be at least 1 ! */
 	if (!config->info.width || !config->info.height || !config->info.depth ||
 	    !config->info.array_size || !config->info.levels)
@@ -241,10 +249,27 @@ static int surf_config_sanity(const struct ac_surf_config *config)
 	case 4:
 	case 8:
 		break;
+	case 16:
+		if (flags & RADEON_SURF_Z_OR_SBUFFER)
+			return -EINVAL;
+		break;
 	default:
 		return -EINVAL;
 	}

+	if (!(flags & RADEON_SURF_Z_OR_SBUFFER)) {
+		switch (config->info.color_samples) {
+		case 0:
+		case 1:
+		case 2:
+		case 4:
+		case 8:
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
 	if (config->is_3d && config->info.array_size > 1)
 		return -EINVAL;
 	if (config->is_cube && config->info.depth > 1)
@@ -276,10 +301,10 @@ static int gfx6_compute_level(ADDR_HANDLE addrlib,
 	 */
 	if (config->info.levels == 1 &&
 	    AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED &&
-	    AddrSurfInfoIn->bpp) {
+	    AddrSurfInfoIn->bpp &&
+	    util_is_power_of_two_or_zero(AddrSurfInfoIn->bpp)) {
 		unsigned alignment = 256 / (AddrSurfInfoIn->bpp / 8);

-		assert(util_is_power_of_two_or_zero(AddrSurfInfoIn->bpp));
 		AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, alignment);
 	}

@@ -343,6 +368,9 @@ static int gfx6_compute_level(ADDR_HANDLE addrlib,
 	/* The previous level's flag tells us if we can use DCC for this level. */
 	if (AddrSurfInfoIn->flags.dccCompatible &&
 	    (level == 0 || AddrDccOut->subLvlCompressible)) {
+		bool prev_level_clearable = level == 0 ||
+					    AddrDccOut->dccRamSizeAligned;
+
 		AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize;
 		AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
 		AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
@@ -355,10 +383,26 @@ static int gfx6_compute_level(ADDR_HANDLE addrlib,

 		if (ret == ADDR_OK) {
 			surf_level->dcc_offset = surf->dcc_size;
-			surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
 			surf->num_dcc_levels = level + 1;
 			surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
 			surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
+
+			/* If the DCC size of a subresource (1 mip level or 1 slice)
+			 * is not aligned, the DCC memory layout is not contiguous for
+			 * that subresource, which means we can't use fast clear.
+			 *
+			 * We only do fast clears for whole mipmap levels. If we did
+			 * per-slice fast clears, the same restriction would apply.
+			 * (i.e. only compute the slice size and see if it's aligned)
+			 *
+			 * The last level can be non-contiguous and still be clearable
+			 * if it's interleaved with the next level that doesn't exist.
+			 */
+			if (AddrDccOut->dccRamSizeAligned ||
+			    (prev_level_clearable && level == config->info.levels - 1))
+				surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
+			else
+				surf_level->dcc_fast_clear_size = 0;
 		}
 	}

@@ -426,7 +470,6 @@ static bool get_display_flag(const struct ac_surf_config *config,
 	unsigned bpe = surf->bpe;

 	if (surf->flags & RADEON_SURF_SCANOUT &&
-	    !(surf->flags & RADEON_SURF_FMASK) &&
 	    config->info.samples <= 1 &&
 	    surf->blk_w <= 2 && surf->blk_h == 1) {
 		/* subsampled */
@@ -537,9 +580,8 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib,

 	compressed = surf->blk_w == 4 && surf->blk_h == 4;

-	/* MSAA and FMASK require 2D tiling. */
-	if (config->info.samples > 1 ||
-	    (surf->flags & RADEON_SURF_FMASK))
+	/* MSAA requires 2D tiling. */
+	if (config->info.samples > 1)
 		mode = RADEON_SURF_MODE_2D;

 	/* DB doesn't support linear layouts. */
@@ -582,13 +624,18 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib,
 	}

 	AddrDccIn.numSamples = AddrSurfInfoIn.numSamples =
-		config->info.samples ? config->info.samples : 1;
+		MAX2(1, config->info.samples);
 	AddrSurfInfoIn.tileIndex = -1;

+	if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) {
+		AddrDccIn.numSamples = AddrSurfInfoIn.numFrags =
+			MAX2(1, config->info.color_samples);
+	}
+
 	/* Set the micro tile type. */
 	if (surf->flags & RADEON_SURF_SCANOUT)
 		AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
-	else if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_FMASK))
+	else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
 		AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
 	else
 		AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
@@ -596,7 +643,6 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib,
 	AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
 	AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
 	AddrSurfInfoIn.flags.cube = config->is_cube;
-	AddrSurfInfoIn.flags.fmask = (surf->flags & RADEON_SURF_FMASK) != 0;
 	AddrSurfInfoIn.flags.display = get_display_flag(config, surf);
 	AddrSurfInfoIn.flags.pow2Pad = config->info.levels > 1;
 	AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
@@ -624,7 +670,7 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib,
 		 config->info.levels == 1);

 	AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0;
-	AddrSurfInfoIn.flags.compressZ = AddrSurfInfoIn.flags.depth;
+	AddrSurfInfoIn.flags.compressZ = !!(surf->flags & RADEON_SURF_Z_OR_SBUFFER);

 	/* On CI/VI, the DB uses the same pitch and tile mode (except tilesplit)
 	 * for Z and stencil. This can cause a number of problems which we work
@@ -661,8 +707,6 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib,
 	if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 &&
 	    surf->u.legacy.bankw && surf->u.legacy.bankh &&
 	    surf->u.legacy.mtilea && surf->u.legacy.tile_split) {
-		assert(!(surf->flags & RADEON_SURF_FMASK));
-
 		/* If any of these parameters are incorrect, the calculation
 		 * will fail. */
 		AddrTileInfoIn.banks = surf->u.legacy.num_banks;
@@ -809,6 +853,67 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib,
 		}
 	}

+	/* Compute FMASK. */
+	if (config->info.samples >= 2 && AddrSurfInfoIn.flags.color) {
+		ADDR_COMPUTE_FMASK_INFO_INPUT fin = {0};
+		ADDR_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
+		ADDR_TILEINFO fmask_tile_info = {};
+
+		fin.size = sizeof(fin);
+		fout.size = sizeof(fout);
+
+		fin.tileMode = AddrSurfInfoOut.tileMode;
+		fin.pitch = AddrSurfInfoOut.pitch;
+		fin.height = config->info.height;
+		fin.numSlices = AddrSurfInfoIn.numSlices;
+		fin.numSamples = AddrSurfInfoIn.numSamples;
+		fin.numFrags = AddrSurfInfoIn.numFrags;
+		fin.tileIndex = -1;
+		fout.pTileInfo = &fmask_tile_info;
+
+		r = AddrComputeFmaskInfo(addrlib, &fin, &fout);
+		if (r)
+			return r;
+
+		surf->fmask_size = fout.fmaskBytes;
+		surf->fmask_alignment = fout.baseAlign;
+		surf->fmask_tile_swizzle = 0;
+
+		surf->u.legacy.fmask.slice_tile_max =
+			(fout.pitch * fout.height) / 64;
+		if (surf->u.legacy.fmask.slice_tile_max)
+		    surf->u.legacy.fmask.slice_tile_max -= 1;
+
+		surf->u.legacy.fmask.tiling_index = fout.tileIndex;
+		surf->u.legacy.fmask.bankh = fout.pTileInfo->bankHeight;
+		surf->u.legacy.fmask.pitch_in_pixels = fout.pitch;
+
+		/* Compute tile swizzle for FMASK. */
+		if (config->info.fmask_surf_index &&
+		    !(surf->flags & RADEON_SURF_SHAREABLE)) {
+			ADDR_COMPUTE_BASE_SWIZZLE_INPUT xin = {0};
+			ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT xout = {0};
+
+			xin.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT);
+			xout.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT);
+
+			/* This counter starts from 1 instead of 0. */
+			xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index);
+			xin.tileIndex = fout.tileIndex;
+			xin.macroModeIndex = fout.macroModeIndex;
+			xin.pTileInfo = fout.pTileInfo;
+			xin.tileMode = fin.tileMode;
+
+			int r = AddrComputeBaseSwizzle(addrlib, &xin, &xout);
+			if (r != ADDR_OK)
+				return r;
+
+			assert(xout.tileSwizzle <=
+			       u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8));
+			surf->fmask_tile_swizzle = xout.tileSwizzle;
+		}
+	}
+
 	/* Recalculate the whole DCC miptree size including disabled levels.
 	 * This is what addrlib does, but calling addrlib would be a lot more
 	 * complicated.
@@ -829,8 +934,17 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib,
 	/* Make sure HTILE covers the whole miptree, because the shader reads
 	 * TC-compatible HTILE even for levels where it's disabled by DB.
 	 */
-	if (surf->htile_size && config->info.levels > 1)
-		surf->htile_size *= 2;
+	if (surf->htile_size && config->info.levels > 1 &&
+	    surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) {
+		/* MSAA can't occur with levels > 1, so ignore the sample count. */
+		const unsigned total_pixels = surf->surf_size / surf->bpe;
+		const unsigned htile_block_size = 8 * 8;
+		const unsigned htile_element_size = 4;
+
+		surf->htile_size = (total_pixels / htile_block_size) *
+				   htile_element_size;
+		surf->htile_size = align(surf->htile_size, surf->htile_alignment);
+	}

 	surf->is_linear = surf->u.legacy.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED;
 	surf->is_displayable = surf->is_linear ||
@@ -1095,8 +1209,8 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib,

 			surf->u.gfx9.fmask.swizzle_mode = fin.swizzleMode;
 			surf->u.gfx9.fmask.epitch = fout.pitch - 1;
-			surf->u.gfx9.fmask_size = fout.fmaskBytes;
-			surf->u.gfx9.fmask_alignment = fout.baseAlign;
+			surf->fmask_size = fout.fmaskBytes;
+			surf->fmask_alignment = fout.baseAlign;

 			/* Compute tile swizzle for the FMASK surface. */
 			if (config->info.fmask_surf_index &&
@@ -1122,8 +1236,8 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib,
 					return ret;

 				assert(xout.pipeBankXor <=
-				       u_bit_consecutive(0, sizeof(surf->u.gfx9.fmask_tile_swizzle) * 8));
-				surf->u.gfx9.fmask_tile_swizzle = xout.pipeBankXor;
+				       u_bit_consecutive(0, sizeof(surf->fmask_tile_swizzle) * 8));
+				surf->fmask_tile_swizzle = xout.pipeBankXor;
 			}
 		}

@@ -1135,7 +1249,7 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib,
 			cin.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_INPUT);
 			cout.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_OUTPUT);

-			if (in->numSamples) {
+			if (in->numSamples > 1) {
 				/* FMASK is always aligned. */
 				cin.cMaskFlags.pipeAligned = 1;
 				cin.cMaskFlags.rbAligned = 1;
@@ -1178,8 +1292,6 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib,
 	ADDR2_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
 	int r;

-	assert(!(surf->flags & RADEON_SURF_FMASK));
-
 	AddrSurfInfoIn.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT);

 	compressed = surf->blk_w == 4 && surf->blk_h == 4;
@@ -1217,6 +1329,10 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib,
 			assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
 			AddrSurfInfoIn.format = ADDR_FMT_32_32;
 			break;
+		case 12:
+			assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+			AddrSurfInfoIn.format = ADDR_FMT_32_32_32;
+			break;
 		case 16:
 			assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
 			AddrSurfInfoIn.format = ADDR_FMT_32_32_32_32;
@@ -1236,9 +1352,12 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib,
 	AddrSurfInfoIn.flags.opt4space = 1;

 	AddrSurfInfoIn.numMipLevels = config->info.levels;
-	AddrSurfInfoIn.numSamples = config->info.samples ? config->info.samples : 1;
+	AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples);
 	AddrSurfInfoIn.numFrags = AddrSurfInfoIn.numSamples;

+	if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER))
+		AddrSurfInfoIn.numFrags = MAX2(1, config->info.color_samples);
+
 	/* GFX9 doesn't support 1D depth textures, so allocate all 1D textures
 	 * as 2D to avoid having shader variants for 1D vs 2D, so all shaders
 	 * must sample 1D textures as 2D. */
@@ -1291,12 +1410,12 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib,

 	surf->num_dcc_levels = 0;
 	surf->surf_size = 0;
+	surf->fmask_size = 0;
 	surf->dcc_size = 0;
 	surf->htile_size = 0;
 	surf->htile_slice_size = 0;
 	surf->u.gfx9.surf_offset = 0;
 	surf->u.gfx9.stencil_offset = 0;
-	surf->u.gfx9.fmask_size = 0;
 	surf->u.gfx9.cmask_size = 0;

 	/* Calculate texture layout information. */
@@ -1391,7 +1510,7 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib,

 	/* Temporary workaround to prevent VM faults and hangs. */
 	if (info->family == CHIP_VEGA12)
-		surf->u.gfx9.fmask_size *= 8;
+		surf->fmask_size *= 8;

 	return 0;
 }
@@ -1403,7 +1522,7 @@ int ac_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *info,
 {
 	int r;

-	r = surf_config_sanity(config);
+	r = surf_config_sanity(config, surf->flags);
 	if (r)
 		return r;

--- a/src/amd/common/ac_surface.h
+++ b/src/amd/common/ac_surface.h
@@ -79,6 +79,13 @@ struct legacy_surf_level {
    enum radeon_surf_mode       mode:2;
 };

+struct legacy_surf_fmask {
+    unsigned slice_tile_max; /* max 4M */
+    uint8_t tiling_index;    /* max 31 */
+    uint8_t bankh;           /* max 8 */
+    uint16_t pitch_in_pixels;
+};
+
 struct legacy_surf_layout {
    unsigned                    bankw:4;  /* max 8 */
    unsigned                    bankh:4;  /* max 8 */
@@ -101,6 +108,7 @@ struct legacy_surf_layout {
    struct legacy_surf_level    stencil_level[RADEON_SURF_MAX_LEVELS];
    uint8_t                     tiling_index[RADEON_SURF_MAX_LEVELS];
    uint8_t                     stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
+    struct legacy_surf_fmask    fmask;
 };

 /* Same as addrlib - AddrResourceType. */
@@ -142,13 +150,9 @@ struct gfx9_surf_layout {
    uint16_t                    dcc_pitch_max;  /* (mip chain pitch - 1) */

    uint64_t                    stencil_offset; /* separate stencil */
-    uint64_t                    fmask_size;
    uint64_t                    cmask_size;

-    uint32_t                    fmask_alignment;
    uint32_t                    cmask_alignment;
-
-    uint8_t                     fmask_tile_swizzle;
 };

 struct radeon_surf {
@@ -188,8 +192,10 @@ struct radeon_surf {
     * - depth/stencil if HTILE is not TC-compatible and if the gen is not GFX9
     */
    uint8_t                     tile_swizzle;
+    uint8_t                     fmask_tile_swizzle;

    uint64_t                    surf_size;
+    uint64_t                    fmask_size;
    /* DCC and HTILE are very small. */
    uint32_t                    dcc_size;
    uint32_t                    htile_size;
@@ -197,6 +203,7 @@ struct radeon_surf {
    uint32_t                    htile_slice_size;

    uint32_t                    surf_alignment;
+    uint32_t                    fmask_alignment;
    uint32_t                    dcc_alignment;
    uint32_t                    htile_alignment;

@@ -217,12 +224,13 @@ struct ac_surf_info {
 	uint32_t width;
 	uint32_t height;
 	uint32_t depth;
-	uint8_t samples;
+	uint8_t samples; /* For Z/S: samples; For color: FMASK coverage samples */
+	uint8_t color_samples; /* For color: color samples */
 	uint8_t levels;
 	uint8_t num_channels; /* heuristic for displayability */
 	uint16_t array_size;
 	uint32_t *surf_index; /* Set a monotonic counter for tile swizzling. */
-	uint32_t *fmask_surf_index; /* GFX9+ */
+	uint32_t *fmask_surf_index;
 };

 struct ac_surf_config {
--- a/src/amd/common/gfx9d.h
+++ b/src/amd/common/gfx9d.h
@@ -1123,7 +1123,6 @@
 #define   S_030960_HW_USE_ONLY(x)                                     (((unsigned)(x) & 0x1) << 23)
 #define   G_030960_HW_USE_ONLY(x)                                     (((x) >> 23) & 0x1)
 #define   C_030960_HW_USE_ONLY                                        0xFF7FFFFF
-#define R_030964_VGT_OBJECT_ID                                          0x030964
 #define R_030968_VGT_INSTANCE_BASE_ID                                   0x030968
 #define R_030A00_PA_SU_LINE_STIPPLE_VALUE                               0x030A00
 #define   S_030A00_LINE_STIPPLE_VALUE(x)                              (((unsigned)(x) & 0xFFFFFF) << 0)
@@ -1195,19 +1194,6 @@
 #define   S_030E04_ADDRESS(x)                                         (((unsigned)(x) & 0xFF) << 0)
 #define   G_030E04_ADDRESS(x)                                         (((x) >> 0) & 0xFF)
 #define   C_030E04_ADDRESS                                            0xFFFFFF00
-#define R_030E08_TA_GRAD_ADJ_UCONFIG                                    0x030E08
-#define   S_030E08_GRAD_ADJ_0(x)                                      (((unsigned)(x) & 0xFF) << 0)
-#define   G_030E08_GRAD_ADJ_0(x)                                      (((x) >> 0) & 0xFF)
-#define   C_030E08_GRAD_ADJ_0                                         0xFFFFFF00
-#define   S_030E08_GRAD_ADJ_1(x)                                      (((unsigned)(x) & 0xFF) << 8)
-#define   G_030E08_GRAD_ADJ_1(x)                                      (((x) >> 8) & 0xFF)
-#define   C_030E08_GRAD_ADJ_1                                         0xFFFF00FF
-#define   S_030E08_GRAD_ADJ_2(x)                                      (((unsigned)(x) & 0xFF) << 16)
-#define   G_030E08_GRAD_ADJ_2(x)                                      (((x) >> 16) & 0xFF)
-#define   C_030E08_GRAD_ADJ_2                                         0xFF00FFFF
-#define   S_030E08_GRAD_ADJ_3(x)                                      (((unsigned)(x) & 0xFF) << 24)
-#define   G_030E08_GRAD_ADJ_3(x)                                      (((x) >> 24) & 0xFF)
-#define   C_030E08_GRAD_ADJ_3                                         0x00FFFFFF
 #define R_030F00_DB_OCCLUSION_COUNT0_LOW                                0x030F00
 #define R_008F00_SQ_BUF_RSRC_WORD0                                      0x008F00
 #define R_030F04_DB_OCCLUSION_COUNT0_HI                                 0x030F04
@@ -4084,10 +4070,6 @@
 #define   S_028060_DISALLOW_OVERFLOW(x)                               (((unsigned)(x) & 0x1) << 3)
 #define   G_028060_DISALLOW_OVERFLOW(x)                               (((x) >> 3) & 0x1)
 #define   C_028060_DISALLOW_OVERFLOW                                  0xFFFFFFF7
-#define R_028064_DB_RENDER_FILTER                                       0x028064
-#define   S_028064_PS_INVOKE_MASK(x)                                  (((unsigned)(x) & 0xFFFF) << 0)
-#define   G_028064_PS_INVOKE_MASK(x)                                  (((x) >> 0) & 0xFFFF)
-#define   C_028064_PS_INVOKE_MASK                                     0xFFFF0000
 #define R_028068_DB_Z_INFO2                                             0x028068
 #define   S_028068_EPITCH(x)                                          (((unsigned)(x) & 0xFFFF) << 0)
 #define   G_028068_EPITCH(x)                                          (((x) >> 0) & 0xFFFF)
@@ -4417,9 +4399,6 @@
 #define   S_02835C_NUM_RB_PER_SE(x)                                   (((unsigned)(x) & 0x03) << 5)
 #define   G_02835C_NUM_RB_PER_SE(x)                                   (((x) >> 5) & 0x03)
 #define   C_02835C_NUM_RB_PER_SE                                      0xFFFFFF9F
-#define   S_02835C_DISABLE_SRBSL_DB_OPTIMIZED_PACKING(x)              (((unsigned)(x) & 0x1) << 8)
-#define   G_02835C_DISABLE_SRBSL_DB_OPTIMIZED_PACKING(x)              (((x) >> 8) & 0x1)
-#define   C_02835C_DISABLE_SRBSL_DB_OPTIMIZED_PACKING                 0xFFFFFEFF
 #define R_028360_CP_PERFMON_CNTX_CNTL                                   0x028360
 #define   S_028360_PERFMON_ENABLE(x)                                  (((unsigned)(x) & 0x1) << 31)
 #define   G_028360_PERFMON_ENABLE(x)                                  (((x) >> 31) & 0x1)
@@ -4463,26 +4442,6 @@
 #define   S_0283A8_BOT_QTR(x)                                         (((unsigned)(x) & 0xFF) << 24)
 #define   G_0283A8_BOT_QTR(x)                                         (((x) >> 24) & 0xFF)
 #define   C_0283A8_BOT_QTR                                            0x00FFFFFF
-#define R_0283AC_PA_SC_FOV_WINDOW_LR                                    0x0283AC
-#define   S_0283AC_LEFT_EYE_FOV_LEFT(x)                               (((unsigned)(x) & 0xFF) << 0)
-#define   G_0283AC_LEFT_EYE_FOV_LEFT(x)                               (((x) >> 0) & 0xFF)
-#define   C_0283AC_LEFT_EYE_FOV_LEFT                                  0xFFFFFF00
-#define   S_0283AC_LEFT_EYE_FOV_RIGHT(x)                              (((unsigned)(x) & 0xFF) << 8)
-#define   G_0283AC_LEFT_EYE_FOV_RIGHT(x)                              (((x) >> 8) & 0xFF)
-#define   C_0283AC_LEFT_EYE_FOV_RIGHT                                 0xFFFF00FF
-#define   S_0283AC_RIGHT_EYE_FOV_LEFT(x)                              (((unsigned)(x) & 0xFF) << 16)
-#define   G_0283AC_RIGHT_EYE_FOV_LEFT(x)                              (((x) >> 16) & 0xFF)
-#define   C_0283AC_RIGHT_EYE_FOV_LEFT                                 0xFF00FFFF
-#define   S_0283AC_RIGHT_EYE_FOV_RIGHT(x)                             (((unsigned)(x) & 0xFF) << 24)
-#define   G_0283AC_RIGHT_EYE_FOV_RIGHT(x)                             (((x) >> 24) & 0xFF)
-#define   C_0283AC_RIGHT_EYE_FOV_RIGHT                                0x00FFFFFF
-#define R_0283B0_PA_SC_FOV_WINDOW_TB                                    0x0283B0
-#define   S_0283B0_FOV_TOP(x)                                         (((unsigned)(x) & 0xFF) << 0)
-#define   G_0283B0_FOV_TOP(x)                                         (((x) >> 0) & 0xFF)
-#define   C_0283B0_FOV_TOP                                            0xFFFFFF00
-#define   S_0283B0_FOV_BOT(x)                                         (((unsigned)(x) & 0xFF) << 8)
-#define   G_0283B0_FOV_BOT(x)                                         (((x) >> 8) & 0xFF)
-#define   C_0283B0_FOV_BOT                                            0xFFFF00FF
 #define R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX                           0x02840C
 #define R_028414_CB_BLEND_RED                                           0x028414
 #define R_028418_CB_BLEND_GREEN                                         0x028418
@@ -5772,9 +5731,6 @@
 #define   S_028830_RECTANGLE_FILTER_DISABLE(x)                        (((unsigned)(x) & 0x1) << 4)
 #define   G_028830_RECTANGLE_FILTER_DISABLE(x)                        (((x) >> 4) & 0x1)
 #define   C_028830_RECTANGLE_FILTER_DISABLE                           0xFFFFFFEF
-#define   S_028830_SRBSL_ENABLE(x)                                    (((unsigned)(x) & 0x1) << 5)
-#define   G_028830_SRBSL_ENABLE(x)                                    (((x) >> 5) & 0x1)
-#define   C_028830_SRBSL_ENABLE                                       0xFFFFFFDF
 #define R_028834_PA_CL_OBJPRIM_ID_CNTL                                  0x028834
 #define   S_028834_OBJ_ID_SEL(x)                                      (((unsigned)(x) & 0x1) << 0)
 #define   G_028834_OBJ_ID_SEL(x)                                      (((x) >> 0) & 0x1)
@@ -6273,10 +6229,6 @@
 #define   S_028A98_OBJECT_ID_INST_EN(x)                               (((unsigned)(x) & 0x1) << 3)
 #define   G_028A98_OBJECT_ID_INST_EN(x)                               (((x) >> 3) & 0x1)
 #define   C_028A98_OBJECT_ID_INST_EN                                  0xFFFFFFF7
-#define R_028A9C_VGT_INDEX_PAYLOAD_CNTL                                 0x028A9C
-#define   S_028A9C_COMPOUND_INDEX_EN(x)                               (((unsigned)(x) & 0x1) << 0)
-#define   G_028A9C_COMPOUND_INDEX_EN(x)                               (((x) >> 0) & 0x1)
-#define   C_028A9C_COMPOUND_INDEX_EN                                  0xFFFFFFFE
 #define R_028AA0_VGT_INSTANCE_STEP_RATE_0                               0x028AA0
 #define R_028AA4_VGT_INSTANCE_STEP_RATE_1                               0x028AA4
 #define R_028AAC_VGT_ESGS_RING_ITEMSIZE                                 0x028AAC
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -6892,34 +6892,22 @@
 #define   S_028808_ROP3(x)                                            (((unsigned)(x) & 0xFF) << 16)
 #define   G_028808_ROP3(x)                                            (((x) >> 16) & 0xFF)
 #define   C_028808_ROP3                                               0xFF00FFFF
-#define     V_028808_X_0X00                                         0x00
-#define     V_028808_X_0X05                                         0x05
-#define     V_028808_X_0X0A                                         0x0A
-#define     V_028808_X_0X0F                                         0x0F
-#define     V_028808_X_0X11                                         0x11
-#define     V_028808_X_0X22                                         0x22
-#define     V_028808_X_0X33                                         0x33
-#define     V_028808_X_0X44                                         0x44
-#define     V_028808_X_0X50                                         0x50
-#define     V_028808_X_0X55                                         0x55
-#define     V_028808_X_0X5A                                         0x5A
-#define     V_028808_X_0X5F                                         0x5F
-#define     V_028808_X_0X66                                         0x66
-#define     V_028808_X_0X77                                         0x77
-#define     V_028808_X_0X88                                         0x88
-#define     V_028808_X_0X99                                         0x99
-#define     V_028808_X_0XA0                                         0xA0
-#define     V_028808_X_0XA5                                         0xA5
-#define     V_028808_X_0XAA                                         0xAA
-#define     V_028808_X_0XAF                                         0xAF
-#define     V_028808_X_0XBB                                         0xBB
-#define     V_028808_X_0XCC                                         0xCC
-#define     V_028808_X_0XDD                                         0xDD
-#define     V_028808_X_0XEE                                         0xEE
-#define     V_028808_X_0XF0                                         0xF0
-#define     V_028808_X_0XF5                                         0xF5
-#define     V_028808_X_0XFA                                         0xFA
-#define     V_028808_X_0XFF                                         0xFF
+#define     V_028808_ROP3_CLEAR                                     0x00
+#define     V_028808_ROP3_NOR                                       0x11
+#define     V_028808_ROP3_AND_INVERTED                              0x22
+#define     V_028808_ROP3_COPY_INVERTED                             0x33
+#define     V_028808_ROP3_AND_REVERSE                               0x44
+#define     V_028808_ROP3_INVERT                                    0x55
+#define     V_028808_ROP3_XOR                                       0x66
+#define     V_028808_ROP3_NAND                                      0x77
+#define     V_028808_ROP3_AND                                       0x88
+#define     V_028808_ROP3_EQUIVALENT                                0x99
+#define     V_028808_ROP3_NO_OP                                     0xaa
+#define     V_028808_ROP3_OR_INVERTED                               0xbb
+#define     V_028808_ROP3_COPY                                      0xcc
+#define     V_028808_ROP3_OR_REVERSE                                0xdd
+#define     V_028808_ROP3_OR                                        0xee
+#define     V_028808_ROP3_SET                                       0xff
 #define R_02880C_DB_SHADER_CONTROL                                      0x02880C
 #define   S_02880C_Z_EXPORT_ENABLE(x)                                 (((unsigned)(x) & 0x1) << 0)
 #define   G_02880C_Z_EXPORT_ENABLE(x)                                 (((x) >> 0) & 0x1)
--- a/src/amd/vulkan/.gitignore
+++ b/src/amd/vulkan/.gitignore
@@ -2,6 +2,7 @@
 /radv_entrypoints.c
 /radv_entrypoints.h
 /radv_extensions.c
+/radv_extensions.h
 /radv_timestamp.h
 /dev_icd.json
 /vk_format_table.c
--- a/src/amd/vulkan/radv_android.c
+++ b/src/amd/vulkan/radv_android.c
@@ -122,7 +122,7 @@ radv_image_from_gralloc(VkDevice device_h,
 		return result;

 	if (gralloc_info->handle->numFds != 1) {
-		return vk_errorf(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR,
+		return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR,
 		                 "VkNativeBufferANDROID::handle::numFds is %d, "
 		                 "expected 1", gralloc_info->handle->numFds);
 	}
@@ -233,7 +233,7 @@ VkResult radv_GetSwapchainGrallocUsageANDROID(
 	result = radv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h,
 	                                                      &image_format_info, &image_format_props);
 	if (result != VK_SUCCESS) {
-		return vk_errorf(result,
+		return vk_errorf(device->instance, result,
 		                 "radv_GetPhysicalDeviceImageFormatProperties2 failed "
 		                 "inside %s", __func__);
 	}
@@ -252,7 +252,7 @@ VkResult radv_GetSwapchainGrallocUsageANDROID(
 	 * gralloc swapchains.
 	 */
 	if (imageUsage != 0) {
-	return vk_errorf(VK_ERROR_FORMAT_NOT_SUPPORTED,
+	return vk_errorf(device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
 	                "unsupported VkImageUsageFlags(0x%x) for gralloc "
 	                "swapchain", imageUsage);
 	}
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -226,7 +226,7 @@ static VkResult radv_create_cmd_buffer(
 	cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
 			       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (cmd_buffer == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
 	cmd_buffer->device = device;
@@ -238,7 +238,7 @@ static VkResult radv_create_cmd_buffer(
 		cmd_buffer->queue_family_index = pool->queue_family_index;

 	} else {
-		/* Init the pool_link so we can safefly call list_del when we destroy
+		/* Init the pool_link so we can safely call list_del when we destroy
 		 * the command buffer
 		 */
 		list_inithead(&cmd_buffer->pool_link);
@@ -250,7 +250,7 @@ static VkResult radv_create_cmd_buffer(
 	cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
 	if (!cmd_buffer->cs) {
 		vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}

 	*pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
@@ -347,7 +347,8 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
 				       new_size, 4096,
 				       RADEON_DOMAIN_GTT,
 				       RADEON_FLAG_CPU_ACCESS|
-				       RADEON_FLAG_NO_INTERPROCESS_SHARING);
+				       RADEON_FLAG_NO_INTERPROCESS_SHARING |
+				       RADEON_FLAG_32BIT);

 	if (!bo) {
 		cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
@@ -559,20 +560,8 @@ radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
 		      gl_shader_stage stage,
 		      int idx)
 {
-	if (stage == MESA_SHADER_VERTEX) {
-		if (pipeline->shaders[MESA_SHADER_VERTEX])
-			return &pipeline->shaders[MESA_SHADER_VERTEX]->info.user_sgprs_locs.shader_data[idx];
-		if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
-			return &pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.user_sgprs_locs.shader_data[idx];
-		if (pipeline->shaders[MESA_SHADER_GEOMETRY])
-			return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.user_sgprs_locs.shader_data[idx];
-	} else if (stage == MESA_SHADER_TESS_EVAL) {
-		if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
-			return &pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.user_sgprs_locs.shader_data[idx];
-		if (pipeline->shaders[MESA_SHADER_GEOMETRY])
-			return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.user_sgprs_locs.shader_data[idx];
-	}
-	return &pipeline->shaders[stage]->info.user_sgprs_locs.shader_data[idx];
+	struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
+	return &shader->info.user_sgprs_locs.shader_data[idx];
 }

 static void
@@ -585,11 +574,54 @@ radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer,
 	uint32_t base_reg = pipeline->user_data_0[stage];
 	if (loc->sgpr_idx == -1)
 		return;
-	assert(loc->num_sgprs == 2);
+
+	assert(loc->num_sgprs == (HAVE_32BIT_POINTERS ? 1 : 2));
 	assert(!loc->indirect);
-	radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 2);
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
+
+	radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
+				 base_reg + loc->sgpr_idx * 4, va, false);
+}
+
+static void
+radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
+			      struct radv_pipeline *pipeline,
+			      struct radv_descriptor_state *descriptors_state,
+			      gl_shader_stage stage)
+{
+	struct radv_device *device = cmd_buffer->device;
+	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	uint32_t sh_base = pipeline->user_data_0[stage];
+	struct radv_userdata_locations *locs =
+		&pipeline->shaders[stage]->info.user_sgprs_locs;
+	unsigned mask;
+
+	mask = descriptors_state->dirty & descriptors_state->valid;
+
+	for (int i = 0; i < MAX_SETS; i++) {
+		struct radv_userdata_info *loc = &locs->descriptor_sets[i];
+		if (loc->sgpr_idx != -1 && !loc->indirect)
+			continue;
+		mask &= ~(1 << i);
+	}
+
+	while (mask) {
+		int start, count;
+
+		u_bit_scan_consecutive_range(&mask, &start, &count);
+
+		struct radv_userdata_info *loc = &locs->descriptor_sets[start];
+		unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
+
+		radv_emit_shader_pointer_head(cs, sh_offset, count,
+					      HAVE_32BIT_POINTERS);
+		for (int i = 0; i < count; i++) {
+			struct radv_descriptor_set *set =
+				descriptors_state->sets[start + i];
+
+			radv_emit_shader_pointer_body(device, cs, set->va,
+						      HAVE_32BIT_POINTERS);
+		}
+	}
 }

 static void
@@ -867,14 +899,6 @@ radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
 {
 	uint32_t count = cmd_buffer->state.dynamic.scissor.count;

-	/* Vega10/Raven scissor bug workaround. This must be done before VPORT
-	 * scissor registers are changed. There is also a more efficient but
-	 * more involved alternative workaround.
-	 */
-	if (cmd_buffer->device->physical_device->has_scissor_bug) {
-		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
-		si_emit_cache_flush(cmd_buffer);
-	}
 	si_write_scissors(cmd_buffer->cs, 0, count,
 			  cmd_buffer->state.dynamic.scissor.scissors,
 			  cmd_buffer->state.dynamic.viewport.viewports,
@@ -1020,6 +1044,68 @@ radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer,
 	}
 }

+static void
+radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer,
+			     struct radv_ds_buffer_info *ds,
+			     struct radv_image *image, VkImageLayout layout,
+			     bool requires_cond_write)
+{
+	uint32_t db_z_info = ds->db_z_info;
+	uint32_t db_z_info_reg;
+
+	if (!radv_image_is_tc_compat_htile(image))
+		return;
+
+	if (!radv_layout_has_htile(image, layout,
+	                           radv_image_queue_family_mask(image,
+	                                                        cmd_buffer->queue_family_index,
+	                                                        cmd_buffer->queue_family_index))) {
+		db_z_info &= C_028040_TILE_SURFACE_ENABLE;
+	}
+
+	db_z_info &= C_028040_ZRANGE_PRECISION;
+
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		db_z_info_reg = R_028038_DB_Z_INFO;
+	} else {
+		db_z_info_reg = R_028040_DB_Z_INFO;
+	}
+
+	/* When we don't know the last fast clear value we need to emit a
+	 * conditional packet, otherwise we can update DB_Z_INFO directly.
+	 */
+	if (requires_cond_write) {
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_WRITE, 7, 0));
+
+		const uint32_t write_space = 0 << 8;	/* register */
+		const uint32_t poll_space = 1 << 4;	/* memory */
+		const uint32_t function = 3 << 0;	/* equal to the reference */
+		const uint32_t options = write_space | poll_space | function;
+		radeon_emit(cmd_buffer->cs, options);
+
+		/* poll address - location of the depth clear value */
+		uint64_t va = radv_buffer_get_va(image->bo);
+		va += image->offset + image->clear_value_offset;
+
+		/* In presence of stencil format, we have to adjust the base
+		 * address because the first value is the stencil clear value.
+		 */
+		if (vk_format_is_stencil(image->vk_format))
+			va += 4;
+
+		radeon_emit(cmd_buffer->cs, va);
+		radeon_emit(cmd_buffer->cs, va >> 32);
+
+		radeon_emit(cmd_buffer->cs, fui(0.0f));		 /* reference value */
+		radeon_emit(cmd_buffer->cs, (uint32_t)-1);	 /* comparison mask */
+		radeon_emit(cmd_buffer->cs, db_z_info_reg >> 2); /* write address low */
+		radeon_emit(cmd_buffer->cs, 0u);		 /* write address high */
+		radeon_emit(cmd_buffer->cs, db_z_info);
+	} else {
+		radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
+	}
+}
+
 static void
 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
 		      struct radv_ds_buffer_info *ds,
@@ -1078,20 +1164,71 @@ radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,

 	}

+	/* Update the ZRANGE_PRECISION value for the TC-compat bug. */
+	radv_update_zrange_precision(cmd_buffer, ds, image, layout, true);
+
 	radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
 			       ds->pa_su_poly_offset_db_fmt_cntl);
 }

-void
-radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			  struct radv_image *image,
-			  VkClearDepthStencilValue ds_clear_value,
-			  VkImageAspectFlags aspects)
+/**
+ * Update the fast clear depth/stencil values if the image is bound as a
+ * depth/stencil buffer.
+ */
+static void
+radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
+				struct radv_image *image,
+				VkClearDepthStencilValue ds_clear_value,
+				VkImageAspectFlags aspects)
 {
+	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radv_attachment_info *att;
+	uint32_t att_idx;
+
+	if (!framebuffer || !subpass)
+		return;
+
+	att_idx = subpass->depth_stencil_attachment.attachment;
+	if (att_idx == VK_ATTACHMENT_UNUSED)
+		return;
+
+	att = &framebuffer->attachments[att_idx];
+	if (att->attachment->image != image)
+		return;
+
+	radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
+	radeon_emit(cs, ds_clear_value.stencil);
+	radeon_emit(cs, fui(ds_clear_value.depth));
+
+	/* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
+	 * only needed when clearing Z to 0.0.
+	 */
+	if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
+	    ds_clear_value.depth == 0.0) {
+		VkImageLayout layout = subpass->depth_stencil_attachment.layout;
+
+		radv_update_zrange_precision(cmd_buffer, &att->ds, image,
+					     layout, false);
+	}
+}
+
+/**
+ * Set the clear depth/stencil values to the image's metadata.
+ */
+void
+radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			   struct radv_image *image,
+			   VkClearDepthStencilValue ds_clear_value,
+			   VkImageAspectFlags aspects)
+{
+	struct radeon_winsys_cs *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(image->bo);
-	va += image->offset + image->clear_value_offset;
 	unsigned reg_offset = 0, reg_count = 0;

+	va += image->offset + image->clear_value_offset;
+
 	assert(radv_image_has_htile(image));

 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
@@ -1103,33 +1240,35 @@ radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
 		++reg_count;

-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0));
-	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
-				    S_370_WR_CONFIRM(1) |
-				    S_370_ENGINE_SEL(V_370_PFP));
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0));
+	radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+			S_370_WR_CONFIRM(1) |
+			S_370_ENGINE_SEL(V_370_PFP));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
-		radeon_emit(cmd_buffer->cs, ds_clear_value.stencil);
+		radeon_emit(cs, ds_clear_value.stencil);
 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
-		radeon_emit(cmd_buffer->cs, fui(ds_clear_value.depth));
+		radeon_emit(cs, fui(ds_clear_value.depth));

-	radeon_set_context_reg_seq(cmd_buffer->cs, R_028028_DB_STENCIL_CLEAR + 4 * reg_offset, reg_count);
-	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
-		radeon_emit(cmd_buffer->cs, ds_clear_value.stencil); /* R_028028_DB_STENCIL_CLEAR */
-	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
-		radeon_emit(cmd_buffer->cs, fui(ds_clear_value.depth)); /* R_02802C_DB_DEPTH_CLEAR */
+	radv_update_bound_fast_clear_ds(cmd_buffer, image, ds_clear_value,
+				        aspects);
 }

+/**
+ * Load the clear depth/stencil values from the image's metadata.
+ */
 static void
-radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			   struct radv_image *image)
+radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			    struct radv_image *image)
 {
+	struct radeon_winsys_cs *cs = cmd_buffer->cs;
 	VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
 	uint64_t va = radv_buffer_get_va(image->bo);
-	va += image->offset + image->clear_value_offset;
 	unsigned reg_offset = 0, reg_count = 0;

+	va += image->offset + image->clear_value_offset;
+
 	if (!radv_image_has_htile(image))
 		return;

@@ -1142,21 +1281,21 @@ radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
 		++reg_count;

-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
-	radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
-				    COPY_DATA_DST_SEL(COPY_DATA_REG) |
-				    (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
-	radeon_emit(cmd_buffer->cs, (R_028028_DB_STENCIL_CLEAR + 4 * reg_offset) >> 2);
-	radeon_emit(cmd_buffer->cs, 0);
+	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+			COPY_DATA_DST_SEL(COPY_DATA_REG) |
+			(reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, (R_028028_DB_STENCIL_CLEAR + 4 * reg_offset) >> 2);
+	radeon_emit(cs, 0);

-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-	radeon_emit(cmd_buffer->cs, 0);
+	radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+	radeon_emit(cs, 0);
 }

 /*
- *with DCC some colors don't require CMASK elimiation before being
+ * With DCC some colors don't require CMASK elimination before being
 * used as a texture. This sets a predicate value to determine if the
 * cmask eliminate is required.
 */
@@ -1181,55 +1320,95 @@ radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer,
 	radeon_emit(cmd_buffer->cs, pred_val >> 32);
 }

-void
-radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			  struct radv_image *image,
-			  int idx,
-			  uint32_t color_values[2])
+/**
+ * Update the fast clear color values if the image is bound as a color buffer.
+ */
+static void
+radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer,
+				   struct radv_image *image,
+				   int cb_idx,
+				   uint32_t color_values[2])
 {
+	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radv_attachment_info *att;
+	uint32_t att_idx;
+
+	if (!framebuffer || !subpass)
+		return;
+
+	att_idx = subpass->color_attachments[cb_idx].attachment;
+	if (att_idx == VK_ATTACHMENT_UNUSED)
+		return;
+
+	att = &framebuffer->attachments[att_idx];
+	if (att->attachment->image != image)
+		return;
+
+	radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
+	radeon_emit(cs, color_values[0]);
+	radeon_emit(cs, color_values[1]);
+}
+
+/**
+ * Set the clear color values to the image's metadata.
+ */
+void
+radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			      struct radv_image *image,
+			      int cb_idx,
+			      uint32_t color_values[2])
+{
+	struct radeon_winsys_cs *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(image->bo);
+
 	va += image->offset + image->clear_value_offset;

 	assert(radv_image_has_cmask(image) || radv_image_has_dcc(image));

-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0));
-	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
-				    S_370_WR_CONFIRM(1) |
-				    S_370_ENGINE_SEL(V_370_PFP));
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
-	radeon_emit(cmd_buffer->cs, color_values[0]);
-	radeon_emit(cmd_buffer->cs, color_values[1]);
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, 0));
+	radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+			S_370_WR_CONFIRM(1) |
+			S_370_ENGINE_SEL(V_370_PFP));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, color_values[0]);
+	radeon_emit(cs, color_values[1]);

-	radeon_set_context_reg_seq(cmd_buffer->cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c, 2);
-	radeon_emit(cmd_buffer->cs, color_values[0]);
-	radeon_emit(cmd_buffer->cs, color_values[1]);
+	radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx,
+					   color_values);
 }

+/**
+ * Load the clear color values from the image's metadata.
+ */
 static void
-radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			   struct radv_image *image,
-			   int idx)
+radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			       struct radv_image *image,
+			       int cb_idx)
 {
+	struct radeon_winsys_cs *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(image->bo);
+
 	va += image->offset + image->clear_value_offset;

 	if (!radv_image_has_cmask(image) && !radv_image_has_dcc(image))
 		return;

-	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c;
+	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;

-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
-	radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
-				    COPY_DATA_DST_SEL(COPY_DATA_REG) |
-				    COPY_DATA_COUNT_SEL);
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
-	radeon_emit(cmd_buffer->cs, reg >> 2);
-	radeon_emit(cmd_buffer->cs, 0);
+	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
+	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+			COPY_DATA_DST_SEL(COPY_DATA_REG) |
+			COPY_DATA_COUNT_SEL);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, reg >> 2);
+	radeon_emit(cs, 0);

-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
-	radeon_emit(cmd_buffer->cs, 0);
+	radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
+	radeon_emit(cs, 0);
 }

 static void
@@ -1260,7 +1439,7 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
 		assert(att->attachment->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT);
 		radv_emit_fb_color_state(cmd_buffer, i, att, image, layout);

-		radv_load_color_clear_regs(cmd_buffer, image, i);
+		radv_load_color_clear_metadata(cmd_buffer, image, i);
 	}

 	if(subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
@@ -1282,7 +1461,7 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
 			cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
 			cmd_buffer->state.offset_scale = att->ds.offset_scale;
 		}
-		radv_load_depth_clear_regs(cmd_buffer, image);
+		radv_load_ds_clear_metadata(cmd_buffer, image);
 	} else {
 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
 			radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
@@ -1334,6 +1513,7 @@ radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)

 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
 {
+	bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
 	uint32_t pa_sc_mode_cntl_1 =
 		pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
@@ -1342,11 +1522,12 @@ void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
 	if(!cmd_buffer->state.active_occlusion_queries) {
 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
 			if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
-			    pipeline->graphics.disable_out_of_order_rast_for_occlusion) {
+			    pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
+			    has_perfect_queries) {
 				/* Re-enable out-of-order rasterization if the
 				 * bound pipeline supports it and if it's has
-				 * been disabled before starting occlusion
-				 * queries.
+				 * been disabled before starting any perfect
+				 * occlusion queries.
 				 */
 				radeon_set_context_reg(cmd_buffer->cs,
 						       R_028A4C_PA_SC_MODE_CNTL_1,
@@ -1359,22 +1540,22 @@ void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
 	} else {
 		const struct radv_subpass *subpass = cmd_buffer->state.subpass;
 		uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
-		bool perfect = cmd_buffer->state.perfect_occlusion_queries_enabled;

 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
 			db_count_control =
-				S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+				S_028004_PERFECT_ZPASS_COUNTS(has_perfect_queries) |
 				S_028004_SAMPLE_RATE(sample_rate) |
 				S_028004_ZPASS_ENABLE(1) |
 				S_028004_SLICE_EVEN_ENABLE(1) |
 				S_028004_SLICE_ODD_ENABLE(1);

 			if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
-			    pipeline->graphics.disable_out_of_order_rast_for_occlusion) {
+			    pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
+			    has_perfect_queries) {
 				/* If the bound pipeline has enabled
 				 * out-of-order rasterization, we should
-				 * disable it before starting occlusion
-				 * queries.
+				 * disable it before starting any perfect
+				 * occlusion queries.
 				 */
 				pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;

@@ -1399,7 +1580,8 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
 	if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
 		radv_emit_viewport(cmd_buffer);

-	if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
+	if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
+	    !cmd_buffer->device->physical_device->has_scissor_bug)
 		radv_emit_scissor(cmd_buffer);

 	if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
@@ -1425,48 +1607,6 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
 	cmd_buffer->state.dirty &= ~states;
 }

-static void
-emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
-				   struct radv_pipeline *pipeline,
-				   int idx,
-				   uint64_t va,
-				   gl_shader_stage stage)
-{
-	struct radv_userdata_info *desc_set_loc = &pipeline->shaders[stage]->info.user_sgprs_locs.descriptor_sets[idx];
-	uint32_t base_reg = pipeline->user_data_0[stage];
-
-	if (desc_set_loc->sgpr_idx == -1 || desc_set_loc->indirect)
-		return;
-
-	assert(!desc_set_loc->indirect);
-	assert(desc_set_loc->num_sgprs == 2);
-	radeon_set_sh_reg_seq(cmd_buffer->cs,
-			      base_reg + desc_set_loc->sgpr_idx * 4, 2);
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
-}
-
-static void
-radv_emit_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
-				  VkShaderStageFlags stages,
-				  struct radv_descriptor_set *set,
-				  unsigned idx)
-{
-	if (cmd_buffer->state.pipeline) {
-		radv_foreach_stage(stage, stages) {
-			if (cmd_buffer->state.pipeline->shaders[stage])
-				emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
-								   idx, set->va,
-								   stage);
-		}
-	}
-
-	if (cmd_buffer->state.compute_pipeline && (stages & VK_SHADER_STAGE_COMPUTE_BIT))
-		emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.compute_pipeline,
-						   idx, set->va,
-						   MESA_SHADER_COMPUTE);
-}
-
 static void
 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer,
 			    VkPipelineBindPoint bind_point)
@@ -1548,7 +1688,6 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
 					 VK_PIPELINE_BIND_POINT_GRAPHICS;
 	struct radv_descriptor_state *descriptors_state =
 		radv_get_descriptors_state(cmd_buffer, bind_point);
-	unsigned i;

 	if (!descriptors_state->dirty)
 		return;
@@ -1565,13 +1704,25 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
 	                                                   cmd_buffer->cs,
 	                                                   MAX_SETS * MESA_SHADER_STAGES * 4);

-	for_each_bit(i, descriptors_state->dirty) {
-		struct radv_descriptor_set *set = descriptors_state->sets[i];
-		if (!(descriptors_state->valid & (1u << i)))
-			continue;
+	if (cmd_buffer->state.pipeline) {
+		radv_foreach_stage(stage, stages) {
+			if (!cmd_buffer->state.pipeline->shaders[stage])
+				continue;

-		radv_emit_descriptor_set_userdata(cmd_buffer, stages, set, i);
+			radv_emit_descriptor_pointers(cmd_buffer,
+						      cmd_buffer->state.pipeline,
+						      descriptors_state, stage);
+		}
 	}
+
+	if (cmd_buffer->state.compute_pipeline &&
+	    (stages & VK_SHADER_STAGE_COMPUTE_BIT)) {
+		radv_emit_descriptor_pointers(cmd_buffer,
+					      cmd_buffer->state.compute_pipeline,
+					      descriptors_state,
+					      MESA_SHADER_COMPUTE);
+	}
+
 	descriptors_state->dirty = 0;
 	descriptors_state->push_dirty = false;

@@ -1589,6 +1740,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
 					 ? cmd_buffer->state.compute_pipeline
 					 : cmd_buffer->state.pipeline;
 	struct radv_pipeline_layout *layout = pipeline->layout;
+	struct radv_shader_variant *shader, *prev_shader;
 	unsigned offset;
 	void *ptr;
 	uint64_t va;
@@ -1613,10 +1765,16 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
 	                                                   cmd_buffer->cs, MESA_SHADER_STAGES * 4);

+	prev_shader = NULL;
 	radv_foreach_stage(stage, stages) {
-		if (pipeline->shaders[stage]) {
+		shader = radv_get_shader(pipeline, stage);
+
+		/* Avoid redundantly emitting the address for merged stages. */
+		if (shader && shader != prev_shader) {
 			radv_emit_userdata_address(cmd_buffer, pipeline, stage,
 						   AC_UD_PUSH_CONSTANTS, va);
+
+			prev_shader = shader;
 		}
 	}

@@ -1631,7 +1789,7 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
 	if ((pipeline_is_dirty ||
 	    (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
 	    cmd_buffer->state.pipeline->vertex_elements.count &&
-	    radv_get_vertex_shader(cmd_buffer->state.pipeline)->info.info.vs.has_vertex_buffers) {
+	    radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
 		struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
 		unsigned vb_offset;
 		void *vb_ptr;
@@ -2405,7 +2563,7 @@ VkResult radv_EndCommandBuffer(
 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);

 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(cmd_buffer->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);

 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;

@@ -2519,18 +2677,6 @@ void radv_CmdSetViewport(
 	assert(firstViewport < MAX_VIEWPORTS);
 	assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);

-	if (cmd_buffer->device->physical_device->has_scissor_bug) {
-		/* Try to skip unnecessary PS partial flushes when the viewports
-		 * don't change.
-		 */
-		if (!(state->dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT |
-				      RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) &&
-		    !memcmp(state->dynamic.viewport.viewports + firstViewport,
-			    pViewports, viewportCount * sizeof(*pViewports))) {
-			return;
-		}
-	}
-
 	memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
 	       viewportCount * sizeof(*pViewports));

@@ -2550,18 +2696,6 @@ void radv_CmdSetScissor(
 	assert(firstScissor < MAX_SCISSORS);
 	assert(total_count >= 1 && total_count <= MAX_SCISSORS);

-	if (cmd_buffer->device->physical_device->has_scissor_bug) {
-		/* Try to skip unnecessary PS partial flushes when the scissors
-		 * don't change.
-		 */
-		if (!(state->dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT |
-				      RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) &&
-		    !memcmp(state->dynamic.scissor.scissors + firstScissor,
-			    pScissors, scissorCount * sizeof(*pScissors))) {
-			return;
-		}
-	}
-
 	memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
 	       scissorCount * sizeof(*pScissors));

@@ -2783,7 +2917,7 @@ VkResult radv_CreateCommandPool(
 	pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (pool == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	if (pAllocator)
 		pool->alloc = *pAllocator;
@@ -2956,7 +3090,7 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
 	unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
 	                              : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
-	bool draw_id_enable = radv_get_vertex_shader(cmd_buffer->state.pipeline)->info.info.vs.needs_draw_id;
+	bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id;
 	uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
 	assert(base_reg);

@@ -3141,10 +3275,55 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
 	}
 }

+/*
+ * Vega and raven have a bug which triggers if there are multiple context
+ * register contexts active at the same time with different scissor values.
+ *
+ * There are two possible workarounds:
+ * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
+ *    there is only ever 1 active set of scissor values at the same time.
+ *
+ * 2) Whenever the hardware switches contexts we have to set the scissor
+ *    registers again even if it is a noop. That way the new context gets
+ *    the correct scissor values.
+ *
+ * This implements option 2. radv_need_late_scissor_emission needs to
+ * return true on affected HW if radv_emit_all_graphics_states sets
+ * any context registers.
+ */
+static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
+                                            bool indexed_draw)
+{
+	struct radv_cmd_state *state = &cmd_buffer->state;
+
+	if (!cmd_buffer->device->physical_device->has_scissor_bug)
+		return false;
+
+	uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
+
+	/* Index & Vertex buffer don't change context regs, and pipeline is handled later. */
+	used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_PIPELINE);
+
+	/* Assume all state changes except  these two can imply context rolls. */
+	if (cmd_buffer->state.dirty & used_states)
+		return true;
+
+	if (cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
+		return true;
+
+	if (indexed_draw && state->pipeline->graphics.prim_restart_enable &&
+	    (state->index_type ? 0xffffffffu : 0xffffu) != state->last_primitive_reset_index)
+		return true;
+
+	return false;
+}
+
 static void
 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
 			      const struct radv_draw_info *info)
 {
+	bool late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info->indexed);
+
 	if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
 	    cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
 		radv_emit_rbplus_state(cmd_buffer);
@@ -3174,6 +3353,9 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
 	radv_emit_draw_registers(cmd_buffer, info->indexed,
 				 info->instance_count > 1, info->indirect,
 				 info->indirect ? 0 : info->count);
+
+	if (late_scissor_emission)
+		radv_emit_scissor(cmd_buffer);
 }

 static void
@@ -3381,6 +3563,55 @@ void radv_CmdDrawIndexedIndirectCountAMD(
 	radv_draw(cmd_buffer, &info);
 }

+void radv_CmdDrawIndirectCountKHR(
+	VkCommandBuffer                             commandBuffer,
+	VkBuffer                                    _buffer,
+	VkDeviceSize                                offset,
+	VkBuffer                                    _countBuffer,
+	VkDeviceSize                                countBufferOffset,
+	uint32_t                                    maxDrawCount,
+	uint32_t                                    stride)
+{
+	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
+	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
+	struct radv_draw_info info = {};
+
+	info.count = maxDrawCount;
+	info.indirect = buffer;
+	info.indirect_offset = offset;
+	info.count_buffer = count_buffer;
+	info.count_buffer_offset = countBufferOffset;
+	info.stride = stride;
+
+	radv_draw(cmd_buffer, &info);
+}
+
+void radv_CmdDrawIndexedIndirectCountKHR(
+	VkCommandBuffer                             commandBuffer,
+	VkBuffer                                    _buffer,
+	VkDeviceSize                                offset,
+	VkBuffer                                    _countBuffer,
+	VkDeviceSize                                countBufferOffset,
+	uint32_t                                    maxDrawCount,
+	uint32_t                                    stride)
+{
+	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
+	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
+	struct radv_draw_info info = {};
+
+	info.indexed = true;
+	info.count = maxDrawCount;
+	info.indirect = buffer;
+	info.indirect_offset = offset;
+	info.count_buffer = count_buffer;
+	info.count_buffer_offset = countBufferOffset;
+	info.stride = stride;
+
+	radv_draw(cmd_buffer, &info);
+}
+
 struct radv_dispatch_info {
 	/**
 	 * Determine the layout of the grid (in block units) to be used.
@@ -3711,6 +3942,20 @@ static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
 					      size, clear_word);

 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
+
+	/* Initialize the depth clear registers and update the ZRANGE_PRECISION
+	 * value for the TC-compat bug (because ZRANGE_PRECISION is 1 by
+	 * default). This is only needed whean clearing Z to 0.0f.
+	 */
+	if (radv_image_is_tc_compat_htile(image) && clear_word == 0) {
+		VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
+		VkClearDepthStencilValue value = {};
+
+		if (vk_format_is_stencil(image->vk_format))
+			aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+		radv_set_ds_clear_metadata(cmd_buffer, image, value, aspects);
+	}
 }

 static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
--- a/src/amd/vulkan/radv_debug.c
+++ b/src/amd/vulkan/radv_debug.c
@@ -369,11 +369,9 @@ static void si_add_split_disasm(const char *disasm,
 }

 static void
-radv_dump_annotated_shader(struct radv_pipeline *pipeline,
-			   struct radv_shader_variant *shader,
-			   gl_shader_stage stage,
-			   struct ac_wave_info *waves, unsigned num_waves,
-			   FILE *f)
+radv_dump_annotated_shader(struct radv_shader_variant *shader,
+			   gl_shader_stage stage, struct ac_wave_info *waves,
+			   unsigned num_waves, FILE *f)
 {
 	uint64_t start_addr, end_addr;
 	unsigned i;
@@ -444,28 +442,22 @@ radv_dump_annotated_shader(struct radv_pipeline *pipeline,

 static void
 radv_dump_annotated_shaders(struct radv_pipeline *pipeline,
-			    struct radv_shader_variant *compute_shader,
-			    FILE *f)
+			    VkShaderStageFlagBits active_stages, FILE *f)
 {
 	struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
 	unsigned num_waves = ac_get_wave_info(waves);
-	unsigned mask;

 	fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
 		"\n\n", num_waves);

 	/* Dump annotated active graphics shaders. */
-	mask = pipeline->active_stages;
-	while (mask) {
-		int stage = u_bit_scan(&mask);
+	while (active_stages) {
+		int stage = u_bit_scan(&active_stages);

-		radv_dump_annotated_shader(pipeline, pipeline->shaders[stage],
+		radv_dump_annotated_shader(pipeline->shaders[stage],
 					   stage, waves, num_waves, f);
 	}

-	radv_dump_annotated_shader(pipeline, compute_shader,
-				   MESA_SHADER_COMPUTE, waves, num_waves, f);
-
 	/* Print waves executing shaders that are not currently bound. */
 	unsigned i;
 	bool found = false;
@@ -523,48 +515,51 @@ radv_dump_shader(struct radv_pipeline *pipeline,

 static void
 radv_dump_shaders(struct radv_pipeline *pipeline,
-		  struct radv_shader_variant *compute_shader, FILE *f)
+		  VkShaderStageFlagBits active_stages, FILE *f)
 {
-	unsigned mask;
-
 	/* Dump active graphics shaders. */
-	mask = pipeline->active_stages;
-	while (mask) {
-		int stage = u_bit_scan(&mask);
+	while (active_stages) {
+		int stage = u_bit_scan(&active_stages);

 		radv_dump_shader(pipeline, pipeline->shaders[stage], stage, f);
 	}
+}

-	radv_dump_shader(pipeline, compute_shader, MESA_SHADER_COMPUTE, f);
+static void
+radv_dump_pipeline_state(struct radv_pipeline *pipeline,
+			 VkShaderStageFlagBits active_stages, FILE *f)
+{
+	radv_dump_shaders(pipeline, active_stages, f);
+	radv_dump_annotated_shaders(pipeline, active_stages, f);
+	radv_dump_descriptors(pipeline, f);
 }

 static void
 radv_dump_graphics_state(struct radv_pipeline *graphics_pipeline,
 			 struct radv_pipeline *compute_pipeline, FILE *f)
 {
-	struct radv_shader_variant *compute_shader =
-		compute_pipeline ? compute_pipeline->shaders[MESA_SHADER_COMPUTE] : NULL;
+	VkShaderStageFlagBits active_stages;

-	if (!graphics_pipeline)
-		return;
+	if (graphics_pipeline) {
+		active_stages = graphics_pipeline->active_stages;
+		radv_dump_pipeline_state(graphics_pipeline, active_stages, f);
+	}

-	radv_dump_shaders(graphics_pipeline, compute_shader, f);
-	radv_dump_annotated_shaders(graphics_pipeline, compute_shader, f);
-	radv_dump_descriptors(graphics_pipeline, f);
+	if (compute_pipeline) {
+		active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
+		radv_dump_pipeline_state(compute_pipeline, active_stages, f);
+	}
 }

 static void
 radv_dump_compute_state(struct radv_pipeline *compute_pipeline, FILE *f)
 {
+	VkShaderStageFlagBits active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
+
 	if (!compute_pipeline)
 		return;

-	radv_dump_shaders(compute_pipeline,
-			  compute_pipeline->shaders[MESA_SHADER_COMPUTE], f);
-	radv_dump_annotated_shaders(compute_pipeline,
-				    compute_pipeline->shaders[MESA_SHADER_COMPUTE],
-				    f);
-	radv_dump_descriptors(compute_pipeline, f);
+	radv_dump_pipeline_state(compute_pipeline, active_stages, f);
 }

 static struct radv_pipeline *
@@ -643,11 +638,9 @@ radv_dump_device_name(struct radv_device *device, FILE *f)
 		snprintf(kernel_version, sizeof(kernel_version),
 			 " / %s", uname_data.release);

-	if (HAVE_LLVM > 0) {
-		snprintf(llvm_string, sizeof(llvm_string),
-			 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
-			 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
-	}
+	snprintf(llvm_string, sizeof(llvm_string),
+		 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
+		 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);

 	fprintf(f, "Device name: %s (%s DRM %i.%i.%i%s%s)\n\n",
 		chip_name, device->physical_device->name,
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -44,6 +44,11 @@ enum {
 	RADV_DEBUG_NO_SISCHED        = 0x4000,
 	RADV_DEBUG_PREOPTIR          = 0x8000,
 	RADV_DEBUG_NO_DYNAMIC_BOUNDS = 0x10000,
+	RADV_DEBUG_NO_OUT_OF_ORDER   = 0x20000,
+	RADV_DEBUG_INFO              = 0x40000,
+	RADV_DEBUG_ERRORS            = 0x80000,
+	RADV_DEBUG_STARTUP           = 0x100000,
+	RADV_DEBUG_CHECKIR           = 0x200000,
 };

 enum {
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -95,7 +95,7 @@ VkResult radv_CreateDescriptorSetLayout(
 	set_layout = vk_alloc2(&device->alloc, pAllocator, size, 8,
 				 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!set_layout)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	set_layout->flags = pCreateInfo->flags;

@@ -106,7 +106,7 @@ VkResult radv_CreateDescriptorSetLayout(
 	                                                                pCreateInfo->bindingCount);
 	if (!bindings) {
 		vk_free2(&device->alloc, pAllocator, set_layout);
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}

 	set_layout->binding_count = max_binding + 1;
@@ -322,7 +322,7 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device,

 /*
 * Pipeline layouts.  These have nothing to do with the pipeline.  They are
- * just muttiple descriptor set layouts pasted together
+ * just multiple descriptor set layouts pasted together.
 */

 VkResult radv_CreatePipelineLayout(
@@ -340,7 +340,7 @@ VkResult radv_CreatePipelineLayout(
 	layout = vk_alloc2(&device->alloc, pAllocator, sizeof(*layout), 8,
 			     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (layout == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	layout->num_sets = pCreateInfo->setLayoutCount;

@@ -412,7 +412,7 @@ radv_descriptor_set_create(struct radv_device *device,

 	if (pool->host_memory_base) {
 		if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
-			return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY_KHR);

 		set = (struct radv_descriptor_set*)pool->host_memory_ptr;
 		pool->host_memory_ptr += mem_size;
@@ -421,7 +421,7 @@ radv_descriptor_set_create(struct radv_device *device,
 		                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

 		if (!set)
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}

 	memset(set, 0, mem_size);
@@ -437,7 +437,7 @@ radv_descriptor_set_create(struct radv_device *device,

 		if (!pool->host_memory_base && pool->entry_count == pool->max_entry_count) {
 			vk_free2(&device->alloc, NULL, set);
-			return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 		}

 		/* try to allocate linearly first, so that we don't spend
@@ -466,7 +466,7 @@ radv_descriptor_set_create(struct radv_device *device,

 			if (pool->size - offset < layout_size) {
 				vk_free2(&device->alloc, NULL, set);
-				return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+				return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 			}
 			set->bo = pool->bo;
 			set->mapped_ptr = (uint32_t*)(pool->mapped_ptr + offset);
@@ -478,7 +478,7 @@ radv_descriptor_set_create(struct radv_device *device,
 			pool->entries[index].set = set;
 			pool->entry_count++;
 		} else
-			return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 	}

 	if (layout->has_immutable_samplers) {
@@ -580,7 +580,7 @@ VkResult radv_CreateDescriptorPool(
 	pool = vk_alloc2(&device->alloc, pAllocator, size, 8,
 	                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!pool)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	memset(pool, 0, sizeof(*pool));

@@ -594,7 +594,8 @@ VkResult radv_CreateDescriptorPool(
 		pool->bo = device->ws->buffer_create(device->ws, bo_size, 32,
 						     RADEON_DOMAIN_VRAM,
 						     RADEON_FLAG_NO_INTERPROCESS_SHARING |
-						     RADEON_FLAG_READ_ONLY);
+						     RADEON_FLAG_READ_ONLY |
+						     RADEON_FLAG_32BIT);
 		pool->mapped_ptr = (uint8_t*)device->ws->buffer_map(pool->bo);
 	}
 	pool->size = bo_size;
@@ -995,7 +996,7 @@ VkResult radv_CreateDescriptorUpdateTemplate(VkDevice _device,

 	templ = vk_alloc2(&device->alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!templ)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	templ->entry_count = entry_count;
 	templ->bind_point = pCreateInfo->pipelineBindPoint;
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -108,12 +108,9 @@ radv_get_device_name(enum radeon_family family, char *name, size_t name_len)
 	default: chip_string = "AMD RADV unknown"; break;
 	}

-	if (HAVE_LLVM > 0) {
-		snprintf(llvm_string, sizeof(llvm_string),
-			 " (LLVM %i.%i.%i)", (HAVE_LLVM >> 8) & 0xff,
-			 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
-	}
-
+	snprintf(llvm_string, sizeof(llvm_string),
+		 " (LLVM %i.%i.%i)", (HAVE_LLVM >> 8) & 0xff,
+		 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
 	snprintf(name, name_len, "%s%s", chip_string, llvm_string);
 }

@@ -230,23 +227,38 @@ radv_physical_device_init(struct radv_physical_device *device,
 	int fd;

 	fd = open(path, O_RDWR | O_CLOEXEC);
-	if (fd < 0)
-		return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER);
+	if (fd < 0) {
+		if (instance->debug_flags & RADV_DEBUG_STARTUP)
+			radv_logi("Could not open device '%s'", path);
+
+		return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);
+	}

 	version = drmGetVersion(fd);
 	if (!version) {
 		close(fd);
-		return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER,
+
+		if (instance->debug_flags & RADV_DEBUG_STARTUP)
+			radv_logi("Could not get the kernel driver version for device '%s'", path);
+
+		return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
 				 "failed to get version %s: %m", path);
 	}

 	if (strcmp(version->name, "amdgpu")) {
 		drmFreeVersion(version);
 		close(fd);
+
+		if (instance->debug_flags & RADV_DEBUG_STARTUP)
+			radv_logi("Device '%s' is not using the amdgpu kernel driver.", path);
+
 		return VK_ERROR_INCOMPATIBLE_DRIVER;
 	}
 	drmFreeVersion(version);

+	if (instance->debug_flags & RADV_DEBUG_STARTUP)
+			radv_logi("Found compatible device '%s'.", path);
+
 	device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
 	device->instance = instance;
 	assert(strlen(path) < ARRAY_SIZE(device->path));
@@ -255,7 +267,7 @@ radv_physical_device_init(struct radv_physical_device *device,
 	device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags,
 					       instance->perftest_flags);
 	if (!device->ws) {
-		result = VK_ERROR_INCOMPATIBLE_DRIVER;
+		result = vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);
 		goto fail;
 	}

@@ -268,7 +280,7 @@ radv_physical_device_init(struct radv_physical_device *device,

 	if (radv_device_get_cache_uuid(device->rad_info.family, device->cache_uuid)) {
 		device->ws->destroy(device->ws);
-		result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+		result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
 				   "cannot generate UUID");
 		goto fail;
 	}
@@ -278,7 +290,7 @@ radv_physical_device_init(struct radv_physical_device *device,
 		(device->instance->perftest_flags & RADV_PERFTEST_SISCHED ? 0x1 : 0) |
 		(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH ? 0x2 : 0);

-	/* The gpu id is already embeded in the uuid so we just pass "radv"
+	/* The gpu id is already embedded in the uuid so we just pass "radv"
 	 * when creating the cache.
 	 */
 	char buf[VK_UUID_SIZE * 2 + 1];
@@ -300,7 +312,7 @@ radv_physical_device_init(struct radv_physical_device *device,
 		                         device->rad_info.family == CHIP_RAVEN;
 	}

-	/* The mere presense of CLEAR_STATE in the IB causes random GPU hangs
+	/* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
 	 * on SI.
 	 */
 	device->has_clear_state = device->rad_info.chip_class >= CIK;
@@ -315,10 +327,10 @@ radv_physical_device_init(struct radv_physical_device *device,
 	device->has_out_of_order_rast = device->rad_info.chip_class >= VI &&
 					device->rad_info.max_se >= 2;
 	device->out_of_order_rast_allowed = device->has_out_of_order_rast &&
-					    (device->instance->perftest_flags & RADV_PERFTEST_OUT_OF_ORDER);
+					    !(device->instance->debug_flags & RADV_DEBUG_NO_OUT_OF_ORDER);

-	device->dcc_msaa_allowed = device->rad_info.chip_class == VI &&
-				   (device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA);
+	device->dcc_msaa_allowed =
+		(device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA);

 	radv_physical_device_init_mem_types(device);
 	radv_fill_device_extension_table(device, &device->supported_extensions);
@@ -326,9 +338,13 @@ radv_physical_device_init(struct radv_physical_device *device,
 	result = radv_init_wsi(device);
 	if (result != VK_SUCCESS) {
 		device->ws->destroy(device->ws);
+		vk_error(instance, result);
 		goto fail;
 	}

+	if ((device->instance->debug_flags & RADV_DEBUG_INFO))
+		ac_print_gpu_info(&device->rad_info);
+
 	return VK_SUCCESS;

 fail:
@@ -390,6 +406,11 @@ static const struct debug_control radv_debug_options[] = {
 	{"nosisched", RADV_DEBUG_NO_SISCHED},
 	{"preoptir", RADV_DEBUG_PREOPTIR},
 	{"nodynamicbounds", RADV_DEBUG_NO_DYNAMIC_BOUNDS},
+	{"nooutoforder", RADV_DEBUG_NO_OUT_OF_ORDER},
+	{"info", RADV_DEBUG_INFO},
+	{"errors", RADV_DEBUG_ERRORS},
+	{"startup", RADV_DEBUG_STARTUP},
+	{"checkir", RADV_DEBUG_CHECKIR},
 	{NULL, 0}
 };

@@ -405,7 +426,6 @@ static const struct debug_control radv_perftest_options[] = {
 	{"sisched", RADV_PERFTEST_SISCHED},
 	{"localbos", RADV_PERFTEST_LOCAL_BOS},
 	{"binning", RADV_PERFTEST_BINNING},
-	{"outoforderrast", RADV_PERFTEST_OUT_OF_ORDER},
 	{"dccmsaa", RADV_PERFTEST_DCC_MSAA},
 	{NULL, 0}
 };
@@ -428,10 +448,12 @@ radv_handle_per_app_options(struct radv_instance *instance,

 	if (!strcmp(name, "Talos - Linux - 32bit") ||
 	    !strcmp(name, "Talos - Linux - 64bit")) {
-		/* Force enable LLVM sisched for Talos because it looks safe
-		 * and it gives few more FPS.
-		 */
-		instance->perftest_flags |= RADV_PERFTEST_SISCHED;
+		if (!(instance->debug_flags & RADV_DEBUG_NO_SISCHED)) {
+			/* Force enable LLVM sisched for Talos because it looks
+			 * safe and it gives few more FPS.
+			 */
+			instance->perftest_flags |= RADV_PERFTEST_SISCHED;
+		}
 	}
 }

@@ -460,22 +482,13 @@ VkResult radv_CreateInstance(
 	    pCreateInfo->pApplicationInfo->apiVersion != 0) {
 		client_version = pCreateInfo->pApplicationInfo->apiVersion;
 	} else {
-		client_version = VK_MAKE_VERSION(1, 0, 0);
-	}
-
-	if (VK_MAKE_VERSION(1, 0, 0) > client_version ||
-	    client_version > VK_MAKE_VERSION(1, 1, 0xfff)) {
-		return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER,
-				 "Client requested version %d.%d.%d",
-				 VK_VERSION_MAJOR(client_version),
-				 VK_VERSION_MINOR(client_version),
-				 VK_VERSION_PATCH(client_version));
+		radv_EnumerateInstanceVersion(&client_version);
 	}

 	instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
 			      VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
 	if (!instance)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);

 	instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC;

@@ -487,13 +500,23 @@ VkResult radv_CreateInstance(
 	instance->apiVersion = client_version;
 	instance->physicalDeviceCount = -1;

+	instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"),
+						   radv_debug_options);
+
+	instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"),
+						   radv_perftest_options);
+
+
+	if (instance->debug_flags & RADV_DEBUG_STARTUP)
+		radv_logi("Created an instance");
+
 	for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
 		const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i];
 		int index = radv_get_instance_extension_index(ext_name);

 		if (index < 0 || !radv_supported_instance_extensions.extensions[index]) {
 			vk_free2(&default_alloc, pAllocator, instance);
-			return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
+			return vk_error(instance, VK_ERROR_EXTENSION_NOT_PRESENT);
 		}

 		instance->enabled_extensions.extensions[index] = true;
@@ -502,29 +525,15 @@ VkResult radv_CreateInstance(
 	result = vk_debug_report_instance_init(&instance->debug_report_callbacks);
 	if (result != VK_SUCCESS) {
 		vk_free2(&default_alloc, pAllocator, instance);
-		return vk_error(result);
+		return vk_error(instance, result);
 	}

 	_mesa_locale_init();

 	VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));

-	instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"),
-						   radv_debug_options);
-
-	instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"),
-						   radv_perftest_options);
-
 	radv_handle_per_app_options(instance, pCreateInfo->pApplicationInfo);

-	if (instance->debug_flags & RADV_DEBUG_NO_SISCHED) {
-		/* Disable sisched when the user requests it, this is mostly
-		 * useful when the driver force-enable sisched for the given
-		 * application.
-		 */
-		instance->perftest_flags &= ~RADV_PERFTEST_SISCHED;
-	}
-
 	*pInstance = radv_instance_to_handle(instance);

 	return VK_SUCCESS;
@@ -563,8 +572,12 @@ radv_enumerate_devices(struct radv_instance *instance)
 	instance->physicalDeviceCount = 0;

 	max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
+
+	if (instance->debug_flags & RADV_DEBUG_STARTUP)
+		radv_logi("Found %d drm nodes", max_devices);
+
 	if (max_devices < 1)
-		return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER);
+		return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);

 	for (unsigned i = 0; i < (unsigned)max_devices; i++) {
 		if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER &&
@@ -745,7 +758,7 @@ void radv_GetPhysicalDeviceFeatures2(
 		}
 		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: {
 			VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features =
-				(VkPhysicalDeviceDescriptorIndexingFeaturesEXT*)features;
+				(VkPhysicalDeviceDescriptorIndexingFeaturesEXT*)ext;
 			features->shaderInputAttachmentArrayDynamicIndexing = true;
 			features->shaderUniformTexelBufferArrayDynamicIndexing = true;
 			features->shaderStorageTexelBufferArrayDynamicIndexing = true;
@@ -865,7 +878,7 @@ void radv_GetPhysicalDeviceProperties(
 		.maxViewports                             = MAX_VIEWPORTS,
 		.maxViewportDimensions                    = { (1 << 14), (1 << 14) },
 		.viewportBoundsRange                      = { INT16_MIN, INT16_MAX },
-		.viewportSubPixelBits                     = 13, /* We take a float? */
+		.viewportSubPixelBits                     = 8,
 		.minMemoryMapAlignment                    = 4096, /* A page */
 		.minTexelBufferOffsetAlignment            = 1,
 		.minUniformBufferOffsetAlignment          = 4,
@@ -977,9 +990,12 @@ void radv_GetPhysicalDeviceProperties2(
 							VK_SUBGROUP_FEATURE_BASIC_BIT |
 							VK_SUBGROUP_FEATURE_BALLOT_BIT |
 							VK_SUBGROUP_FEATURE_QUAD_BIT |
-							VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
-							VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
 							VK_SUBGROUP_FEATURE_VOTE_BIT;
+			if (pdevice->rad_info.chip_class >= VI) {
+				properties->supportedOperations |=
+							VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
+							VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT;
+			}
 			properties->quadOperationsInAllStages = true;
 			break;
 		}
@@ -1258,7 +1274,7 @@ radv_queue_init(struct radv_device *device, struct radv_queue *queue,

 	queue->hw_ctx = device->ws->ctx_create(device->ws, queue->priority);
 	if (!queue->hw_ctx)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	return VK_SUCCESS;
 }
@@ -1353,36 +1369,8 @@ static void radv_bo_list_remove(struct radv_device *device,
 static void
 radv_device_init_gs_info(struct radv_device *device)
 {
-	switch (device->physical_device->rad_info.family) {
-	case CHIP_OLAND:
-	case CHIP_HAINAN:
-	case CHIP_KAVERI:
-	case CHIP_KABINI:
-	case CHIP_MULLINS:
-	case CHIP_ICELAND:
-	case CHIP_CARRIZO:
-	case CHIP_STONEY:
-		device->gs_table_depth = 16;
-		return;
-	case CHIP_TAHITI:
-	case CHIP_PITCAIRN:
-	case CHIP_VERDE:
-	case CHIP_BONAIRE:
-	case CHIP_HAWAII:
-	case CHIP_TONGA:
-	case CHIP_FIJI:
-	case CHIP_POLARIS10:
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-	case CHIP_VEGAM:
-	case CHIP_VEGA10:
-	case CHIP_VEGA12:
-	case CHIP_RAVEN:
-		device->gs_table_depth = 32;
-		return;
-	default:
-		unreachable("unknown GPU");
-	}
+	device->gs_table_depth = ac_get_gs_table_depth(device->physical_device->rad_info.chip_class,
+						       device->physical_device->rad_info.family);
 }

 static int radv_get_device_extension_index(const char *name)
@@ -1415,7 +1403,7 @@ VkResult radv_CreateDevice(
 		unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
 		for (uint32_t i = 0; i < num_features; i++) {
 			if (enabled_feature[i] && !supported_feature[i])
-				return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+				return vk_error(physical_device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
 		}
 	}

@@ -1423,7 +1411,7 @@ VkResult radv_CreateDevice(
 			    sizeof(*device), 8,
 			    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
 	if (!device)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
 	device->instance = physical_device->instance;
@@ -1440,7 +1428,7 @@ VkResult radv_CreateDevice(
 		int index = radv_get_device_extension_index(ext_name);
 		if (index < 0 || !physical_device->supported_extensions.extensions[index]) {
 			vk_free(&device->alloc, device);
-			return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
+			return vk_error(physical_device->instance, VK_ERROR_EXTENSION_NOT_PRESENT);
 		}

 		device->enabled_extensions.extensions[index] = true;
@@ -1497,13 +1485,11 @@ VkResult radv_CreateDevice(
 	device->always_use_syncobj = device->physical_device->rad_info.has_syncobj_wait_for_submit;
 #endif

-	device->llvm_supports_spill = true;
-
 	/* The maximum number of scratch waves. Scratch space isn't divided
 	 * evenly between CUs. The number is only a function of the number of CUs.
 	 * We can decrease the constant to decrease the scratch buffer size.
 	 *
-	 * sctx->scratch_waves must be >= the maximum posible size of
+	 * sctx->scratch_waves must be >= the maximum possible size of
 	 * 1 threadgroup, so that the hw doesn't hang from being unable
 	 * to start any.
 	 *
@@ -1654,7 +1640,7 @@ VkResult radv_EnumerateInstanceLayerProperties(
 	}

 	/* None supported at this time */
-	return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
+	return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 }

 VkResult radv_EnumerateDeviceLayerProperties(
@@ -1668,7 +1654,7 @@ VkResult radv_EnumerateDeviceLayerProperties(
 	}

 	/* None supported at this time */
-	return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
+	return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 }

 void radv_GetDeviceQueue2(
@@ -1905,6 +1891,126 @@ radv_get_hs_offchip_param(struct radv_device *device, uint32_t *max_offchip_buff
 	return hs_offchip_param;
 }

+static void
+radv_emit_gs_ring_sizes(struct radv_queue *queue, struct radeon_winsys_cs *cs,
+			struct radeon_winsys_bo *esgs_ring_bo,
+			uint32_t esgs_ring_size,
+			struct radeon_winsys_bo *gsvs_ring_bo,
+			uint32_t gsvs_ring_size)
+{
+	if (!esgs_ring_bo && !gsvs_ring_bo)
+		return;
+
+	if (esgs_ring_bo)
+		radv_cs_add_buffer(queue->device->ws, cs, esgs_ring_bo, 8);
+
+	if (gsvs_ring_bo)
+		radv_cs_add_buffer(queue->device->ws, cs, gsvs_ring_bo, 8);
+
+	if (queue->device->physical_device->rad_info.chip_class >= CIK) {
+		radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
+		radeon_emit(cs, esgs_ring_size >> 8);
+		radeon_emit(cs, gsvs_ring_size >> 8);
+	} else {
+		radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
+		radeon_emit(cs, esgs_ring_size >> 8);
+		radeon_emit(cs, gsvs_ring_size >> 8);
+	}
+}
+
+static void
+radv_emit_tess_factor_ring(struct radv_queue *queue, struct radeon_winsys_cs *cs,
+			   unsigned hs_offchip_param, unsigned tf_ring_size,
+			   struct radeon_winsys_bo *tess_rings_bo)
+{
+	uint64_t tf_va;
+
+	if (!tess_rings_bo)
+		return;
+
+	tf_va = radv_buffer_get_va(tess_rings_bo);
+
+	radv_cs_add_buffer(queue->device->ws, cs, tess_rings_bo, 8);
+
+	if (queue->device->physical_device->rad_info.chip_class >= CIK) {
+		radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
+				       S_030938_SIZE(tf_ring_size / 4));
+		radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE,
+				       tf_va >> 8);
+		if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
+			radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI,
+					       S_030944_BASE_HI(tf_va >> 40));
+		}
+		radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM,
+				       hs_offchip_param);
+	} else {
+		radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE,
+				      S_008988_SIZE(tf_ring_size / 4));
+		radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE,
+				      tf_va >> 8);
+		radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM,
+				     hs_offchip_param);
+	}
+}
+
+static void
+radv_emit_compute_scratch(struct radv_queue *queue, struct radeon_winsys_cs *cs,
+			  struct radeon_winsys_bo *compute_scratch_bo)
+{
+	uint64_t scratch_va;
+
+	if (!compute_scratch_bo)
+		return;
+
+	scratch_va = radv_buffer_get_va(compute_scratch_bo);
+
+	radv_cs_add_buffer(queue->device->ws, cs, compute_scratch_bo, 8);
+
+	radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+	radeon_emit(cs, scratch_va);
+	radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+			S_008F04_SWIZZLE_ENABLE(1));
+}
+
+static void
+radv_emit_global_shader_pointers(struct radv_queue *queue,
+				 struct radeon_winsys_cs *cs,
+				 struct radeon_winsys_bo *descriptor_bo)
+{
+	uint64_t va;
+
+	if (!descriptor_bo)
+		return;
+
+	va = radv_buffer_get_va(descriptor_bo);
+
+	radv_cs_add_buffer(queue->device->ws, cs, descriptor_bo, 8);
+
+	if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
+		uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
+				   R_00B130_SPI_SHADER_USER_DATA_VS_0,
+				   R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS,
+				   R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS};
+
+		for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
+			radv_emit_shader_pointer(queue->device, cs, regs[i],
+						 va, true);
+		}
+	} else {
+		uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
+				   R_00B130_SPI_SHADER_USER_DATA_VS_0,
+				   R_00B230_SPI_SHADER_USER_DATA_GS_0,
+				   R_00B330_SPI_SHADER_USER_DATA_ES_0,
+				   R_00B430_SPI_SHADER_USER_DATA_HS_0,
+				   R_00B530_SPI_SHADER_USER_DATA_LS_0};
+
+		for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
+			radv_emit_shader_pointer(queue->device, cs, regs[i],
+						 va, true);
+		}
+	}
+}
+
 static VkResult
 radv_get_preamble_cs(struct radv_queue *queue,
                     uint32_t scratch_size,
@@ -2059,18 +2165,6 @@ radv_get_preamble_cs(struct radv_queue *queue,
 		if (scratch_bo)
 			radv_cs_add_buffer(queue->device->ws, cs, scratch_bo, 8);

-		if (esgs_ring_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, esgs_ring_bo, 8);
-
-		if (gsvs_ring_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, gsvs_ring_bo, 8);
-
-		if (tess_rings_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, tess_rings_bo, 8);
-
-		if (descriptor_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, descriptor_bo, 8);
-
 		if (descriptor_bo != queue->descriptor_bo) {
 			uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);

@@ -2102,80 +2196,12 @@ radv_get_preamble_cs(struct radv_queue *queue,
 			radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
 		}

-		if (esgs_ring_bo || gsvs_ring_bo) {
-			if (queue->device->physical_device->rad_info.chip_class >= CIK) {
-				radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
-				radeon_emit(cs, esgs_ring_size >> 8);
-				radeon_emit(cs, gsvs_ring_size >> 8);
-			} else {
-				radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
-				radeon_emit(cs, esgs_ring_size >> 8);
-				radeon_emit(cs, gsvs_ring_size >> 8);
-			}
-		}
-
-		if (tess_rings_bo) {
-			uint64_t tf_va = radv_buffer_get_va(tess_rings_bo);
-			if (queue->device->physical_device->rad_info.chip_class >= CIK) {
-				radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
-						       S_030938_SIZE(tess_factor_ring_size / 4));
-				radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE,
-						       tf_va >> 8);
-				if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
-					radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI,
-							       S_030944_BASE_HI(tf_va >> 40));
-				}
-				radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param);
-			} else {
-				radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE,
-						      S_008988_SIZE(tess_factor_ring_size / 4));
-				radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE,
-						      tf_va >> 8);
-				radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM,
-						      hs_offchip_param);
-			}
-		}
-
-		if (descriptor_bo) {
-			uint64_t va = radv_buffer_get_va(descriptor_bo);
-			if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
-				uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
-						R_00B130_SPI_SHADER_USER_DATA_VS_0,
-						R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS,
-						R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS};
-
-				for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
-					radeon_set_sh_reg_seq(cs, regs[i], 2);
-					radeon_emit(cs, va);
-					radeon_emit(cs, va >> 32);
-				}
-			} else {
-				uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
-						R_00B130_SPI_SHADER_USER_DATA_VS_0,
-						R_00B230_SPI_SHADER_USER_DATA_GS_0,
-						R_00B330_SPI_SHADER_USER_DATA_ES_0,
-						R_00B430_SPI_SHADER_USER_DATA_HS_0,
-						R_00B530_SPI_SHADER_USER_DATA_LS_0};
-
-				for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
-					radeon_set_sh_reg_seq(cs, regs[i], 2);
-					radeon_emit(cs, va);
-					radeon_emit(cs, va >> 32);
-				}
-			}
-		}
-
-		if (compute_scratch_bo) {
-			uint64_t scratch_va = radv_buffer_get_va(compute_scratch_bo);
-			uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-			                 S_008F04_SWIZZLE_ENABLE(1);
-
-			radv_cs_add_buffer(queue->device->ws, cs, compute_scratch_bo, 8);
-
-			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
-			radeon_emit(cs, scratch_va);
-			radeon_emit(cs, rsrc1);
-		}
+		radv_emit_gs_ring_sizes(queue, cs, esgs_ring_bo, esgs_ring_size,
+					gsvs_ring_bo, gsvs_ring_size);
+		radv_emit_tess_factor_ring(queue, cs, hs_offchip_param,
+					   tess_factor_ring_size, tess_rings_bo);
+		radv_emit_global_shader_pointers(queue, cs, descriptor_bo);
+		radv_emit_compute_scratch(queue, cs, compute_scratch_bo);

 		if (i == 0) {
 			si_cs_emit_cache_flush(cs,
@@ -2282,10 +2308,11 @@ fail:
 		queue->device->ws->buffer_destroy(gsvs_ring_bo);
 	if (tess_rings_bo && tess_rings_bo != queue->tess_rings_bo)
 		queue->device->ws->buffer_destroy(tess_rings_bo);
-	return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+	return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 }

-static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts,
+static VkResult radv_alloc_sem_counts(struct radv_instance *instance,
+				      struct radv_winsys_sem_counts *counts,
 				      int num_sems,
 				      const VkSemaphore *sems,
 				      VkFence _fence,
@@ -2314,14 +2341,14 @@ static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts,
 	if (counts->syncobj_count) {
 		counts->syncobj = (uint32_t *)malloc(sizeof(uint32_t) * counts->syncobj_count);
 		if (!counts->syncobj)
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}

 	if (counts->sem_count) {
 		counts->sem = (struct radeon_winsys_sem **)malloc(sizeof(struct radeon_winsys_sem *) * counts->sem_count);
 		if (!counts->sem) {
 			free(counts->syncobj);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 	}

@@ -2350,7 +2377,8 @@ static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts,
 	return VK_SUCCESS;
 }

-void radv_free_sem_info(struct radv_winsys_sem_info *sem_info)
+static void
+radv_free_sem_info(struct radv_winsys_sem_info *sem_info)
 {
 	free(sem_info->wait.syncobj);
 	free(sem_info->wait.sem);
@@ -2373,20 +2401,22 @@ static void radv_free_temp_syncobjs(struct radv_device *device,
 	}
 }

-VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
-			     int num_wait_sems,
-			     const VkSemaphore *wait_sems,
-			     int num_signal_sems,
-			     const VkSemaphore *signal_sems,
-			     VkFence fence)
+static VkResult
+radv_alloc_sem_info(struct radv_instance *instance,
+		    struct radv_winsys_sem_info *sem_info,
+		    int num_wait_sems,
+		    const VkSemaphore *wait_sems,
+		    int num_signal_sems,
+		    const VkSemaphore *signal_sems,
+		    VkFence fence)
 {
 	VkResult ret;
 	memset(sem_info, 0, sizeof(*sem_info));

-	ret = radv_alloc_sem_counts(&sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true);
+	ret = radv_alloc_sem_counts(instance, &sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true);
 	if (ret)
 		return ret;
-	ret = radv_alloc_sem_counts(&sem_info->signal, num_signal_sems, signal_sems, fence, false);
+	ret = radv_alloc_sem_counts(instance, &sem_info->signal, num_signal_sems, signal_sems, fence, false);
 	if (ret)
 		radv_free_sem_info(sem_info);

@@ -2404,7 +2434,7 @@ static VkResult radv_signal_fence(struct radv_queue *queue,
 	VkResult result;
 	struct radv_winsys_sem_info sem_info;

-	result = radv_alloc_sem_info(&sem_info, 0, NULL, 0, NULL,
+	result = radv_alloc_sem_info(queue->device->instance, &sem_info, 0, NULL, 0, NULL,
 	                             radv_fence_to_handle(fence));
 	if (result != VK_SUCCESS)
 		return result;
@@ -2417,7 +2447,7 @@ static VkResult radv_signal_fence(struct radv_queue *queue,

 	/* TODO: find a better error */
 	if (ret)
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);

 	return VK_SUCCESS;
 }
@@ -2474,7 +2504,8 @@ VkResult radv_QueueSubmit(
 		uint32_t advance;
 		struct radv_winsys_sem_info sem_info;

-		result = radv_alloc_sem_info(&sem_info,
+		result = radv_alloc_sem_info(queue->device->instance,
+					     &sem_info,
 					     pSubmits[i].waitSemaphoreCount,
 					     pSubmits[i].pWaitSemaphores,
 					     pSubmits[i].signalSemaphoreCount,
@@ -2517,6 +2548,8 @@ VkResult radv_QueueSubmit(

 		for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) {
 			struct radeon_winsys_cs *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
+			const struct radv_winsys_bo_list *bo_list = NULL;
+
 			advance = MIN2(max_cs_submission,
 				       pSubmits[i].commandBufferCount - j);

@@ -2526,12 +2559,14 @@ VkResult radv_QueueSubmit(
 			sem_info.cs_emit_wait = j == 0;
 			sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount;

-			if (unlikely(queue->device->use_global_bo_list))
+			if (unlikely(queue->device->use_global_bo_list)) {
 				pthread_mutex_lock(&queue->device->bo_list.mutex);
+				bo_list = &queue->device->bo_list.list;
+			}

 			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
 							advance, initial_preamble, continue_preamble_cs,
-							&sem_info, &queue->device->bo_list.list,
+							&sem_info, bo_list,
 							can_patch, base_fence);

 			if (unlikely(queue->device->use_global_bo_list))
@@ -2715,7 +2750,7 @@ static VkResult radv_alloc_memory(struct radv_device *device,
 	mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8,
 			  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (mem == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	if (wsi_info && wsi_info->implicit_sync)
 		flags |= RADEON_FLAG_IMPLICIT_SYNC;
@@ -2853,7 +2888,7 @@ VkResult radv_MapMemory(
 		return VK_SUCCESS;
 	}

-	return vk_error(VK_ERROR_MEMORY_MAP_FAILED);
+	return vk_error(device->instance, VK_ERROR_MEMORY_MAP_FAILED);
 }

 void radv_UnmapMemory(
@@ -3127,7 +3162,8 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,
 		}

 		VkResult result;
-		result = radv_alloc_sem_info(&sem_info,
+		result = radv_alloc_sem_info(queue->device->instance,
+					     &sem_info,
 					     pBindInfo[i].waitSemaphoreCount,
 					     pBindInfo[i].pWaitSemaphores,
 					     pBindInfo[i].signalSemaphoreCount,
@@ -3178,7 +3214,7 @@ VkResult radv_CreateFence(
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

 	if (!fence)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	fence->submitted = false;
 	fence->signalled = !!(pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT);
@@ -3187,7 +3223,7 @@ VkResult radv_CreateFence(
 		int ret = device->ws->create_syncobj(device->ws, &fence->syncobj);
 		if (ret) {
 			vk_free2(&device->alloc, pAllocator, fence);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 		if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
 			device->ws->signal_syncobj(device->ws, fence->syncobj);
@@ -3197,7 +3233,7 @@ VkResult radv_CreateFence(
 		fence->fence = device->ws->create_fence();
 		if (!fence->fence) {
 			vk_free2(&device->alloc, pAllocator, fence);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 		fence->syncobj = 0;
 	}
@@ -3268,7 +3304,7 @@ VkResult radv_WaitForFences(
 	if (device->always_use_syncobj) {
 		uint32_t *handles = malloc(sizeof(uint32_t) * fenceCount);
 		if (!handles)
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 		for (uint32_t i = 0; i < fenceCount; ++i) {
 			RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
@@ -3287,7 +3323,7 @@ VkResult radv_WaitForFences(
 			uint32_t wait_count = 0;
 			struct radeon_winsys_fence **fences = malloc(sizeof(struct radeon_winsys_fence *) * fenceCount);
 			if (!fences)
-				return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+				return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 			for (uint32_t i = 0; i < fenceCount; ++i) {
 				RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
@@ -3426,7 +3462,7 @@ VkResult radv_CreateSemaphore(
 					       sizeof(*sem), 8,
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!sem)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	sem->temp_syncobj = 0;
 	/* create a syncobject if we are going to export this semaphore */
@@ -3435,14 +3471,14 @@ VkResult radv_CreateSemaphore(
 		int ret = device->ws->create_syncobj(device->ws, &sem->syncobj);
 		if (ret) {
 			vk_free2(&device->alloc, pAllocator, sem);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 		sem->sem = NULL;
 	} else {
 		sem->sem = device->ws->create_sem(device->ws);
 		if (!sem->sem) {
 			vk_free2(&device->alloc, pAllocator, sem);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 		sem->syncobj = 0;
 	}
@@ -3480,14 +3516,14 @@ VkResult radv_CreateEvent(
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

 	if (!event)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	event->bo = device->ws->buffer_create(device->ws, 8, 8,
 					      RADEON_DOMAIN_GTT,
 					      RADEON_FLAG_VA_UNCACHED | RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING);
 	if (!event->bo) {
 		vk_free2(&device->alloc, pAllocator, event);
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	}

 	event->map = (uint64_t*)device->ws->buffer_map(event->bo);
@@ -3556,7 +3592,7 @@ VkResult radv_CreateBuffer(
 	buffer = vk_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8,
 			     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (buffer == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	buffer->size = pCreateInfo->size;
 	buffer->usage = pCreateInfo->usage;
@@ -3573,7 +3609,7 @@ VkResult radv_CreateBuffer(
 		                                       4096, 0, RADEON_FLAG_VIRTUAL);
 		if (!buffer->bo) {
 			vk_free2(&device->alloc, pAllocator, buffer);
-			return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 		}
 	}

@@ -3934,7 +3970,8 @@ radv_initialise_ds_surface(struct radv_device *device,
 		ds->db_z_info = S_028038_FORMAT(format) |
 			S_028038_NUM_SAMPLES(util_logbase2(iview->image->info.samples)) |
 			S_028038_SW_MODE(iview->image->surface.u.gfx9.surf.swizzle_mode) |
-			S_028038_MAXMIP(iview->image->info.levels - 1);
+			S_028038_MAXMIP(iview->image->info.levels - 1) |
+			S_028038_ZRANGE_PRECISION(1);
 		ds->db_stencil_info = S_02803C_FORMAT(stencil_format) |
 			S_02803C_SW_MODE(iview->image->surface.u.gfx9.stencil.swizzle_mode);

@@ -4060,7 +4097,7 @@ VkResult radv_CreateFramebuffer(
 	framebuffer = vk_alloc2(&device->alloc, pAllocator, size, 8,
 				  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (framebuffer == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	framebuffer->attachment_count = pCreateInfo->attachmentCount;
 	framebuffer->width = pCreateInfo->width;
@@ -4278,7 +4315,7 @@ VkResult radv_CreateSampler(
 	sampler = vk_alloc2(&device->alloc, pAllocator, sizeof(*sampler), 8,
 			      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!sampler)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	radv_init_sampler(device, sampler, pCreateInfo);
 	*pSampler = radv_sampler_to_handle(sampler);
@@ -4360,7 +4397,7 @@ VkResult radv_GetMemoryFdKHR(VkDevice _device,

 	bool ret = radv_get_memory_fd(device, memory, pFD);
 	if (ret == false)
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	return VK_SUCCESS;
 }

@@ -4369,6 +4406,8 @@ VkResult radv_GetMemoryFdPropertiesKHR(VkDevice _device,
 				       int fd,
 				       VkMemoryFdPropertiesKHR *pMemoryFdProperties)
 {
+   RADV_FROM_HANDLE(radv_device, device, _device);
+
   switch (handleType) {
   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
      pMemoryFdProperties->memoryTypeBits = (1 << RADV_MEM_TYPE_COUNT) - 1;
@@ -4382,7 +4421,7 @@ VkResult radv_GetMemoryFdPropertiesKHR(VkDevice _device,
       *
       * So opaque handle types fall into the default "unsupported" case.
       */
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
   }
 }

@@ -4393,7 +4432,7 @@ static VkResult radv_import_opaque_fd(struct radv_device *device,
 	uint32_t syncobj_handle = 0;
 	int ret = device->ws->import_syncobj(device->ws, fd, &syncobj_handle);
 	if (ret != 0)
-		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);

 	if (*syncobj)
 		device->ws->destroy_syncobj(device->ws, *syncobj);
@@ -4414,7 +4453,7 @@ static VkResult radv_import_sync_fd(struct radv_device *device,
 	if (!syncobj_handle) {
 		int ret = device->ws->create_syncobj(device->ws, &syncobj_handle);
 		if (ret) {
-			return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+			return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 		}
 	}

@@ -4423,7 +4462,7 @@ static VkResult radv_import_sync_fd(struct radv_device *device,
 	} else {
 		int ret = device->ws->import_syncobj_from_sync_file(device->ws, syncobj_handle, fd);
 	if (ret != 0)
-		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 	}

 	*syncobj = syncobj_handle;
@@ -4490,7 +4529,7 @@ VkResult radv_GetSemaphoreFdKHR(VkDevice _device,
 	}

 	if (ret)
-		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 	return VK_SUCCESS;
 }

@@ -4579,7 +4618,7 @@ VkResult radv_GetFenceFdKHR(VkDevice _device,
 	}

 	if (ret)
-		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 	return VK_SUCCESS;
 }

--- a/src/amd/vulkan/radv_entrypoints_gen.py
+++ b/src/amd/vulkan/radv_entrypoints_gen.py
@@ -116,7 +116,7 @@ struct string_map_entry {
   uint32_t num;
 };

-/* We use a big string constant to avoid lots of reloctions from the entry
+/* We use a big string constant to avoid lots of relocations from the entry
 * point table to lots of little strings. The entries in the entry point table
 * store the index into this big string.
 */
@@ -205,7 +205,7 @@ radv_entrypoint_is_enabled(int index, uint32_t core_version,
   % if not e.device_command:
      if (device) return false;
   % endif
-   % if e.name == 'vkCreateInstance' or e.name == 'vkEnumerateInstanceExtensionProperties' or e.name == 'vkEnumerateInstanceLayerProperties':
+   % if e.name == 'vkCreateInstance' or e.name == 'vkEnumerateInstanceExtensionProperties' or e.name == 'vkEnumerateInstanceLayerProperties' or e.name == 'vkEnumerateInstanceVersion':
      return !device;
   % elif e.core_version:
      return instance && ${e.core_version.c_vk_version()} <= core_version;
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -56,6 +56,7 @@ EXTENSIONS = [
    Extension('VK_KHR_descriptor_update_template',        1, True),
    Extension('VK_KHR_device_group',                      1, True),
    Extension('VK_KHR_device_group_creation',             1, True),
+    Extension('VK_KHR_draw_indirect_count',               1, True),
    Extension('VK_KHR_external_fence',                    1, 'device->rad_info.has_syncobj_wait_for_submit'),
    Extension('VK_KHR_external_fence_capabilities',       1, True),
    Extension('VK_KHR_external_fence_fd',                 1, 'device->rad_info.has_syncobj_wait_for_submit'),
@@ -195,9 +196,9 @@ struct radv_device_extension_table {
   };
 };

-const VkExtensionProperties radv_instance_extensions[RADV_INSTANCE_EXTENSION_COUNT];
-const VkExtensionProperties radv_device_extensions[RADV_DEVICE_EXTENSION_COUNT];
-const struct radv_instance_extension_table radv_supported_instance_extensions;
+extern const VkExtensionProperties radv_instance_extensions[RADV_INSTANCE_EXTENSION_COUNT];
+extern const VkExtensionProperties radv_device_extensions[RADV_DEVICE_EXTENSION_COUNT];
+extern const struct radv_instance_extension_table radv_supported_instance_extensions;


 struct radv_physical_device;
--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@@ -321,10 +321,8 @@ uint32_t radv_translate_tex_dataformat(VkFormat format,
 			return V_008F14_IMG_DATA_FORMAT_32;
 		case 2:
 			return V_008F14_IMG_DATA_FORMAT_32_32;
-#if 0 /* Not supported for render targets */
 		case 3:
 			return V_008F14_IMG_DATA_FORMAT_32_32_32;
-#endif
 		case 4:
 			return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
 		}
@@ -638,13 +636,17 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
 				tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
 			}
 		}
-		if (tiled && util_is_power_of_two_or_zero(vk_format_get_blocksize(format)) && !scaled) {
+		if (tiled && !scaled) {
 			tiled |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR |
 			         VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR;
 		}
+
+		/* Tiled formatting does not support NPOT pixel sizes */
+		if (!util_is_power_of_two_or_zero(vk_format_get_blocksize(format)))
+			tiled = 0;
 	}

-	if (linear && util_is_power_of_two_or_zero(vk_format_get_blocksize(format)) && !scaled) {
+	if (linear && !scaled) {
 		linear |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR |
 		          VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR;
 	}
@@ -655,6 +657,25 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
 		tiled |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
 	}

+	switch(format) {
+	case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
+	case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
+	case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
+	case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
+	case VK_FORMAT_A2R10G10B10_SINT_PACK32:
+	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
+		if (physical_device->rad_info.chip_class <= VI &&
+		    physical_device->rad_info.family != CHIP_STONEY) {
+			buffer &= ~(VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT |
+			            VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT);
+			linear = 0;
+			tiled = 0;
+		}
+		break;
+	default:
+		break;
+	}
+
 	out_properties->linearTilingFeatures = linear;
 	out_properties->optimalTilingFeatures = tiled;
 	out_properties->bufferFeatures = buffer;
@@ -859,194 +880,87 @@ bool radv_format_pack_clear_color(VkFormat format,
 				  uint32_t clear_vals[2],
 				  VkClearColorValue *value)
 {
-	uint8_t r = 0, g = 0, b = 0, a = 0;
 	const struct vk_format_description *desc = vk_format_description(format);

-	if (vk_format_get_component_bits(format, VK_FORMAT_COLORSPACE_RGB, 0) <= 8) {
-		if (desc->colorspace == VK_FORMAT_COLORSPACE_RGB) {
-			r = float_to_ubyte(value->float32[0]);
-			g = float_to_ubyte(value->float32[1]);
-			b = float_to_ubyte(value->float32[2]);
-			a = float_to_ubyte(value->float32[3]);
-		} else if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) {
-			r = util_format_linear_float_to_srgb_8unorm(value->float32[0]);
-			g = util_format_linear_float_to_srgb_8unorm(value->float32[1]);
-			b = util_format_linear_float_to_srgb_8unorm(value->float32[2]);
-			a = float_to_ubyte(value->float32[3]);
-		}
-	}
-	switch (format) {
-	case VK_FORMAT_R8_UNORM:
-	case VK_FORMAT_R8_SRGB:
-		clear_vals[0] = r;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8_UNORM:
-	case VK_FORMAT_R8G8_SRGB:
-		clear_vals[0] = r | g << 8;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8B8A8_SRGB:
-	case VK_FORMAT_R8G8B8A8_UNORM:
-		clear_vals[0] = r | g << 8 | b << 16 | a << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_B8G8R8A8_SRGB:
-	case VK_FORMAT_B8G8R8A8_UNORM:
-		clear_vals[0] = b | g << 8 | r << 16 | a << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-		clear_vals[0] = r | g << 8 | b << 16 | a << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8_UINT:
-		clear_vals[0] = value->uint32[0] & 0xff;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8_SINT:
-		clear_vals[0] = value->int32[0] & 0xff;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16_UINT:
-		clear_vals[0] = value->uint32[0] & 0xffff;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8_UINT:
-		clear_vals[0] = value->uint32[0] & 0xff;
-		clear_vals[0] |= (value->uint32[1] & 0xff) << 8;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8_SINT:
-		clear_vals[0] = value->int32[0] & 0xff;
-		clear_vals[0] |= (value->int32[1] & 0xff) << 8;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8B8A8_UINT:
-		clear_vals[0] = value->uint32[0] & 0xff;
-		clear_vals[0] |= (value->uint32[1] & 0xff) << 8;
-		clear_vals[0] |= (value->uint32[2] & 0xff) << 16;
-		clear_vals[0] |= (value->uint32[3] & 0xff) << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8B8A8_SINT:
-		clear_vals[0] = value->int32[0] & 0xff;
-		clear_vals[0] |= (value->int32[1] & 0xff) << 8;
-		clear_vals[0] |= (value->int32[2] & 0xff) << 16;
-		clear_vals[0] |= (value->int32[3] & 0xff) << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		clear_vals[0] = value->uint32[0] & 0xff;
-		clear_vals[0] |= (value->uint32[1] & 0xff) << 8;
-		clear_vals[0] |= (value->uint32[2] & 0xff) << 16;
-		clear_vals[0] |= (value->uint32[3] & 0xff) << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16_UINT:
-		clear_vals[0] = value->uint32[0] & 0xffff;
-		clear_vals[0] |= (value->uint32[1] & 0xffff) << 16;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16B16A16_UINT:
-		clear_vals[0] = value->uint32[0] & 0xffff;
-		clear_vals[0] |= (value->uint32[1] & 0xffff) << 16;
-		clear_vals[1] = value->uint32[2] & 0xffff;
-		clear_vals[1] |= (value->uint32[3] & 0xffff) << 16;
-		break;
-	case VK_FORMAT_R32_UINT:
-		clear_vals[0] = value->uint32[0];
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R32G32_UINT:
-		clear_vals[0] = value->uint32[0];
-		clear_vals[1] = value->uint32[1];
-		break;
-	case VK_FORMAT_R32_SINT:
-		clear_vals[0] = value->int32[0];
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16_SFLOAT:
-		clear_vals[0] = util_float_to_half(value->float32[0]);
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16_SFLOAT:
-		clear_vals[0] = util_float_to_half(value->float32[0]);
-		clear_vals[0] |= (uint32_t)util_float_to_half(value->float32[1]) << 16;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16B16A16_SFLOAT:
-		clear_vals[0] = util_float_to_half(value->float32[0]);
-		clear_vals[0] |= (uint32_t)util_float_to_half(value->float32[1]) << 16;
-		clear_vals[1] = util_float_to_half(value->float32[2]);
-		clear_vals[1] |= (uint32_t)util_float_to_half(value->float32[3]) << 16;
-		break;
-	case VK_FORMAT_R16_UNORM:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16_UNORM:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
-		clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0xffff)) << 16;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16B16A16_UNORM:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
-		clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0xffff)) << 16;
-		clear_vals[1] = ((uint16_t)util_iround(CLAMP(value->float32[2], 0.0f, 1.0f) * 0xffff)) & 0xffff;
-		clear_vals[1] |= ((uint16_t)util_iround(CLAMP(value->float32[3], 0.0f, 1.0f) * 0xffff)) << 16;
-		break;
-	case VK_FORMAT_R16G16B16A16_SNORM:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], -1.0f, 1.0f) * 0x7fff)) & 0xffff;
-		clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], -1.0f, 1.0f) * 0x7fff)) << 16;
-		clear_vals[1] = ((uint16_t)util_iround(CLAMP(value->float32[2], -1.0f, 1.0f) * 0x7fff)) & 0xffff;
-		clear_vals[1] |= ((uint16_t)util_iround(CLAMP(value->float32[3], -1.0f, 1.0f) * 0x7fff)) << 16;
-		break;
-	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
-		clear_vals[0] |= (((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
-		clear_vals[0] |= (((uint16_t)util_iround(CLAMP(value->float32[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
-		clear_vals[0] |= (((uint16_t)util_iround(CLAMP(value->float32[3], 0.0f, 1.0f) * 0x3)) & 0x3) << 30;
-		clear_vals[1] = 0;
-		return true;
-	case VK_FORMAT_R32G32_SFLOAT:
-		clear_vals[0] = fui(value->float32[0]);
-		clear_vals[1] = fui(value->float32[1]);
-		break;
-	case VK_FORMAT_R32_SFLOAT:
-		clear_vals[1] = 0;
-		clear_vals[0] = fui(value->float32[0]);
-		break;
-	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+	if (format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
 		clear_vals[0] = float3_to_r11g11b10f(value->float32);
 		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R32G32B32A32_SFLOAT:
-		if (value->float32[0] != value->float32[1] ||
-		    value->float32[0] != value->float32[2])
-			return false;
-		clear_vals[0] = fui(value->float32[0]);
-		clear_vals[1] = fui(value->float32[3]);
-		break;
-	case VK_FORMAT_R32G32B32A32_UINT:
-		if (value->uint32[0] != value->uint32[1] ||
-		    value->uint32[0] != value->uint32[2])
-			return false;
-		clear_vals[0] = value->uint32[0];
-		clear_vals[1] = value->uint32[3];
-		break;
-	case VK_FORMAT_R32G32B32A32_SINT:
-		if (value->int32[0] != value->int32[1] ||
-		    value->int32[0] != value->int32[2])
-			return false;
-		clear_vals[0] = value->int32[0];
-		clear_vals[1] = value->int32[3];
-		break;
-	default:
-		fprintf(stderr, "failed to fast clear %d\n", format);
+		return true;
+	}
+
+	if (desc->layout != VK_FORMAT_LAYOUT_PLAIN) {
+		fprintf(stderr, "failed to fast clear for non-plain format %d\n", format);
 		return false;
 	}
+
+	if (!util_is_power_of_two_or_zero(desc->block.bits)) {
+		fprintf(stderr, "failed to fast clear for NPOT format %d\n", format);
+		return false;
+	}
+
+	if (desc->block.bits > 64) {
+		/*
+		 * We have a 128 bits format, check if the first 3 components are the same.
+		 * Every elements has to be 32 bits since we don't support 64-bit formats,
+		 * and we can skip swizzling checks as alpha always comes last for these and
+		 * we do not care about the rest as they have to be the same.
+		 */
+		if (desc->channel[0].type == VK_FORMAT_TYPE_FLOAT) {
+			if (value->float32[0] != value->float32[1] ||
+			    value->float32[0] != value->float32[2])
+				return false;
+		} else {
+			if (value->uint32[0] != value->uint32[1] ||
+			    value->uint32[0] != value->uint32[2])
+				return false;
+		}
+		clear_vals[0] = value->uint32[0];
+		clear_vals[1] = value->uint32[3];
+		return true;
+	}
+	uint64_t clear_val = 0;
+
+	for (unsigned c = 0; c < 4; ++c) {
+		if (desc->swizzle[c] >= 4)
+			continue;
+
+		const struct vk_format_channel_description *channel = &desc->channel[desc->swizzle[c]];
+		assert(channel->size);
+
+		uint64_t v = 0;
+		if (channel->pure_integer) {
+			v = value->uint32[c]  & ((1ULL << channel->size) - 1);
+		} else if (channel->normalized) {
+			if (channel->type == VK_FORMAT_TYPE_UNSIGNED &&
+			    desc->swizzle[c] < 3 &&
+			    desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) {
+				assert(channel->size == 8);
+
+				v = util_format_linear_float_to_srgb_8unorm(value->float32[c]);
+			} else if (channel->type == VK_FORMAT_TYPE_UNSIGNED) {
+				v = MAX2(MIN2(value->float32[c], 1.0f), 0.0f) * ((1ULL << channel->size) - 1);
+			} else  {
+				v = MAX2(MIN2(value->float32[c], 1.0f), -1.0f) * ((1ULL << (channel->size - 1)) - 1);
+			}
+		} else if (channel->type == VK_FORMAT_TYPE_FLOAT) {
+			if (channel->size == 32) {
+				memcpy(&v, &value->float32[c], 4);
+			} else if(channel->size == 16) {
+				v = util_float_to_half(value->float32[c]);
+			} else {
+				fprintf(stderr, "failed to fast clear for unhandled float size in format %d\n", format);
+				return false;
+			}
+		} else {
+			fprintf(stderr, "failed to fast clear for unhandled component type in format %d\n", format);
+			return false;
+		}
+		clear_val |= (v & ((1ULL << channel->size) - 1)) << channel->shift;
+	}
+
+	clear_vals[0] = clear_val;
+	clear_vals[1] = clear_val >> 32;
+
 	return true;
 }

@@ -1306,7 +1220,7 @@ VkResult radv_GetPhysicalDeviceImageFormatProperties2(
 			 *    vkGetPhysicalDeviceImageFormatProperties2KHR returns
 			 *    VK_ERROR_FORMAT_NOT_SUPPORTED.
 			 */
-			result = vk_errorf(VK_ERROR_FORMAT_NOT_SUPPORTED,
+			result = vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
 					   "unsupported VkExternalMemoryTypeFlagBitsKHR 0x%x",
 					   external_info->handleType);
 			goto fail;
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -110,6 +110,8 @@ radv_use_dcc_for_image(struct radv_device *device,
 {
 	bool dcc_compatible_formats;
 	bool blendable;
+	bool shareable = vk_find_struct_const(pCreateInfo->pNext,
+	                                      EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR) != NULL;

 	/* DCC (Delta Color Compression) is only available for GFX8+. */
 	if (device->physical_device->rad_info.chip_class < VI)
@@ -118,6 +120,11 @@ radv_use_dcc_for_image(struct radv_device *device,
 	if (device->instance->debug_flags & RADV_DEBUG_NO_DCC)
 		return false;

+	/* FIXME: DCC is broken for shareable images starting with GFX9 */
+	if (device->physical_device->rad_info.chip_class >= GFX9 &&
+	    shareable)
+		return false;
+
 	/* TODO: Enable DCC for storage images. */
 	if ((pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT) ||
 	    (pCreateInfo->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT_KHR))
@@ -133,12 +140,12 @@ radv_use_dcc_for_image(struct radv_device *device,
 	if (create_info->scanout)
 		return false;

-	/* FIXME: DCC for MSAA with 4x and 8x samples doesn't work yet. */
-	if (pCreateInfo->samples > 2)
-		return false;
-
-	/* TODO: Enable DCC for MSAA textures. */
-	if (!device->physical_device->dcc_msaa_allowed)
+	/* FIXME: DCC for MSAA with 4x and 8x samples doesn't work yet, while
+	 * 2x can be enabled with an option.
+	 */
+	if (pCreateInfo->samples > 2 ||
+	    (pCreateInfo->samples == 2 &&
+	     !device->physical_device->dcc_msaa_allowed))
 		return false;

 	/* Determine if the formats are DCC compatible. */
@@ -414,7 +421,7 @@ static unsigned radv_tex_dim(VkImageType image_type, VkImageViewType view_type,
 		else
 			return V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
 	default:
-		unreachable("illegale image type");
+		unreachable("illegal image type");
 	}
 }

@@ -534,7 +541,7 @@ si_make_texture_descriptor(struct radv_device *device,
 	if (device->physical_device->rad_info.chip_class >= GFX9) {
 		unsigned bc_swizzle = gfx9_border_color_swizzle(swizzle);

-		/* Depth is the the last accessible layer on Gfx9.
+		/* Depth is the last accessible layer on Gfx9.
 		 * The hw doesn't need to know the total number of layers.
 		 */
 		if (type == V_008F1C_SQ_RSRC_IMG_3D)
@@ -619,7 +626,7 @@ si_make_texture_descriptor(struct radv_device *device,
 			S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
 			S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
 			S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
-			S_008F1C_TYPE(radv_tex_dim(image->type, view_type, 1, 0, false, false));
+			S_008F1C_TYPE(radv_tex_dim(image->type, view_type, image->info.array_size, 0, false, false));
 		fmask_state[4] = 0;
 		fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
 		fmask_state[6] = 0;
@@ -726,56 +733,20 @@ radv_image_get_fmask_info(struct radv_device *device,
 			  unsigned nr_samples,
 			  struct radv_fmask_info *out)
 {
-	/* FMASK is allocated like an ordinary texture. */
-	struct radeon_surf fmask = {};
-	struct ac_surf_info info = image->info;
-	memset(out, 0, sizeof(*out));
-
 	if (device->physical_device->rad_info.chip_class >= GFX9) {
-		out->alignment = image->surface.u.gfx9.fmask_alignment;
-		out->size = image->surface.u.gfx9.fmask_size;
+		out->alignment = image->surface.fmask_alignment;
+		out->size = image->surface.fmask_size;
+		out->tile_swizzle = image->surface.fmask_tile_swizzle;
 		return;
 	}

-	fmask.blk_w = image->surface.blk_w;
-	fmask.blk_h = image->surface.blk_h;
-	info.samples = 1;
-	fmask.flags = image->surface.flags | RADEON_SURF_FMASK;
-
-	if (!image->shareable)
-		info.surf_index = &device->fmask_mrt_offset_counter;
-
-	/* Force 2D tiling if it wasn't set. This may occur when creating
-	 * FMASK for MSAA resolve on R6xx. On R6xx, the single-sample
-	 * destination buffer must have an FMASK too. */
-	fmask.flags = RADEON_SURF_CLR(fmask.flags, MODE);
-	fmask.flags |= RADEON_SURF_SET(RADEON_SURF_MODE_2D, MODE);
-
-	switch (nr_samples) {
-	case 2:
-	case 4:
-		fmask.bpe = 1;
-		break;
-	case 8:
-		fmask.bpe = 4;
-		break;
-	default:
-		return;
-	}
-
-	device->ws->surface_init(device->ws, &info, &fmask);
-	assert(fmask.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
-
-	out->slice_tile_max = (fmask.u.legacy.level[0].nblk_x * fmask.u.legacy.level[0].nblk_y) / 64;
-	if (out->slice_tile_max)
-		out->slice_tile_max -= 1;
-
-	out->tile_mode_index = fmask.u.legacy.tiling_index[0];
-	out->pitch_in_pixels = fmask.u.legacy.level[0].nblk_x;
-	out->bank_height = fmask.u.legacy.bankh;
-	out->tile_swizzle = fmask.tile_swizzle;
-	out->alignment = MAX2(256, fmask.surf_alignment);
-	out->size = fmask.surf_size;
+	out->slice_tile_max = image->surface.u.legacy.fmask.slice_tile_max;
+	out->tile_mode_index = image->surface.u.legacy.fmask.tiling_index;
+	out->pitch_in_pixels = image->surface.u.legacy.fmask.pitch_in_pixels;
+	out->bank_height = image->surface.u.legacy.fmask.bankh;
+	out->tile_swizzle = image->surface.fmask_tile_swizzle;
+	out->alignment = image->surface.fmask_alignment;
+	out->size = image->surface.fmask_size;

 	assert(!out->tile_swizzle || !image->shareable);
 }
@@ -959,16 +930,17 @@ radv_image_create(VkDevice _device,
 	image = vk_zalloc2(&device->alloc, alloc, sizeof(*image), 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!image)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	image->type = pCreateInfo->imageType;
 	image->info.width = pCreateInfo->extent.width;
 	image->info.height = pCreateInfo->extent.height;
 	image->info.depth = pCreateInfo->extent.depth;
 	image->info.samples = pCreateInfo->samples;
+	image->info.color_samples = pCreateInfo->samples;
 	image->info.array_size = pCreateInfo->arrayLayers;
 	image->info.levels = pCreateInfo->mipLevels;
-	image->info.num_channels = 4; /* TODO: set this correctly */
+	image->info.num_channels = vk_format_get_nr_components(pCreateInfo->format);

 	image->vk_format = pCreateInfo->format;
 	image->tiling = pCreateInfo->tiling;
@@ -1043,7 +1015,7 @@ radv_image_create(VkDevice _device,
 		                                      0, RADEON_FLAG_VIRTUAL);
 		if (!image->bo) {
 			vk_free2(&device->alloc, alloc, image);
-			return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 		}
 	}

@@ -1358,7 +1330,7 @@ radv_CreateImageView(VkDevice _device,
 	view = vk_alloc2(&device->alloc, pAllocator, sizeof(*view), 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (view == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	radv_image_view_init(view, device, pCreateInfo);

@@ -1406,7 +1378,7 @@ radv_CreateBufferView(VkDevice _device,
 	view = vk_alloc2(&device->alloc, pAllocator, sizeof(*view), 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!view)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	radv_buffer_view_init(view, device, pCreateInfo);

--- a/src/amd/vulkan/radv_meta.h
+++ b/src/amd/vulkan/radv_meta.h
@@ -199,10 +199,6 @@ void radv_decompress_resolve_src(struct radv_cmd_buffer *cmd_buffer,
 				 uint32_t region_count,
 				 const VkImageResolve *regions);

-void radv_blit_to_prime_linear(struct radv_cmd_buffer *cmd_buffer,
-			       struct radv_image *image,
-			       struct radv_image *linear_image);
-
 uint32_t radv_clear_cmask(struct radv_cmd_buffer *cmd_buffer,
 			  struct radv_image *image, uint32_t value);
 uint32_t radv_clear_dcc(struct radv_cmd_buffer *cmd_buffer,
--- a/src/amd/vulkan/radv_meta_blit2d.c
+++ b/src/amd/vulkan/radv_meta_blit2d.c
@@ -100,7 +100,8 @@ blit2d_bind_src(struct radv_cmd_buffer *cmd_buffer,
                struct radv_meta_blit2d_buffer *src_buf,
                struct blit2d_src_temps *tmp,
                enum blit2d_src_type src_type, VkFormat depth_format,
-                VkImageAspectFlagBits aspects)
+                VkImageAspectFlagBits aspects,
+                uint32_t log2_samples)
 {
 	struct radv_device *device = cmd_buffer->device;

@@ -108,7 +109,7 @@ blit2d_bind_src(struct radv_cmd_buffer *cmd_buffer,
 		create_bview(cmd_buffer, src_buf, &tmp->bview, depth_format);

 		radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
-					      device->meta_state.blit2d.p_layouts[src_type],
+					      device->meta_state.blit2d[log2_samples].p_layouts[src_type],
 					      0, /* set */
 					      1, /* descriptorWriteCount */
 					      (VkWriteDescriptorSet[]) {
@@ -123,7 +124,7 @@ blit2d_bind_src(struct radv_cmd_buffer *cmd_buffer,
 					      });

 		radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
-				      device->meta_state.blit2d.p_layouts[src_type],
+				      device->meta_state.blit2d[log2_samples].p_layouts[src_type],
 				      VK_SHADER_STAGE_FRAGMENT_BIT, 16, 4,
 				      &src_buf->pitch);
 	} else {
@@ -131,12 +132,12 @@ blit2d_bind_src(struct radv_cmd_buffer *cmd_buffer,

 		if (src_type == BLIT2D_SRC_TYPE_IMAGE_3D)
 			radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
-					      device->meta_state.blit2d.p_layouts[src_type],
+					      device->meta_state.blit2d[log2_samples].p_layouts[src_type],
 					      VK_SHADER_STAGE_FRAGMENT_BIT, 16, 4,
 					      &src_img->layer);

 		radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
-					      device->meta_state.blit2d.p_layouts[src_type],
+					      device->meta_state.blit2d[log2_samples].p_layouts[src_type],
 					      0, /* set */
 					      1, /* descriptorWriteCount */
 					      (VkWriteDescriptorSet[]) {
@@ -190,10 +191,11 @@ blit2d_bind_dst(struct radv_cmd_buffer *cmd_buffer,

 static void
 bind_pipeline(struct radv_cmd_buffer *cmd_buffer,
-              enum blit2d_src_type src_type, unsigned fs_key)
+              enum blit2d_src_type src_type, unsigned fs_key,
+              uint32_t log2_samples)
 {
 	VkPipeline pipeline =
-		cmd_buffer->device->meta_state.blit2d.pipelines[src_type][fs_key];
+		cmd_buffer->device->meta_state.blit2d[log2_samples].pipelines[src_type][fs_key];

 	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
 			     VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
@@ -201,10 +203,11 @@ bind_pipeline(struct radv_cmd_buffer *cmd_buffer,

 static void
 bind_depth_pipeline(struct radv_cmd_buffer *cmd_buffer,
-		    enum blit2d_src_type src_type)
+		    enum blit2d_src_type src_type,
+		    uint32_t log2_samples)
 {
 	VkPipeline pipeline =
-		cmd_buffer->device->meta_state.blit2d.depth_only_pipeline[src_type];
+		cmd_buffer->device->meta_state.blit2d[log2_samples].depth_only_pipeline[src_type];

 	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
 			     VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
@@ -212,10 +215,11 @@ bind_depth_pipeline(struct radv_cmd_buffer *cmd_buffer,

 static void
 bind_stencil_pipeline(struct radv_cmd_buffer *cmd_buffer,
-		      enum blit2d_src_type src_type)
+		      enum blit2d_src_type src_type,
+		      uint32_t log2_samples)
 {
 	VkPipeline pipeline =
-		cmd_buffer->device->meta_state.blit2d.stencil_only_pipeline[src_type];
+		cmd_buffer->device->meta_state.blit2d[log2_samples].stencil_only_pipeline[src_type];

 	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
 			     VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
@@ -227,7 +231,8 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,
 			    struct radv_meta_blit2d_buffer *src_buf,
 			    struct radv_meta_blit2d_surf *dst,
 			    unsigned num_rects,
-			    struct radv_meta_blit2d_rect *rects, enum blit2d_src_type src_type)
+			    struct radv_meta_blit2d_rect *rects, enum blit2d_src_type src_type,
+			    uint32_t log2_samples)
 {
 	struct radv_device *device = cmd_buffer->device;

@@ -241,7 +246,7 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,
 			else if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
 				depth_format = vk_format_depth_only(dst->image->vk_format);
 			struct blit2d_src_temps src_temps;
-			blit2d_bind_src(cmd_buffer, src_img, src_buf, &src_temps, src_type, depth_format, aspect_mask);
+			blit2d_bind_src(cmd_buffer, src_img, src_buf, &src_temps, src_type, depth_format, aspect_mask, log2_samples);

 			struct blit2d_dst_temps dst_temps;
 			blit2d_bind_dst(cmd_buffer, dst, rects[r].dst_x + rects[r].width,
@@ -255,7 +260,7 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,
 			};

 			radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
-					device->meta_state.blit2d.p_layouts[src_type],
+					device->meta_state.blit2d[log2_samples].p_layouts[src_type],
 					VK_SHADER_STAGE_VERTEX_BIT, 0, 16,
 					vertex_push_constants);

@@ -266,7 +271,7 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,
 				radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
 							&(VkRenderPassBeginInfo) {
 								.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-									.renderPass = device->meta_state.blit2d.render_passes[fs_key][dst_layout],
+									.renderPass = device->meta_state.blit2d_render_passes[fs_key][dst_layout],
 									.framebuffer = dst_temps.fb,
 									.renderArea = {
 									.offset = { rects[r].dst_x, rects[r].dst_y, },
@@ -277,13 +282,13 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,
 										}, VK_SUBPASS_CONTENTS_INLINE);


-				bind_pipeline(cmd_buffer, src_type, fs_key);
+				bind_pipeline(cmd_buffer, src_type, fs_key, log2_samples);
 			} else if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
 				enum radv_blit_ds_layout ds_layout = radv_meta_blit_ds_to_type(dst->current_layout);
 				radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
 							&(VkRenderPassBeginInfo) {
 								.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-									.renderPass = device->meta_state.blit2d.depth_only_rp[ds_layout],
+									.renderPass = device->meta_state.blit2d_depth_only_rp[ds_layout],
 									.framebuffer = dst_temps.fb,
 									.renderArea = {
 									.offset = { rects[r].dst_x, rects[r].dst_y, },
@@ -294,14 +299,14 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,
 										}, VK_SUBPASS_CONTENTS_INLINE);


-				bind_depth_pipeline(cmd_buffer, src_type);
+				bind_depth_pipeline(cmd_buffer, src_type, log2_samples);

 			} else if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
 				enum radv_blit_ds_layout ds_layout = radv_meta_blit_ds_to_type(dst->current_layout);
 				radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
 							&(VkRenderPassBeginInfo) {
 								.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-									.renderPass = device->meta_state.blit2d.stencil_only_rp[ds_layout],
+									.renderPass = device->meta_state.blit2d_stencil_only_rp[ds_layout],
 									.framebuffer = dst_temps.fb,
 									.renderArea = {
 									.offset = { rects[r].dst_x, rects[r].dst_y, },
@@ -312,7 +317,7 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,
 										}, VK_SUBPASS_CONTENTS_INLINE);


-				bind_stencil_pipeline(cmd_buffer, src_type);
+				bind_stencil_pipeline(cmd_buffer, src_type, log2_samples);
 			} else
 				unreachable("Processing blit2d with multiple aspects.");

@@ -332,7 +337,24 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,



-			radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
+			if (log2_samples > 0) {
+				for (uint32_t sample = 0; sample < src_img->image->info.samples; sample++) {
+					uint32_t sample_mask = 1 << sample;
+					radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+							      device->meta_state.blit2d[log2_samples].p_layouts[src_type],
+							      VK_SHADER_STAGE_FRAGMENT_BIT, 20, 4,
+							      &sample);
+
+					radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+							      device->meta_state.blit2d[log2_samples].p_layouts[src_type],
+							      VK_SHADER_STAGE_FRAGMENT_BIT, 24, 4,
+							      &sample_mask);
+
+					radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
+				}
+			}
+			else
+				radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
 			radv_CmdEndRenderPass(radv_cmd_buffer_to_handle(cmd_buffer));

 			/* At the point where we emit the draw call, all data from the
@@ -358,7 +380,8 @@ radv_meta_blit2d(struct radv_cmd_buffer *cmd_buffer,
 	enum blit2d_src_type src_type = src_buf ? BLIT2D_SRC_TYPE_BUFFER :
 		use_3d ? BLIT2D_SRC_TYPE_IMAGE_3D : BLIT2D_SRC_TYPE_IMAGE;
 	radv_meta_blit2d_normal_dst(cmd_buffer, src_img, src_buf, dst,
-				    num_rects, rects, src_type);
+				    num_rects, rects, src_type,
+				    src_img ? util_logbase2(src_img->image->info.samples) : 0);
 }

 static nir_shader *
@@ -421,13 +444,14 @@ build_nir_vertex_shader(void)

 typedef nir_ssa_def* (*texel_fetch_build_func)(struct nir_builder *,
                                               struct radv_device *,
-                                               nir_ssa_def *, bool);
+                                               nir_ssa_def *, bool, bool);

 static nir_ssa_def *
 build_nir_texel_fetch(struct nir_builder *b, struct radv_device *device,
-                      nir_ssa_def *tex_pos, bool is_3d)
+                      nir_ssa_def *tex_pos, bool is_3d, bool is_multisampled)
 {
-	enum glsl_sampler_dim dim = is_3d ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
+	enum glsl_sampler_dim dim =
+		is_3d ? GLSL_SAMPLER_DIM_3D : is_multisampled ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D;
 	const struct glsl_type *sampler_type =
 		glsl_sampler_type(dim, false, false, GLSL_TYPE_UINT);
 	nir_variable *sampler = nir_variable_create(b->shader, nir_var_uniform,
@@ -436,6 +460,7 @@ build_nir_texel_fetch(struct nir_builder *b, struct radv_device *device,
 	sampler->data.binding = 0;

 	nir_ssa_def *tex_pos_3d = NULL;
+	nir_intrinsic_instr *sample_idx = NULL;
 	if (is_3d) {
 		nir_intrinsic_instr *layer = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
 		nir_intrinsic_set_base(layer, 16);
@@ -451,13 +476,26 @@ build_nir_texel_fetch(struct nir_builder *b, struct radv_device *device,
 		chans[2] = &layer->dest.ssa;
 		tex_pos_3d = nir_vec(b, chans, 3);
 	}
-	nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
+	if (is_multisampled) {
+		sample_idx = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
+		nir_intrinsic_set_base(sample_idx, 20);
+		nir_intrinsic_set_range(sample_idx, 4);
+		sample_idx->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+		sample_idx->num_components = 1;
+		nir_ssa_dest_init(&sample_idx->instr, &sample_idx->dest, 1, 32, "sample_idx");
+		nir_builder_instr_insert(b, &sample_idx->instr);
+	}
+	nir_tex_instr *tex = nir_tex_instr_create(b->shader, is_multisampled ? 3 : 2);
 	tex->sampler_dim = dim;
-	tex->op = nir_texop_txf;
+	tex->op = is_multisampled ? nir_texop_txf_ms : nir_texop_txf;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(is_3d ? tex_pos_3d : tex_pos);
-	tex->src[1].src_type = nir_tex_src_lod;
-	tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
+	tex->src[1].src_type = is_multisampled ? nir_tex_src_ms_index : nir_tex_src_lod;
+	tex->src[1].src = nir_src_for_ssa(is_multisampled ? &sample_idx->dest.ssa : nir_imm_int(b, 0));
+	if (is_multisampled) {
+		tex->src[2].src_type = nir_tex_src_lod;
+		tex->src[2].src = nir_src_for_ssa(nir_imm_int(b, 0));
+	}
 	tex->dest_type = nir_type_uint;
 	tex->is_array = false;
 	tex->coord_components = is_3d ? 3 : 2;
@@ -473,7 +511,7 @@ build_nir_texel_fetch(struct nir_builder *b, struct radv_device *device,

 static nir_ssa_def *
 build_nir_buffer_fetch(struct nir_builder *b, struct radv_device *device,
-		       nir_ssa_def *tex_pos, bool is_3d)
+		       nir_ssa_def *tex_pos, bool is_3d, bool is_multisampled)
 {
 	const struct glsl_type *sampler_type =
 		glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
@@ -519,9 +557,31 @@ static const VkPipelineVertexInputStateCreateInfo normal_vi_create_info = {
 	.vertexAttributeDescriptionCount = 0,
 };

+static void
+build_nir_store_sample_mask(struct nir_builder *b)
+{
+	nir_intrinsic_instr *sample_mask = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
+	nir_intrinsic_set_base(sample_mask, 24);
+	nir_intrinsic_set_range(sample_mask, 4);
+	sample_mask->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+	sample_mask->num_components = 1;
+	nir_ssa_dest_init(&sample_mask->instr, &sample_mask->dest, 1, 32, "sample_mask");
+	nir_builder_instr_insert(b, &sample_mask->instr);
+
+	const struct glsl_type *sample_mask_out_type = glsl_uint_type();
+
+	nir_variable *sample_mask_out =
+		nir_variable_create(b->shader, nir_var_shader_out,
+				    sample_mask_out_type, "sample_mask_out");
+	sample_mask_out->data.location = FRAG_RESULT_SAMPLE_MASK;
+
+	nir_store_var(b, sample_mask_out, &sample_mask->dest.ssa, 0x1);
+}
+
 static nir_shader *
 build_nir_copy_fragment_shader(struct radv_device *device,
-                               texel_fetch_build_func txf_func, const char* name, bool is_3d)
+                               texel_fetch_build_func txf_func, const char* name, bool is_3d,
+                               bool is_multisampled)
 {
 	const struct glsl_type *vec4 = glsl_vec4_type();
 	const struct glsl_type *vec2 = glsl_vector_type(GLSL_TYPE_FLOAT, 2);
@@ -538,11 +598,15 @@ build_nir_copy_fragment_shader(struct radv_device *device,
 						      vec4, "f_color");
 	color_out->data.location = FRAG_RESULT_DATA0;

+	if (is_multisampled) {
+		build_nir_store_sample_mask(&b);
+	}
+
 	nir_ssa_def *pos_int = nir_f2i32(&b, nir_load_var(&b, tex_pos_in));
 	unsigned swiz[4] = { 0, 1 };
 	nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false);

-	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d);
+	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d, is_multisampled);
 	nir_store_var(&b, color_out, color, 0xf);

 	return b.shader;
@@ -550,7 +614,8 @@ build_nir_copy_fragment_shader(struct radv_device *device,

 static nir_shader *
 build_nir_copy_fragment_shader_depth(struct radv_device *device,
-				     texel_fetch_build_func txf_func, const char* name, bool is_3d)
+				     texel_fetch_build_func txf_func, const char* name, bool is_3d,
+				     bool is_multisampled)
 {
 	const struct glsl_type *vec4 = glsl_vec4_type();
 	const struct glsl_type *vec2 = glsl_vector_type(GLSL_TYPE_FLOAT, 2);
@@ -567,11 +632,15 @@ build_nir_copy_fragment_shader_depth(struct radv_device *device,
 						      vec4, "f_color");
 	color_out->data.location = FRAG_RESULT_DEPTH;

+	if (is_multisampled) {
+		build_nir_store_sample_mask(&b);
+	}
+
 	nir_ssa_def *pos_int = nir_f2i32(&b, nir_load_var(&b, tex_pos_in));
 	unsigned swiz[4] = { 0, 1 };
 	nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false);

-	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d);
+	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d, is_multisampled);
 	nir_store_var(&b, color_out, color, 0x1);

 	return b.shader;
@@ -579,7 +648,8 @@ build_nir_copy_fragment_shader_depth(struct radv_device *device,

 static nir_shader *
 build_nir_copy_fragment_shader_stencil(struct radv_device *device,
-				       texel_fetch_build_func txf_func, const char* name, bool is_3d)
+				       texel_fetch_build_func txf_func, const char* name, bool is_3d,
+				       bool is_multisampled)
 {
 	const struct glsl_type *vec4 = glsl_vec4_type();
 	const struct glsl_type *vec2 = glsl_vector_type(GLSL_TYPE_FLOAT, 2);
@@ -596,11 +666,15 @@ build_nir_copy_fragment_shader_stencil(struct radv_device *device,
 						      vec4, "f_color");
 	color_out->data.location = FRAG_RESULT_STENCIL;

+	if (is_multisampled) {
+		build_nir_store_sample_mask(&b);
+	}
+
 	nir_ssa_def *pos_int = nir_f2i32(&b, nir_load_var(&b, tex_pos_in));
 	unsigned swiz[4] = { 0, 1 };
 	nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false);

-	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d);
+	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d, is_multisampled);
 	nir_store_var(&b, color_out, color, 0x1);

 	return b.shader;
@@ -614,45 +688,48 @@ radv_device_finish_meta_blit2d_state(struct radv_device *device)
 	for(unsigned j = 0; j < NUM_META_FS_KEYS; ++j) {
 		for (unsigned k = 0; k < RADV_META_DST_LAYOUT_COUNT; ++k) {
 			radv_DestroyRenderPass(radv_device_to_handle(device),
-			                       state->blit2d.render_passes[j][k],
-			                       &state->alloc);
+					       state->blit2d_render_passes[j][k],
+					       &state->alloc);
 		}
 	}

 	for (enum radv_blit_ds_layout j = RADV_BLIT_DS_LAYOUT_TILE_ENABLE; j < RADV_BLIT_DS_LAYOUT_COUNT; j++) {
 		radv_DestroyRenderPass(radv_device_to_handle(device),
-				       state->blit2d.depth_only_rp[j], &state->alloc);
+				       state->blit2d_depth_only_rp[j], &state->alloc);
 		radv_DestroyRenderPass(radv_device_to_handle(device),
-				       state->blit2d.stencil_only_rp[j], &state->alloc);
+				       state->blit2d_stencil_only_rp[j], &state->alloc);
 	}

-	for (unsigned src = 0; src < BLIT2D_NUM_SRC_TYPES; src++) {
-		radv_DestroyPipelineLayout(radv_device_to_handle(device),
-					   state->blit2d.p_layouts[src],
-					   &state->alloc);
-		radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
-						state->blit2d.ds_layouts[src],
-						&state->alloc);
+	for (unsigned log2_samples = 0; log2_samples < 1 + MAX_SAMPLES_LOG2; ++log2_samples) {
+		for (unsigned src = 0; src < BLIT2D_NUM_SRC_TYPES; src++) {
+			radv_DestroyPipelineLayout(radv_device_to_handle(device),
+						   state->blit2d[log2_samples].p_layouts[src],
+						   &state->alloc);
+			radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
+							state->blit2d[log2_samples].ds_layouts[src],
+							&state->alloc);
+
+			for (unsigned j = 0; j < NUM_META_FS_KEYS; ++j) {
+				radv_DestroyPipeline(radv_device_to_handle(device),
+						     state->blit2d[log2_samples].pipelines[src][j],
+						     &state->alloc);
+			}

-		for (unsigned j = 0; j < NUM_META_FS_KEYS; ++j) {
 			radv_DestroyPipeline(radv_device_to_handle(device),
-					     state->blit2d.pipelines[src][j],
+					     state->blit2d[log2_samples].depth_only_pipeline[src],
+					     &state->alloc);
+			radv_DestroyPipeline(radv_device_to_handle(device),
+					     state->blit2d[log2_samples].stencil_only_pipeline[src],
 					     &state->alloc);
 		}
-
-		radv_DestroyPipeline(radv_device_to_handle(device),
-				     state->blit2d.depth_only_pipeline[src],
-				     &state->alloc);
-		radv_DestroyPipeline(radv_device_to_handle(device),
-				     state->blit2d.stencil_only_pipeline[src],
-				     &state->alloc);
 	}
 }

 static VkResult
 blit2d_init_color_pipeline(struct radv_device *device,
 			   enum blit2d_src_type src_type,
-			   VkFormat format)
+			   VkFormat format,
+			   uint32_t log2_samples)
 {
 	VkResult result;
 	unsigned fs_key = radv_format_meta_fs_key(format);
@@ -681,7 +758,7 @@ blit2d_init_color_pipeline(struct radv_device *device,
 	struct radv_shader_module fs = { .nir = NULL };


-	fs.nir = build_nir_copy_fragment_shader(device, src_func, name, src_type == BLIT2D_SRC_TYPE_IMAGE_3D);
+	fs.nir = build_nir_copy_fragment_shader(device, src_func, name, src_type == BLIT2D_SRC_TYPE_IMAGE_3D, log2_samples > 0);
 	vi_create_info = &normal_vi_create_info;

 	struct radv_shader_module vs = {
@@ -705,7 +782,7 @@ blit2d_init_color_pipeline(struct radv_device *device,
 	};

 	for (unsigned dst_layout = 0; dst_layout < RADV_META_DST_LAYOUT_COUNT; ++dst_layout) {
-		if (!device->meta_state.blit2d.render_passes[fs_key][dst_layout]) {
+		if (!device->meta_state.blit2d_render_passes[fs_key][dst_layout]) {
 			VkImageLayout layout = radv_meta_dst_layout_to_layout(dst_layout);

 			result = radv_CreateRenderPass(radv_device_to_handle(device),
@@ -737,7 +814,7 @@ blit2d_init_color_pipeline(struct radv_device *device,
 						.pPreserveAttachments = (uint32_t[]) { 0 },
 						},
 						.dependencyCount = 0,
-					}, &device->meta_state.alloc, &device->meta_state.blit2d.render_passes[fs_key][dst_layout]);
+					}, &device->meta_state.alloc, &device->meta_state.blit2d_render_passes[fs_key][dst_layout]);
 		}
 	}

@@ -765,7 +842,7 @@ blit2d_init_color_pipeline(struct radv_device *device,
 		},
 		.pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
 			.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
-			.rasterizationSamples = 1,
+			.rasterizationSamples = 1 << log2_samples,
 			.sampleShadingEnable = false,
 			.pSampleMask = (VkSampleMask[]) { UINT32_MAX },
 		},
@@ -796,8 +873,8 @@ blit2d_init_color_pipeline(struct radv_device *device,
 			},
 		},
 		.flags = 0,
-		.layout = device->meta_state.blit2d.p_layouts[src_type],
-		.renderPass = device->meta_state.blit2d.render_passes[fs_key][0],
+		.layout = device->meta_state.blit2d[log2_samples].p_layouts[src_type],
+		.renderPass = device->meta_state.blit2d_render_passes[fs_key][0],
 		.subpass = 0,
 	};

@@ -809,7 +886,7 @@ blit2d_init_color_pipeline(struct radv_device *device,
 					       radv_pipeline_cache_to_handle(&device->meta_state.cache),
 					       &vk_pipeline_info, &radv_pipeline_info,
 					       &device->meta_state.alloc,
-					       &device->meta_state.blit2d.pipelines[src_type][fs_key]);
+					       &device->meta_state.blit2d[log2_samples].pipelines[src_type][fs_key]);


 	ralloc_free(vs.nir);
@@ -820,7 +897,8 @@ blit2d_init_color_pipeline(struct radv_device *device,

 static VkResult
 blit2d_init_depth_only_pipeline(struct radv_device *device,
-				enum blit2d_src_type src_type)
+				enum blit2d_src_type src_type,
+				uint32_t log2_samples)
 {
 	VkResult result;
 	const char *name;
@@ -847,7 +925,7 @@ blit2d_init_depth_only_pipeline(struct radv_device *device,
 	const VkPipelineVertexInputStateCreateInfo *vi_create_info;
 	struct radv_shader_module fs = { .nir = NULL };

-	fs.nir = build_nir_copy_fragment_shader_depth(device, src_func, name, src_type == BLIT2D_SRC_TYPE_IMAGE_3D);
+	fs.nir = build_nir_copy_fragment_shader_depth(device, src_func, name, src_type == BLIT2D_SRC_TYPE_IMAGE_3D, log2_samples > 0);
 	vi_create_info = &normal_vi_create_info;

 	struct radv_shader_module vs = {
@@ -871,7 +949,7 @@ blit2d_init_depth_only_pipeline(struct radv_device *device,
 	};

 	for (enum radv_blit_ds_layout ds_layout = RADV_BLIT_DS_LAYOUT_TILE_ENABLE; ds_layout < RADV_BLIT_DS_LAYOUT_COUNT; ds_layout++) {
-		if (!device->meta_state.blit2d.depth_only_rp[ds_layout]) {
+		if (!device->meta_state.blit2d_depth_only_rp[ds_layout]) {
 			VkImageLayout layout = radv_meta_blit_ds_to_layout(ds_layout);
 			result = radv_CreateRenderPass(radv_device_to_handle(device),
 						       &(VkRenderPassCreateInfo) {
@@ -899,7 +977,7 @@ blit2d_init_depth_only_pipeline(struct radv_device *device,
 								       .pPreserveAttachments = (uint32_t[]) { 0 },
 							       },
 							       .dependencyCount = 0,
-							}, &device->meta_state.alloc, &device->meta_state.blit2d.depth_only_rp[ds_layout]);
+							}, &device->meta_state.alloc, &device->meta_state.blit2d_depth_only_rp[ds_layout]);
 		}
 	}

@@ -927,7 +1005,7 @@ blit2d_init_depth_only_pipeline(struct radv_device *device,
 		},
 		.pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
 			.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
-			.rasterizationSamples = 1,
+			.rasterizationSamples = 1 << log2_samples,
 			.sampleShadingEnable = false,
 			.pSampleMask = (VkSampleMask[]) { UINT32_MAX },
 		},
@@ -958,8 +1036,8 @@ blit2d_init_depth_only_pipeline(struct radv_device *device,
 			},
 		},
 		.flags = 0,
-		.layout = device->meta_state.blit2d.p_layouts[src_type],
-		.renderPass = device->meta_state.blit2d.depth_only_rp[0],
+		.layout = device->meta_state.blit2d[log2_samples].p_layouts[src_type],
+		.renderPass = device->meta_state.blit2d_depth_only_rp[0],
 		.subpass = 0,
 	};

@@ -971,7 +1049,7 @@ blit2d_init_depth_only_pipeline(struct radv_device *device,
 					       radv_pipeline_cache_to_handle(&device->meta_state.cache),
 					       &vk_pipeline_info, &radv_pipeline_info,
 					       &device->meta_state.alloc,
-					       &device->meta_state.blit2d.depth_only_pipeline[src_type]);
+					       &device->meta_state.blit2d[log2_samples].depth_only_pipeline[src_type]);


 	ralloc_free(vs.nir);
@@ -982,7 +1060,8 @@ blit2d_init_depth_only_pipeline(struct radv_device *device,

 static VkResult
 blit2d_init_stencil_only_pipeline(struct radv_device *device,
-				  enum blit2d_src_type src_type)
+				  enum blit2d_src_type src_type,
+				  uint32_t log2_samples)
 {
 	VkResult result;
 	const char *name;
@@ -1009,7 +1088,7 @@ blit2d_init_stencil_only_pipeline(struct radv_device *device,
 	const VkPipelineVertexInputStateCreateInfo *vi_create_info;
 	struct radv_shader_module fs = { .nir = NULL };

-	fs.nir = build_nir_copy_fragment_shader_stencil(device, src_func, name, src_type == BLIT2D_SRC_TYPE_IMAGE_3D);
+	fs.nir = build_nir_copy_fragment_shader_stencil(device, src_func, name, src_type == BLIT2D_SRC_TYPE_IMAGE_3D, log2_samples > 0);
 	vi_create_info = &normal_vi_create_info;

 	struct radv_shader_module vs = {
@@ -1033,7 +1112,7 @@ blit2d_init_stencil_only_pipeline(struct radv_device *device,
 	};

 	for (enum radv_blit_ds_layout ds_layout = RADV_BLIT_DS_LAYOUT_TILE_ENABLE; ds_layout < RADV_BLIT_DS_LAYOUT_COUNT; ds_layout++) {
-		if (!device->meta_state.blit2d.stencil_only_rp[ds_layout]) {
+		if (!device->meta_state.blit2d_stencil_only_rp[ds_layout]) {
 			VkImageLayout layout = radv_meta_blit_ds_to_layout(ds_layout);
 			result = radv_CreateRenderPass(radv_device_to_handle(device),
 						       &(VkRenderPassCreateInfo) {
@@ -1061,7 +1140,7 @@ blit2d_init_stencil_only_pipeline(struct radv_device *device,
 								       .pPreserveAttachments = (uint32_t[]) { 0 },
 							       },
 							       .dependencyCount = 0,
-						       }, &device->meta_state.alloc, &device->meta_state.blit2d.stencil_only_rp[ds_layout]);
+						       }, &device->meta_state.alloc, &device->meta_state.blit2d_stencil_only_rp[ds_layout]);
 		}
 	}

@@ -1089,7 +1168,7 @@ blit2d_init_stencil_only_pipeline(struct radv_device *device,
 		},
 		.pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
 			.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
-			.rasterizationSamples = 1,
+			.rasterizationSamples = 1 << log2_samples,
 			.sampleShadingEnable = false,
 			.pSampleMask = (VkSampleMask[]) { UINT32_MAX },
 		},
@@ -1136,8 +1215,8 @@ blit2d_init_stencil_only_pipeline(struct radv_device *device,
 			},
 		},
 		.flags = 0,
-		.layout = device->meta_state.blit2d.p_layouts[src_type],
-		.renderPass = device->meta_state.blit2d.stencil_only_rp[0],
+		.layout = device->meta_state.blit2d[log2_samples].p_layouts[src_type],
+		.renderPass = device->meta_state.blit2d_stencil_only_rp[0],
 		.subpass = 0,
 	};

@@ -1149,7 +1228,7 @@ blit2d_init_stencil_only_pipeline(struct radv_device *device,
 					       radv_pipeline_cache_to_handle(&device->meta_state.cache),
 					       &vk_pipeline_info, &radv_pipeline_info,
 					       &device->meta_state.alloc,
-					       &device->meta_state.blit2d.stencil_only_pipeline[src_type]);
+					       &device->meta_state.blit2d[log2_samples].stencil_only_pipeline[src_type]);


 	ralloc_free(vs.nir);
@@ -1175,15 +1254,16 @@ static VkFormat pipeline_formats[] = {

 static VkResult
 meta_blit2d_create_pipe_layout(struct radv_device *device,
-			       int idx)
+			       int idx,
+			       uint32_t log2_samples)
 {
 	VkResult result;
 	VkDescriptorType desc_type = (idx == BLIT2D_SRC_TYPE_BUFFER) ? VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
 	const VkPushConstantRange push_constant_ranges[] = {
 		{VK_SHADER_STAGE_VERTEX_BIT, 0, 16},
-		{VK_SHADER_STAGE_FRAGMENT_BIT, 16, 4},
+		{VK_SHADER_STAGE_FRAGMENT_BIT, 16, 12},
 	};
-	int num_push_constant_range = (idx != BLIT2D_SRC_TYPE_IMAGE) ? 2 : 1;
+	int num_push_constant_range = (idx != BLIT2D_SRC_TYPE_IMAGE || log2_samples > 0) ? 2 : 1;

 	result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
 						&(VkDescriptorSetLayoutCreateInfo) {
@@ -1199,7 +1279,7 @@ meta_blit2d_create_pipe_layout(struct radv_device *device,
 								.pImmutableSamplers = NULL
 							},
 							}
-						}, &device->meta_state.alloc, &device->meta_state.blit2d.ds_layouts[idx]);
+						}, &device->meta_state.alloc, &device->meta_state.blit2d[log2_samples].ds_layouts[idx]);
 	if (result != VK_SUCCESS)
 		goto fail;

@@ -1207,11 +1287,11 @@ meta_blit2d_create_pipe_layout(struct radv_device *device,
 					   &(VkPipelineLayoutCreateInfo) {
 						   .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
 							   .setLayoutCount = 1,
-							   .pSetLayouts = &device->meta_state.blit2d.ds_layouts[idx],
+							   .pSetLayouts = &device->meta_state.blit2d[log2_samples].ds_layouts[idx],
 							   .pushConstantRangeCount = num_push_constant_range,
 							   .pPushConstantRanges = push_constant_ranges,
 							   },
-					   &device->meta_state.alloc, &device->meta_state.blit2d.p_layouts[idx]);
+					   &device->meta_state.alloc, &device->meta_state.blit2d[log2_samples].p_layouts[idx]);
 	if (result != VK_SUCCESS)
 		goto fail;
 	return VK_SUCCESS;
@@ -1225,27 +1305,33 @@ radv_device_init_meta_blit2d_state(struct radv_device *device)
 	VkResult result;
 	bool create_3d = device->physical_device->rad_info.chip_class >= GFX9;

-	for (unsigned src = 0; src < BLIT2D_NUM_SRC_TYPES; src++) {
-		if (src == BLIT2D_SRC_TYPE_IMAGE_3D && !create_3d)
-			continue;
+	for (unsigned log2_samples = 0; log2_samples < 1 + MAX_SAMPLES_LOG2; log2_samples++) {
+		for (unsigned src = 0; src < BLIT2D_NUM_SRC_TYPES; src++) {
+			if (src == BLIT2D_SRC_TYPE_IMAGE_3D && !create_3d)
+				continue;

-		result = meta_blit2d_create_pipe_layout(device, src);
-		if (result != VK_SUCCESS)
-			goto fail;
+			/* Don't need to handle copies between buffers and multisample images. */
+			if (src == BLIT2D_SRC_TYPE_BUFFER && log2_samples > 0)
+				continue;

-		for (unsigned j = 0; j < ARRAY_SIZE(pipeline_formats); ++j) {
-			result = blit2d_init_color_pipeline(device, src, pipeline_formats[j]);
+			result = meta_blit2d_create_pipe_layout(device, src, log2_samples);
+			if (result != VK_SUCCESS)
+				goto fail;
+
+			for (unsigned j = 0; j < ARRAY_SIZE(pipeline_formats); ++j) {
+				result = blit2d_init_color_pipeline(device, src, pipeline_formats[j], log2_samples);
+				if (result != VK_SUCCESS)
+					goto fail;
+			}
+
+			result = blit2d_init_depth_only_pipeline(device, src, log2_samples);
+			if (result != VK_SUCCESS)
+				goto fail;
+
+			result = blit2d_init_stencil_only_pipeline(device, src, log2_samples);
 			if (result != VK_SUCCESS)
 				goto fail;
 		}
-
-		result = blit2d_init_depth_only_pipeline(device, src);
-		if (result != VK_SUCCESS)
-			goto fail;
-
-		result = blit2d_init_stencil_only_pipeline(device, src);
-		if (result != VK_SUCCESS)
-			goto fail;
 	}

 	return VK_SUCCESS;
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -645,7 +645,8 @@ emit_depthstencil_clear(struct radv_cmd_buffer *cmd_buffer,
 	if (depth_view_can_fast_clear(cmd_buffer, iview, aspects,
 	                              subpass->depth_stencil_attachment.layout,
 	                              clear_rect, clear_value))
-		radv_set_depth_clear_regs(cmd_buffer, iview->image, clear_value, aspects);
+		radv_set_ds_clear_metadata(cmd_buffer, iview->image,
+					   clear_value, aspects);

 	radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkViewport) {
 			.x = clear_rect->rect.offset.x,
@@ -717,6 +718,14 @@ emit_fast_htile_clear(struct radv_cmd_buffer *cmd_buffer,
 	if ((clear_value.depth != 0.0 && clear_value.depth != 1.0) || !(aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
 		goto fail;

+	/* GFX8 only supports 32-bit depth surfaces but we can enable TC-compat
+	 * HTILE for 16-bit surfaces if no Z planes are compressed. Though,
+	 * fast HTILE clears don't seem to work.
+	 */
+	if (cmd_buffer->device->physical_device->rad_info.chip_class == VI &&
+	    iview->image->vk_format == VK_FORMAT_D16_UNORM)
+		goto fail;
+
 	if (vk_format_aspects(iview->image->vk_format) & VK_IMAGE_ASPECT_STENCIL_BIT) {
 		if (clear_value.stencil != 0 || !(aspects & VK_IMAGE_ASPECT_STENCIL_BIT))
 			goto fail;
@@ -736,7 +745,7 @@ emit_fast_htile_clear(struct radv_cmd_buffer *cmd_buffer,
 				      iview->image->offset + iview->image->htile_offset,
 				      iview->image->surface.htile_size, clear_word);

-	radv_set_depth_clear_regs(cmd_buffer, iview->image, clear_value, aspects);
+	radv_set_ds_clear_metadata(cmd_buffer, iview->image, clear_value, aspects);
 	if (post_flush) {
 		*post_flush |= flush_bits;
 	} else {
@@ -1011,8 +1020,6 @@ emit_fast_color_clear(struct radv_cmd_buffer *cmd_buffer,
 	if (iview->image->info.levels > 1)
 		goto fail;

-	if (iview->image->surface.is_linear)
-		goto fail;
 	if (!radv_image_extent_compare(iview->image, &iview->extent))
 		goto fail;

@@ -1035,7 +1042,7 @@ emit_fast_color_clear(struct radv_cmd_buffer *cmd_buffer,
 		goto fail;

 	/* DCC */
-	ret = radv_format_pack_clear_color(iview->image->vk_format,
+	ret = radv_format_pack_clear_color(iview->vk_format,
 					   clear_color, &clear_value);
 	if (ret == false)
 		goto fail;
@@ -1056,7 +1063,7 @@ emit_fast_color_clear(struct radv_cmd_buffer *cmd_buffer,
 		bool can_avoid_fast_clear_elim;
 		bool need_decompress_pass = false;

-		vi_get_fast_clear_parameters(iview->image->vk_format,
+		vi_get_fast_clear_parameters(iview->vk_format,
 					     &clear_value, &reset_value,
 					     &can_avoid_fast_clear_elim);

@@ -1097,7 +1104,8 @@ emit_fast_color_clear(struct radv_cmd_buffer *cmd_buffer,
 		cmd_buffer->state.flush_bits |= flush_bits;
 	}

-	radv_set_color_clear_regs(cmd_buffer, iview->image, subpass_att, clear_color);
+	radv_set_color_clear_metadata(cmd_buffer, iview->image, subpass_att,
+				      clear_color);

 	return true;
 fail:
--- a/src/amd/vulkan/radv_meta_copy.c
+++ b/src/amd/vulkan/radv_meta_copy.c
@@ -72,6 +72,7 @@ vk_format_for_size(int bs)
 	case 2: return VK_FORMAT_R8G8_UINT;
 	case 4: return VK_FORMAT_R8G8B8A8_UINT;
 	case 8: return VK_FORMAT_R16G16B16A16_UINT;
+	case 12: return VK_FORMAT_R32G32B32_UINT;
 	case 16: return VK_FORMAT_R32G32B32A32_UINT;
 	default:
 		unreachable("Invalid format block size");
@@ -93,6 +94,8 @@ blit_surf_for_image_level_layer(struct radv_image *image,
 	    !(radv_image_is_tc_compat_htile(image)))
 		format = vk_format_for_size(vk_format_get_blocksize(format));

+	format = vk_format_no_srgb(format);
+
 	return (struct radv_meta_blit2d_surf) {
 		.format = format,
 		.bs = vk_format_get_blocksize(format),
@@ -483,24 +486,3 @@ void radv_CmdCopyImage(
 			dest_image, destImageLayout,
 			regionCount, pRegions);
 }
-
-void radv_blit_to_prime_linear(struct radv_cmd_buffer *cmd_buffer,
-			       struct radv_image *image,
-			       struct radv_image *linear_image)
-{
-	struct VkImageCopy image_copy = { 0 };
-
-	image_copy.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-	image_copy.srcSubresource.layerCount = 1;
-
-	image_copy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-	image_copy.dstSubresource.layerCount = 1;
-
-	image_copy.extent.width = image->info.width;
-	image_copy.extent.height = image->info.height;
-	image_copy.extent.depth = 1;
-
-	meta_copy_image(cmd_buffer, image, VK_IMAGE_LAYOUT_GENERAL, linear_image,
-			VK_IMAGE_LAYOUT_GENERAL,
-			1, &image_copy);
-}
--- a/src/amd/vulkan/radv_meta_resolve.c
+++ b/src/amd/vulkan/radv_meta_resolve.c
@@ -358,6 +358,8 @@ static void radv_pick_resolve_method_images(struct radv_image *src_image,
 		*method = RESOLVE_COMPUTE;
 	else if (vk_format_is_int(src_image->vk_format))
 		*method = RESOLVE_COMPUTE;
+	else if (src_image->info.array_size > 1)
+		*method = RESOLVE_COMPUTE;
 	
 	if (radv_layout_dcc_compressed(dest_image, dest_image_layout, queue_mask)) {
 		*method = RESOLVE_FRAGMENT;
@@ -695,7 +697,7 @@ radv_decompress_resolve_subpass_src(struct radv_cmd_buffer *cmd_buffer)
 		VkImageResolve region = {};
 		region.srcSubresource.baseArrayLayer = 0;
 		region.srcSubresource.mipLevel = 0;
-		region.srcSubresource.layerCount = 1;
+		region.srcSubresource.layerCount = src_image->info.array_size;

 		radv_decompress_resolve_src(cmd_buffer, src_image,
 					    src_att.layout, 1, &region);
--- a/src/amd/vulkan/radv_meta_resolve_cs.c
+++ b/src/amd/vulkan/radv_meta_resolve_cs.c
@@ -508,12 +508,48 @@ radv_cmd_buffer_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer)
 		if (dest_att.attachment == VK_ATTACHMENT_UNUSED)
 			continue;

-		emit_resolve(cmd_buffer,
-			     src_iview,
-			     dst_iview,
-			     &(VkOffset2D) { 0, 0 },
-			     &(VkOffset2D) { 0, 0 },
-			     &(VkExtent2D) { fb->width, fb->height });
+		struct radv_image *src_image = src_iview->image;
+		struct radv_image *dst_image = dst_iview->image;
+		for (uint32_t layer = 0; layer < src_image->info.array_size; layer++) {
+
+			struct radv_image_view tsrc_iview;
+			radv_image_view_init(&tsrc_iview, cmd_buffer->device,
+					     &(VkImageViewCreateInfo) {
+						     .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+							     .image = radv_image_to_handle(src_image),
+							     .viewType = radv_meta_get_view_type(src_image),
+							     .format = src_image->vk_format,
+							     .subresourceRange = {
+							     .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+							     .baseMipLevel = src_iview->base_mip,
+							     .levelCount = 1,
+							     .baseArrayLayer = layer,
+							     .layerCount = 1,
+						     },
+					     });
+
+			struct radv_image_view tdst_iview;
+			radv_image_view_init(&tdst_iview, cmd_buffer->device,
+					     &(VkImageViewCreateInfo) {
+						     .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+							     .image = radv_image_to_handle(dst_image),
+							     .viewType = radv_meta_get_view_type(dst_image),
+							     .format = vk_to_non_srgb_format(dst_image->vk_format),
+							     .subresourceRange = {
+							     .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+							     .baseMipLevel = dst_iview->base_mip,
+							     .levelCount = 1,
+							     .baseArrayLayer = layer,
+							     .layerCount = 1,
+						     },
+					     });
+			emit_resolve(cmd_buffer,
+				     &tsrc_iview,
+				     &tdst_iview,
+				     &(VkOffset2D) { 0, 0 },
+				     &(VkOffset2D) { 0, 0 },
+				     &(VkExtent2D) { fb->width, fb->height });
+		}
 	}

 	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -124,6 +124,98 @@ radv_shader_context_from_abi(struct ac_shader_abi *abi)
 	return container_of(abi, ctx, abi);
 }

+struct ac_build_if_state
+{
+	struct radv_shader_context *ctx;
+	LLVMValueRef condition;
+	LLVMBasicBlockRef entry_block;
+	LLVMBasicBlockRef true_block;
+	LLVMBasicBlockRef false_block;
+	LLVMBasicBlockRef merge_block;
+};
+
+static LLVMBasicBlockRef
+ac_build_insert_new_block(struct radv_shader_context *ctx, const char *name)
+{
+	LLVMBasicBlockRef current_block;
+	LLVMBasicBlockRef next_block;
+	LLVMBasicBlockRef new_block;
+
+	/* get current basic block */
+	current_block = LLVMGetInsertBlock(ctx->ac.builder);
+
+	/* chqeck if there's another block after this one */
+	next_block = LLVMGetNextBasicBlock(current_block);
+	if (next_block) {
+		/* insert the new block before the next block */
+		new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name);
+	}
+	else {
+		/* append new block after current block */
+		LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+		new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name);
+	}
+	return new_block;
+}
+
+static void
+ac_nir_build_if(struct ac_build_if_state *ifthen,
+		struct radv_shader_context *ctx,
+		LLVMValueRef condition)
+{
+	LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->ac.builder);
+
+	memset(ifthen, 0, sizeof *ifthen);
+	ifthen->ctx = ctx;
+	ifthen->condition = condition;
+	ifthen->entry_block = block;
+
+	/* create endif/merge basic block for the phi functions */
+	ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block");
+
+	/* create/insert true_block before merge_block */
+	ifthen->true_block =
+		LLVMInsertBasicBlockInContext(ctx->context,
+					      ifthen->merge_block,
+					      "if-true-block");
+
+	/* successive code goes into the true block */
+	LLVMPositionBuilderAtEnd(ctx->ac.builder, ifthen->true_block);
+}
+
+/**
+ * End a conditional.
+ */
+static void
+ac_nir_build_endif(struct ac_build_if_state *ifthen)
+{
+	LLVMBuilderRef builder = ifthen->ctx->ac.builder;
+
+	/* Insert branch to the merge block from current block */
+	LLVMBuildBr(builder, ifthen->merge_block);
+
+	/*
+	 * Now patch in the various branch instructions.
+	 */
+
+	/* Insert the conditional branch instruction at the end of entry_block */
+	LLVMPositionBuilderAtEnd(builder, ifthen->entry_block);
+	if (ifthen->false_block) {
+		/* we have an else clause */
+		LLVMBuildCondBr(builder, ifthen->condition,
+				ifthen->true_block, ifthen->false_block);
+	}
+	else {
+		/* no else clause */
+		LLVMBuildCondBr(builder, ifthen->condition,
+				ifthen->true_block, ifthen->merge_block);
+	}
+
+	/* Resume building code at end of the ifthen->merge_block */
+	LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
+}
+
+
 static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx)
 {
 	switch (ctx->stage) {
@@ -388,7 +480,7 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module,
                     unsigned num_return_elems,
 		     struct arg_info *args,
 		     unsigned max_workgroup_size,
-		     bool unsafe_math)
+		     const struct radv_nir_compiler_options *options)
 {
 	LLVMTypeRef main_function_type, ret_type;
 	LLVMBasicBlockRef main_function_body;
@@ -419,12 +511,18 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module,
 		}
 	}

+	if (options->address32_hi) {
+		ac_llvm_add_target_dep_function_attr(main_function,
+						     "amdgpu-32bit-address-high-bits",
+						     options->address32_hi);
+	}
+
 	if (max_workgroup_size) {
 		ac_llvm_add_target_dep_function_attr(main_function,
 						     "amdgpu-max-work-group-size",
 						     max_workgroup_size);
 	}
-	if (unsafe_math) {
+	if (options->unsafe_math) {
 		/* These were copied from some LLVM test. */
 		LLVMAddTargetDependentFunctionAttr(main_function,
 						   "less-precise-fpmad",
@@ -468,6 +566,15 @@ set_loc_shader(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx,
 	set_loc(ud_info, sgpr_idx, num_sgprs, 0);
 }

+static void
+set_loc_shader_ptr(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx)
+{
+	bool use_32bit_pointers = HAVE_32BIT_POINTERS &&
+				  idx != AC_UD_SCRATCH_RING_OFFSETS;
+
+	set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
+}
+
 static void
 set_loc_desc(struct radv_shader_context *ctx, int idx,  uint8_t *sgpr_idx,
 	     uint32_t indirect_offset)
@@ -476,12 +583,11 @@ set_loc_desc(struct radv_shader_context *ctx, int idx,  uint8_t *sgpr_idx,
 		&ctx->shader_info->user_sgprs_locs.descriptor_sets[idx];
 	assert(ud_info);

-	set_loc(ud_info, sgpr_idx, 2, indirect_offset);
+	set_loc(ud_info, sgpr_idx, HAVE_32BIT_POINTERS ? 1 : 2, indirect_offset);
 }

 struct user_sgpr_info {
 	bool need_ring_offsets;
-	uint8_t sgpr_count;
 	bool indirect_all_descriptor_sets;
 };

@@ -514,7 +620,8 @@ count_vs_user_sgprs(struct radv_shader_context *ctx)
 {
 	uint8_t count = 0;

-	count += ctx->shader_info->info.vs.has_vertex_buffers ? 2 : 0;
+	if (ctx->shader_info->info.vs.has_vertex_buffers)
+		count += HAVE_32BIT_POINTERS ? 1 : 2;
 	count += ctx->shader_info->info.vs.needs_draw_id ? 3 : 2;

 	return count;
@@ -527,6 +634,8 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx,
 				bool needs_view_index,
 				struct user_sgpr_info *user_sgpr_info)
 {
+	uint8_t user_sgpr_count = 0;
+
 	memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));

 	/* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
@@ -543,25 +652,25 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx,

 	/* 2 user sgprs will nearly always be allocated for scratch/rings */
 	if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) {
-		user_sgpr_info->sgpr_count += 2;
+		user_sgpr_count += 2;
 	}

 	switch (stage) {
 	case MESA_SHADER_COMPUTE:
 		if (ctx->shader_info->info.cs.uses_grid_size)
-			user_sgpr_info->sgpr_count += 3;
+			user_sgpr_count += 3;
 		break;
 	case MESA_SHADER_FRAGMENT:
-		user_sgpr_info->sgpr_count += ctx->shader_info->info.ps.needs_sample_positions;
+		user_sgpr_count += ctx->shader_info->info.ps.needs_sample_positions;
 		break;
 	case MESA_SHADER_VERTEX:
 		if (!ctx->is_gs_copy_shader)
-			user_sgpr_info->sgpr_count += count_vs_user_sgprs(ctx);
+			user_sgpr_count += count_vs_user_sgprs(ctx);
 		break;
 	case MESA_SHADER_TESS_CTRL:
 		if (has_previous_stage) {
 			if (previous_stage == MESA_SHADER_VERTEX)
-				user_sgpr_info->sgpr_count += count_vs_user_sgprs(ctx);
+				user_sgpr_count += count_vs_user_sgprs(ctx);
 		}
 		break;
 	case MESA_SHADER_TESS_EVAL:
@@ -569,7 +678,7 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx,
 	case MESA_SHADER_GEOMETRY:
 		if (has_previous_stage) {
 			if (previous_stage == MESA_SHADER_VERTEX) {
-				user_sgpr_info->sgpr_count += count_vs_user_sgprs(ctx);
+				user_sgpr_count += count_vs_user_sgprs(ctx);
 			}
 		}
 		break;
@@ -578,19 +687,18 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx,
 	}

 	if (needs_view_index)
-		user_sgpr_info->sgpr_count++;
+		user_sgpr_count++;

 	if (ctx->shader_info->info.loads_push_constants)
-		user_sgpr_info->sgpr_count += 2;
+		user_sgpr_count += HAVE_32BIT_POINTERS ? 1 : 2;

 	uint32_t available_sgprs = ctx->options->chip_class >= GFX9 ? 32 : 16;
-	uint32_t remaining_sgprs = available_sgprs - user_sgpr_info->sgpr_count;
+	uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
+	uint32_t num_desc_set =
+		util_bitcount(ctx->shader_info->info.desc_set_used_mask);

-	if (remaining_sgprs / 2 < util_bitcount(ctx->shader_info->info.desc_set_used_mask)) {
-		user_sgpr_info->sgpr_count += 2;
+	if (remaining_sgprs / (HAVE_32BIT_POINTERS ? 1 : 2) < num_desc_set) {
 		user_sgpr_info->indirect_all_descriptor_sets = true;
-	} else {
-		user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
 	}
 }

@@ -603,7 +711,7 @@ declare_global_input_sgprs(struct radv_shader_context *ctx,
 			   struct arg_info *args,
 			   LLVMValueRef *desc_sets)
 {
-	LLVMTypeRef type = ac_array_in_const_addr_space(ctx->ac.i8);
+	LLVMTypeRef type = ac_array_in_const32_addr_space(ctx->ac.i8);
 	unsigned num_sets = ctx->options->layout ?
 			    ctx->options->layout->num_sets : 0;
 	unsigned stage_mask = 1 << stage;
@@ -621,7 +729,7 @@ declare_global_input_sgprs(struct radv_shader_context *ctx,
 			}
 		}
 	} else {
-		add_array_arg(args, ac_array_in_const_addr_space(type), desc_sets);
+		add_array_arg(args, ac_array_in_const32_addr_space(type), desc_sets);
 	}

 	if (ctx->shader_info->info.loads_push_constants) {
@@ -641,7 +749,8 @@ declare_vs_specific_input_sgprs(struct radv_shader_context *ctx,
 	    (stage == MESA_SHADER_VERTEX ||
 	     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
 		if (ctx->shader_info->info.vs.has_vertex_buffers) {
-			add_arg(args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32),
+			add_arg(args, ARG_SGPR,
+				ac_array_in_const32_addr_space(ctx->ac.v4i32),
 				&ctx->vertex_buffers);
 		}
 		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
@@ -699,8 +808,8 @@ set_global_input_locs(struct radv_shader_context *ctx, gl_shader_stage stage,
 				ctx->descriptor_sets[i] = NULL;
 		}
 	} else {
-		set_loc_shader(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS,
-			       user_sgpr_idx, 2);
+		set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS,
+			           user_sgpr_idx);

 		for (unsigned i = 0; i < num_sets; ++i) {
 			if ((ctx->shader_info->info.desc_set_used_mask & (1 << i)) &&
@@ -718,7 +827,7 @@ set_global_input_locs(struct radv_shader_context *ctx, gl_shader_stage stage,
 	}

 	if (ctx->shader_info->info.loads_push_constants) {
-		set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
+		set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
 	}
 }

@@ -732,8 +841,8 @@ set_vs_specific_input_locs(struct radv_shader_context *ctx,
 	    (stage == MESA_SHADER_VERTEX ||
 	     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
 		if (ctx->shader_info->info.vs.has_vertex_buffers) {
-			set_loc_shader(ctx, AC_UD_VS_VERTEX_BUFFERS,
-				       user_sgpr_idx, 2);
+			set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS,
+					   user_sgpr_idx);
 		}

 		unsigned vs_num = 2;
@@ -759,7 +868,7 @@ static void set_llvm_calling_convention(LLVMValueRef func,
 		calling_conv = RADEON_LLVM_AMDGPU_GS;
 		break;
 	case MESA_SHADER_TESS_CTRL:
-		calling_conv = HAVE_LLVM >= 0x0500 ? RADEON_LLVM_AMDGPU_HS : RADEON_LLVM_AMDGPU_VS;
+		calling_conv = RADEON_LLVM_AMDGPU_HS;
 		break;
 	case MESA_SHADER_FRAGMENT:
 		calling_conv = RADEON_LLVM_AMDGPU_PS;
@@ -1014,8 +1123,7 @@ static void create_function(struct radv_shader_context *ctx,

 	ctx->main_function = create_llvm_function(
 	    ctx->context, ctx->ac.module, ctx->ac.builder, NULL, 0, &args,
-	    ctx->max_workgroup_size,
-	    ctx->options->unsafe_math);
+	    ctx->max_workgroup_size, ctx->options);
 	set_llvm_calling_convention(ctx->main_function, stage);


@@ -1032,8 +1140,8 @@ static void create_function(struct radv_shader_context *ctx,
 	user_sgpr_idx = 0;

 	if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) {
-		set_loc_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS,
-			       &user_sgpr_idx, 2);
+		set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS,
+				   &user_sgpr_idx);
 		if (ctx->options->supports_spill) {
 			ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
 							       LLVMPointerType(ctx->ac.i8, AC_CONST_ADDR_SPACE),
@@ -1592,6 +1700,8 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
 	/* loop num outputs */
 	idx = 0;
 	for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
+		unsigned output_usage_mask =
+			ctx->shader_info->info.gs.output_usage_mask[i];
 		LLVMValueRef *out_ptr = &addrs[i * 4];
 		int length = 4;
 		int slot = idx;
@@ -1605,8 +1715,13 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
 			length = ctx->num_output_clips + ctx->num_output_culls;
 			if (length > 4)
 				slot_inc = 2;
+			output_usage_mask = (1 << length) - 1;
 		}
+
 		for (unsigned j = 0; j < length; j++) {
+			if (!(output_usage_mask & (1 << j)))
+				continue;
+
 			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder,
 							     out_ptr[j], "");
 			LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, (slot * 4 + j) * ctx->gs_max_out_vertices, false);
@@ -1768,11 +1883,53 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,
 	index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");

 	list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->ac.i32, offset, 0));
-	list = LLVMBuildPointerCast(builder, list, ac_array_in_const_addr_space(type), "");
+	list = LLVMBuildPointerCast(builder, list,
+				    ac_array_in_const32_addr_space(type), "");

 	return ac_build_load_to_sgpr(&ctx->ac, list, index);
 }

+/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
+ * so we may need to fix it up. */
+static LLVMValueRef
+adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
+                          unsigned adjustment,
+                          LLVMValueRef alpha)
+{
+	if (adjustment == RADV_ALPHA_ADJUST_NONE)
+		return alpha;
+
+	LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
+
+	if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
+		alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
+	else
+		alpha = ac_to_integer(&ctx->ac, alpha);
+
+	/* For the integer-like cases, do a natural sign extension.
+	 *
+	 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
+	 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
+	 * exponent.
+	 */
+	alpha = LLVMBuildShl(ctx->ac.builder, alpha,
+	                     adjustment == RADV_ALPHA_ADJUST_SNORM ?
+	                     LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
+	alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
+
+	/* Convert back to the right type. */
+	if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
+		LLVMValueRef clamp;
+		LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
+		alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
+		clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
+		alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
+	} else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
+		alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
+	}
+
+	return alpha;
+}

 static void
 handle_vs_input_decl(struct radv_shader_context *ctx,
@@ -1783,18 +1940,19 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 	LLVMValueRef t_list;
 	LLVMValueRef input;
 	LLVMValueRef buffer_index;
-	int index = variable->data.location - VERT_ATTRIB_GENERIC0;
-	int idx = variable->data.location;
 	unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
 	uint8_t input_usage_mask =
 		ctx->shader_info->info.vs.input_usage_mask[variable->data.location];
 	unsigned num_channels = util_last_bit(input_usage_mask);

-	variable->data.driver_location = idx * 4;
+	variable->data.driver_location = variable->data.location * 4;

-	for (unsigned i = 0; i < attrib_count; ++i, ++idx) {
-		if (ctx->options->key.vs.instance_rate_inputs & (1u << (index + i))) {
-			uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[index + i];
+	for (unsigned i = 0; i < attrib_count; ++i) {
+		LLVMValueRef output[4];
+		unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
+
+		if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
+			uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];

 			if (divisor) {
 				buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.instance_id,
@@ -1818,7 +1976,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 		} else
 			buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
 			                            ctx->abi.base_vertex, "");
-		t_offset = LLVMConstInt(ctx->ac.i32, index + i, false);
+		t_offset = LLVMConstInt(ctx->ac.i32, attrib_index, false);

 		t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);

@@ -1831,9 +1989,15 @@ handle_vs_input_decl(struct radv_shader_context *ctx,

 		for (unsigned chan = 0; chan < 4; chan++) {
 			LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
-			ctx->inputs[ac_llvm_reg_index_soa(idx, chan)] =
-				ac_to_integer(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder,
-							input, llvm_chan, ""));
+			output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
+		}
+
+		unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3;
+		output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]);
+
+		for (unsigned chan = 0; chan < 4; chan++) {
+			ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] =
+				ac_to_integer(&ctx->ac, output[chan]);
 		}
 	}
 }
@@ -1929,9 +2093,6 @@ static void
 prepare_interp_optimize(struct radv_shader_context *ctx,
                        struct nir_shader *nir)
 {
-	if (!ctx->options->key.fs.multisample)
-		return;
-
 	bool uses_center = false;
 	bool uses_centroid = false;
 	nir_foreach_variable(variable, &nir->inputs) {
@@ -2353,10 +2514,9 @@ handle_vs_outputs_post(struct radv_shader_context *ctx,
 			output_usage_mask =
 				ctx->shader_info->info.tes.output_usage_mask[i];
 		} else {
-			/* Enable all channels for the GS copy shader because
-			 * we don't know the output usage mask currently.
-			 */
-			output_usage_mask = 0xf;
+			assert(ctx->is_gs_copy_shader);
+			output_usage_mask =
+				ctx->shader_info->info.gs.output_usage_mask[i];
 		}

 		radv_export_param(ctx, param_count, values, output_usage_mask);
@@ -2436,14 +2596,26 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
 	for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
 		LLVMValueRef dw_addr = NULL;
 		LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
+		unsigned output_usage_mask;
 		int param_index;
 		int length = 4;

 		if (!(ctx->output_mask & (1ull << i)))
 			continue;

-		if (i == VARYING_SLOT_CLIP_DIST0)
+		if (ctx->stage == MESA_SHADER_VERTEX) {
+			output_usage_mask =
+				ctx->shader_info->info.vs.output_usage_mask[i];
+		} else {
+			assert(ctx->stage == MESA_SHADER_TESS_EVAL);
+			output_usage_mask =
+				ctx->shader_info->info.tes.output_usage_mask[i];
+		}
+
+		if (i == VARYING_SLOT_CLIP_DIST0) {
 			length = ctx->num_output_clips + ctx->num_output_culls;
+			output_usage_mask = (1 << length) - 1;
+		}

 		param_index = shader_io_get_unique_index(i);

@@ -2452,14 +2624,22 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
 			                       LLVMConstInt(ctx->ac.i32, param_index * 4, false),
 			                       "");
 		}
+
 		for (j = 0; j < length; j++) {
+			if (!(output_usage_mask & (1 << j)))
+				continue;
+
 			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
 			out_val = LLVMBuildBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");

 			if (ctx->ac.chip_class  >= GFX9) {
-				ac_lds_store(&ctx->ac, dw_addr,
+				LLVMValueRef dw_addr_offset =
+					LLVMBuildAdd(ctx->ac.builder, dw_addr,
+						     LLVMConstInt(ctx->ac.i32,
+								  j, false), "");
+
+				ac_lds_store(&ctx->ac, dw_addr_offset,
 					     LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""));
-				dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, "");
 			} else {
 				ac_build_buffer_store_dword(&ctx->ac,
 				                            ctx->esgs_ring,
@@ -2502,97 +2682,6 @@ handle_ls_outputs_post(struct radv_shader_context *ctx)
 	}
 }

-struct ac_build_if_state
-{
-	struct radv_shader_context *ctx;
-	LLVMValueRef condition;
-	LLVMBasicBlockRef entry_block;
-	LLVMBasicBlockRef true_block;
-	LLVMBasicBlockRef false_block;
-	LLVMBasicBlockRef merge_block;
-};
-
-static LLVMBasicBlockRef
-ac_build_insert_new_block(struct radv_shader_context *ctx, const char *name)
-{
-	LLVMBasicBlockRef current_block;
-	LLVMBasicBlockRef next_block;
-	LLVMBasicBlockRef new_block;
-
-	/* get current basic block */
-	current_block = LLVMGetInsertBlock(ctx->ac.builder);
-
-	/* chqeck if there's another block after this one */
-	next_block = LLVMGetNextBasicBlock(current_block);
-	if (next_block) {
-		/* insert the new block before the next block */
-		new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name);
-	}
-	else {
-		/* append new block after current block */
-		LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
-		new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name);
-	}
-	return new_block;
-}
-
-static void
-ac_nir_build_if(struct ac_build_if_state *ifthen,
-		struct radv_shader_context *ctx,
-		LLVMValueRef condition)
-{
-	LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->ac.builder);
-
-	memset(ifthen, 0, sizeof *ifthen);
-	ifthen->ctx = ctx;
-	ifthen->condition = condition;
-	ifthen->entry_block = block;
-
-	/* create endif/merge basic block for the phi functions */
-	ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block");
-
-	/* create/insert true_block before merge_block */
-	ifthen->true_block =
-		LLVMInsertBasicBlockInContext(ctx->context,
-					      ifthen->merge_block,
-					      "if-true-block");
-
-	/* successive code goes into the true block */
-	LLVMPositionBuilderAtEnd(ctx->ac.builder, ifthen->true_block);
-}
-
-/**
- * End a conditional.
- */
-static void
-ac_nir_build_endif(struct ac_build_if_state *ifthen)
-{
-	LLVMBuilderRef builder = ifthen->ctx->ac.builder;
-
-	/* Insert branch to the merge block from current block */
-	LLVMBuildBr(builder, ifthen->merge_block);
-
-	/*
-	 * Now patch in the various branch instructions.
-	 */
-
-	/* Insert the conditional branch instruction at the end of entry_block */
-	LLVMPositionBuilderAtEnd(builder, ifthen->entry_block);
-	if (ifthen->false_block) {
-		/* we have an else clause */
-		LLVMBuildCondBr(builder, ifthen->condition,
-				ifthen->true_block, ifthen->false_block);
-	}
-	else {
-		/* no else clause */
-		LLVMBuildCondBr(builder, ifthen->condition,
-				ifthen->true_block, ifthen->merge_block);
-	}
-
-	/* Resume building code at end of the ifthen->merge_block */
-	LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
-}
-
 static void
 write_tess_factors(struct radv_shader_context *ctx)
 {
@@ -2647,7 +2736,7 @@ write_tess_factors(struct radv_shader_context *ctx)
 		outer[i] = LLVMGetUndef(ctx->ac.i32);
 	}

-	// LINES reverseal
+	// LINES reversal
 	if (ctx->options->key.tcs.primitive_mode == GL_ISOLINES) {
 		outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer);
 		lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer,
@@ -2878,13 +2967,17 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
 	}
 }

-static void ac_llvm_finalize_module(struct radv_shader_context *ctx)
+static void ac_llvm_finalize_module(struct radv_shader_context *ctx,
+				    const struct radv_nir_compiler_options *options)
 {
 	LLVMPassManagerRef passmgr;
 	/* Create the pass manager */
 	passmgr = LLVMCreateFunctionPassManagerForModule(
 							ctx->ac.module);

+	if (options->check_ir)
+		LLVMAddVerifierPass(passmgr);
+
 	/* This pass should eliminate all the load and store instructions */
 	LLVMAddPromoteMemoryToRegisterPass(passmgr);

@@ -2893,6 +2986,8 @@ static void ac_llvm_finalize_module(struct radv_shader_context *ctx)
 	LLVMAddLICMPass(passmgr);
 	LLVMAddAggressiveDCEPass(passmgr);
 	LLVMAddCFGSimplificationPass(passmgr);
+	/* This is recommended by the instruction combining pass. */
+	LLVMAddEarlyCSEMemSSAPass(passmgr);
 	LLVMAddInstructionCombiningPass(passmgr);

 	/* Run the pass */
@@ -2942,9 +3037,16 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
 static void
 ac_setup_rings(struct radv_shader_context *ctx)
 {
-	if ((ctx->stage == MESA_SHADER_VERTEX && ctx->options->key.vs.as_es) ||
-	    (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->options->key.tes.as_es)) {
-		ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_ESGS_VS, false));
+	if (ctx->options->chip_class <= VI &&
+	    (ctx->stage == MESA_SHADER_GEOMETRY ||
+	     ctx->options->key.vs.as_es || ctx->options->key.tes.as_es)) {
+		unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS
+								   : RING_ESGS_VS;
+		LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
+
+		ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac,
+						       ctx->ring_offsets,
+						       offset);
 	}

 	if (ctx->is_gs_copy_shader) {
@@ -2955,7 +3057,6 @@ ac_setup_rings(struct radv_shader_context *ctx)
 		uint32_t num_entries = 64;
 		LLVMValueRef gsvs_ring_stride = LLVMConstInt(ctx->ac.i32, ctx->max_gsvs_emit_size, false);
 		LLVMValueRef gsvs_ring_desc = LLVMConstInt(ctx->ac.i32, ctx->max_gsvs_emit_size << 16, false);
-		ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_ESGS_GS, false));
 		ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));

 		ctx->gsvs_ring = LLVMBuildBitCast(ctx->ac.builder, ctx->gsvs_ring, ctx->ac.v4i32, "");
@@ -3006,7 +3107,6 @@ static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
 	LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count,
 	                                      ctx->ac.i32_0, "");
 	ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, "");
-	ctx->vs_prim_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.vertex_id, ctx->vs_prim_id, "");
 	ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_rel_ids, ctx->rel_auto_id, "");
 	ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_patch_id, ctx->abi.vertex_id, "");
 }
@@ -3202,7 +3302,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 	if (options->dump_preoptir)
 		ac_dump_module(ctx.ac.module);

-	ac_llvm_finalize_module(&ctx);
+	ac_llvm_finalize_module(&ctx, options);

 	if (shader_count == 1)
 		ac_nir_eliminate_const_vs_outputs(&ctx);
@@ -3500,6 +3600,8 @@ radv_compile_gs_copy_shader(LLVMTargetMachineRef tm,
 	ctx.ac.builder = ac_create_builder(ctx.context, float_mode);
 	ctx.stage = MESA_SHADER_VERTEX;

+	radv_nir_shader_info_pass(geom_shader, options, &shader_info->info);
+
 	create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX);

 	ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
@@ -3518,7 +3620,7 @@ radv_compile_gs_copy_shader(LLVMTargetMachineRef tm,

 	LLVMBuildRetVoid(ctx.ac.builder);

-	ac_llvm_finalize_module(&ctx);
+	ac_llvm_finalize_module(&ctx, options);

 	ac_compile_llvm_module(tm, ctx.ac.module, binary, config, shader_info,
 			       MESA_SHADER_VERTEX, options);
--- a/src/amd/vulkan/radv_pass.c
+++ b/src/amd/vulkan/radv_pass.c
@@ -50,7 +50,7 @@ VkResult radv_CreateRenderPass(
 	pass = vk_alloc2(&device->alloc, pAllocator, size, 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (pass == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	memset(pass, 0, size);
 	pass->attachment_count = pCreateInfo->attachmentCount;
@@ -87,8 +87,8 @@ VkResult radv_CreateRenderPass(
 		subpass_attachment_count +=
 			desc->inputAttachmentCount +
 			desc->colorAttachmentCount +
-			/* Count colorAttachmentCount again for resolve_attachments */
-			desc->colorAttachmentCount;
+			(desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
+			(desc->pDepthStencilAttachment != NULL);
 	}

 	if (subpass_attachment_count) {
@@ -98,7 +98,7 @@ VkResult radv_CreateRenderPass(
 				    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 		if (pass->subpass_attachments == NULL) {
 			vk_free2(&device->alloc, pAllocator, pass);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 	} else
 		pass->subpass_attachments = NULL;
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -174,13 +174,54 @@ radv_pipeline_scratch_init(struct radv_device *device,
 	if (scratch_bytes_per_wave && max_waves < min_waves) {
 		/* Not really true at this moment, but will be true on first
 		 * execution. Avoid having hanging shaders. */
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	}
 	pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
 	pipeline->max_waves = max_waves;
 	return VK_SUCCESS;
 }

+static uint32_t si_translate_blend_logic_op(VkLogicOp op)
+{
+	switch (op) {
+	case VK_LOGIC_OP_CLEAR:
+		return V_028808_ROP3_CLEAR;
+	case VK_LOGIC_OP_AND:
+		return V_028808_ROP3_AND;
+	case VK_LOGIC_OP_AND_REVERSE:
+		return V_028808_ROP3_AND_REVERSE;
+	case VK_LOGIC_OP_COPY:
+		return V_028808_ROP3_COPY;
+	case VK_LOGIC_OP_AND_INVERTED:
+		return V_028808_ROP3_AND_INVERTED;
+	case VK_LOGIC_OP_NO_OP:
+		return V_028808_ROP3_NO_OP;
+	case VK_LOGIC_OP_XOR:
+		return V_028808_ROP3_XOR;
+	case VK_LOGIC_OP_OR:
+		return V_028808_ROP3_OR;
+	case VK_LOGIC_OP_NOR:
+		return V_028808_ROP3_NOR;
+	case VK_LOGIC_OP_EQUIVALENT:
+		return V_028808_ROP3_EQUIVALENT;
+	case VK_LOGIC_OP_INVERT:
+		return V_028808_ROP3_INVERT;
+	case VK_LOGIC_OP_OR_REVERSE:
+		return V_028808_ROP3_OR_REVERSE;
+	case VK_LOGIC_OP_COPY_INVERTED:
+		return V_028808_ROP3_COPY_INVERTED;
+	case VK_LOGIC_OP_OR_INVERTED:
+		return V_028808_ROP3_OR_INVERTED;
+	case VK_LOGIC_OP_NAND:
+		return V_028808_ROP3_NAND;
+	case VK_LOGIC_OP_SET:
+		return V_028808_ROP3_SET;
+	default:
+		unreachable("Unhandled logic op");
+	}
+}
+
+
 static uint32_t si_translate_blend_function(VkBlendOp op)
 {
 	switch (op) {
@@ -463,6 +504,7 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,
 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
 	unsigned col_format = 0;
+	unsigned num_targets;

 	for (unsigned i = 0; i < (blend->single_cb_enable ? 1 : subpass->color_count); ++i) {
 		unsigned cf;
@@ -482,6 +524,16 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,
 		col_format |= cf << (4 * i);
 	}

+	/* If the i-th target format is set, all previous target formats must
+	 * be non-zero to avoid hangs.
+	 */
+	num_targets = (util_last_bit(col_format) + 3) / 4;
+	for (unsigned i = 0; i < num_targets; i++) {
+		if (!(col_format & (0xf << (i * 4)))) {
+			col_format |= V_028714_SPI_SHADER_32_R << (i * 4);
+		}
+	}
+
 	blend->cb_shader_mask = ac_get_cb_shader_mask(col_format);

 	if (blend->mrt0_is_dual_src)
@@ -570,7 +622,7 @@ radv_blend_check_commutativity(struct radv_blend_state *blend,
 		(1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA);

 	if (dst == VK_BLEND_FACTOR_ONE &&
-	    (src_allowed && (1u << src))) {
+	    (src_allowed & (1u << src))) {
 		/* Addition is commutative, but floating point addition isn't
 		 * associative: subtle changes can be introduced via different
 		 * rounding. Be conservative, only enable for min and max.
@@ -600,9 +652,9 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
 	}
 	blend.cb_color_control = 0;
 	if (vkblend->logicOpEnable)
-		blend.cb_color_control |= S_028808_ROP3(vkblend->logicOp | (vkblend->logicOp << 4));
+		blend.cb_color_control |= S_028808_ROP3(si_translate_blend_logic_op(vkblend->logicOp));
 	else
-		blend.cb_color_control |= S_028808_ROP3(0xcc);
+		blend.cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);

 	blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) |
 		S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
@@ -1542,21 +1594,25 @@ static void si_multiwave_lds_size_workaround(struct radv_device *device,
 }

 struct radv_shader_variant *
-radv_get_vertex_shader(struct radv_pipeline *pipeline)
+radv_get_shader(struct radv_pipeline *pipeline,
+		gl_shader_stage stage)
 {
-	if (pipeline->shaders[MESA_SHADER_VERTEX])
-		return pipeline->shaders[MESA_SHADER_VERTEX];
-	if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
-		return pipeline->shaders[MESA_SHADER_TESS_CTRL];
-	return pipeline->shaders[MESA_SHADER_GEOMETRY];
-}
-
-static struct radv_shader_variant *
-radv_get_tess_eval_shader(struct radv_pipeline *pipeline)
-{
-	if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
-		return pipeline->shaders[MESA_SHADER_TESS_EVAL];
-	return pipeline->shaders[MESA_SHADER_GEOMETRY];
+	if (stage == MESA_SHADER_VERTEX) {
+		if (pipeline->shaders[MESA_SHADER_VERTEX])
+			return pipeline->shaders[MESA_SHADER_VERTEX];
+		if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
+			return pipeline->shaders[MESA_SHADER_TESS_CTRL];
+		if (pipeline->shaders[MESA_SHADER_GEOMETRY])
+			return pipeline->shaders[MESA_SHADER_GEOMETRY];
+	} else if (stage == MESA_SHADER_TESS_EVAL) {
+		if (!radv_pipeline_has_tess(pipeline))
+			return NULL;
+		if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
+			return pipeline->shaders[MESA_SHADER_TESS_EVAL];
+		if (pipeline->shaders[MESA_SHADER_GEOMETRY])
+			return pipeline->shaders[MESA_SHADER_GEOMETRY];
+	}
+	return pipeline->shaders[stage];
 }

 static struct radv_tessellation_state
@@ -1591,7 +1647,7 @@ calculate_tess_state(struct radv_pipeline *pipeline,
 		S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
 	tess.num_patches = num_patches;

-	struct radv_shader_variant *tes = radv_get_tess_eval_shader(pipeline);
+	struct radv_shader_variant *tes = radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL);
 	unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0;

 	switch (tes->info.tes.primitive_mode) {
@@ -1724,13 +1780,13 @@ radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders)
 				ac_lower_indirect_derefs(ordered_shaders[i],
 				                         pipeline->device->physical_device->rad_info.chip_class);
 			}
-			radv_optimize_nir(ordered_shaders[i]);
+			radv_optimize_nir(ordered_shaders[i], false);

 			if (nir_lower_global_vars_to_local(ordered_shaders[i - 1])) {
 				ac_lower_indirect_derefs(ordered_shaders[i - 1],
 				                         pipeline->device->physical_device->rad_info.chip_class);
 			}
-			radv_optimize_nir(ordered_shaders[i - 1]);
+			radv_optimize_nir(ordered_shaders[i - 1], false);
 		}
 	}
 }
@@ -1750,6 +1806,9 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
 	struct radv_pipeline_key key;
 	memset(&key, 0, sizeof(key));

+	if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
+		key.optimisations_disabled = 1;
+
 	key.has_multiview_view_index = has_view_index;

 	uint32_t binding_input_rate = 0;
@@ -1769,13 +1828,36 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
 	}

 	for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
-		unsigned binding;
-		binding = input_state->pVertexAttributeDescriptions[i].binding;
+		unsigned location = input_state->pVertexAttributeDescriptions[i].location;
+		unsigned binding = input_state->pVertexAttributeDescriptions[i].binding;
 		if (binding_input_rate & (1u << binding)) {
-			unsigned location = input_state->pVertexAttributeDescriptions[i].location;
 			key.instance_rate_inputs |= 1u << location;
 			key.instance_rate_divisors[location] = instance_rate_divisors[binding];
 		}
+
+		if (pipeline->device->physical_device->rad_info.chip_class <= VI &&
+		    pipeline->device->physical_device->rad_info.family != CHIP_STONEY) {
+			VkFormat format = input_state->pVertexAttributeDescriptions[i].format;
+			uint64_t adjust;
+			switch(format) {
+			case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
+			case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
+				adjust = RADV_ALPHA_ADJUST_SNORM;
+				break;
+			case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
+			case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
+				adjust = RADV_ALPHA_ADJUST_SSCALED;
+				break;
+			case VK_FORMAT_A2R10G10B10_SINT_PACK32:
+			case VK_FORMAT_A2B10G10R10_SINT_PACK32:
+				adjust = RADV_ALPHA_ADJUST_SINT;
+				break;
+			default:
+				adjust = 0;
+				break;
+			}
+			key.vertex_alpha_adjust |= adjust << (2 * location);
+		}
 	}

 	if (pCreateInfo->pTessellationState)
@@ -1786,7 +1868,6 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
 	    pCreateInfo->pMultisampleState->rasterizationSamples > 1) {
 		uint32_t num_samples = pCreateInfo->pMultisampleState->rasterizationSamples;
 		uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo->pMultisampleState);
-		key.multisample = true;
 		key.log2_num_samples = util_logbase2(num_samples);
 		key.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
 	}
@@ -1804,6 +1885,7 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
                      nir_shader **nir)
 {
 	keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = key->instance_rate_inputs;
+	keys[MESA_SHADER_VERTEX].vs.alpha_adjust = key->vertex_alpha_adjust;
 	for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i)
 		keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = key->instance_rate_divisors[i];

@@ -1826,7 +1908,6 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
 	for(int i = 0; i < MESA_SHADER_STAGES; ++i)
 		keys[i].has_multiview_view_index = key->has_multiview_view_index;

-	keys[MESA_SHADER_FRAGMENT].fs.multisample = key->multisample;
 	keys[MESA_SHADER_FRAGMENT].fs.col_format = key->col_format;
 	keys[MESA_SHADER_FRAGMENT].fs.is_int8 = key->is_int8;
 	keys[MESA_SHADER_FRAGMENT].fs.is_int10 = key->is_int10;
@@ -1878,7 +1959,8 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
                         struct radv_device *device,
                         struct radv_pipeline_cache *cache,
                         struct radv_pipeline_key key,
-                         const VkPipelineShaderStageCreateInfo **pStages)
+                         const VkPipelineShaderStageCreateInfo **pStages,
+                         const VkPipelineCreateFlags flags)
 {
 	struct radv_shader_module fs_m = {0};
 	struct radv_shader_module *modules[MESA_SHADER_STAGES] = { 0, };
@@ -1895,6 +1977,8 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
 				_mesa_sha1_compute(modules[i]->nir->info.name,
 				                   strlen(modules[i]->nir->info.name),
 				                   modules[i]->sha1);
+
+			pipeline->active_stages |= mesa_to_vk_shader_stage(i);
 		}
 	}

@@ -1910,10 +1994,6 @@ void radv_create_shaders(struct radv_pipeline *pipeline,

 	if (radv_create_shader_variants_from_pipeline_cache(device, cache, hash, pipeline->shaders) &&
 	    (!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader)) {
-		for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
-			if (pipeline->shaders[i])
-				pipeline->active_stages |= mesa_to_vk_shader_stage(i);
-		}
 		return;
 	}

@@ -1944,8 +2024,8 @@ void radv_create_shaders(struct radv_pipeline *pipeline,

 		nir[i] = radv_shader_compile_to_nir(device, modules[i],
 						    stage ? stage->pName : "main", i,
-						    stage ? stage->pSpecializationInfo : NULL);
-		pipeline->active_stages |= mesa_to_vk_shader_stage(i);
+						    stage ? stage->pSpecializationInfo : NULL,
+						    flags);

 		/* We don't want to alter meta shaders IR directly so clone it
 		 * first.
@@ -1963,8 +2043,10 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
 			if (i != last)
 				mask = mask | nir_var_shader_out;

-			nir_lower_io_to_scalar_early(nir[i], mask);
-			radv_optimize_nir(nir[i]);
+			if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)) {
+				nir_lower_io_to_scalar_early(nir[i], mask);
+				radv_optimize_nir(nir[i], false);
+			}
 		}
 	}

@@ -1973,10 +2055,11 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
 		merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
 	}

-	radv_link_shaders(pipeline, nir);
+	if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
+		radv_link_shaders(pipeline, nir);

 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
-		if (modules[i] && radv_can_dump_shader(device, modules[i]))
+		if (radv_can_dump_shader(device, modules[i], false))
 			nir_print_shader(nir[i], stderr);
 	}

@@ -3076,7 +3159,7 @@ radv_pipeline_generate_vgt_vertex_reuse(struct radeon_winsys_cs *cs,

 	unsigned vtx_reuse_depth = 30;
 	if (radv_pipeline_has_tess(pipeline) &&
-	    radv_get_tess_eval_shader(pipeline)->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD) {
+	    radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD) {
 		vtx_reuse_depth = 14;
 	}
 	radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
@@ -3206,8 +3289,9 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline,
 		}
 	}
 	/* GS requirement. */
-	if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
-		ia_multi_vgt_param.partial_es_wave = true;
+	if (radv_pipeline_has_gs(pipeline) && device->physical_device->rad_info.chip_class <= VI)
+		if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
+			ia_multi_vgt_param.partial_es_wave = true;

 	ia_multi_vgt_param.wd_switch_on_eop = false;
 	if (device->physical_device->rad_info.chip_class >= CIK) {
@@ -3236,7 +3320,7 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline,
 	if (radv_pipeline_has_tess(pipeline)) {
 		/* SWITCH_ON_EOI must be set if PrimID is used. */
 		if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.info.uses_prim_id ||
-		    radv_get_tess_eval_shader(pipeline)->info.info.uses_prim_id)
+		    radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.info.uses_prim_id)
 			ia_multi_vgt_param.ia_switch_on_eoi = true;
 	}

@@ -3348,7 +3432,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,

 	radv_create_shaders(pipeline, device, cache, 
 	                    radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend, has_view_index),
-	                    pStages);
+	                    pStages, pCreateInfo->flags);

 	pipeline->graphics.spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
 	radv_pipeline_init_multisample_state(pipeline, &blend, pCreateInfo);
@@ -3426,7 +3510,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 	if (loc->sgpr_idx != -1) {
 		pipeline->graphics.vtx_base_sgpr = pipeline->user_data_0[MESA_SHADER_VERTEX];
 		pipeline->graphics.vtx_base_sgpr += loc->sgpr_idx * 4;
-		if (radv_get_vertex_shader(pipeline)->info.info.vs.needs_draw_id)
+		if (radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id)
 			pipeline->graphics.vtx_emit_num = 3;
 		else
 			pipeline->graphics.vtx_emit_num = 2;
@@ -3455,7 +3539,7 @@ radv_graphics_pipeline_create(
 	pipeline = vk_zalloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
 			      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (pipeline == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	result = radv_pipeline_init(pipeline, device, cache,
 				    pCreateInfo, extra, pAllocator);
@@ -3574,14 +3658,14 @@ static VkResult radv_compute_pipeline_create(
 	pipeline = vk_zalloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
 			      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (pipeline == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	pipeline->device = device;
 	pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
 	assert(pipeline->layout);

 	pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;
-	radv_create_shaders(pipeline, device, cache, (struct radv_pipeline_key) {0}, pStages);
+	radv_create_shaders(pipeline, device, cache, (struct radv_pipeline_key) {0}, pStages, pCreateInfo->flags);

 	pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
 	pipeline->need_indirect_descriptor_sets |= pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets;
--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@@ -206,7 +206,7 @@ radv_pipeline_cache_grow(struct radv_pipeline_cache *cache)

 	table = malloc(byte_size);
 	if (table == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(cache->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	cache->hash_table = table;
 	cache->table_size = table_size;
@@ -515,7 +515,7 @@ VkResult radv_CreatePipelineCache(
 			    sizeof(*cache), 8,
 			    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (cache == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	if (pAllocator)
 		cache->alloc = *pAllocator;
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -57,8 +57,10 @@
 #include "ac_nir_to_llvm.h"
 #include "ac_gpu_info.h"
 #include "ac_surface.h"
+#include "ac_llvm_build.h"
 #include "radv_descriptor_set.h"
 #include "radv_extensions.h"
+#include "radv_cs.h"

 #include <llvm-c/TargetMachine.h>

@@ -215,20 +217,19 @@ radv_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
 * propagating errors. Might be useful to plug in a stack trace here.
 */

-VkResult __vk_errorf(VkResult error, const char *file, int line, const char *format, ...);
+struct radv_instance;

-#ifdef DEBUG
-#define vk_error(error) __vk_errorf(error, __FILE__, __LINE__, NULL);
-#define vk_errorf(error, format, ...) __vk_errorf(error, __FILE__, __LINE__, format, ## __VA_ARGS__);
-#else
-#define vk_error(error) error
-#define vk_errorf(error, format, ...) error
-#endif
+VkResult __vk_errorf(struct radv_instance *instance, VkResult error, const char *file, int line, const char *format, ...);
+
+#define vk_error(instance, error) __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
+#define vk_errorf(instance, error, format, ...) __vk_errorf(instance, error, __FILE__, __LINE__, format, ## __VA_ARGS__);

 void __radv_finishme(const char *file, int line, const char *format, ...)
 	radv_printflike(3, 4);
 void radv_loge(const char *format, ...) radv_printflike(1, 2);
 void radv_loge_v(const char *format, va_list va);
+void radv_logi(const char *format, ...) radv_printflike(1, 2);
+void radv_logi_v(const char *format, va_list va);

 /**
 * Print a FINISHME message, including its source location.
@@ -352,14 +353,15 @@ struct radv_pipeline_cache {
 struct radv_pipeline_key {
 	uint32_t instance_rate_inputs;
 	uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+	uint64_t vertex_alpha_adjust;
 	unsigned tess_input_vertices;
 	uint32_t col_format;
 	uint32_t is_int8;
 	uint32_t is_int10;
 	uint8_t log2_ps_iter_samples;
 	uint8_t log2_num_samples;
-	uint32_t multisample : 1;
 	uint32_t has_multiview_view_index : 1;
+	uint32_t optimisations_disabled : 1;
 };

 void
@@ -465,18 +467,18 @@ struct radv_meta_state {
 	} blit;

 	struct {
-		VkRenderPass render_passes[NUM_META_FS_KEYS][RADV_META_DST_LAYOUT_COUNT];
+		VkPipelineLayout p_layouts[5];
+		VkDescriptorSetLayout ds_layouts[5];
+		VkPipeline pipelines[5][NUM_META_FS_KEYS];

-		VkPipelineLayout p_layouts[3];
-		VkDescriptorSetLayout ds_layouts[3];
-		VkPipeline pipelines[3][NUM_META_FS_KEYS];
+		VkPipeline depth_only_pipeline[5];

-		VkRenderPass depth_only_rp[RADV_BLIT_DS_LAYOUT_COUNT];
-		VkPipeline depth_only_pipeline[3];
+		VkPipeline stencil_only_pipeline[5];
+	} blit2d[1 + MAX_SAMPLES_LOG2];

-		VkRenderPass stencil_only_rp[RADV_BLIT_DS_LAYOUT_COUNT];
-		VkPipeline stencil_only_pipeline[3];
-	} blit2d;
+	VkRenderPass blit2d_render_passes[NUM_META_FS_KEYS][RADV_META_DST_LAYOUT_COUNT];
+	VkRenderPass blit2d_depth_only_rp[RADV_BLIT_DS_LAYOUT_COUNT];
+	VkRenderPass blit2d_stencil_only_rp[RADV_BLIT_DS_LAYOUT_COUNT];

 	struct {
 		VkPipelineLayout                          img_p_layout;
@@ -622,7 +624,6 @@ struct radv_device {
 	struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES];

 	bool always_use_syncobj;
-	bool llvm_supports_spill;
 	bool has_distributed_tess;
 	bool pbb_allowed;
 	bool dfsm_allowed;
@@ -1108,14 +1109,17 @@ void radv_cmd_buffer_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer);
 void radv_cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples);
 unsigned radv_cayman_get_maxdist(int log_samples);
 void radv_device_init_msaa(struct radv_device *device);
-void radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			       struct radv_image *image,
-			       VkClearDepthStencilValue ds_clear_value,
-			       VkImageAspectFlags aspects);
-void radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			       struct radv_image *image,
-			       int idx,
-			       uint32_t color_values[2]);
+
+void radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+				struct radv_image *image,
+				VkClearDepthStencilValue ds_clear_value,
+				VkImageAspectFlags aspects);
+
+void radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+				   struct radv_image *image,
+				   int cb_idx,
+				   uint32_t color_values[2]);
+
 void radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer,
 				       struct radv_image *image,
 				       bool value);
@@ -1127,6 +1131,41 @@ bool radv_get_memory_fd(struct radv_device *device,
 			struct radv_device_memory *memory,
 			int *pFD);

+static inline void
+radv_emit_shader_pointer_head(struct radeon_winsys_cs *cs,
+			      unsigned sh_offset, unsigned pointer_count,
+			      bool use_32bit_pointers)
+{
+	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count * (use_32bit_pointers ? 1 : 2), 0));
+	radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
+}
+
+static inline void
+radv_emit_shader_pointer_body(struct radv_device *device,
+			      struct radeon_winsys_cs *cs,
+			      uint64_t va, bool use_32bit_pointers)
+{
+	radeon_emit(cs, va);
+
+	if (use_32bit_pointers) {
+		assert(va == 0 ||
+		       (va >> 32) == device->physical_device->rad_info.address32_hi);
+	} else {
+		radeon_emit(cs, va >> 32);
+	}
+}
+
+static inline void
+radv_emit_shader_pointer(struct radv_device *device,
+			 struct radeon_winsys_cs *cs,
+			 uint32_t sh_offset, uint64_t va, bool global)
+{
+	bool use_32bit_pointers = HAVE_32BIT_POINTERS && !global;
+
+	radv_emit_shader_pointer_head(cs, sh_offset, 1, use_32bit_pointers);
+	radv_emit_shader_pointer_body(device, cs, va, use_32bit_pointers);
+}
+
 static inline struct radv_descriptor_state *
 radv_get_descriptors_state(struct radv_cmd_buffer *cmd_buffer,
 			   VkPipelineBindPoint bind_point)
@@ -1279,7 +1318,8 @@ struct radv_userdata_info *radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
 						 gl_shader_stage stage,
 						 int idx);

-struct radv_shader_variant *radv_get_vertex_shader(struct radv_pipeline *pipeline);
+struct radv_shader_variant *radv_get_shader(struct radv_pipeline *pipeline,
+					    gl_shader_stage stage);

 struct radv_graphics_pipeline_create_info {
 	bool use_rectlist;
@@ -1706,14 +1746,6 @@ struct radv_semaphore {
 	uint32_t temp_syncobj;
 };

-VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
-			     int num_wait_sems,
-			     const VkSemaphore *wait_sems,
-			     int num_signal_sems,
-			     const VkSemaphore *signal_sems,
-			     VkFence fence);
-void radv_free_sem_info(struct radv_winsys_sem_info *sem_info);
-
 void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
 			     VkPipelineBindPoint bind_point,
 			     struct radv_descriptor_set *set,
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -753,7 +753,7 @@ VkResult radv_CreateQueryPool(
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

 	if (!pool)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);


 	switch(pCreateInfo->queryType) {
@@ -783,7 +783,7 @@ VkResult radv_CreateQueryPool(

 	if (!pool->bo) {
 		vk_free2(&device->alloc, pAllocator, pool);
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	}

 	pool->ptr = device->ws->buffer_map(pool->bo);
@@ -791,7 +791,7 @@ VkResult radv_CreateQueryPool(
 	if (!pool->ptr) {
 		device->ws->buffer_destroy(pool->bo);
 		vk_free2(&device->alloc, pAllocator, pool);
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	}
 	memset(pool->ptr, 0, pool->size);

@@ -1140,12 +1140,12 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,

 		cmd_buffer->state.active_occlusion_queries--;
 		if (cmd_buffer->state.active_occlusion_queries == 0) {
+			radv_set_db_count_control(cmd_buffer);
+
 			/* Reset the perfect occlusion queries hint now that no
 			 * queries are active.
 			 */
 			cmd_buffer->state.perfect_occlusion_queries_enabled = false;
-
-			radv_set_db_count_control(cmd_buffer);
 		}

 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -1204,25 +1204,6 @@ void radv_CmdBeginQuery(
 	va += pool->stride * query;

 	emit_begin_query(cmd_buffer, va, pool->type, flags);
-
-	/*
-	 * For multiview we have to emit a query for each bit in the mask,
-	 * however the first query we emit will get the totals for all the
-	 * operations, so we don't want to get a real value in the other
-	 * queries. This emits a fake begin/end sequence so the waiting
-	 * code gets a completed query value and doesn't hang, but the
-	 * query returns 0.
-	 */
-	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
-		uint64_t avail_va = va + pool->availability_offset + 4 * query;
-
-		for (unsigned i = 0; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
-			va += pool->stride;
-			avail_va += 4;
-			emit_begin_query(cmd_buffer, va, pool->type, flags);
-			emit_end_query(cmd_buffer, va, avail_va, pool->type);
-		}
-	}
 }


@@ -1241,6 +1222,26 @@ void radv_CmdEndQuery(
 	 * currently be active, which means the BO is already in the list.
 	 */
 	emit_end_query(cmd_buffer, va, avail_va, pool->type);
+
+	/*
+	 * For multiview we have to emit a query for each bit in the mask,
+	 * however the first query we emit will get the totals for all the
+	 * operations, so we don't want to get a real value in the other
+	 * queries. This emits a fake begin/end sequence so the waiting
+	 * code gets a completed query value and doesn't hang, but the
+	 * query returns 0.
+	 */
+	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
+		uint64_t avail_va = va + pool->availability_offset + 4 * query;
+
+
+		for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
+			va += pool->stride;
+			avail_va += 4;
+			emit_begin_query(cmd_buffer, va, pool->type, 0);
+			emit_end_query(cmd_buffer, va, avail_va, pool->type);
+		}
+	}
 }

 void radv_CmdWriteTimestamp(
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -57,6 +57,7 @@ enum radeon_bo_flag { /* bitfield */
 	RADEON_FLAG_IMPLICIT_SYNC = (1 << 5),
 	RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 6),
 	RADEON_FLAG_READ_ONLY =     (1 << 7),
+	RADEON_FLAG_32BIT =         (1 << 8),
 };

 enum radeon_bo_usage { /* bitfield */
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -36,6 +36,7 @@

 #include <llvm-c/Core.h>
 #include <llvm-c/TargetMachine.h>
+#include <llvm-c/Support.h>

 #include "sid.h"
 #include "gfx9d.h"
@@ -89,7 +90,7 @@ VkResult radv_CreateShaderModule(
 			     sizeof(*module) + pCreateInfo->codeSize, 8,
 			     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (module == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

 	module->nir = NULL;
 	module->size = pCreateInfo->codeSize;
@@ -117,7 +118,7 @@ void radv_DestroyShaderModule(
 }

 void
-radv_optimize_nir(struct nir_shader *shader)
+radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively)
 {
        bool progress;

@@ -125,7 +126,7 @@ radv_optimize_nir(struct nir_shader *shader)
                progress = false;

                NIR_PASS_V(shader, nir_lower_vars_to_ssa);
-		NIR_PASS_V(shader, nir_lower_64bit_pack);
+		NIR_PASS_V(shader, nir_lower_pack);
                NIR_PASS_V(shader, nir_lower_alu_to_scalar);
                NIR_PASS_V(shader, nir_lower_phis_to_scalar);

@@ -149,7 +150,7 @@ radv_optimize_nir(struct nir_shader *shader)
                if (shader->options->max_unroll_iterations) {
                        NIR_PASS(progress, shader, nir_opt_loop_unroll, 0);
                }
-        } while (progress);
+        } while (progress && !optimize_conservatively);

        NIR_PASS(progress, shader, nir_opt_shrink_load);
        NIR_PASS(progress, shader, nir_opt_move_load_ubo);
@@ -160,12 +161,9 @@ radv_shader_compile_to_nir(struct radv_device *device,
 			   struct radv_shader_module *module,
 			   const char *entrypoint_name,
 			   gl_shader_stage stage,
-			   const VkSpecializationInfo *spec_info)
+			   const VkSpecializationInfo *spec_info,
+			   const VkPipelineCreateFlags flags)
 {
-	if (strcmp(entrypoint_name, "main") != 0) {
-		radv_finishme("Multiple shaders per module not really supported");
-	}
-
 	nir_shader *nir;
 	nir_function *entry_point;
 	if (module->nir) {
@@ -280,7 +278,20 @@ radv_shader_compile_to_nir(struct radv_device *device,
 	nir_lower_tex(nir, &tex_options);

 	nir_lower_vars_to_ssa(nir);
+
+	if (nir->info.stage == MESA_SHADER_VERTEX ||
+	    nir->info.stage == MESA_SHADER_GEOMETRY) {
+		NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+			   nir_shader_get_entrypoint(nir), true, true);
+	} else if (nir->info.stage == MESA_SHADER_TESS_EVAL||
+		   nir->info.stage == MESA_SHADER_FRAGMENT) {
+		NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+			   nir_shader_get_entrypoint(nir), true, false);
+	}
+
+	nir_split_var_copies(nir);
 	nir_lower_var_copies(nir);
+
 	nir_lower_global_vars_to_local(nir);
 	nir_remove_dead_variables(nir, nir_var_local);
 	nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options) {
@@ -293,7 +304,8 @@ radv_shader_compile_to_nir(struct radv_device *device,
 			.lower_vote_eq_to_ballot = 1,
 		});

-	radv_optimize_nir(nir);
+	if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
+		radv_optimize_nir(nir, false);

 	/* Indirect lowering must be called after the radv_optimize_nir() loop
 	 * has been called at least once. Otherwise indirect lowering can
@@ -301,7 +313,7 @@ radv_shader_compile_to_nir(struct radv_device *device,
 	 * considered too large for unrolling.
 	 */
 	ac_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class);
-	radv_optimize_nir(nir);
+	radv_optimize_nir(nir, flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT);

 	return nir;
 }
@@ -371,16 +383,14 @@ radv_fill_shader_variant(struct radv_device *device,
 			 gl_shader_stage stage)
 {
 	bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0;
+	struct radv_shader_info *info = &variant->info.info;
 	unsigned vgpr_comp_cnt = 0;

-	if (scratch_enabled && !device->llvm_supports_spill)
-		radv_finishme("shader scratch support only available with LLVM 4.0");
-
 	variant->code_size = binary->code_size;
 	variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) |
-			S_00B12C_SCRATCH_EN(scratch_enabled);
+			 S_00B12C_SCRATCH_EN(scratch_enabled);

-	variant->rsrc1 =  S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) |
+	variant->rsrc1 = S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) |
 		S_00B848_SGPRS((variant->config.num_sgprs - 1) / 8) |
 		S_00B848_DX10_CLAMP(1) |
 		S_00B848_FLOAT_MODE(variant->config.float_mode);
@@ -391,10 +401,11 @@ radv_fill_shader_variant(struct radv_device *device,
 		variant->rsrc2 |= S_00B12C_OC_LDS_EN(1);
 		break;
 	case MESA_SHADER_TESS_CTRL:
-		if (device->physical_device->rad_info.chip_class >= GFX9)
+		if (device->physical_device->rad_info.chip_class >= GFX9) {
 			vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt;
-		else
+		} else {
 			variant->rsrc2 |= S_00B12C_OC_LDS_EN(1);
+		}
 		break;
 	case MESA_SHADER_VERTEX:
 	case MESA_SHADER_GEOMETRY:
@@ -402,8 +413,7 @@ radv_fill_shader_variant(struct radv_device *device,
 		break;
 	case MESA_SHADER_FRAGMENT:
 		break;
-	case MESA_SHADER_COMPUTE: {
-		struct radv_shader_info *info = &variant->info.info;
+	case MESA_SHADER_COMPUTE:
 		variant->rsrc2 |=
 			S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) |
 			S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) |
@@ -413,7 +423,6 @@ radv_fill_shader_variant(struct radv_device *device,
 			S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) |
 			S_00B84C_LDS_SIZE(variant->config.lds_size);
 		break;
-	}
 	default:
 		unreachable("unsupported shader type");
 		break;
@@ -421,7 +430,6 @@ radv_fill_shader_variant(struct radv_device *device,

 	if (device->physical_device->rad_info.chip_class >= GFX9 &&
 	    stage == MESA_SHADER_GEOMETRY) {
-		struct radv_shader_info *info = &variant->info.info;
 		unsigned es_type = variant->info.gs.es_type;
 		unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;

@@ -436,28 +444,106 @@ radv_fill_shader_variant(struct radv_device *device,
 		/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
 		 * VGPR[0:4] are always loaded.
 		 */
-		if (info->uses_invocation_id)
+		if (info->uses_invocation_id) {
 			gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
-		else if (info->uses_prim_id)
+		} else if (info->uses_prim_id) {
 			gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
-		else if (variant->info.gs.vertices_in >= 3)
+		} else if (variant->info.gs.vertices_in >= 3) {
 			gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
-		else
+		} else {
 			gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+		}

 		variant->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
 		variant->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
 		                  S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL);
 	} else if (device->physical_device->rad_info.chip_class >= GFX9 &&
-	    stage == MESA_SHADER_TESS_CTRL)
+		   stage == MESA_SHADER_TESS_CTRL) {
 		variant->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);
-	else
+	} else {
 		variant->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
+	}

 	void *ptr = radv_alloc_shader_memory(device, variant);
 	memcpy(ptr, binary->code, binary->code_size);
 }

+static void radv_init_llvm_target()
+{
+	LLVMInitializeAMDGPUTargetInfo();
+	LLVMInitializeAMDGPUTarget();
+	LLVMInitializeAMDGPUTargetMC();
+	LLVMInitializeAMDGPUAsmPrinter();
+
+	/* For inline assembly. */
+	LLVMInitializeAMDGPUAsmParser();
+
+	/* Workaround for bug in llvm 4.0 that causes image intrinsics
+	 * to disappear.
+	 * https://reviews.llvm.org/D26348
+	 *
+	 * Workaround for bug in llvm that causes the GPU to hang in presence
+	 * of nested loops because there is an exec mask issue. The proper
+	 * solution is to fix LLVM but this might require a bunch of work.
+	 * https://bugs.llvm.org/show_bug.cgi?id=37744
+	 *
+	 * "mesa" is the prefix for error messages.
+	 */
+	const char *argv[3] = { "mesa", "-simplifycfg-sink-common=false",
+				"-amdgpu-skip-threshold=1" };
+	LLVMParseCommandLineOptions(3, argv, NULL);
+}
+
+static once_flag radv_init_llvm_target_once_flag = ONCE_FLAG_INIT;
+
+static LLVMTargetRef radv_get_llvm_target(const char *triple)
+{
+	LLVMTargetRef target = NULL;
+	char *err_message = NULL;
+
+	call_once(&radv_init_llvm_target_once_flag, radv_init_llvm_target);
+
+	if (LLVMGetTargetFromTriple(triple, &target, &err_message)) {
+		fprintf(stderr, "Cannot find target for triple %s ", triple);
+		if (err_message) {
+			fprintf(stderr, "%s\n", err_message);
+		}
+		LLVMDisposeMessage(err_message);
+		return NULL;
+	}
+	return target;
+}
+
+static LLVMTargetMachineRef radv_create_target_machine(enum radeon_family family,
+						       enum ac_target_machine_options tm_options,
+						       const char **out_triple)
+{
+	assert(family >= CHIP_TAHITI);
+	char features[256];
+	const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--";
+	LLVMTargetRef target = radv_get_llvm_target(triple);
+
+	snprintf(features, sizeof(features),
+		 "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s%s",
+		 tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "",
+		 tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "",
+		 tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "",
+		 tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "");
+
+	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
+	                             target,
+	                             triple,
+	                             ac_get_llvm_processor_name(family),
+				     features,
+	                             LLVMCodeGenLevelDefault,
+	                             LLVMRelocDefault,
+	                             LLVMCodeModelDefault);
+
+	if (out_triple)
+		*out_triple = triple;
+	return tm;
+}
+
 static struct radv_shader_variant *
 shader_variant_create(struct radv_device *device,
 		      struct radv_shader_module *module,
@@ -481,17 +567,19 @@ shader_variant_create(struct radv_device *device,

 	options->family = chip_family;
 	options->chip_class = device->physical_device->rad_info.chip_class;
-	options->dump_shader = radv_can_dump_shader(device, module);
+	options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader);
 	options->dump_preoptir = options->dump_shader &&
 				 device->instance->debug_flags & RADV_DEBUG_PREOPTIR;
 	options->record_llvm_ir = device->keep_shader_info;
+	options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
 	options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
+	options->address32_hi = device->physical_device->rad_info.address32_hi;

 	if (options->supports_spill)
 		tm_options |= AC_TM_SUPPORTS_SPILL;
 	if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED)
 		tm_options |= AC_TM_SISCHED;
-	tm = ac_create_target_machine(chip_family, tm_options);
+	tm = radv_create_target_machine(chip_family, tm_options, NULL);

 	if (gs_copy_shader) {
 		assert(shader_count == 1);
@@ -551,7 +639,7 @@ radv_shader_variant_create(struct radv_device *device,
 		options.key = *key;

 	options.unsafe_math = !!(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH);
-	options.supports_spill = device->llvm_supports_spill;
+	options.supports_spill = true;

 	return shader_variant_create(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage,
 				     &options, false, code_out, code_size_out);
@@ -615,17 +703,7 @@ generate_shader_stats(struct radv_device *device,
 	unsigned max_simd_waves;
 	unsigned lds_per_wave = 0;

-	switch (device->physical_device->rad_info.family) {
-	/* These always have 8 waves: */
-	case CHIP_POLARIS10:
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-	case CHIP_VEGAM:
-		max_simd_waves = 8;
-		break;
-	default:
-		max_simd_waves = 10;
-	}
+	max_simd_waves = ac_get_max_simd_waves(device->physical_device->rad_info.family);

 	conf = &variant->config;

@@ -710,7 +788,7 @@ radv_GetShaderInfoAMD(VkDevice _device,
 	/* Spec doesn't indicate what to do if the stage is invalid, so just
 	 * return no info for this. */
 	if (!variant)
-		return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+		return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT);

 	switch (infoType) {
 	case VK_SHADER_INFO_TYPE_STATISTICS_AMD:
@@ -731,7 +809,7 @@ radv_GetShaderInfoAMD(VkDevice _device,
 				unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2];

 				statistics.numAvailableVgprs = statistics.numPhysicalVgprs /
-							       ceil(workgroup_size / statistics.numPhysicalVgprs);
+							       ceil((double)workgroup_size / statistics.numPhysicalVgprs);

 				statistics.computeWorkGroupSize[0] = local_size[0];
 				statistics.computeWorkGroupSize[1] = local_size[1];
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -55,9 +55,21 @@ struct radv_shader_module {
 	char data[0];
 };

+enum {
+	RADV_ALPHA_ADJUST_NONE = 0,
+	RADV_ALPHA_ADJUST_SNORM = 1,
+	RADV_ALPHA_ADJUST_SINT = 2,
+	RADV_ALPHA_ADJUST_SSCALED = 3,
+};
+
 struct radv_vs_variant_key {
 	uint32_t instance_rate_inputs;
 	uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+
+	/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
+	 * so we may need to fix it up. */
+	uint64_t alpha_adjust;
+
 	uint32_t as_es:1;
 	uint32_t as_ls:1;
 	uint32_t export_prim_id:1;
@@ -86,7 +98,6 @@ struct radv_fs_variant_key {
 	uint8_t log2_num_samples;
 	uint32_t is_int8;
 	uint32_t is_int10;
-	uint32_t multisample : 1;
 };

 struct radv_shader_variant_key {
@@ -108,9 +119,11 @@ struct radv_nir_compiler_options {
 	bool dump_shader;
 	bool dump_preoptir;
 	bool record_llvm_ir;
+	bool check_ir;
 	enum radeon_family family;
 	enum chip_class chip_class;
 	uint32_t tess_offchip_block_dw_size;
+	uint32_t address32_hi;
 };

 enum radv_ud_index {
@@ -145,6 +158,9 @@ struct radv_shader_info {
 		bool needs_draw_id;
 		bool needs_instance_id;
 	} vs;
+	struct {
+		uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
+	} gs;
 	struct {
 		uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
 	} tes;
@@ -282,14 +298,15 @@ struct radv_shader_slab {
 };

 void
-radv_optimize_nir(struct nir_shader *shader);
+radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively);

 nir_shader *
 radv_shader_compile_to_nir(struct radv_device *device,
 			   struct radv_shader_module *module,
 			   const char *entrypoint_name,
 			   gl_shader_stage stage,
-			   const VkSpecializationInfo *spec_info);
+			   const VkSpecializationInfo *spec_info,
+			   const VkPipelineCreateFlags flags);

 void *
 radv_alloc_shader_memory(struct radv_device *device,
@@ -328,11 +345,14 @@ radv_shader_dump_stats(struct radv_device *device,

 static inline bool
 radv_can_dump_shader(struct radv_device *device,
-		     struct radv_shader_module *module)
+		     struct radv_shader_module *module,
+		     bool is_gs_copy_shader)
 {
+	if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS))
+		return false;
+
 	/* Only dump non-meta shaders, useful for debugging purposes. */
-	return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS &&
-	       module && !module->nir;
+	return (module && !module->nir) || is_gs_copy_shader;
 }

 static inline bool
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -87,6 +87,89 @@ static void get_deref_offset(nir_deref_var *deref, unsigned *const_out)
 	*const_out = const_offset;
 }

+static void
+gather_intrinsic_load_var_info(const nir_shader *nir,
+			       const nir_intrinsic_instr *instr,
+			       struct radv_shader_info *info)
+{
+	switch (nir->info.stage) {
+	case MESA_SHADER_VERTEX: {
+		nir_deref_var *dvar = instr->variables[0];
+		nir_variable *var = dvar->var;
+
+		if (var->data.mode == nir_var_shader_in) {
+			unsigned idx = var->data.location;
+			uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa);
+
+			info->vs.input_usage_mask[idx] |=
+				mask << var->data.location_frac;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static void
+gather_intrinsic_store_var_info(const nir_shader *nir,
+				const nir_intrinsic_instr *instr,
+				struct radv_shader_info *info)
+{
+	nir_deref_var *dvar = instr->variables[0];
+	nir_variable *var = dvar->var;
+
+	if (var->data.mode == nir_var_shader_out) {
+		unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
+		unsigned idx = var->data.location;
+		unsigned comp = var->data.location_frac;
+		unsigned const_offset = 0;
+
+		get_deref_offset(dvar, &const_offset);
+
+		switch (nir->info.stage) {
+		case MESA_SHADER_VERTEX:
+			for (unsigned i = 0; i < attrib_count; i++) {
+				info->vs.output_usage_mask[idx + i + const_offset] |=
+					instr->const_index[0] << comp;
+			}
+			break;
+		case MESA_SHADER_GEOMETRY:
+			for (unsigned i = 0; i < attrib_count; i++) {
+				info->gs.output_usage_mask[idx + i + const_offset] |=
+					instr->const_index[0] << comp;
+			}
+			break;
+		case MESA_SHADER_TESS_EVAL:
+			for (unsigned i = 0; i < attrib_count; i++) {
+				info->tes.output_usage_mask[idx + i + const_offset] |=
+					instr->const_index[0] << comp;
+			}
+			break;
+		case MESA_SHADER_TESS_CTRL: {
+			unsigned param = shader_io_get_unique_index(idx);
+			const struct glsl_type *type = var->type;
+
+			if (!var->data.patch)
+				type = glsl_get_array_element(var->type);
+
+			unsigned slots =
+				var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
+						  : glsl_count_attribute_slots(type, false);
+
+			if (idx == VARYING_SLOT_CLIP_DIST0)
+				slots = (nir->info.clip_distance_array_size +
+					 nir->info.cull_distance_array_size > 4) ? 2 : 1;
+
+			mark_tess_output(info, var->data.patch, param, slots);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
 static void
 gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 		      struct radv_shader_info *info)
@@ -197,55 +280,11 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 			info->ps.writes_memory = true;
 		break;
 	case nir_intrinsic_load_var:
-		if (nir->info.stage == MESA_SHADER_VERTEX) {
-			nir_deref_var *dvar = instr->variables[0];
-			nir_variable *var = dvar->var;
-
-			if (var->data.mode == nir_var_shader_in) {
-				unsigned idx = var->data.location;
-				uint8_t mask =
-					nir_ssa_def_components_read(&instr->dest.ssa) << var->data.location_frac;
-				info->vs.input_usage_mask[idx] |= mask;
-			}
-		}
+		gather_intrinsic_load_var_info(nir, instr, info);
 		break;
-	case nir_intrinsic_store_var: {
-		nir_deref_var *dvar = instr->variables[0];
-		nir_variable *var = dvar->var;
-
-		if (var->data.mode == nir_var_shader_out) {
-			unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
-			unsigned idx = var->data.location;
-			unsigned comp = var->data.location_frac;
-			unsigned const_offset = 0;
-
-			get_deref_offset(dvar, &const_offset);
-
-			if (nir->info.stage == MESA_SHADER_VERTEX) {
-				for (unsigned i = 0; i < attrib_count; i++) {
-					info->vs.output_usage_mask[idx + i + const_offset] |=
-						instr->const_index[0] << comp;
-				}
-			} else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
-				for (unsigned i = 0; i < attrib_count; i++) {
-					info->tes.output_usage_mask[idx + i + const_offset] |=
-						instr->const_index[0] << comp;
-				}
-			} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-				unsigned param = shader_io_get_unique_index(idx);
-				const struct glsl_type *type = var->type;
-				if (!var->data.patch)
-					type = glsl_get_array_element(var->type);
-				unsigned slots =
-					var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
-					: glsl_count_attribute_slots(type, false);
-				if (idx == VARYING_SLOT_CLIP_DIST0)
-					slots = (nir->info.clip_distance_array_size + nir->info.cull_distance_array_size > 4) ? 2 : 1;
-				mark_tess_output(info, var->data.patch, param, slots);
-			}
-		}
+	case nir_intrinsic_store_var:
+		gather_intrinsic_store_var_info(nir, instr, info);
 		break;
-	}
 	default:
 		break;
 	}
@@ -391,7 +430,7 @@ radv_nir_shader_info_pass(const struct nir_shader *nir,
 	struct nir_function *func =
 		(struct nir_function *)exec_list_get_head_const(&nir->functions);

-	if (options->layout->dynamic_offset_count)
+	if (options->layout && options->layout->dynamic_offset_count)
 		info->loads_push_constants = true;

 	nir_foreach_variable(variable, &nir->inputs)
--- a/src/amd/vulkan/radv_util.c
+++ b/src/amd/vulkan/radv_util.c
@@ -29,6 +29,7 @@
 #include <assert.h>

 #include "radv_private.h"
+#include "radv_debug.h"
 #include "vk_enum_to_str.h"

 #include "util/u_math.h"
@@ -53,6 +54,26 @@ radv_loge_v(const char *format, va_list va)
 	fprintf(stderr, "\n");
 }

+/** Log an error message.  */
+void radv_printflike(1, 2)
+	radv_logi(const char *format, ...)
+{
+	va_list va;
+
+	va_start(va, format);
+	radv_logi_v(format, va);
+	va_end(va);
+}
+
+/** \see radv_logi() */
+void
+radv_logi_v(const char *format, va_list va)
+{
+	fprintf(stderr, "radv: info: ");
+	vfprintf(stderr, format, va);
+	fprintf(stderr, "\n");
+}
+
 void radv_printflike(3, 4)
 	__radv_finishme(const char *file, int line, const char *format, ...)
 {
@@ -67,13 +88,19 @@ void radv_printflike(3, 4)
 }

 VkResult
-__vk_errorf(VkResult error, const char *file, int line, const char *format, ...)
+__vk_errorf(struct radv_instance *instance, VkResult error, const char *file,
+	    int line, const char *format, ...)
 {
 	va_list ap;
 	char buffer[256];

 	const char *error_str = vk_Result_to_str(error);

+#ifndef DEBUG
+	if (instance && !(instance->debug_flags & RADV_DEBUG_ERRORS))
+		return error;
+#endif
+
 	if (format) {
 		va_start(ap, format);
 		vsnprintf(buffer, sizeof(buffer), format, ap);
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -41,110 +41,16 @@ si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
 				  unsigned raster_config,
 				  unsigned raster_config_1)
 {
-	unsigned sh_per_se = MAX2(physical_device->rad_info.max_sh_per_se, 1);
 	unsigned num_se = MAX2(physical_device->rad_info.max_se, 1);
-	unsigned rb_mask = physical_device->rad_info.enabled_rb_mask;
-	unsigned num_rb = MIN2(physical_device->rad_info.num_render_backends, 16);
-	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
-	unsigned rb_per_se = num_rb / num_se;
-	unsigned se_mask[4];
+	unsigned raster_config_se[4];
 	unsigned se;

-	se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
-	se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
-	se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
-	se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
-
-	assert(num_se == 1 || num_se == 2 || num_se == 4);
-	assert(sh_per_se == 1 || sh_per_se == 2);
-	assert(rb_per_pkr == 1 || rb_per_pkr == 2);
-
-	/* XXX: I can't figure out what the *_XSEL and *_YSEL
-	 * fields are for, so I'm leaving them as their default
-	 * values. */
-
-	if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
-			     (!se_mask[2] && !se_mask[3]))) {
-		raster_config_1 &= C_028354_SE_PAIR_MAP;
-
-		if (!se_mask[0] && !se_mask[1]) {
-			raster_config_1 |=
-				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
-		} else {
-			raster_config_1 |=
-				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
-		}
-	}
+	ac_get_harvested_configs(&physical_device->rad_info,
+				 raster_config,
+				 &raster_config_1,
+				 raster_config_se);

 	for (se = 0; se < num_se; se++) {
-		unsigned raster_config_se = raster_config;
-		unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
-		unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
-		int idx = (se / 2) * 2;
-
-		if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
-			raster_config_se &= C_028350_SE_MAP;
-
-			if (!se_mask[idx]) {
-				raster_config_se |=
-					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
-			} else {
-				raster_config_se |=
-					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
-			}
-		}
-
-		pkr0_mask &= rb_mask;
-		pkr1_mask &= rb_mask;
-		if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
-			raster_config_se &= C_028350_PKR_MAP;
-
-			if (!pkr0_mask) {
-				raster_config_se |=
-					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
-			} else {
-				raster_config_se |=
-					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
-			}
-		}
-
-		if (rb_per_se >= 2) {
-			unsigned rb0_mask = 1 << (se * rb_per_se);
-			unsigned rb1_mask = rb0_mask << 1;
-
-			rb0_mask &= rb_mask;
-			rb1_mask &= rb_mask;
-			if (!rb0_mask || !rb1_mask) {
-				raster_config_se &= C_028350_RB_MAP_PKR0;
-
-				if (!rb0_mask) {
-					raster_config_se |=
-						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
-				} else {
-					raster_config_se |=
-						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
-				}
-			}
-
-			if (rb_per_se > 2) {
-				rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
-				rb1_mask = rb0_mask << 1;
-				rb0_mask &= rb_mask;
-				rb1_mask &= rb_mask;
-				if (!rb0_mask || !rb1_mask) {
-					raster_config_se &= C_028350_RB_MAP_PKR1;
-
-					if (!rb0_mask) {
-						raster_config_se |=
-							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
-					} else {
-						raster_config_se |=
-							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
-					}
-				}
-			}
-		}
-
 		/* GRBM_GFX_INDEX has a different offset on SI and CI+ */
 		if (physical_device->rad_info.chip_class < CIK)
 			radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX,
@@ -155,9 +61,7 @@ si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
 			radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
 					       S_030800_SE_INDEX(se) | S_030800_SH_BROADCAST_WRITES(1) |
 					       S_030800_INSTANCE_BROADCAST_WRITES(1));
-		radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config_se);
-		if (physical_device->rad_info.chip_class >= CIK)
-			radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
+		radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
 	}

 	/* GRBM_GFX_INDEX has a different offset on SI and CI+ */
@@ -170,6 +74,9 @@ si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
 		radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
 				       S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
 				       S_030800_INSTANCE_BROADCAST_WRITES(1));
+
+	if (physical_device->rad_info.chip_class >= CIK)
+		radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
 }

 static void
@@ -234,88 +141,9 @@ si_set_raster_config(struct radv_physical_device *physical_device,
 	unsigned rb_mask = physical_device->rad_info.enabled_rb_mask;
 	unsigned raster_config, raster_config_1;

-	switch (physical_device->rad_info.family) {
-	case CHIP_TAHITI:
-	case CHIP_PITCAIRN:
-		raster_config = 0x2a00126a;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_VERDE:
-		raster_config = 0x0000124a;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_OLAND:
-		raster_config = 0x00000082;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_HAINAN:
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_BONAIRE:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_HAWAII:
-		raster_config = 0x3a00161a;
-		raster_config_1 = 0x0000002e;
-		break;
-	case CHIP_FIJI:
-		if (physical_device->rad_info.cik_macrotile_mode_array[0] == 0x000000e8) {
-			/* old kernels with old tiling config */
-			raster_config = 0x16000012;
-			raster_config_1 = 0x0000002a;
-		} else {
-			raster_config = 0x3a00161a;
-			raster_config_1 = 0x0000002e;
-		}
-		break;
-	case CHIP_POLARIS10:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x0000002a;
-		break;
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_VEGAM:
-		raster_config = 0x3a00161a;
-		raster_config_1 = 0x0000002e;
-		break;
-	case CHIP_TONGA:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x0000002a;
-		break;
-	case CHIP_ICELAND:
-		if (num_rb == 1)
-			raster_config = 0x00000000;
-		else
-			raster_config = 0x00000002;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_CARRIZO:
-		raster_config = 0x00000002;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_KAVERI:
-		/* KV should be 0x00000002, but that causes problems with radeon */
-		raster_config = 0x00000000; /* 0x00000002 */
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_KABINI:
-	case CHIP_MULLINS:
-	case CHIP_STONEY:
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-		break;
-	default:
-		fprintf(stderr,
-			"radv: Unknown GPU, using 0 for raster_config\n");
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-		break;
-	}
+	ac_get_raster_config(&physical_device->rad_info,
+			     &raster_config,
+			     &raster_config_1);

 	/* Always use the default config when all backends are enabled
 	 * (or when we failed to determine the enabled backends).
--- a/src/amd/vulkan/vk_format.h
+++ b/src/amd/vulkan/vk_format.h
@@ -416,6 +416,46 @@ vk_format_is_srgb(VkFormat format)
 	return desc->colorspace == VK_FORMAT_COLORSPACE_SRGB;
 }

+static inline VkFormat
+vk_format_no_srgb(VkFormat format)
+{
+	switch(format) {
+	case VK_FORMAT_R8_SRGB:
+		return VK_FORMAT_R8_UNORM;
+	case VK_FORMAT_R8G8_SRGB:
+		return VK_FORMAT_R8G8_UNORM;
+	case VK_FORMAT_R8G8B8_SRGB:
+		return VK_FORMAT_R8G8B8_UNORM;
+	case VK_FORMAT_B8G8R8_SRGB:
+		return VK_FORMAT_B8G8R8_UNORM;
+	case VK_FORMAT_R8G8B8A8_SRGB:
+		return VK_FORMAT_R8G8B8A8_UNORM;
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		return VK_FORMAT_B8G8R8A8_UNORM;
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
+	case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
+		return VK_FORMAT_BC1_RGB_UNORM_BLOCK;
+	case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
+		return VK_FORMAT_BC1_RGBA_UNORM_BLOCK;
+	case VK_FORMAT_BC2_SRGB_BLOCK:
+		return VK_FORMAT_BC2_UNORM_BLOCK;
+	case VK_FORMAT_BC3_SRGB_BLOCK:
+		return VK_FORMAT_BC3_UNORM_BLOCK;
+	case VK_FORMAT_BC7_SRGB_BLOCK:
+		return VK_FORMAT_BC7_UNORM_BLOCK;
+	case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+		return VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK;
+	case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+		return VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK;
+	case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
+		return VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK;
+	default:
+		assert(!vk_format_is_srgb(format));
+		return format;
+	}
+}
+
 static inline VkFormat
 vk_format_stencil_only(VkFormat format)
 {
@@ -488,4 +528,11 @@ vk_to_non_srgb_format(VkFormat format)
 	}
 }

+static inline unsigned
+vk_format_get_nr_components(VkFormat format)
+{
+	const struct vk_format_description *desc = vk_format_description(format);
+	return desc->nr_channels;
+}
+
 #endif /* VK_FORMAT_H */
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
@@ -38,7 +38,6 @@

 #include "util/u_atomic.h"

-
 static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo);

 static int
@@ -306,7 +305,9 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws,
 	}

 	r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
-				  size, alignment, 0, &va, &va_handle, 0);
+				  size, alignment, 0, &va, &va_handle,
+				  (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
+				   AMDGPU_VA_RANGE_HIGH);
 	if (r)
 		goto error_va_alloc;

@@ -424,7 +425,8 @@ radv_amdgpu_winsys_bo_from_ptr(struct radeon_winsys *_ws,
 		goto error;

 	if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
-	                          size, 1 << 12, 0, &va, &va_handle, 0))
+	                          size, 1 << 12, 0, &va, &va_handle,
+				  AMDGPU_VA_RANGE_HIGH))
 		goto error_va_alloc;

 	if (amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP))
@@ -480,7 +482,8 @@ radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys *_ws,
 		goto error_query;

 	r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
-				  result.alloc_size, 1 << 20, 0, &va, &va_handle, 0);
+				  result.alloc_size, 1 << 20, 0, &va, &va_handle,
+				  AMDGPU_VA_RANGE_HIGH);
 	if (r)
 		goto error_query;

@@ -501,6 +504,7 @@ radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys *_ws,
 	bo->size = result.alloc_size;
 	bo->is_shared = true;
 	bo->ws = ws;
+	bo->ref_count = 1;
 	radv_amdgpu_add_buffer_to_global_list(bo);
 	return (struct radeon_winsys_bo *)bo;
 error_va_map:
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
@@ -45,13 +45,6 @@ do_winsys_init(struct radv_amdgpu_winsys *ws, int fd)
 	if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
 		return false;

-	/* LLVM 5.0 is required for GFX9. */
-	if (ws->info.chip_class >= GFX9 && HAVE_LLVM < 0x0500) {
-		fprintf(stderr, "amdgpu: LLVM 5.0 is required, got LLVM %i.%i\n",
-			HAVE_LLVM >> 8, HAVE_LLVM & 255);
-		return false;
-	}
-
 	ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment);
 	if (!ws->addrlib) {
 		fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
--- a/src/broadcom/Makefile.am
+++ b/src/broadcom/Makefile.am
@@ -60,6 +60,6 @@ PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)

 include Makefile.genxml.am
 include Makefile.cle.am
-include Makefile.vc5.am
+include Makefile.v3d.am

 CLEANFILES += $(BUILT_SOURCES)
--- a/src/broadcom/Makefile.v3d.am
+++ b/src/broadcom/Makefile.v3d.am
@@ -3,9 +3,9 @@ noinst_LTLIBRARIES += libbroadcom_v33.la
 noinst_LTLIBRARIES += libbroadcom_v41.la
 noinst_LTLIBRARIES += libbroadcom_v42.la

-if USE_VC5_SIMULATOR
-AM_CFLAGS += $(VC5_SIMULATOR_CFLAGS)
-libbroadcom_la_LDFLAGS = $(VC5_SIMULATOR_LIBS)
+if USE_V3D_SIMULATOR
+AM_CFLAGS += $(V3D_SIMULATOR_CFLAGS)
+libbroadcom_la_LDFLAGS = $(V3D_SIMULATOR_LIBS)
 endif

 libbroadcom_v33_la_SOURCES = $(BROADCOM_PER_VERSION_SOURCES)
--- a/src/broadcom/cle/gen_pack_header.py
+++ b/src/broadcom/cle/gen_pack_header.py
@@ -43,7 +43,7 @@ pack_header = """%(license)s
 #ifndef %(guard)s
 #define %(guard)s

-#include "v3d_packet_helpers.h"
+#include "cle/v3d_packet_helpers.h"

 """

--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -702,15 +702,17 @@
    <field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
    <field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/>
    <field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/>
-    <field name="Propagate NaNs" size="1" start="98" type="bool"/>
+    <field name="Fragment Shader Propagate NaNs" size="1" start="98" type="bool"/>
    <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
    <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
    <field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/>
    <field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/>
+    <field name="Vertex Shader Propagate NaNs" size="1" start="162" type="bool"/>
    <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
    <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
    <field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/>
    <field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/>
+    <field name="Coordinate Shader Propagate NaNs" size="1" start="226" type="bool"/>
    <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
  </struct>

--- a/src/broadcom/cle/v3d_packet_v41.xml
+++ b/src/broadcom/cle/v3d_packet_v41.xml
@@ -277,7 +277,7 @@
    <field name="Clear buffer being stored" size="1" start="18" type="bool"/>
    <field name="Output Image Format" size="6" start="12" type="Output Image Format"/>

-    <field name="Decimate" size="2" start="10" type="Decimate Mode"/>
+    <field name="Decimate mode" size="2" start="10" type="Decimate Mode"/>

    <field name="A dithered" size="1" start="9" type="bool"/>
    <field name="BGR dithered" size="1" start="8" type="bool"/>
@@ -311,7 +311,7 @@

    <field name="Input Image Format" size="6" start="12" type="Output Image Format"/>

-    <field name="Decimate" size="2" start="10" type="Decimate Mode"/>
+    <field name="Decimate mode" size="2" start="10" type="Decimate Mode"/>

    <field name="Flip Y" size="1" start="7" type="bool"/>

@@ -475,6 +475,11 @@
    <field name="Varying offset V0" size="4" start="0" type="uint"/>
  </packet>

+  <packet code="91" name="Sample State">
+    <field name="Coverage" size="16" start="16" type="uint"/> <!-- float-1-8-7 -->
+    <field name="Mask" size="4" start="0" type="uint"/>
+  </packet>
+
  <packet code="92" name="Occlusion Query Counter">
    <field name="address" size="32" start="0" type="address"/>
  </packet>
@@ -781,17 +786,19 @@
    <field name="Fragment Shader Code Address" size="32" start="12b" type="address"/>
    <field name="Fragment Shader 4-way threadable" size="1" start="96" type="bool"/>
    <field name="Fragment Shader start in final thread section" size="1" start="97" type="bool"/>
-    <field name="Propagate NaNs" size="1" start="98" type="bool"/>
+    <field name="Fragment Shader Propagate NaNs" size="1" start="98" type="bool"/>
    <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>

    <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
    <field name="Vertex Shader 4-way threadable" size="1" start="160" type="bool"/>
    <field name="Vertex Shader start in final thread section" size="1" start="161" type="bool"/>
+    <field name="Vertex Shader Propagate NaNs" size="1" start="162" type="bool"/>
    <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>

    <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
    <field name="Coordinate Shader 4-way threadable" size="1" start="224" type="bool"/>
    <field name="Coordinate Shader start in final thread section" size="1" start="225" type="bool"/>
+    <field name="Coordinate Shader Propagate NaNs" size="1" start="226" type="bool"/>
    <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
  </struct>

--- a/src/broadcom/cle/v3d_packet_v42.xml
+++ b/src/broadcom/cle/v3d_packet_v42.xml
@@ -278,7 +278,7 @@
    <field name="Clear buffer being stored" size="1" start="18" type="bool"/>
    <field name="Output Image Format" size="6" start="12" type="Output Image Format"/>

-    <field name="Decimate" size="2" start="10" type="Decimate Mode"/>
+    <field name="Decimate mode" size="2" start="10" type="Decimate Mode"/>

    <field name="A dithered" size="1" start="9" type="bool"/>
    <field name="BGR dithered" size="1" start="8" type="bool"/>
@@ -312,7 +312,7 @@

    <field name="Input Image Format" size="6" start="12" type="Output Image Format"/>

-    <field name="Decimate" size="2" start="10" type="Decimate Mode"/>
+    <field name="Decimate mode" size="2" start="10" type="Decimate Mode"/>

    <field name="Flip Y" size="1" start="7" type="bool"/>

@@ -476,6 +476,11 @@
    <field name="Varying offset V0" size="4" start="0" type="uint"/>
  </packet>

+  <packet code="91" name="Sample State">
+    <field name="Coverage" size="16" start="16" type="uint"/> <!-- float-1-8-7 -->
+    <field name="Mask" size="4" start="0" type="uint"/>
+  </packet>
+
  <packet code="92" name="Occlusion Query Counter">
    <field name="address" size="32" start="0" type="address"/>
  </packet>
@@ -782,17 +787,19 @@
    <field name="Fragment Shader Code Address" size="32" start="12b" type="address"/>
    <field name="Fragment Shader 4-way threadable" size="1" start="96" type="bool"/>
    <field name="Fragment Shader start in final thread section" size="1" start="97" type="bool"/>
-    <field name="Propagate NaNs" size="1" start="98" type="bool"/>
+    <field name="Fragment Shader Propagate NaNs" size="1" start="98" type="bool"/>
    <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>

    <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
    <field name="Vertex Shader 4-way threadable" size="1" start="160" type="bool"/>
    <field name="Vertex Shader start in final thread section" size="1" start="161" type="bool"/>
+    <field name="Vertex Shader Propagate NaNs" size="1" start="162" type="bool"/>
    <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>

    <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
    <field name="Coordinate Shader 4-way threadable" size="1" start="224" type="bool"/>
    <field name="Coordinate Shader start in final thread section" size="1" start="225" type="bool"/>
+    <field name="Coordinate Shader Propagate NaNs" size="1" start="226" type="bool"/>
    <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
  </struct>

--- a/src/broadcom/common/v3d_debug.h
+++ b/src/broadcom/common/v3d_debug.h
@@ -57,7 +57,11 @@ extern uint32_t V3D_DEBUG;

 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "BROADCOM-MESA"
+#if ANDROID_API_LEVEL >= 26
+#include <log/log.h>
+#else
 #include <cutils/log.h>
+#endif /* use log/log.h start from android 8 major version */
 #ifndef ALOGW
 #define ALOGW LOGW
 #endif
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -436,6 +436,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                /* FALLTHROUGH */
        case INTERP_MODE_SMOOTH:
                if (var->data.centroid) {
+                        BITSET_SET(c->centroid_flags, i);
                        return vir_FADD(c, vir_FMUL(c, vary,
                                                    c->payload_w_centroid), r5);
                } else {
@@ -754,6 +755,10 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                result = vir_NOT(c, src[0]);
                break;

+        case nir_op_ufind_msb:
+                result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
+                break;
+
        case nir_op_imul:
                result = vir_UMUL(c, src[0], src[1]);
                break;
@@ -852,6 +857,13 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                result = vir_FDY(c, src[0]);
                break;

+        case nir_op_uadd_carry:
+                vir_PF(c, vir_ADD(c, src[0], src[1]), V3D_QPU_PF_PUSHC);
+                result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+                                            vir_uniform_ui(c, ~0),
+                                            vir_uniform_ui(c, 0)));
+                break;
+
        default:
                fprintf(stderr, "unknown NIR ALU inst: ");
                nir_print_instr(&instr->instr, stderr);
@@ -957,6 +969,9 @@ emit_frag_end(struct v3d_compile *c)
                conf |= TLB_SAMPLE_MODE_PER_PIXEL;
                conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;

+                if (c->fs_key->swap_color_rb & (1 << rt))
+                        num_components = MAX2(num_components, 3);
+
                assert(num_components != 0);
                switch (glsl_get_base_type(var->type)) {
                case GLSL_TYPE_UINT:
@@ -985,7 +1000,7 @@ emit_frag_end(struct v3d_compile *c)
                        struct qreg b = color[2];
                        struct qreg a = color[3];

-                        if (c->fs_key->f32_color_rb) {
+                        if (c->fs_key->f32_color_rb & (1 << rt)) {
                                conf |= TLB_TYPE_F32_COLOR;
                                conf |= ((num_components - 1) <<
                                         TLB_VEC_SIZE_MINUS_1_SHIFT);
@@ -1348,7 +1363,7 @@ ntq_setup_outputs(struct v3d_compile *c)
                assert(array_len == 1);
                (void)array_len;

-                for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
+                for (int i = 0; i < 4; i++) {
                        add_output(c, loc + var->data.location_frac + i,
                                   var->data.location,
                                   var->data.location_frac + i);
@@ -1893,8 +1908,11 @@ const nir_shader_compiler_options v3d_nir_options = {
        .lower_all_io_to_temps = true,
        .lower_extract_byte = true,
        .lower_extract_word = true,
-        .lower_bitfield_insert = true,
-        .lower_bitfield_extract = true,
+        .lower_bfm = true,
+        .lower_bitfield_insert_to_shifts = true,
+        .lower_bitfield_extract_to_shifts = true,
+        .lower_bitfield_reverse = true,
+        .lower_bit_count = true,
        .lower_pack_unorm_2x16 = true,
        .lower_pack_snorm_2x16 = true,
        .lower_pack_unorm_4x8 = true,
@@ -1902,12 +1920,15 @@ const nir_shader_compiler_options v3d_nir_options = {
        .lower_unpack_unorm_4x8 = true,
        .lower_unpack_snorm_4x8 = true,
        .lower_fdiv = true,
+        .lower_find_lsb = true,
        .lower_ffma = true,
        .lower_flrp32 = true,
        .lower_fpow = true,
        .lower_fsat = true,
        .lower_fsqrt = true,
+        .lower_ifind_msb = true,
        .lower_ldexp = true,
+        .lower_mul_high = true,
        .native_integers = true,
 };

@@ -1985,6 +2006,29 @@ vir_emit_last_thrsw(struct v3d_compile *c)
                c->last_thrsw->is_last_thrsw = true;
 }

+/* There's a flag in the shader for "center W is needed for reasons other than
+ * non-centroid varyings", so we just walk the program after VIR optimization
+ * to see if it's used.  It should be harmless to set even if we only use
+ * center W for varyings.
+ */
+static void
+vir_check_payload_w(struct v3d_compile *c)
+{
+        if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+                return;
+
+        vir_for_each_inst_inorder(inst, c) {
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        if (inst->src[i].file == QFILE_REG &&
+                            inst->src[i].index == 0) {
+                                c->uses_center_w = true;
+                                return;
+                        }
+                }
+        }
+
+}
+
 void
 v3d_nir_to_vir(struct v3d_compile *c)
 {
@@ -2024,6 +2068,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
        vir_optimize(c);
        vir_lower_uniforms(c);

+        vir_check_payload_w(c);
+
        /* XXX: vir_schedule_instructions(c); */

        if (V3D_DEBUG & (V3D_DEBUG_VIR |
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,7 +41,15 @@ struct v3d_qpu_validate_state {
        int last_sfu_write;
        int last_branch_ip;
        int last_thrsw_ip;
+
+        /* Set when we've found the last-THRSW signal, or if we were started
+         * in single-segment mode.
+         */
        bool last_thrsw_found;
+
+        /* Set when we've found the THRSW after the last THRSW */
+        bool thrend_found;
+
        int thrsw_count;
 };

@@ -116,6 +124,19 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
                fail_instr(state, "LDUNIF after a LDVARY");
        }

+        /* GFXH-1633 */
+        bool last_reads_ldunif = (state->last && (state->last->sig.ldunif ||
+                                                  state->last->sig.ldunifrf));
+        bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa ||
+                                                   state->last->sig.ldunifarf));
+        bool reads_ldunif = inst->sig.ldunif || inst->sig.ldunifrf;
+        bool reads_ldunifa = inst->sig.ldunifa || inst->sig.ldunifarf;
+        if ((last_reads_ldunif && reads_ldunifa) ||
+            (last_reads_ldunifa && reads_ldunif)) {
+                fail_instr(state,
+                           "LDUNIF and LDUNIFA can't be next to each other");
+        }
+
        int tmu_writes = 0;
        int sfu_writes = 0;
        int vpm_writes = 0;
@@ -204,6 +225,9 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
                if (in_branch_delay_slots(state))
                        fail_instr(state, "THRSW in a branch delay slot.");

+                if (state->last_thrsw_found)
+                        state->thrend_found = true;
+
                if (state->last_thrsw_ip == state->ip - 1) {
                        /* If it's the second THRSW in a row, then it's just a
                         * last-thrsw signal.
@@ -221,6 +245,28 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
                }
        }

+        if (state->thrend_found &&
+            state->last_thrsw_ip - state->ip <= 2 &&
+            inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                if ((inst->alu.add.op != V3D_QPU_A_NOP &&
+                     !inst->alu.add.magic_write)) {
+                        fail_instr(state, "RF write after THREND");
+                }
+
+                if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
+                     !inst->alu.mul.magic_write)) {
+                        fail_instr(state, "RF write after THREND");
+                }
+
+                if (v3d_qpu_sig_writes_address(devinfo, &inst->sig))
+                        fail_instr(state, "RF write after THREND");
+
+                /* GFXH-1625: No TMUWT in the last instruction */
+                if (state->last_thrsw_ip - state->ip == 2 &&
+                    inst->alu.add.op == V3D_QPU_A_TMUWT)
+                        fail_instr(state, "TMUWT in last instruction");
+        }
+
        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
                if (in_branch_delay_slots(state))
                        fail_instr(state, "branch in a branch delay slot.");
@@ -262,6 +308,8 @@ qpu_validate(struct v3d_compile *c)
                .last_thrsw_ip = -10,
                .last_branch_ip = -10,
                .ip = 0,
+
+                .last_thrsw_found = !c->last_thrsw,
        };

        vir_for_each_block(block, c) {
@@ -273,8 +321,6 @@ qpu_validate(struct v3d_compile *c)
                           "thread switch found without last-THRSW in program");
        }

-        if (state.thrsw_count == 0 ||
-            (state.last_thrsw_found && state.thrsw_count == 1)) {
+        if (!state.thrend_found)
                fail_instr(&state, "No program-end THRSW found");
-        }
 }
--- a/Show More
+++ b/Show More