Update version to 19.1.0-rc3

Signed-off-by: Juan A. Suarez Romero <jasuarez@igalia.com>
nir: Fix clone of nir_variable state slots
2019-05-21 14:09:14 +00:00 · 2019-05-21 09:04:42 +00:00 · 2019-05-21 09:02:06 +00:00 · 2019-05-21 08:59:28 +00:00 · 2019-05-21 08:54:23 +00:00 · 2019-05-21 08:42:32 +00:00
64 changed files with 887 additions and 572 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,198 +1,40 @@
 language: c

-dist: xenial
+os: osx

 cache:
-  apt: true
  ccache: true

 env:
  global:
-    - XORG_RELEASES=https://xorg.freedesktop.org/releases/individual
-    - XCB_RELEASES=https://xcb.freedesktop.org/dist
-    - WAYLAND_RELEASES=https://wayland.freedesktop.org/releases
-    - XORGMACROS_VERSION=util-macros-1.19.0
-    - GLPROTO_VERSION=glproto-1.4.17
-    - DRI2PROTO_VERSION=dri2proto-2.8
-    - LIBPCIACCESS_VERSION=libpciaccess-0.13.4
-    - LIBDRM_VERSION=libdrm-2.4.97
-    - XCBPROTO_VERSION=xcb-proto-1.13
-    - RANDRPROTO_VERSION=randrproto-1.3.0
-    - LIBXRANDR_VERSION=libXrandr-1.3.0
-    - LIBXCB_VERSION=libxcb-1.13
-    - LIBXSHMFENCE_VERSION=libxshmfence-1.2
-    - LIBVDPAU_VERSION=libvdpau-1.1
-    - LIBVA_VERSION=libva-1.7.0
-    - LIBWAYLAND_VERSION=wayland-1.15.0
-    - WAYLAND_PROTOCOLS_VERSION=wayland-protocols-1.8
-    - PKG_CONFIG_PATH=$HOME/prefix/lib/pkgconfig:$HOME/prefix/share/pkgconfig
-    - LD_LIBRARY_PATH="$HOME/prefix/lib:$LD_LIBRARY_PATH"
-    - PATH="$HOME/prefix/bin:$PATH"
-
-matrix:
-  include:
-    - env:
-        - LABEL="macOS meson"
-        - BUILD=meson
-        - DRI_LOADERS="-Dplatforms=x11"
-        - GALLIUM_DRIVERS=swrast
-      os: osx
+    - PKG_CONFIG_PATH=""

 before_install:
-  - |
-    if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-      HOMEBREW_NO_AUTO_UPDATE=1 brew install python3 ninja expat gettext
-      # Set PATH for homebrew pip3 installs
-      PATH="$HOME/Library/Python/3.6/bin:${PATH}"
-      # Set PKG_CONFIG_PATH for keg-only expat
-      PKG_CONFIG_PATH="/usr/local/opt/expat/lib/pkgconfig:${PKG_CONFIG_PATH}"
-      # Set PATH for keg-only gettext
-      PATH="/usr/local/opt/gettext/bin:${PATH}"
+  - HOMEBREW_NO_AUTO_UPDATE=1 brew install python3 ninja expat gettext
+  # Set PATH for homebrew pip3 installs
+  - PATH="$HOME/Library/Python/3.6/bin:${PATH}"
+  # Set PKG_CONFIG_PATH for keg-only expat
+  - PKG_CONFIG_PATH="/usr/local/opt/expat/lib/pkgconfig:${PKG_CONFIG_PATH}"
+  # Set PATH for keg-only gettext
+  - PATH="/usr/local/opt/gettext/bin:${PATH}"

-      # Install xquartz for prereqs ...
-      XQUARTZ_VERSION="2.7.11"
-      wget -nv https://dl.bintray.com/xquartz/downloads/XQuartz-${XQUARTZ_VERSION}.dmg
-      hdiutil attach XQuartz-${XQUARTZ_VERSION}.dmg
-      sudo installer -pkg /Volumes/XQuartz-${XQUARTZ_VERSION}/XQuartz.pkg -target /
-      hdiutil detach /Volumes/XQuartz-${XQUARTZ_VERSION}
-      # ... and set paths
-      PATH="/opt/X11/bin:${PATH}"
-      PKG_CONFIG_PATH="/opt/X11/share/pkgconfig:/opt/X11/lib/pkgconfig:${PKG_CONFIG_PATH}"
-      ACLOCAL="aclocal -I /opt/X11/share/aclocal -I /usr/local/share/aclocal"
-    fi
+  # Install xquartz for prereqs ...
+  - XQUARTZ_VERSION="2.7.11"
+  - wget -nv https://dl.bintray.com/xquartz/downloads/XQuartz-${XQUARTZ_VERSION}.dmg
+  - hdiutil attach XQuartz-${XQUARTZ_VERSION}.dmg
+  - sudo installer -pkg /Volumes/XQuartz-${XQUARTZ_VERSION}/XQuartz.pkg -target /
+  - hdiutil detach /Volumes/XQuartz-${XQUARTZ_VERSION}
+  # ... and set paths
+  - PKG_CONFIG_PATH="/opt/X11/share/pkgconfig:/opt/X11/lib/pkgconfig:${PKG_CONFIG_PATH}"

 install:
-  # Install a more modern meson from pip, since the version in the
-  # ubuntu repos is often quite old.
-  - if test "x$BUILD" = xmeson; then
-      pip3 install --user meson;
-      pip3 install --user mako;
-    fi
-
-  # Install dependencies where we require specific versions (or where
-  # disallowed by Travis CI's package whitelisting).
-
-  - |
-    if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
-      wget $XORG_RELEASES/util/$XORGMACROS_VERSION.tar.bz2
-      tar -jxvf $XORGMACROS_VERSION.tar.bz2
-      (cd $XORGMACROS_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget $XORG_RELEASES/proto/$GLPROTO_VERSION.tar.bz2
-      tar -jxvf $GLPROTO_VERSION.tar.bz2
-      (cd $GLPROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget $XORG_RELEASES/proto/$DRI2PROTO_VERSION.tar.bz2
-      tar -jxvf $DRI2PROTO_VERSION.tar.bz2
-      (cd $DRI2PROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget $XCB_RELEASES/$XCBPROTO_VERSION.tar.bz2
-      tar -jxvf $XCBPROTO_VERSION.tar.bz2
-      (cd $XCBPROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget $XCB_RELEASES/$LIBXCB_VERSION.tar.bz2
-      tar -jxvf $LIBXCB_VERSION.tar.bz2
-      (cd $LIBXCB_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget $XORG_RELEASES/lib/$LIBPCIACCESS_VERSION.tar.bz2
-      tar -jxvf $LIBPCIACCESS_VERSION.tar.bz2
-      (cd $LIBPCIACCESS_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget https://dri.freedesktop.org/libdrm/$LIBDRM_VERSION.tar.bz2
-      tar -jxvf $LIBDRM_VERSION.tar.bz2
-      (cd $LIBDRM_VERSION && ./configure --prefix=$HOME/prefix --enable-vc4 --enable-freedreno --enable-etnaviv-experimental-api && make install)
-
-      wget $XORG_RELEASES/proto/$RANDRPROTO_VERSION.tar.bz2
-      tar -jxvf $RANDRPROTO_VERSION.tar.bz2
-      (cd $RANDRPROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget $XORG_RELEASES/lib/$LIBXRANDR_VERSION.tar.bz2
-      tar -jxvf $LIBXRANDR_VERSION.tar.bz2
-      (cd $LIBXRANDR_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget $XORG_RELEASES/lib/$LIBXSHMFENCE_VERSION.tar.bz2
-      tar -jxvf $LIBXSHMFENCE_VERSION.tar.bz2
-      (cd $LIBXSHMFENCE_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget https://people.freedesktop.org/~aplattner/vdpau/$LIBVDPAU_VERSION.tar.bz2
-      tar -jxvf $LIBVDPAU_VERSION.tar.bz2
-      (cd $LIBVDPAU_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      wget https://www.freedesktop.org/software/vaapi/releases/libva/$LIBVA_VERSION.tar.bz2
-      tar -jxvf $LIBVA_VERSION.tar.bz2
-      (cd $LIBVA_VERSION && ./configure --prefix=$HOME/prefix --disable-wayland --disable-dummy-driver && make install)
-
-      wget $WAYLAND_RELEASES/$LIBWAYLAND_VERSION.tar.xz
-      tar -axvf $LIBWAYLAND_VERSION.tar.xz
-      (cd $LIBWAYLAND_VERSION && ./configure --prefix=$HOME/prefix --enable-libraries --without-host-scanner --disable-documentation --disable-dtd-validation && make install)
-
-      wget $WAYLAND_RELEASES/$WAYLAND_PROTOCOLS_VERSION.tar.xz
-      tar -axvf $WAYLAND_PROTOCOLS_VERSION.tar.xz
-      (cd $WAYLAND_PROTOCOLS_VERSION && ./configure --prefix=$HOME/prefix && make install)
-
-      # Meson requires ninja >= 1.6, but xenial has 1.3.x
-      wget https://github.com/ninja-build/ninja/releases/download/v1.6.0/ninja-linux.zip
-      unzip ninja-linux.zip
-      mv ninja $HOME/prefix/bin/
-
-      # Generate this header since one is missing on the Travis instance
-      mkdir -p linux
-      printf "%s\n" \
-           "#ifndef _LINUX_MEMFD_H" \
-           "#define _LINUX_MEMFD_H" \
-           "" \
-           "#define MFD_CLOEXEC             0x0001U" \
-           "#define MFD_ALLOW_SEALING       0x0002U" \
-           "" \
-           "#endif /* _LINUX_MEMFD_H */" > linux/memfd.h
-
-      # Generate this header, including the missing SYS_memfd_create
-      # macro, which is not provided by the header in the Travis
-      # instance
-      mkdir -p sys
-      printf "%s\n" \
-           "#ifndef _SYSCALL_H" \
-           "#define _SYSCALL_H      1" \
-           "" \
-           "#include <asm/unistd.h>" \
-           "" \
-           "#ifndef _LIBC" \
-           "# include <bits/syscall.h>" \
-           "#endif" \
-           "" \
-           "#ifndef __NR_memfd_create" \
-           "# define __NR_memfd_create 319 /* Taken from <asm/unistd_64.h> */" \
-           "#endif" \
-           "" \
-           "#ifndef SYS_memfd_create" \
-           "# define SYS_memfd_create __NR_memfd_create" \
-           "#endif" \
-           "" \
-           "#endif" > sys/syscall.h
-    fi
+  - pip3 install --user meson
+  - pip3 install --user mako

 script:
-  if test "x$BUILD" = xmeson; then
-    if test -n "$LLVM_CONFIG"; then
-      # We need to control the version of llvm-config we're using, so we'll
-      # generate a native file to do so. This requires meson >=0.49
-      #
-      echo -e "[binaries]\nllvm-config = '`which $LLVM_CONFIG`'" > native.file
-
-      $LLVM_CONFIG --version
-    else
-      : > native.file
-    fi
-
-    export CFLAGS="$CFLAGS -isystem`pwd`"
-    meson _build \
-                  --native-file=native.file \
-                  -Dbuild-tests=true \
-                  ${DRI_LOADERS} \
-                  -Ddri-drivers=${DRI_DRIVERS:-[]} \
-                  -Dgallium-drivers=${GALLIUM_DRIVERS:-[]} \
-                  -Dvulkan-drivers=${VULKAN_DRIVERS:-[]}
-    meson configure _build
-    ninja -C _build
-    ninja -C _build test
-  fi
+  - meson _build
+      -Dbuild-tests=true
+      -Dplatforms=x11
+      -Dgallium-drivers=swrast
+  - ninja -C _build
+  - ninja -C _build test
--- a/Android.mk
+++ b/Android.mk
@@ -110,6 +110,7 @@ endef

 # add subdirectories
 SUBDIRS := \
+	src/freedreno \
 	src/gbm \
 	src/loader \
 	src/mapi \
--- a/2
+++ b/2
@@ -1 +1 @@
-19.1.0-devel
+19.1.0-rc3
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,2 @@
+stable: this commit causes issues in several systems
+78e35df52aa2f7d770f929a0866a0faa89c261a9 radeonsi: update buffer descriptors in all contexts after buffer invalidation
--- a/meson.build
+++ b/meson.build
@@ -1258,6 +1258,7 @@ if _llvm != 'false'
      with_gallium_opencl or _llvm == 'true'
    ),
    static : not _shared_llvm,
+    method : 'config-tool',
  )
  with_llvm = dep_llvm.found()
 endif
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -151,13 +151,14 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
 	LLVMTargetRef target = ac_get_llvm_target(triple);

 	snprintf(features, sizeof(features),
-		 "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s",
+		 "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s%s",
 		 HAVE_LLVM >= 0x0800 ? "" : ",+vgpr-spilling",
 		 tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "",
 		 tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "",
 		 tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "",
-		 tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "");
-	
+		 tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "",
+		 tm_options & AC_TM_NO_LOAD_STORE_OPT ? ",-load-store-opt" : "");
+
 	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
 	                             target,
 	                             triple,
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -65,6 +65,7 @@ enum ac_target_machine_options {
 	AC_TM_CHECK_IR = (1 << 5),
 	AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6),
 	AC_TM_CREATE_LOW_OPT = (1 << 7),
+	AC_TM_NO_LOAD_STORE_OPT = (1 << 8),
 };

 enum ac_float_mode {
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -301,7 +301,6 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
 static VkResult
 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 {
-
 	cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);

 	list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
@@ -326,6 +325,8 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)

 	cmd_buffer->record_result = VK_SUCCESS;

+	memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
+
 	for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
 		cmd_buffer->descriptors[i].dirty = 0;
 		cmd_buffer->descriptors[i].valid = 0;
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -51,6 +51,7 @@ enum {
 	RADV_DEBUG_CHECKIR           = 0x200000,
 	RADV_DEBUG_NOTHREADLLVM      = 0x400000,
 	RADV_DEBUG_NOBINNING         = 0x800000,
+	RADV_DEBUG_NO_LOAD_STORE_OPT = 0x1000000,
 };

 enum {
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -200,7 +200,7 @@ VkResult radv_CreateDescriptorSetLayout(
 			break;
 		case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
 			/* main descriptor + fmask descriptor + sampler */
-			set_layout->binding[b].size = 32 + 32 * max_sampled_image_descriptors;
+			set_layout->binding[b].size = 96;
 			binding_buffer_count = 1;
 			alignment = 32;
 			break;
@@ -247,7 +247,8 @@ VkResult radv_CreateDescriptorSetLayout(

 			/* Don't reserve space for the samplers if they're not accessed. */
 			if (set_layout->binding[b].immutable_samplers_equal) {
-				if (binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+				if (binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER &&
+				    max_sampled_image_descriptors <= 2)
 					set_layout->binding[b].size -= 32;
 				else if (binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER)
 					set_layout->binding[b].size -= 16;
--- a/src/amd/vulkan/radv_descriptor_set.h
+++ b/src/amd/vulkan/radv_descriptor_set.h
@@ -104,7 +104,7 @@ radv_immutable_samplers(const struct radv_descriptor_set_layout *set,
 static inline unsigned
 radv_combined_image_descriptor_sampler_offset(const struct radv_descriptor_set_binding_layout *binding)
 {
-	return binding->size - ((!binding->immutable_samplers_equal) ? 32 : 0);
+	return binding->size - ((!binding->immutable_samplers_equal) ? 16 : 0);
 }

 static inline const struct radv_sampler_ycbcr_conversion *
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -464,6 +464,7 @@ static const struct debug_control radv_debug_options[] = {
 	{"checkir", RADV_DEBUG_CHECKIR},
 	{"nothreadllvm", RADV_DEBUG_NOTHREADLLVM},
 	{"nobinning", RADV_DEBUG_NOBINNING},
+	{"noloadstoreopt", RADV_DEBUG_NO_LOAD_STORE_OPT},
 	{NULL, 0}
 };

@@ -510,6 +511,13 @@ radv_handle_per_app_options(struct radv_instance *instance,
 	} else if (!strcmp(name, "DOOM_VFR")) {
 		/* Work around a Doom VFR game bug */
 		instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS;
+	} else if (!strcmp(name, "MonsterHunterWorld.exe")) {
+		/* Workaround for a WaW hazard when LLVM moves/merges
+		 * load/store memory operations.
+		 * See https://reviews.llvm.org/D61313
+		 */
+		if (HAVE_LLVM < 0x900)
+			instance->debug_flags |= RADV_DEBUG_NO_LOAD_STORE_OPT;
 	}
 }

--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@@ -635,7 +635,8 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
 	const struct vk_format_description *desc = vk_format_description(format);
 	bool blendable;
 	bool scaled = false;
-	if (!desc) {
+	/* TODO: implement some software emulation of SUBSAMPLED formats. */
+	if (!desc || desc->layout == VK_FORMAT_LAYOUT_SUBSAMPLED) {
 		out_properties->linearTilingFeatures = linear;
 		out_properties->optimalTilingFeatures = tiled;
 		out_properties->bufferFeatures = buffer;
@@ -655,6 +656,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
 		uint32_t tiling = VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
 		                  VK_FORMAT_FEATURE_TRANSFER_DST_BIT |
 		                  VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
+		                  VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT |
 		                  VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT;

 		/* The subsampled formats have no support for linear filters. */
--- a/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c
+++ b/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c
@@ -156,6 +156,73 @@ convert_ycbcr(struct ycbcr_state *state,
 	                converted_channels[2], nir_imm_float(b, 1.0f));
 }

+static nir_ssa_def *
+get_texture_size(struct ycbcr_state *state, nir_deref_instr *texture)
+{
+	nir_builder *b = state->builder;
+	const struct glsl_type *type = texture->type;
+	nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
+
+	tex->op = nir_texop_txs;
+	tex->sampler_dim = glsl_get_sampler_dim(type);
+	tex->is_array = glsl_sampler_type_is_array(type);
+	tex->is_shadow = glsl_sampler_type_is_shadow(type);
+	tex->dest_type = nir_type_int;
+
+	tex->src[0].src_type = nir_tex_src_texture_deref;
+	tex->src[0].src = nir_src_for_ssa(&texture->dest.ssa);
+
+	nir_ssa_dest_init(&tex->instr, &tex->dest,
+	                  nir_tex_instr_dest_size(tex), 32, NULL);
+	nir_builder_instr_insert(b, &tex->instr);
+
+	return nir_i2f32(b, &tex->dest.ssa);
+}
+
+static nir_ssa_def *
+implicit_downsampled_coord(nir_builder *b,
+                           nir_ssa_def *value,
+                           nir_ssa_def *max_value,
+                           int div_scale)
+{
+	return nir_fadd(b,
+	                value,
+	                nir_fdiv(b,
+	                         nir_imm_float(b, 1.0f),
+	                         nir_fmul(b,
+	                                  nir_imm_float(b, div_scale),
+	                                  max_value)));
+}
+
+static nir_ssa_def *
+implicit_downsampled_coords(struct ycbcr_state *state,
+                            nir_ssa_def *old_coords)
+{
+	nir_builder *b = state->builder;
+	const struct radv_sampler_ycbcr_conversion *conversion = state->conversion;
+	nir_ssa_def *image_size = NULL;
+	nir_ssa_def *comp[4] = { NULL, };
+	const struct vk_format_description *fmt_desc = vk_format_description(state->conversion->format);
+	const unsigned divisors[2] = {fmt_desc->width_divisor, fmt_desc->height_divisor};
+
+	for (int c = 0; c < old_coords->num_components; c++) {
+		if (c < ARRAY_SIZE(divisors) && divisors[c] > 1 &&
+		    conversion->chroma_offsets[c] == VK_CHROMA_LOCATION_COSITED_EVEN) {
+			if (!image_size)
+				image_size = get_texture_size(state, state->tex_deref);
+
+			comp[c] = implicit_downsampled_coord(b,
+			                                     nir_channel(b, old_coords, c),
+			                                     nir_channel(b, image_size, c),
+			                                     divisors[c]);
+		} else {
+			comp[c] = nir_channel(b, old_coords, c);
+		}
+	}
+
+	return nir_vec(b, comp, old_coords->num_components);
+}
+
 static nir_ssa_def *
 create_plane_tex_instr_implicit(struct ycbcr_state *state,
                                uint32_t plane)
@@ -163,10 +230,23 @@ create_plane_tex_instr_implicit(struct ycbcr_state *state,
 	nir_builder *b = state->builder;
 	nir_tex_instr *old_tex = state->origin_tex;
 	nir_tex_instr *tex = nir_tex_instr_create(b->shader, old_tex->num_srcs+ 1);
-
 	for (uint32_t i = 0; i < old_tex->num_srcs; i++) {
 		tex->src[i].src_type = old_tex->src[i].src_type;
-		nir_src_copy(&tex->src[i].src, &old_tex->src[i].src, tex);
+
+		switch (old_tex->src[i].src_type) {
+		case nir_tex_src_coord:
+			if (plane && true/*state->conversion->chroma_reconstruction*/) {
+				assert(old_tex->src[i].src.is_ssa);
+				tex->src[i].src =
+					nir_src_for_ssa(implicit_downsampled_coords(state,
+					                                            old_tex->src[i].src.ssa));
+				break;
+			}
+		/* fall through */
+		default:
+			nir_src_copy(&tex->src[i].src, &old_tex->src[i].src, tex);
+			break;
+		}
 	}

 	tex->src[tex->num_srcs - 1].src = nir_src_for_ssa(nir_imm_int(b, plane));
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2019,16 +2019,34 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,

 	assert(stride % type_size == 0);

-	if (!index)
-		index = ctx->ac.i32_0;
+	LLVMValueRef adjusted_index = index;
+	if (!adjusted_index)
+		adjusted_index = ctx->ac.i32_0;

-	index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
+	adjusted_index = LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");

 	list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->ac.i32, offset, 0));
 	list = LLVMBuildPointerCast(builder, list,
 				    ac_array_in_const32_addr_space(type), "");

-	return ac_build_load_to_sgpr(&ctx->ac, list, index);
+	LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);
+
+	/* 3 plane formats always have same size and format for plane 1 & 2, so
+	 * use the tail from plane 1 so that we can store only the first 16 bytes
+	 * of the last plane. */
+	if (desc_type == AC_DESC_PLANE_2) {
+		LLVMValueRef descriptor2 = radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index, AC_DESC_PLANE_1,image, write, bindless);
+
+		LLVMValueRef components[8];
+		for (unsigned i = 0; i < 4; ++i)
+			components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
+
+		for (unsigned i = 4; i < 8; ++i)
+			components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
+		descriptor = ac_build_gather_values(&ctx->ac, components, 8);
+	}
+
+	return descriptor;
 }

 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1417,11 +1417,13 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,

 	const  VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
 			vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
-	if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
+	if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
 		dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount;
-		typed_memcpy(dynamic->discard_rectangle.rectangles,
-		             discard_rectangle_info->pDiscardRectangles,
-		             discard_rectangle_info->discardRectangleCount);
+		if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
+			typed_memcpy(dynamic->discard_rectangle.rectangles,
+			             discard_rectangle_info->pDiscardRectangles,
+			             discard_rectangle_info->discardRectangleCount);
+		}
 	}

 	pipeline->dynamic_state.mask = states;
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -624,6 +624,8 @@ shader_variant_create(struct radv_device *device,
 		tm_options |= AC_TM_SISCHED;
 	if (options->check_ir)
 		tm_options |= AC_TM_CHECK_IR;
+	if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT)
+		tm_options |= AC_TM_NO_LOAD_STORE_OPT;

 	thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM);
 	radv_init_llvm_once();
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -244,6 +244,7 @@ NIR_FILES = \
 	nir/nir_lower_constant_initializers.c \
 	nir/nir_lower_double_ops.c \
 	nir/nir_lower_drawpixels.c \
+	nir/nir_lower_fb_read.c \
 	nir/nir_lower_fragcoord_wtrans.c \
 	nir/nir_lower_frexp.c \
 	nir/nir_lower_global_vars_to_local.c \
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -50,7 +50,7 @@ glsl_type::glsl_type(GLenum gl_type,
   gl_type(gl_type),
   base_type(base_type), sampled_type(GLSL_TYPE_VOID),
   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
-   interface_packing(0), interface_row_major(row_major),
+   interface_packing(0), interface_row_major(row_major), packed(0),
   vector_elements(vector_elements), matrix_columns(matrix_columns),
   length(0), explicit_stride(explicit_stride)
 {
@@ -85,7 +85,7 @@ glsl_type::glsl_type(GLenum gl_type, glsl_base_type base_type,
   base_type(base_type), sampled_type(type),
   sampler_dimensionality(dim), sampler_shadow(shadow),
   sampler_array(array), interface_packing(0),
-   interface_row_major(0),
+   interface_row_major(0), packed(0),
   length(0), explicit_stride(0)
 {
   this->mem_ctx = ralloc_context(NULL);
@@ -134,7 +134,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
   base_type(GLSL_TYPE_INTERFACE), sampled_type(GLSL_TYPE_VOID),
   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
   interface_packing((unsigned) packing),
-   interface_row_major((unsigned) row_major),
+   interface_row_major((unsigned) row_major), packed(0),
   vector_elements(0), matrix_columns(0),
   length(num_fields), explicit_stride(0)
 {
@@ -159,7 +159,7 @@ glsl_type::glsl_type(const glsl_type *return_type,
   gl_type(0),
   base_type(GLSL_TYPE_FUNCTION), sampled_type(GLSL_TYPE_VOID),
   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
-   interface_packing(0), interface_row_major(0),
+   interface_packing(0), interface_row_major(0), packed(0),
   vector_elements(0), matrix_columns(0),
   length(num_params), explicit_stride(0)
 {
@@ -188,7 +188,7 @@ glsl_type::glsl_type(const char *subroutine_name) :
   gl_type(0),
   base_type(GLSL_TYPE_SUBROUTINE), sampled_type(GLSL_TYPE_VOID),
   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
-   interface_packing(0), interface_row_major(0),
+   interface_packing(0), interface_row_major(0), packed(0),
   vector_elements(1), matrix_columns(1),
   length(0), explicit_stride(0)
 {
@@ -534,7 +534,7 @@ glsl_type::glsl_type(const glsl_type *array, unsigned length,
                     unsigned explicit_stride) :
   base_type(GLSL_TYPE_ARRAY), sampled_type(GLSL_TYPE_VOID),
   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
-   interface_packing(0), interface_row_major(0),
+   interface_packing(0), interface_row_major(0), packed(0),
   vector_elements(0), matrix_columns(0),
   length(length), name(NULL), explicit_stride(explicit_stride)
 {
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -151,9 +151,11 @@ nir_variable_clone(const nir_variable *var, nir_shader *shader)
   nvar->name = ralloc_strdup(nvar, var->name);
   nvar->data = var->data;
   nvar->num_state_slots = var->num_state_slots;
-   nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots);
-   memcpy(nvar->state_slots, var->state_slots,
-          var->num_state_slots * sizeof(nir_state_slot));
+   if (var->num_state_slots) {
+      nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots);
+      memcpy(nvar->state_slots, var->state_slots,
+             var->num_state_slots * sizeof(nir_state_slot));
+   }
   if (var->constant_initializer) {
      nvar->constant_initializer =
         nir_constant_clone(var->constant_initializer, nvar);
--- a/src/compiler/nir/nir_lower_non_uniform_access.c
+++ b/src/compiler/nir/nir_lower_non_uniform_access.c
@@ -34,6 +34,7 @@ read_first_invocation(nir_builder *b, nir_ssa_def *x)
   first->src[0] = nir_src_for_ssa(x);
   nir_ssa_dest_init(&first->instr, &first->dest,
                     x->num_components, x->bit_size, NULL);
+   nir_builder_instr_insert(b, &first->instr);
   return &first->dest.ssa;
 }

@@ -129,7 +130,7 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl,
   nir_builder_init(&b, impl);

   nir_foreach_block(block, impl) {
-      nir_foreach_instr(instr, block) {
+      nir_foreach_instr_safe(instr, block) {
         switch (instr->type) {
         case nir_instr_type_tex: {
            nir_tex_instr *tex = nir_instr_as_tex(instr);
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -1086,9 +1086,6 @@ late_optimizations = [
   (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
   (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),

-   (('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)),
-   (('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)),
-
   # we do these late so that we don't get in the way of creating ffmas
   (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
   (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
--- a/src/compiler/nir/nir_opt_idiv_const.c
+++ b/src/compiler/nir/nir_opt_idiv_const.c
@@ -65,15 +65,17 @@ build_umod(nir_builder *b, nir_ssa_def *n, uint64_t d)
 static nir_ssa_def *
 build_idiv(nir_builder *b, nir_ssa_def *n, int64_t d)
 {
+   uint64_t abs_d = d < 0 ? -d : d;
+
   if (d == 0) {
      return nir_imm_intN_t(b, 0, n->bit_size);
   } else if (d == 1) {
      return n;
   } else if (d == -1) {
      return nir_ineg(b, n);
-   } else if (util_is_power_of_two_or_zero64(d)) {
-      uint64_t abs_d = d < 0 ? -d : d;
-      nir_ssa_def *uq = nir_ishr(b, n, nir_imm_int(b, util_logbase2_64(abs_d)));
+   } else if (util_is_power_of_two_or_zero64(abs_d)) {
+      nir_ssa_def *uq = nir_ushr(b, nir_iabs(b, n),
+                                    nir_imm_int(b, util_logbase2_64(abs_d)));
      nir_ssa_def *n_neg = nir_ilt(b, n, nir_imm_intN_t(b, 0, n->bit_size));
      nir_ssa_def *neg = d < 0 ? nir_inot(b, n_neg) : n_neg;
      return nir_bcsel(b, neg, nir_ineg(b, uq), uq);
--- a/src/compiler/nir/nir_search_helpers.h
+++ b/src/compiler/nir/nir_search_helpers.h
@@ -143,22 +143,6 @@ is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components,
   return !nir_src_is_const(instr->src[src].src);
 }

-static inline bool
-is_used_more_than_once(nir_alu_instr *instr)
-{
-   bool zero_if_use = list_empty(&instr->dest.dest.ssa.if_uses);
-   bool zero_use = list_empty(&instr->dest.dest.ssa.uses);
-
-   if (zero_use && zero_if_use)
-      return false;
-   else if (zero_use && list_is_singular(&instr->dest.dest.ssa.if_uses))
-      return false;
-   else if (zero_if_use && list_is_singular(&instr->dest.dest.ssa.uses))
-      return false;
-
-   return true;
-}
-
 static inline bool
 is_used_once(nir_alu_instr *instr)
 {
--- a/src/freedreno/Android.drm.mk
+++ b/src/freedreno/Android.drm.mk
@@ -0,0 +1,41 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# Android.mk for libfreedreno_drm.a
+
+# ---------------------------------------
+# Build libfreedreno_drm
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+	$(drm_SOURCES)
+
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/gallium/include \
+	$(MESA_TOP)/src/gallium/auxiliary
+
+LOCAL_MODULE := libfreedreno_drm
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
--- a/src/freedreno/Android.ir3.mk
+++ b/src/freedreno/Android.ir3.mk
@@ -0,0 +1,51 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# Android.mk for libfreedreno_ir3.a
+
+# ---------------------------------------
+# Build libfreedreno_ir3
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+	$(ir3_SOURCES)
+
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/compiler/nir \
+	$(MESA_TOP)/src/gallium/include \
+	$(MESA_TOP)/src/gallium/auxiliary \
+	$(MESA_TOP)/prebuilt-intermediates/nir \
+
+# We need libmesa_nir to get NIR's generated include directories.
+LOCAL_STATIC_LIBRARIES := \
+	libmesa_nir
+
+LOCAL_MODULE := libfreedreno_ir3
+
+LOCAL_GENERATED_SOURCES := \
+	$(MESA_GEN_GLSL_H) \
+	$(MESA_GEN_NIR_H)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
--- a/src/freedreno/Android.mk
+++ b/src/freedreno/Android.mk
@@ -0,0 +1,30 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+# Android.mk for libfreedreno_*
+
+LOCAL_PATH := $(call my-dir)
+
+include $(LOCAL_PATH)/Makefile.sources
+include $(MESA_TOP)/src/gallium/drivers/freedreno/Android.gen.mk
+include $(LOCAL_PATH)/Android.drm.mk
+include $(LOCAL_PATH)/Android.ir3.mk
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -36,6 +36,8 @@ ir3_SOURCES := \
 	ir3/ir3_nir.c \
 	ir3/ir3_nir.h \
 	ir3/ir3_nir_analyze_ubo_ranges.c \
+	ir3/ir3_nir_lower_load_barycentric_at_sample.c \
+	ir3/ir3_nir_lower_load_barycentric_at_offset.c \
 	ir3/ir3_nir_lower_io_offsets.c \
 	ir3/ir3_nir_lower_tg4_to_tex.c \
 	ir3/ir3_nir_move_varying_inputs.c \
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -1304,7 +1304,8 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 			idx += nir_src_as_uint(intr->src[1]);
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned inloc = idx * 4 + i + comp;
-				if (ctx->so->inputs[idx].bary) {
+				if (ctx->so->inputs[idx].bary &&
+						!ctx->so->inputs[idx].use_ldlv) {
 					dst[i] = ir3_BARY_F(b, create_immed(b, inloc), 0, coord, 0);
 				} else {
 					/* for non-varyings use the pre-setup input, since
@@ -2402,8 +2403,6 @@ setup_input(struct ir3_context *ctx, nir_variable *in)
 				so->inputs[n].bary = true;
 				instr = create_frag_input(ctx, false, idx);
 			} else {
-				bool use_ldlv = false;
-
 				/* detect the special case for front/back colors where
 				 * we need to do flat vs smooth shading depending on
 				 * rast state:
@@ -2424,12 +2423,12 @@ setup_input(struct ir3_context *ctx, nir_variable *in)
 				if (ctx->compiler->flat_bypass) {
 					if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
 							(so->inputs[n].rasterflat && ctx->so->key.rasterflat))
-						use_ldlv = true;
+						so->inputs[n].use_ldlv = true;
 				}

 				so->inputs[n].bary = true;

-				instr = create_frag_input(ctx, use_ldlv, idx);
+				instr = create_frag_input(ctx, so->inputs[n].use_ldlv, idx);
 			}

 			compile_assert(ctx, idx < ctx->ir->ninputs);
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -414,6 +414,7 @@ struct ir3_shader_variant {
 		/* fragment shader specific: */
 		bool    bary       : 1;   /* fetched varying (vs one loaded into reg) */
 		bool    rasterflat : 1;   /* special handling for emit->rasterflat */
+		bool    use_ldlv   : 1;   /* internal to ir3_compiler_nir */
 		bool    half       : 1;
 		enum glsl_interp_mode interpolate;
 	} inputs[16 + 2];  /* +POSITION +FACE */
--- a/src/gallium/Android.common.mk
+++ b/src/gallium/Android.common.mk
@@ -27,6 +27,9 @@ LOCAL_C_INCLUDES += \
 	$(GALLIUM_TOP)/include \
 	$(GALLIUM_TOP)/auxiliary \
 	$(GALLIUM_TOP)/winsys \
-	$(GALLIUM_TOP)/drivers
+	$(GALLIUM_TOP)/drivers \
+	$(MESA_TOP)/src/freedreno \
+	$(MESA_TOP)/src/freedreno/ir3 \
+	$(MESA_TOP)/src/freedreno/registers

 include $(MESA_COMMON_MK)
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -950,6 +950,8 @@ draw_set_mapped_so_targets(struct draw_context *draw,
 {
   int i;

+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
   for (i = 0; i < num_targets; i++)
      draw->so.targets[i] = targets[i];
   for (i = num_targets; i < PIPE_MAX_SO_BUFFERS; i++)
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -753,8 +753,10 @@ void draw_gs_destroy( struct draw_context *draw )
 {
   int i;
   if (draw->gs.tgsi.machine) {
-      for (i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++)
+      for (i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++) {
         align_free(draw->gs.tgsi.machine->Primitives[i]);
+         align_free(draw->gs.tgsi.machine->PrimitiveOffsets[i]);
+      }
      tgsi_exec_machine_destroy(draw->gs.tgsi.machine);
   }
 }
--- a/src/gallium/drivers/freedreno/Android.gen.mk
+++ b/src/gallium/drivers/freedreno/Android.gen.mk
@@ -25,7 +25,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 endif

 ir3_nir_trig_deps := \
-	$(LOCAL_PATH)/ir3/ir3_nir_trig.py \
+	$(MESA_TOP)/src/freedreno/ir3/ir3_nir_trig.py \
 	$(MESA_TOP)/src/compiler/nir/nir_algebraic.py

 intermediates := $(call local-generated-sources-dir)
--- a/src/gallium/drivers/freedreno/Android.mk
+++ b/src/gallium/drivers/freedreno/Android.mk
@@ -44,7 +44,7 @@ LOCAL_C_INCLUDES := \
 LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)

 LOCAL_SHARED_LIBRARIES := libdrm
-LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir
+LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3
 LOCAL_MODULE := libmesa_pipe_freedreno

 include $(LOCAL_PATH)/Android.gen.mk
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -631,6 +631,7 @@ iris_emit_l3_config(struct iris_batch *batch, const struct gen_l3_config *cfg,
       * desirable behavior.
       */
      reg.ErrorDetectionBehaviorControl = true;
+      reg.UseFullWays = true;
 #endif
      reg.URBAllocation = cfg->n[GEN_L3P_URB];
      reg.ROAllocation = cfg->n[GEN_L3P_RO];
--- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c
+++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
@@ -995,7 +995,7 @@ emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
 {
        nir_ssa_def def = instr->def;

-        float *v = ralloc_array(NULL, float, 4);
+        float *v = rzalloc_array(NULL, float, 4);
        nir_const_load_to_arr(v, instr, f32);
        _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v);
 }
--- a/src/gallium/drivers/panfrost/pan_swizzle.c
+++ b/src/gallium/drivers/panfrost/pan_swizzle.c
@@ -164,10 +164,10 @@ panfrost_texture_swizzle(unsigned off_x,

        /* Use fast path if available */
        if (!(off_x || off_y) && (width == dest_width)) {
-                if (bytes_per_pixel == 4 /* && (ALIGN(width, 16) == width) */) {
+                if (bytes_per_pixel == 4 && (ALIGN(width, 16) == width)) {
                        swizzle_bpp4_align16(width, height, source_stride >> 2, (block_pitch * 256 >> 4), (const uint32_t *) pixels, (uint32_t *) ldest);
                        return;
-                } else if (bytes_per_pixel == 1 /* && (ALIGN(width, 16) == width) */) {
+                } else if (bytes_per_pixel == 1 && (ALIGN(width, 16) == width)) {
                        swizzle_bpp1_align16(width, height, source_stride, (block_pitch * 256 >> 4), pixels, (uint8_t *) ldest);
                        return;
                }
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -287,11 +287,9 @@ si_invalidate_buffer(struct si_context *sctx,
 	/* Check if mapping this buffer would cause waiting for the GPU. */
 	if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
 	    !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
-		uint64_t old_va = buf->gpu_address;
-
 		/* Reallocate the buffer in the same pipe_resource. */
 		si_alloc_resource(sctx->screen, buf);
-		si_rebind_buffer(sctx, &buf->b.b, old_va);
+		si_rebind_buffer(sctx, &buf->b.b);
 	} else {
 		util_range_set_empty(&buf->valid_buffer_range);
 	}
@@ -307,7 +305,6 @@ void si_replace_buffer_storage(struct pipe_context *ctx,
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_resource *sdst = si_resource(dst);
 	struct si_resource *ssrc = si_resource(src);
-	uint64_t old_gpu_address = sdst->gpu_address;

 	pb_reference(&sdst->buf, ssrc->buf);
 	sdst->gpu_address = ssrc->gpu_address;
@@ -322,7 +319,7 @@ void si_replace_buffer_storage(struct pipe_context *ctx,
 	assert(sdst->bo_alignment == ssrc->bo_alignment);
 	assert(sdst->domains == ssrc->domains);

-	si_rebind_buffer(sctx, dst, old_gpu_address);
+	si_rebind_buffer(sctx, dst);
 }

 static void si_invalidate_resource(struct pipe_context *ctx,
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -999,6 +999,7 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 	buffers->priority = priority;
 	buffers->priority_constbuf = priority_constbuf;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
+	buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));

 	si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
 }
@@ -1013,6 +1014,7 @@ static void si_release_buffer_resources(struct si_buffer_resources *buffers,
 	}

 	FREE(buffers->buffers);
+	FREE(buffers->offsets);
 }

 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
@@ -1219,11 +1221,10 @@ static void si_set_constant_buffer(struct si_context *sctx,
 	if (input && (input->buffer || input->user_buffer)) {
 		struct pipe_resource *buffer = NULL;
 		uint64_t va;
+		unsigned buffer_offset;

 		/* Upload the user buffer if needed. */
 		if (input->user_buffer) {
-			unsigned buffer_offset;
-
 			si_upload_const_buffer(sctx,
 					       (struct si_resource**)&buffer, input->user_buffer,
 					       input->buffer_size, &buffer_offset);
@@ -1232,12 +1233,13 @@ static void si_set_constant_buffer(struct si_context *sctx,
 				si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
 				return;
 			}
-			va = si_resource(buffer)->gpu_address + buffer_offset;
 		} else {
 			pipe_resource_reference(&buffer, input->buffer);
-			va = si_resource(buffer)->gpu_address + input->buffer_offset;
+			buffer_offset = input->buffer_offset;
 		}

+		va = si_resource(buffer)->gpu_address + buffer_offset;
+
 		/* Set the descriptor. */
 		uint32_t *desc = descs->list + slot*4;
 		desc[0] = va;
@@ -1252,6 +1254,7 @@ static void si_set_constant_buffer(struct si_context *sctx,
 			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);

 		buffers->buffers[slot] = buffer;
+		buffers->offsets[slot] = buffer_offset;
 		radeon_add_to_gfx_buffer_list_check_mem(sctx,
 							si_resource(buffer),
 							RADEON_USAGE_READ,
@@ -1336,6 +1339,7 @@ static void si_set_shader_buffer(struct si_context *sctx,
 		  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);

 	pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
+	buffers->offsets[slot] = sbuffer->buffer_offset;
 	radeon_add_to_gfx_buffer_list_check_mem(sctx, buf,
 						writable ? RADEON_USAGE_READWRITE :
 							   RADEON_USAGE_READ,
@@ -1505,20 +1509,6 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot,
 	sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
 }

-static void si_desc_reset_buffer_offset(uint32_t *desc, uint64_t old_buf_va,
-					struct pipe_resource *new_buf)
-{
-	/* Retrieve the buffer offset from the descriptor. */
-	uint64_t old_desc_va = si_desc_extract_buffer_address(desc);
-
-	assert(old_buf_va <= old_desc_va);
-	uint64_t offset_within_buffer = old_desc_va - old_buf_va;
-
-	/* Update the descriptor. */
-	si_set_buf_desc_address(si_resource(new_buf), offset_within_buffer,
-				desc);
-}
-
 /* INTERNAL CONST BUFFERS */

 static void si_set_polygon_stipple(struct pipe_context *ctx,
@@ -1603,7 +1593,6 @@ static void si_reset_buffer_resources(struct si_context *sctx,
 				      unsigned descriptors_idx,
 				      unsigned slot_mask,
 				      struct pipe_resource *buf,
-				      uint64_t old_va,
 				      enum radeon_bo_priority priority)
 {
 	struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
@@ -1612,8 +1601,8 @@ static void si_reset_buffer_resources(struct si_context *sctx,
 	while (mask) {
 		unsigned i = u_bit_scan(&mask);
 		if (buffers->buffers[i] == buf) {
-			si_desc_reset_buffer_offset(descs->list + i*4,
-						    old_va, buf);
+			si_set_buf_desc_address(si_resource(buf), buffers->offsets[i],
+						descs->list + i*4);
 			sctx->descriptors_dirty |= 1u << descriptors_idx;

 			radeon_add_to_gfx_buffer_list_check_mem(sctx,
@@ -1629,8 +1618,7 @@ static void si_reset_buffer_resources(struct si_context *sctx,
 /* Update all resource bindings where the buffer is bound, including
 * all resource descriptors. This is invalidate_buffer without
 * the invalidation. */
-void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf,
-		      uint64_t old_va)
+void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
 {
 	struct si_resource *buffer = si_resource(buf);
 	unsigned i, shader;
@@ -1670,8 +1658,8 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf,
 			if (buffers->buffers[i] != buf)
 				continue;

-			si_desc_reset_buffer_offset(descs->list + i*4,
-						    old_va, buf);
+			si_set_buf_desc_address(si_resource(buf), buffers->offsets[i],
+						descs->list + i*4);
 			sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;

 			radeon_add_to_gfx_buffer_list_check_mem(sctx,
@@ -1694,7 +1682,7 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf,
 			si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
 						  si_const_and_shader_buffer_descriptors_idx(shader),
 						  u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
-						  buf, old_va,
+						  buf,
 						  sctx->const_and_shader_buffers[shader].priority_constbuf);
 	}

@@ -1703,7 +1691,7 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf,
 			si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
 						  si_const_and_shader_buffer_descriptors_idx(shader),
 						  u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS),
-						  buf, old_va,
+						  buf,
 						  sctx->const_and_shader_buffers[shader].priority);
 	}

@@ -1720,9 +1708,9 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf,
 				if (samplers->views[i]->texture == buf) {
 					unsigned desc_slot = si_get_sampler_slot(i);

-					si_desc_reset_buffer_offset(descs->list +
-								    desc_slot * 16 + 4,
-								    old_va, buf);
+					si_set_buf_desc_address(si_resource(buf),
+								samplers->views[i]->u.buf.offset,
+								descs->list + desc_slot * 16 + 4);
 					sctx->descriptors_dirty |=
 						1u << si_sampler_and_image_descriptors_idx(shader);

@@ -1752,9 +1740,9 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf,
 					if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
 						si_mark_image_range_valid(&images->views[i]);

-					si_desc_reset_buffer_offset(
-						descs->list + desc_slot * 8 + 4,
-						old_va, buf);
+					si_set_buf_desc_address(si_resource(buf),
+								images->views[i].u.buf.offset,
+								descs->list + desc_slot * 8 + 4);
 					sctx->descriptors_dirty |=
 						1u << si_sampler_and_image_descriptors_idx(shader);

--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -409,6 +409,7 @@ struct si_descriptors {

 struct si_buffer_resources {
 	struct pipe_resource		**buffers; /* this has num_buffers elements */
+	unsigned			*offsets; /* this has num_buffers elements */

 	enum radeon_bo_priority		priority:6;
 	enum radeon_bo_priority		priority_constbuf:6;
@@ -487,8 +488,7 @@ struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap,
 						  unsigned entry_size,
 						  unsigned group_index);
 void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
-void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf,
-		      uint64_t old_va);
+void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf);
 /* si_state.c */
 void si_init_state_compute_functions(struct si_context *sctx);
 void si_init_state_functions(struct si_context *sctx);
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -53,6 +53,10 @@ LOCAL_SHARED_LIBRARIES += \
 	libexpat
 endif

+LOCAL_STATIC_LIBRARIES += \
+	libfreedreno_drm \
+	libfreedreno_ir3
+
 ifeq ($(USE_LIBBACKTRACE),true)
 	LOCAL_SHARED_LIBRARIES += libbacktrace
 endif
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -78,8 +78,8 @@ foreach d : [[with_gallium_kmsro, [
               'pl111_dri.so',
               'repaper_dri.so',
               'rockchip_dri.so',
-               'st7586.so',
-               'st7735r.so',
+               'st7586_dri.so',
+               'st7735r_dri.so',
 	       'sun4i-drm_dri.so',
             ]],
             [with_gallium_radeonsi, 'radeonsi_dri.so'],
--- a/src/gallium/targets/osmesa/meson.build
+++ b/src/gallium/targets/osmesa/meson.build
@@ -43,9 +43,9 @@ libosmesa = shared_library(
    inc_gallium_drivers,
  ],
  link_depends : osmesa_link_deps,
-  link_whole : [libosmesa_st],
+  link_whole : [libosmesa_st, libglapi_static],
  link_with : [
-    libmesa_gallium, libgallium, libglapi_static, libws_null, osmesa_link_with,
+    libmesa_gallium, libgallium, libws_null, osmesa_link_with,
  ],
  dependencies : [
    dep_selinux, dep_thread, dep_clock, dep_unwind,
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -378,7 +378,8 @@ static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE &&
          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD_ENC &&
          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC &&
-          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC;
+          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC &&
+          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_JPEG;
 }

 static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs)
--- a/src/glx/glx_error.c
+++ b/src/glx/glx_error.c
@@ -54,7 +54,7 @@ __glXSendError(Display * dpy, int_fast8_t errorCode, uint_fast32_t resourceID,
      error.errorCode = glx_dpy->codes->first_error + errorCode;
   }

-   error.sequenceNumber = dpy->last_request_read;
+   error.sequenceNumber = dpy->request;
   error.resourceID = resourceID;
   error.minorCode = minorCode;
   error.majorCode = glx_dpy->majorOpcode;
@@ -73,7 +73,7 @@ __glXSendErrorForXcb(Display * dpy, const xcb_generic_error_t *err)

   error.type = X_Error;
   error.errorCode = err->error_code;
-   error.sequenceNumber = dpy->last_request_read;
+   error.sequenceNumber = err->sequence;
   error.resourceID = err->resource_id;
   error.minorCode = err->minor_code;
   error.majorCode = err->major_code;
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -591,7 +591,7 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
    */
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
      if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
-         for (unsigned i = 0; i < 3; i++) {
+         for (unsigned i = 0; i < inst->sources; i++) {
            if (inst->src[i].file == VGRF) {
               ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
            }
@@ -710,14 +710,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
         if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
             inst->src[2].file == VGRF &&
             inst->src[3].file == VGRF &&
-             inst->src[2].nr != inst->src[3].nr) {
-            for (unsigned i = 0; i < inst->mlen; i++) {
-               for (unsigned j = 0; j < inst->ex_mlen; j++) {
-                  ra_add_node_interference(g, inst->src[2].nr + i,
-                                           inst->src[3].nr + j);
-               }
-            }
-         }
+             inst->src[2].nr != inst->src[3].nr)
+            ra_add_node_interference(g, inst->src[2].nr,
+                                     inst->src[3].nr);
      }
   }

--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -165,7 +165,7 @@ anv_state_table_init(struct anv_state_table *table,
      goto fail_fd;
   }

-   if (!u_vector_init(&table->mmap_cleanups,
+   if (!u_vector_init(&table->cleanups,
                      round_to_power_of_two(sizeof(struct anv_state_table_cleanup)),
                      128)) {
      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
@@ -179,12 +179,12 @@ anv_state_table_init(struct anv_state_table *table,
   uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
   result = anv_state_table_expand_range(table, initial_size);
   if (result != VK_SUCCESS)
-      goto fail_mmap_cleanups;
+      goto fail_cleanups;

   return VK_SUCCESS;

- fail_mmap_cleanups:
-   u_vector_finish(&table->mmap_cleanups);
+ fail_cleanups:
+   u_vector_finish(&table->cleanups);
 fail_fd:
   close(table->fd);

@@ -195,7 +195,7 @@ static VkResult
 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
 {
   void *map;
-   struct anv_mmap_cleanup *cleanup;
+   struct anv_state_table_cleanup *cleanup;

   /* Assert that we only ever grow the pool */
   assert(size >= table->state.end);
@@ -204,11 +204,11 @@ anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
   if (size > BLOCK_POOL_MEMFD_SIZE)
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

-   cleanup = u_vector_add(&table->mmap_cleanups);
+   cleanup = u_vector_add(&table->cleanups);
   if (!cleanup)
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

-   *cleanup = ANV_MMAP_CLEANUP_INIT;
+   *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;

   /* Just leak the old map until we destroy the pool.  We can't munmap it
    * without races or imposing locking on the block allocate fast path. On
@@ -272,12 +272,12 @@ anv_state_table_finish(struct anv_state_table *table)
 {
   struct anv_state_table_cleanup *cleanup;

-   u_vector_foreach(cleanup, &table->mmap_cleanups) {
+   u_vector_foreach(cleanup, &table->cleanups) {
      if (cleanup->map)
         munmap(cleanup->map, cleanup->size);
   }

-   u_vector_finish(&table->mmap_cleanups);
+   u_vector_finish(&table->cleanups);

   close(table->fd);
 }
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -103,6 +103,12 @@ anv_descriptor_data_for_type(const struct anv_physical_device *device,
        type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC))
      data |= ANV_DESCRIPTOR_ADDRESS_RANGE;

+   /* On Ivy Bridge and Bay Trail, we need swizzles textures in the shader */
+   if (device->info.gen == 7 && !device->info.is_haswell &&
+       (type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE ||
+        type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER))
+      data |= ANV_DESCRIPTOR_TEXTURE_SWIZZLE;
+
   return data;
 }

@@ -123,6 +129,9 @@ anv_descriptor_data_size(enum anv_descriptor_data data)
   if (data & ANV_DESCRIPTOR_ADDRESS_RANGE)
      size += sizeof(struct anv_address_range_descriptor);

+   if (data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE)
+      size += sizeof(struct anv_texture_swizzle_descriptor);
+
   return size;
 }

@@ -1184,6 +1193,26 @@ anv_descriptor_set_write_image_view(struct anv_device *device,

      anv_descriptor_set_write_image_param(desc_map, image_param);
   }
+
+   if (bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) {
+      assert(!(bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE));
+      assert(image_view);
+      struct anv_texture_swizzle_descriptor desc_data[3];
+      memset(desc_data, 0, sizeof(desc_data));
+
+      for (unsigned p = 0; p < image_view->n_planes; p++) {
+         desc_data[p] = (struct anv_texture_swizzle_descriptor) {
+            .swizzle = {
+               (uint8_t)image_view->planes[p].isl.swizzle.r,
+               (uint8_t)image_view->planes[p].isl.swizzle.g,
+               (uint8_t)image_view->planes[p].isl.swizzle.b,
+               (uint8_t)image_view->planes[p].isl.swizzle.a,
+            },
+         };
+      }
+      memcpy(desc_map, desc_data,
+             MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0]));
+   }
 }

 void
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1170,6 +1170,11 @@ void anv_GetPhysicalDeviceFeatures2(
   }
 }

+#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS   64
+
+#define MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS 64
+#define MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS       256
+
 void anv_GetPhysicalDeviceProperties(
    VkPhysicalDevice                            physicalDevice,
    VkPhysicalDeviceProperties*                 pProperties)
@@ -1215,20 +1220,20 @@ void anv_GetPhysicalDeviceProperties(
      .sparseAddressSpaceSize                   = 0,
      .maxBoundDescriptorSets                   = MAX_SETS,
      .maxPerStageDescriptorSamplers            = max_samplers,
-      .maxPerStageDescriptorUniformBuffers      = 64,
+      .maxPerStageDescriptorUniformBuffers      = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
      .maxPerStageDescriptorStorageBuffers      = max_ssbos,
      .maxPerStageDescriptorSampledImages       = max_textures,
      .maxPerStageDescriptorStorageImages       = max_images,
-      .maxPerStageDescriptorInputAttachments    = 64,
+      .maxPerStageDescriptorInputAttachments    = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
      .maxPerStageResources                     = max_per_stage,
      .maxDescriptorSetSamplers                 = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
-      .maxDescriptorSetUniformBuffers           = 6 * 64,           /* number of stages * maxPerStageDescriptorUniformBuffers */
+      .maxDescriptorSetUniformBuffers           = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,           /* number of stages * maxPerStageDescriptorUniformBuffers */
      .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
      .maxDescriptorSetStorageBuffers           = 6 * max_ssbos,    /* number of stages * maxPerStageDescriptorStorageBuffers */
      .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
      .maxDescriptorSetSampledImages            = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
      .maxDescriptorSetStorageImages            = 6 * max_images,   /* number of stages * maxPerStageDescriptorStorageImages */
-      .maxDescriptorSetInputAttachments         = 256,
+      .maxDescriptorSetInputAttachments         = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
      .maxVertexInputAttributes                 = MAX_VBS,
      .maxVertexInputBindings                   = MAX_VBS,
      .maxVertexInputAttributeOffset            = 2047,
@@ -1393,20 +1398,20 @@ void anv_GetPhysicalDeviceProperties2(
         props->robustBufferAccessUpdateAfterBind = true;
         props->quadDivergentImplicitLod = false;
         props->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views;
-         props->maxPerStageDescriptorUpdateAfterBindUniformBuffers = 0;
+         props->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
         props->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
         props->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views;
         props->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views;
-         props->maxPerStageDescriptorUpdateAfterBindInputAttachments = 0;
+         props->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
         props->maxPerStageUpdateAfterBindResources = UINT32_MAX;
         props->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views;
-         props->maxDescriptorSetUpdateAfterBindUniformBuffers = 0;
-         props->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = 0;
+         props->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
+         props->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
         props->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX;
         props->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
         props->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views;
         props->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views;
-         props->maxDescriptorSetUpdateAfterBindInputAttachments = 0;
+         props->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
         break;
      }

@@ -2995,6 +3000,9 @@ void anv_FreeMemory(
   if (mem->map)
      anv_UnmapMemory(_device, _mem);

+   p_atomic_add(&pdevice->memory.heaps[mem->type->heapIndex].used,
+                -mem->bo->size);
+
   anv_bo_cache_release(device, &device->bo_cache, mem->bo);

 #if defined(ANDROID) && ANDROID_API_LEVEL >= 26
@@ -3002,9 +3010,6 @@ void anv_FreeMemory(
      AHardwareBuffer_release(mem->ahw);
 #endif

-   p_atomic_add(&pdevice->memory.heaps[mem->type->heapIndex].used,
-                -mem->bo->size);
-
   vk_free2(&device->alloc, pAllocator, mem);
 }

--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -1278,6 +1278,10 @@ anv_image_fill_surface_state(struct anv_device *device,
   if (view_usage == ISL_SURF_USAGE_RENDER_TARGET_BIT)
      view.swizzle = anv_swizzle_for_render(view.swizzle);

+   /* On Ivy Bridge and Bay Trail we do the swizzle in the shader */
+   if (device->info.gen == 7 && !device->info.is_haswell)
+      view.swizzle = ISL_SWIZZLE_IDENTITY;
+
   /* If this is a HiZ buffer we can sample from with a programmable clear
    * value (SKL+), define the clear value to the optimal constant.
    */
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -725,6 +725,10 @@ lower_image_intrinsic(nir_intrinsic_instr *intrin,
   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
   nir_variable *var = nir_deref_instr_get_variable(deref);

+   unsigned set = var->data.descriptor_set;
+   unsigned binding = var->data.binding;
+   unsigned binding_offset = state->set[set].surface_offsets[binding];
+
   nir_builder *b = &state->builder;
   b->cursor = nir_before_instr(&intrin->instr);

@@ -742,7 +746,7 @@ lower_image_intrinsic(nir_intrinsic_instr *intrin,
                               intrin->dest.ssa.bit_size, state);

      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(desc));
-   } else if (use_bindless) {
+   } else if (binding_offset > MAX_BINDING_TABLE_SIZE) {
      const bool write_only =
         (var->data.image.access & ACCESS_NON_READABLE) != 0;
      nir_ssa_def *desc =
@@ -750,9 +754,6 @@ lower_image_intrinsic(nir_intrinsic_instr *intrin,
      nir_ssa_def *handle = nir_channel(b, desc, write_only ? 1 : 0);
      nir_rewrite_image_intrinsic(intrin, handle, true);
   } else {
-      unsigned set = var->data.descriptor_set;
-      unsigned binding = var->data.binding;
-      unsigned binding_offset = state->set[set].surface_offsets[binding];
      unsigned array_size =
         state->layout->set[set].layout->binding[binding].array_size;

@@ -899,13 +900,98 @@ tex_instr_get_and_remove_plane_src(nir_tex_instr *tex)
   return plane;
 }

+static nir_ssa_def *
+build_def_array_select(nir_builder *b, nir_ssa_def **srcs, nir_ssa_def *idx,
+                       unsigned start, unsigned end)
+{
+   if (start == end - 1) {
+      return srcs[start];
+   } else {
+      unsigned mid = start + (end - start) / 2;
+      return nir_bcsel(b, nir_ilt(b, idx, nir_imm_int(b, mid)),
+                       build_def_array_select(b, srcs, idx, start, mid),
+                       build_def_array_select(b, srcs, idx, mid, end));
+   }
+}
+
+static void
+lower_gen7_tex_swizzle(nir_tex_instr *tex, unsigned plane,
+                       struct apply_pipeline_layout_state *state)
+{
+   assert(state->pdevice->info.gen == 7 && !state->pdevice->info.is_haswell);
+   if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ||
+       nir_tex_instr_is_query(tex) ||
+       tex->op == nir_texop_tg4 || /* We can't swizzle TG4 */
+       (tex->is_shadow && tex->is_new_style_shadow))
+      return;
+
+   int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+   assert(deref_src_idx >= 0);
+
+   nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
+   UNUSED nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   UNUSED unsigned set = var->data.descriptor_set;
+   UNUSED unsigned binding = var->data.binding;
+   UNUSED const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+   assert(bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE);
+
+   nir_builder *b = &state->builder;
+   b->cursor = nir_before_instr(&tex->instr);
+
+   const unsigned plane_offset =
+      plane * sizeof(struct anv_texture_swizzle_descriptor);
+   nir_ssa_def *swiz =
+      build_descriptor_load(deref, plane_offset, 1, 32, state);
+
+   b->cursor = nir_after_instr(&tex->instr);
+
+   assert(tex->dest.ssa.bit_size == 32);
+   assert(tex->dest.ssa.num_components == 4);
+
+   /* Initializing to undef is ok; nir_opt_undef will clean it up. */
+   nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
+   nir_ssa_def *comps[8];
+   for (unsigned i = 0; i < ARRAY_SIZE(comps); i++)
+      comps[i] = undef;
+
+   comps[ISL_CHANNEL_SELECT_ZERO] = nir_imm_int(b, 0);
+   if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float)
+      comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_float(b, 1);
+   else
+      comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_int(b, 1);
+   comps[ISL_CHANNEL_SELECT_RED] = nir_channel(b, &tex->dest.ssa, 0);
+   comps[ISL_CHANNEL_SELECT_GREEN] = nir_channel(b, &tex->dest.ssa, 1);
+   comps[ISL_CHANNEL_SELECT_BLUE] = nir_channel(b, &tex->dest.ssa, 2);
+   comps[ISL_CHANNEL_SELECT_ALPHA] = nir_channel(b, &tex->dest.ssa, 3);
+
+   nir_ssa_def *swiz_comps[4];
+   for (unsigned i = 0; i < 4; i++) {
+      nir_ssa_def *comp_swiz = nir_extract_u8(b, swiz, nir_imm_int(b, i));
+      swiz_comps[i] = build_def_array_select(b, comps, comp_swiz, 0, 8);
+   }
+   nir_ssa_def *swiz_tex_res = nir_vec(b, swiz_comps, 4);
+
+   /* Rewrite uses before we insert so we don't rewrite this use */
+   nir_ssa_def_rewrite_uses_after(&tex->dest.ssa,
+                                  nir_src_for_ssa(swiz_tex_res),
+                                  swiz_tex_res->parent_instr);
+}
+
 static void
 lower_tex(nir_tex_instr *tex, struct apply_pipeline_layout_state *state)
 {
-   state->builder.cursor = nir_before_instr(&tex->instr);
-
   unsigned plane = tex_instr_get_and_remove_plane_src(tex);

+   /* On Ivy Bridge and Bay Trail, we have to swizzle in the shader.  Do this
+    * before we lower the derefs away so we can still find the descriptor.
+    */
+   if (state->pdevice->info.gen == 7 && !state->pdevice->info.is_haswell)
+      lower_gen7_tex_swizzle(tex, plane, state);
+
+   state->builder.cursor = nir_before_instr(&tex->instr);
+
   lower_tex_deref(tex, nir_tex_src_texture_deref,
                   &tex->texture_index, plane, state);

--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -400,12 +400,12 @@ populate_wm_prog_key(const struct gen_device_info *devinfo,
       * harmless to compute it and then let dead-code take care of it.
       */
      if (ms_info->rasterizationSamples > 1) {
-         key->persample_interp =
+         key->persample_interp = ms_info->sampleShadingEnable &&
            (ms_info->minSampleShading * ms_info->rasterizationSamples) > 1;
         key->multisample_fbo = true;
      }

-      key->frag_coord_adds_sample_pos = ms_info->sampleShadingEnable;
+      key->frag_coord_adds_sample_pos = key->persample_interp;
   }
 }

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -744,7 +744,7 @@ struct anv_state_table {
   struct anv_free_entry *map;
   uint32_t size;
   struct anv_block_state state;
-   struct u_vector mmap_cleanups;
+   struct u_vector cleanups;
 };

 struct anv_state_pool {
@@ -1548,6 +1548,17 @@ struct anv_sampled_image_descriptor {
   uint32_t sampler;
 };

+struct anv_texture_swizzle_descriptor {
+   /** Texture swizzle
+    *
+    * See also nir_intrinsic_channel_select_intel
+    */
+   uint8_t swizzle[4];
+
+   /** Unused padding to ensure the struct is a multiple of 64 bits */
+   uint32_t _pad;
+};
+
 /** Struct representing a storage image descriptor */
 struct anv_storage_image_descriptor {
   /** Bindless image handles
@@ -1589,6 +1600,8 @@ enum anv_descriptor_data {
   ANV_DESCRIPTOR_SAMPLED_IMAGE  = (1 << 6),
   /** Storage image handles */
   ANV_DESCRIPTOR_STORAGE_IMAGE  = (1 << 7),
+   /** Storage image handles */
+   ANV_DESCRIPTOR_TEXTURE_SWIZZLE  = (1 << 8),
 };

 struct anv_descriptor_set_binding_layout {
@@ -3201,7 +3214,13 @@ anv_can_sample_with_hiz(const struct gen_device_info * const devinfo,
   if (!(image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
      return false;

-   if (devinfo->gen < 8)
+   /* Allow this feature on BDW even though it is disabled in the BDW devinfo
+    * struct. There's documentation which suggests that this feature actually
+    * reduces performance on BDW, but it has only been observed to help so
+    * far. Sampling fast-cleared blocks on BDW must also be handled with care
+    * (see depth_stencil_attachment_compute_aux_usage() for more info).
+    */
+   if (devinfo->gen != 8 && !devinfo->has_sample_with_hiz)
      return false;

   return image->samples == 1;
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -346,14 +346,23 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
 }

 static void
-emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
-                        struct anv_address addr)
+emit_query_mi_availability(struct gen_mi_builder *b,
+                           struct anv_address addr,
+                           bool available)
+{
+   gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
+}
+
+static void
+emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_address addr,
+                           bool available)
 {
   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
      pc.DestinationAddressType  = DAT_PPGTT;
      pc.PostSyncOperation       = WriteImmediateData;
      pc.Address                 = addr;
-      pc.ImmediateData           = 1;
+      pc.ImmediateData           = available;
   }
 }

@@ -366,11 +375,39 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
                  struct gen_mi_builder *b, struct anv_query_pool *pool,
                  uint32_t first_index, uint32_t num_queries)
 {
-   for (uint32_t i = 0; i < num_queries; i++) {
-      struct anv_address slot_addr =
-         anv_query_address(pool, first_index + i);
-      gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
-      emit_query_availability(cmd_buffer, slot_addr);
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+      /* These queries are written with a PIPE_CONTROL so clear them using the
+       * PIPE_CONTROL as well so we don't have to synchronize between 2 types
+       * of operations.
+       */
+      assert((pool->stride % 8) == 0);
+      for (uint32_t i = 0; i < num_queries; i++) {
+         struct anv_address slot_addr =
+            anv_query_address(pool, first_index + i);
+
+         for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
+            emit_query_pc_availability(cmd_buffer,
+                                       anv_address_add(slot_addr, qword * 8),
+                                       false);
+         }
+         emit_query_pc_availability(cmd_buffer, slot_addr, true);
+      }
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      for (uint32_t i = 0; i < num_queries; i++) {
+         struct anv_address slot_addr =
+            anv_query_address(pool, first_index + i);
+         gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
+         emit_query_mi_availability(b, slot_addr, true);
+      }
+      break;
+
+   default:
+      unreachable("Unsupported query type");
   }
 }

@@ -383,11 +420,28 @@ void genX(CmdResetQueryPool)(
   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);

-   for (uint32_t i = 0; i < queryCount; i++) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
-         sdm.Address = anv_query_address(pool, firstQuery + i);
-         sdm.ImmediateData = 0;
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+      for (uint32_t i = 0; i < queryCount; i++) {
+         emit_query_pc_availability(cmd_buffer,
+                                    anv_query_address(pool, firstQuery + i),
+                                    false);
      }
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+      struct gen_mi_builder b;
+      gen_mi_builder_init(&b, &cmd_buffer->batch);
+
+      for (uint32_t i = 0; i < queryCount; i++)
+         emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
+      break;
+   }
+
+   default:
+      unreachable("Unsupported query type");
   }
 }

@@ -525,7 +579,7 @@ void genX(CmdEndQueryIndexedEXT)(
   switch (pool->type) {
   case VK_QUERY_TYPE_OCCLUSION:
      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
-      emit_query_availability(cmd_buffer, query_addr);
+      emit_query_pc_availability(cmd_buffer, query_addr, true);
      break;

   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
@@ -543,7 +597,7 @@ void genX(CmdEndQueryIndexedEXT)(
         offset += 16;
      }

-      emit_query_availability(cmd_buffer, query_addr);
+      emit_query_mi_availability(&b, query_addr, true);
      break;
   }

@@ -554,7 +608,7 @@ void genX(CmdEndQueryIndexedEXT)(
      }

      emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
-      emit_query_availability(cmd_buffer, query_addr);
+      emit_query_mi_availability(&b, query_addr, true);
      break;

   default:
@@ -613,7 +667,7 @@ void genX(CmdWriteTimestamp)(
      break;
   }

-   emit_query_availability(cmd_buffer, query_addr);
+   emit_query_pc_availability(cmd_buffer, query_addr, true);

   /* When multiview is active the spec requires that N consecutive query
    * indices are used, where N is the number of active views in the subpass.
@@ -684,7 +738,20 @@ void genX(CmdCopyQueryPoolResults)(
   }

   if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
-       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) {
+       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
+       /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+        * because we're about to copy values from MI commands, we need to
+        * stall the command streamer to make sure the PIPE_CONTROL values have
+        * landed, otherwise we could see inconsistent values & availability.
+        *
+        *  From the vulkan spec:
+        *
+        *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+        *     previous uses of vkCmdResetQueryPool in the same queue, without
+        *     any additional synchronization."
+        */
+       pool->type == VK_QUERY_TYPE_OCCLUSION ||
+       pool->type == VK_QUERY_TYPE_TIMESTAMP) {
      cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
   }
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -402,6 +402,8 @@ vma_alloc(struct brw_bufmgr *bufmgr,
   /* Without softpin support, we let the kernel assign addresses. */
   assert(brw_using_softpin(bufmgr));

+   alignment = ALIGN(alignment, PAGE_SIZE);
+
   struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
   uint64_t addr;

@@ -1717,6 +1719,9 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)

   const uint64_t _4GB = 4ull << 30;

+   /* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */
+   const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE;
+
   if (devinfo->gen >= 8 && gtt_size > _4GB) {
      bufmgr->initial_kflags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;

@@ -1726,9 +1731,13 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)
         bufmgr->initial_kflags |= EXEC_OBJECT_PINNED;

         util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_LOW_4G],
-                            PAGE_SIZE, _4GB);
+                            PAGE_SIZE, _4GB_minus_1);
+
+         /* Leave the last 4GB out of the high vma range, so that no state
+          * base address + size can overflow 48 bits.
+          */
         util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_OTHER],
-                            1 * _4GB, gtt_size - 1 * _4GB);
+                            1 * _4GB, gtt_size - 2 * _4GB);
      } else if (devinfo->gen >= 10) {
         /* Softpin landed in 4.5, but GVT used an aliasing PPGTT until
          * kernel commit 6b3816d69628becb7ff35978aa0751798b4a940a in
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1685,6 +1685,11 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
                                    ISL_FORMAT_RAW,
                                    3 * sizeof(GLuint), 1,
                                    RELOC_WRITE);
+
+      /* The state buffer now holds a reference to our upload, drop ours. */
+      if (bo != brw->compute.num_work_groups_bo)
+         brw_bo_unreference(bo);
+
      brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
   }
 }
--- a/src/mesa/drivers/osmesa/meson.build
+++ b/src/mesa/drivers/osmesa/meson.build
@@ -33,7 +33,8 @@ libosmesa = shared_library(
  include_directories : [
    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux,
  ],
-  link_with : [libmesa_classic, libglapi_static, osmesa_link_with],
+  link_whole : libglapi_static,
+  link_with : [libmesa_classic, osmesa_link_with],
  dependencies : [dep_thread, dep_selinux],
  version : '8.0.0',
  install : true,
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -1767,6 +1767,10 @@ _mesa_make_current( struct gl_context *newCtx,

         check_init_viewport(newCtx, drawBuffer->Width, drawBuffer->Height);
      }
+      else {
+         _mesa_reference_framebuffer(&newCtx->WinSysDrawBuffer, NULL);
+         _mesa_reference_framebuffer(&newCtx->WinSysReadBuffer, NULL);
+      }

      if (newCtx->FirstTimeCurrent) {
         handle_first_current(newCtx);
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -1105,10 +1105,17 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi,
   else {
      GET_CURRENT_CONTEXT(ctx);

-      ret = _mesa_make_current(NULL, NULL, NULL);
-
-      if (ctx)
+      if (ctx) {
+         /* Before releasing the context, release its associated
+          * winsys buffers first. Then purge the context's winsys buffers list
+          * to free the resources of any winsys buffers that no longer have
+          * an existing drawable.
+          */
+         ret = _mesa_make_current(ctx, NULL, NULL);
         st_framebuffers_purge(ctx->st);
+      }
+
+      ret = _mesa_make_current(NULL, NULL, NULL);
   }

   return ret;
--- a/src/mesa/state_tracker/st_tgsi_lower_yuv.c
+++ b/src/mesa/state_tracker/st_tgsi_lower_yuv.c
@@ -269,31 +269,39 @@ yuv_to_rgb(struct tgsi_transform_context *tctx,
   tctx->emit_instruction(tctx, &inst);

   /* DP3 dst.x, tmpA, imm[0] */
-   inst = dp3_instruction();
-   reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X);
-   reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
-   reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W));
-   tctx->emit_instruction(tctx, &inst);
+   if (dst->Register.WriteMask & TGSI_WRITEMASK_X) {
+      inst = dp3_instruction();
+      reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X);
+      reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+      reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W));
+      tctx->emit_instruction(tctx, &inst);
+   }

   /* DP3 dst.y, tmpA, imm[1] */
-   inst = dp3_instruction();
-   reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Y);
-   reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
-   reg_src(&inst.Src[1], &ctx->imm[1], SWIZ(X, Y, Z, W));
-   tctx->emit_instruction(tctx, &inst);
+   if (dst->Register.WriteMask & TGSI_WRITEMASK_Y) {
+      inst = dp3_instruction();
+      reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Y);
+      reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+      reg_src(&inst.Src[1], &ctx->imm[1], SWIZ(X, Y, Z, W));
+      tctx->emit_instruction(tctx, &inst);
+   }

   /* DP3 dst.z, tmpA, imm[2] */
-   inst = dp3_instruction();
-   reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Z);
-   reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
-   reg_src(&inst.Src[1], &ctx->imm[2], SWIZ(X, Y, Z, W));
-   tctx->emit_instruction(tctx, &inst);
+   if (dst->Register.WriteMask & TGSI_WRITEMASK_Z) {
+      inst = dp3_instruction();
+      reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Z);
+      reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+      reg_src(&inst.Src[1], &ctx->imm[2], SWIZ(X, Y, Z, W));
+      tctx->emit_instruction(tctx, &inst);
+   }

   /* MOV dst.w, imm[0].x */
-   inst = mov_instruction();
-   reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_W);
-   reg_src(&inst.Src[0], &ctx->imm[3], SWIZ(_, _, _, W));
-   tctx->emit_instruction(tctx, &inst);
+   if (dst->Register.WriteMask & TGSI_WRITEMASK_W) {
+      inst = mov_instruction();
+      reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_W);
+      reg_src(&inst.Src[0], &ctx->imm[3], SWIZ(_, _, _, W));
+      tctx->emit_instruction(tctx, &inst);
+   }
 }

 static void
@@ -434,7 +442,7 @@ st_tgsi_lower_yuv(const struct tgsi_token *tokens, unsigned free_slots,
   /* TODO better job of figuring out how many extra tokens we need..
    * this is a pain about tgsi_transform :-/
    */
-   newlen = tgsi_num_tokens(tokens) + 120;
+   newlen = tgsi_num_tokens(tokens) + 300;
   newtoks = tgsi_alloc_tokens(newlen);
   if (!newtoks)
      return NULL;
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@@ -111,6 +111,11 @@ TODO: document the other workarounds.
            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
        </application>

+        <application name="Doom 3: BFG" executable="Doom3BFG.exe">
+            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
+            <option name="force_glsl_extensions_warn" value="true" />
+        </application>
+
        <application name="Dying Light" executable="DyingLightGame">
            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
        </application>
@@ -463,6 +468,9 @@ TODO: document the other workarounds.
        <application name="ARK: Survival Evolved (and unintentionally the UE4 demo template)" executable="ShooterGame">
            <option name="radeonsi_clear_db_cache_before_clear" value="true" />
        </application>
+        <application name="Counter-Strike Global Offensive" executable="csgo_linux64">
+            <option name="radeonsi_zerovram" value="true" />
+        </application>
        <application name="No Mans Sky" executable="NMS.exe">
            <option name="radeonsi_zerovram" value="true" />
        </application>
--- a/src/util/os_file.c
+++ b/src/util/os_file.c
@@ -38,11 +38,29 @@ readN(int fd, char *buf, size_t len)
   return total ? total : err;
 }

-static char *
-read_grow(int fd)
+char *
+os_read_file(const char *filename)
 {
+   /* Note that this also serves as a slight margin to avoid a 2x grow when
+    * the file is just a few bytes larger when we read it than when we
+    * fstat'ed it.
+    * The string's NULL terminator is also included in here.
+    */
   size_t len = 64;

+   int fd = open(filename, O_RDONLY);
+   if (fd == -1) {
+      /* errno set by open() */
+      return NULL;
+   }
+
+   /* Pre-allocate a buffer at least the size of the file if we can read
+    * that information.
+    */
+   struct stat stat;
+   if (fstat(fd, &stat) == 0)
+      len += stat.st_size;
+
   char *buf = malloc(len);
   if (!buf) {
      close(fd);
@@ -77,46 +95,6 @@ read_grow(int fd)
   return buf;
 }

-char *
-os_read_file(const char *filename)
-{
-   size_t len = 0;
-
-   int fd = open(filename, O_RDONLY);
-   if (fd == -1) {
-      /* errno set by open() */
-      return NULL;
-   }
-
-   struct stat stat;
-   if (fstat(fd, &stat) == 0)
-      len = stat.st_size;
-
-   if (!len)
-      return read_grow(fd);
-
-   /* add NULL terminator */
-   len++;
-
-   char *buf = malloc(len);
-   if (!buf) {
-      close(fd);
-      errno = -ENOMEM;
-      return NULL;
-   }
-
-   ssize_t read = readN(fd, buf, len - 1);
-
-   close(fd);
-
-   if (read == -1)
-      return NULL;
-
-   buf[read] = '\0';
-
-   return buf;
-}
-
 #else

 char *
--- a/src/vulkan/overlay-layer/overlay.cpp
+++ b/src/vulkan/overlay-layer/overlay.cpp
@@ -109,6 +109,23 @@ struct queue_data {
   struct list_head running_command_buffer;
 };

+struct overlay_draw {
+   struct list_head link;
+
+   VkCommandBuffer command_buffer;
+
+   VkSemaphore semaphore;
+   VkFence fence;
+
+   VkBuffer vertex_buffer;
+   VkDeviceMemory vertex_buffer_mem;
+   VkDeviceSize vertex_buffer_size;
+
+   VkBuffer index_buffer;
+   VkDeviceMemory index_buffer_mem;
+   VkDeviceSize index_buffer_size;
+};
+
 /* Mapped from VkSwapchainKHR */
 struct swapchain_data {
   struct device_data *device;
@@ -135,17 +152,7 @@ struct swapchain_data {

   VkCommandPool command_pool;

-   struct {
-      VkCommandBuffer command_buffer;
-
-      VkBuffer vertex_buffer;
-      VkDeviceMemory vertex_buffer_mem;
-      VkDeviceSize vertex_buffer_size;
-
-      VkBuffer index_buffer;
-      VkDeviceMemory index_buffer_mem;
-      VkDeviceSize index_buffer_size;
-   } frame_data[2];
+   struct list_head draws; /* List of struct overlay_draw */

   bool font_uploaded;
   VkImage font_image;
@@ -154,8 +161,6 @@ struct swapchain_data {
   VkBuffer upload_font_buffer;
   VkDeviceMemory upload_font_buffer_mem;

-   VkSemaphore submission_semaphore;
-
   /**/
   ImGuiContext* imgui_context;
   ImVec2 window_size;
@@ -194,49 +199,45 @@ static const VkQueryPipelineStatisticFlags overlay_query_flags =
   VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
 #define OVERLAY_QUERY_COUNT (11)

-static struct hash_table *vk_object_to_data = NULL;
+static struct hash_table_u64 *vk_object_to_data = NULL;
 static simple_mtx_t vk_object_to_data_mutex = _SIMPLE_MTX_INITIALIZER_NP;

 thread_local ImGuiContext* __MesaImGui;

 static inline void ensure_vk_object_map(void)
 {
-   if (!vk_object_to_data) {
-      vk_object_to_data = _mesa_hash_table_create(NULL,
-                                                  _mesa_hash_pointer,
-                                                  _mesa_key_pointer_equal);
-   }
+   if (!vk_object_to_data)
+      vk_object_to_data = _mesa_hash_table_u64_create(NULL);
 }

-#define FIND_SWAPCHAIN_DATA(obj) ((struct swapchain_data *)find_object_data((void *) obj))
-#define FIND_CMD_BUFFER_DATA(obj) ((struct command_buffer_data *)find_object_data((void *) obj))
-#define FIND_DEVICE_DATA(obj) ((struct device_data *)find_object_data((void *) obj))
-#define FIND_QUEUE_DATA(obj) ((struct queue_data *)find_object_data((void *) obj))
-#define FIND_PHYSICAL_DEVICE_DATA(obj) ((struct instance_data *)find_object_data((void *) obj))
-#define FIND_INSTANCE_DATA(obj) ((struct instance_data *)find_object_data((void *) obj))
-static void *find_object_data(void *obj)
+#define HKEY(obj) ((uint64_t)(obj))
+#define FIND_SWAPCHAIN_DATA(obj) ((struct swapchain_data *)find_object_data(HKEY(obj)))
+#define FIND_CMD_BUFFER_DATA(obj) ((struct command_buffer_data *)find_object_data(HKEY(obj)))
+#define FIND_DEVICE_DATA(obj) ((struct device_data *)find_object_data(HKEY(obj)))
+#define FIND_QUEUE_DATA(obj) ((struct queue_data *)find_object_data(HKEY(obj)))
+#define FIND_PHYSICAL_DEVICE_DATA(obj) ((struct instance_data *)find_object_data(HKEY(obj)))
+#define FIND_INSTANCE_DATA(obj) ((struct instance_data *)find_object_data(HKEY(obj)))
+static void *find_object_data(uint64_t obj)
 {
   simple_mtx_lock(&vk_object_to_data_mutex);
   ensure_vk_object_map();
-   struct hash_entry *entry = _mesa_hash_table_search(vk_object_to_data, obj);
-   void *data = entry ? entry->data : NULL;
+   void *data = _mesa_hash_table_u64_search(vk_object_to_data, obj);
   simple_mtx_unlock(&vk_object_to_data_mutex);
   return data;
 }

-static void map_object(void *obj, void *data)
+static void map_object(uint64_t obj, void *data)
 {
   simple_mtx_lock(&vk_object_to_data_mutex);
   ensure_vk_object_map();
-   _mesa_hash_table_insert(vk_object_to_data, obj, data);
+   _mesa_hash_table_u64_insert(vk_object_to_data, obj, data);
   simple_mtx_unlock(&vk_object_to_data_mutex);
 }

-static void unmap_object(void *obj)
+static void unmap_object(uint64_t obj)
 {
   simple_mtx_lock(&vk_object_to_data_mutex);
-   struct hash_entry *entry = _mesa_hash_table_search(vk_object_to_data, obj);
-   _mesa_hash_table_remove(vk_object_to_data, entry);
+   _mesa_hash_table_u64_remove(vk_object_to_data, obj);
   simple_mtx_unlock(&vk_object_to_data_mutex);
 }

@@ -321,7 +322,7 @@ static struct instance_data *new_instance_data(VkInstance instance)
 {
   struct instance_data *data = rzalloc(NULL, struct instance_data);
   data->instance = instance;
-   map_object(data->instance, data);
+   map_object(HKEY(data->instance), data);
   return data;
 }

@@ -329,7 +330,7 @@ static void destroy_instance_data(struct instance_data *data)
 {
   if (data->params.output_file)
      fclose(data->params.output_file);
-   unmap_object(data->instance);
+   unmap_object(HKEY(data->instance));
   ralloc_free(data);
 }

@@ -348,9 +349,9 @@ static void instance_data_map_physical_devices(struct instance_data *instance_da

   for (uint32_t i = 0; i < physicalDeviceCount; i++) {
      if (map)
-         map_object(physicalDevices[i], instance_data);
+         map_object(HKEY(physicalDevices[i]), instance_data);
      else
-         unmap_object(physicalDevices[i]);
+         unmap_object(HKEY(physicalDevices[i]));
   }

   free(physicalDevices);
@@ -362,7 +363,7 @@ static struct device_data *new_device_data(VkDevice device, struct instance_data
   struct device_data *data = rzalloc(NULL, struct device_data);
   data->instance = instance;
   data->device = device;
-   map_object(data->device, data);
+   map_object(HKEY(data->device), data);
   return data;
 }

@@ -375,10 +376,10 @@ static struct queue_data *new_queue_data(VkQueue queue,
   data->device = device_data;
   data->queue = queue;
   data->flags = family_props->queueFlags;
-   data->timestamp_mask = (1ul << family_props->timestampValidBits) - 1;
+   data->timestamp_mask = (1ull << family_props->timestampValidBits) - 1;
   data->family_index = family_index;
   LIST_INITHEAD(&data->running_command_buffer);
-   map_object(data->queue, data);
+   map_object(HKEY(data->queue), data);

   /* Fence synchronizing access to queries on that queue. */
   VkFenceCreateInfo fence_info = {};
@@ -400,7 +401,7 @@ static void destroy_queue(struct queue_data *data)
 {
   struct device_data *device_data = data->device;
   device_data->vtable.DestroyFence(device_data->device, data->queries_fence, NULL);
-   unmap_object(data->queue);
+   unmap_object(HKEY(data->queue));
   ralloc_free(data);
 }

@@ -449,7 +450,7 @@ static void device_unmap_queues(struct device_data *data)

 static void destroy_device_data(struct device_data *data)
 {
-   unmap_object(data->device);
+   unmap_object(HKEY(data->device));
   ralloc_free(data);
 }

@@ -469,13 +470,13 @@ static struct command_buffer_data *new_command_buffer_data(VkCommandBuffer cmd_b
   data->timestamp_query_pool = timestamp_query_pool;
   data->query_index = query_index;
   list_inithead(&data->link);
-   map_object((void *) data->cmd_buffer, data);
+   map_object(HKEY(data->cmd_buffer), data);
   return data;
 }

 static void destroy_command_buffer_data(struct command_buffer_data *data)
 {
-   unmap_object((void *) data->cmd_buffer);
+   unmap_object(HKEY(data->cmd_buffer));
   list_delinit(&data->link);
   ralloc_free(data);
 }
@@ -489,16 +490,63 @@ static struct swapchain_data *new_swapchain_data(VkSwapchainKHR swapchain,
   data->device = device_data;
   data->swapchain = swapchain;
   data->window_size = ImVec2(instance_data->params.width, instance_data->params.height);
-   map_object((void *) data->swapchain, data);
+   list_inithead(&data->draws);
+   map_object(HKEY(data->swapchain), data);
   return data;
 }

 static void destroy_swapchain_data(struct swapchain_data *data)
 {
-   unmap_object((void *) data->swapchain);
+   unmap_object(HKEY(data->swapchain));
   ralloc_free(data);
 }

+struct overlay_draw *get_overlay_draw(struct swapchain_data *data)
+{
+   struct device_data *device_data = data->device;
+   struct overlay_draw *draw = list_empty(&data->draws) ?
+      NULL : list_first_entry(&data->draws, struct overlay_draw, link);
+
+   VkSemaphoreCreateInfo sem_info = {};
+   sem_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+
+   if (draw && device_data->vtable.GetFenceStatus(device_data->device, draw->fence) == VK_SUCCESS) {
+      list_del(&draw->link);
+      VK_CHECK(device_data->vtable.ResetFences(device_data->device,
+                                               1, &draw->fence));
+      list_addtail(&draw->link, &data->draws);
+      return draw;
+   }
+
+   draw = rzalloc(data, struct overlay_draw);
+
+   VkCommandBufferAllocateInfo cmd_buffer_info = {};
+   cmd_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+   cmd_buffer_info.commandPool = data->command_pool;
+   cmd_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+   cmd_buffer_info.commandBufferCount = 1;
+   VK_CHECK(device_data->vtable.AllocateCommandBuffers(device_data->device,
+                                                       &cmd_buffer_info,
+                                                       &draw->command_buffer));
+   VK_CHECK(device_data->set_device_loader_data(device_data->device,
+                                                draw->command_buffer));
+
+
+   VkFenceCreateInfo fence_info = {};
+   fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+   VK_CHECK(device_data->vtable.CreateFence(device_data->device,
+                                            &fence_info,
+                                            NULL,
+                                            &draw->fence));
+
+   VK_CHECK(device_data->vtable.CreateSemaphore(device_data->device, &sem_info,
+                                                NULL, &draw->semaphore));
+
+   list_addtail(&draw->link, &data->draws);
+
+   return draw;
+}
+
 static const char *param_unit(enum overlay_param_enabled param)
 {
   switch (param) {
@@ -872,20 +920,19 @@ static void CreateOrResizeBuffer(struct device_data *data,
    *buffer_size = new_size;
 }

-static void render_swapchain_display(struct swapchain_data *data,
-                                     const VkSemaphore *wait_semaphores,
-                                     unsigned n_wait_semaphores,
-                                     unsigned image_index)
+static struct overlay_draw *render_swapchain_display(struct swapchain_data *data,
+                                                     const VkSemaphore *wait_semaphores,
+                                                     unsigned n_wait_semaphores,
+                                                     unsigned image_index)
 {
   ImDrawData* draw_data = ImGui::GetDrawData();
   if (draw_data->TotalVtxCount == 0)
-      return;
+      return NULL;

   struct device_data *device_data = data->device;
-   uint32_t idx = data->n_frames % ARRAY_SIZE(data->frame_data);
-   VkCommandBuffer command_buffer = data->frame_data[idx].command_buffer;
+   struct overlay_draw *draw = get_overlay_draw(data);

-   device_data->vtable.ResetCommandBuffer(command_buffer, 0);
+   device_data->vtable.ResetCommandBuffer(draw->command_buffer, 0);

   VkRenderPassBeginInfo render_pass_info = {};
   render_pass_info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
@@ -897,9 +944,9 @@ static void render_swapchain_display(struct swapchain_data *data,
   VkCommandBufferBeginInfo buffer_begin_info = {};
   buffer_begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;

-   device_data->vtable.BeginCommandBuffer(command_buffer, &buffer_begin_info);
+   device_data->vtable.BeginCommandBuffer(draw->command_buffer, &buffer_begin_info);

-   ensure_swapchain_fonts(data, command_buffer);
+   ensure_swapchain_fonts(data, draw->command_buffer);

   /* Bounce the image to display back to color attachment layout for
    * rendering on top of it.
@@ -919,7 +966,7 @@ static void render_swapchain_display(struct swapchain_data *data,
   imb.subresourceRange.layerCount = 1;
   imb.srcQueueFamilyIndex = device_data->graphic_queue->family_index;
   imb.dstQueueFamilyIndex = device_data->graphic_queue->family_index;
-   device_data->vtable.CmdPipelineBarrier(command_buffer,
+   device_data->vtable.CmdPipelineBarrier(draw->command_buffer,
                                          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
                                          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
                                          0,          /* dependency flags */
@@ -927,37 +974,33 @@ static void render_swapchain_display(struct swapchain_data *data,
                                          0, nullptr, /* buffer memory barriers */
                                          1, &imb);   /* image memory barriers */

-   device_data->vtable.CmdBeginRenderPass(command_buffer, &render_pass_info,
+   device_data->vtable.CmdBeginRenderPass(draw->command_buffer, &render_pass_info,
                                          VK_SUBPASS_CONTENTS_INLINE);

   /* Create/Resize vertex & index buffers */
   size_t vertex_size = draw_data->TotalVtxCount * sizeof(ImDrawVert);
   size_t index_size = draw_data->TotalIdxCount * sizeof(ImDrawIdx);
-   if (data->frame_data[idx].vertex_buffer_size < vertex_size) {
+   if (draw->vertex_buffer_size < vertex_size) {
      CreateOrResizeBuffer(device_data,
-                           &data->frame_data[idx].vertex_buffer,
-                           &data->frame_data[idx].vertex_buffer_mem,
-                           &data->frame_data[idx].vertex_buffer_size,
+                           &draw->vertex_buffer,
+                           &draw->vertex_buffer_mem,
+                           &draw->vertex_buffer_size,
                           vertex_size, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
   }
-   if (data->frame_data[idx].index_buffer_size < index_size) {
+   if (draw->index_buffer_size < index_size) {
      CreateOrResizeBuffer(device_data,
-                           &data->frame_data[idx].index_buffer,
-                           &data->frame_data[idx].index_buffer_mem,
-                           &data->frame_data[idx].index_buffer_size,
+                           &draw->index_buffer,
+                           &draw->index_buffer_mem,
+                           &draw->index_buffer_size,
                           index_size, VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
   }

    /* Upload vertex & index data */
-    VkBuffer vertex_buffer = data->frame_data[idx].vertex_buffer;
-    VkDeviceMemory vertex_mem = data->frame_data[idx].vertex_buffer_mem;
-    VkBuffer index_buffer = data->frame_data[idx].index_buffer;
-    VkDeviceMemory index_mem = data->frame_data[idx].index_buffer_mem;
    ImDrawVert* vtx_dst = NULL;
    ImDrawIdx* idx_dst = NULL;
-    VK_CHECK(device_data->vtable.MapMemory(device_data->device, vertex_mem,
+    VK_CHECK(device_data->vtable.MapMemory(device_data->device, draw->vertex_buffer_mem,
                                           0, vertex_size, 0, (void**)(&vtx_dst)));
-    VK_CHECK(device_data->vtable.MapMemory(device_data->device, index_mem,
+    VK_CHECK(device_data->vtable.MapMemory(device_data->device, draw->index_buffer_mem,
                                           0, index_size, 0, (void**)(&idx_dst)));
    for (int n = 0; n < draw_data->CmdListsCount; n++)
        {
@@ -969,26 +1012,26 @@ static void render_swapchain_display(struct swapchain_data *data,
        }
    VkMappedMemoryRange range[2] = {};
    range[0].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
-    range[0].memory = vertex_mem;
+    range[0].memory = draw->vertex_buffer_mem;
    range[0].size = VK_WHOLE_SIZE;
    range[1].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
-    range[1].memory = index_mem;
+    range[1].memory = draw->index_buffer_mem;
    range[1].size = VK_WHOLE_SIZE;
    VK_CHECK(device_data->vtable.FlushMappedMemoryRanges(device_data->device, 2, range));
-    device_data->vtable.UnmapMemory(device_data->device, vertex_mem);
-    device_data->vtable.UnmapMemory(device_data->device, index_mem);
+    device_data->vtable.UnmapMemory(device_data->device, draw->vertex_buffer_mem);
+    device_data->vtable.UnmapMemory(device_data->device, draw->index_buffer_mem);

    /* Bind pipeline and descriptor sets */
-    device_data->vtable.CmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, data->pipeline);
+    device_data->vtable.CmdBindPipeline(draw->command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, data->pipeline);
    VkDescriptorSet desc_set[1] = { data->descriptor_set };
-    device_data->vtable.CmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
+    device_data->vtable.CmdBindDescriptorSets(draw->command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
                                              data->pipeline_layout, 0, 1, desc_set, 0, NULL);

    /* Bind vertex & index buffers */
-    VkBuffer vertex_buffers[1] = { vertex_buffer };
+    VkBuffer vertex_buffers[1] = { draw->vertex_buffer };
    VkDeviceSize vertex_offset[1] = { 0 };
-    device_data->vtable.CmdBindVertexBuffers(command_buffer, 0, 1, vertex_buffers, vertex_offset);
-    device_data->vtable.CmdBindIndexBuffer(command_buffer, index_buffer, 0, VK_INDEX_TYPE_UINT16);
+    device_data->vtable.CmdBindVertexBuffers(draw->command_buffer, 0, 1, vertex_buffers, vertex_offset);
+    device_data->vtable.CmdBindIndexBuffer(draw->command_buffer, draw->index_buffer, 0, VK_INDEX_TYPE_UINT16);

    /* Setup viewport */
    VkViewport viewport;
@@ -998,7 +1041,7 @@ static void render_swapchain_display(struct swapchain_data *data,
    viewport.height = draw_data->DisplaySize.y;
    viewport.minDepth = 0.0f;
    viewport.maxDepth = 1.0f;
-    device_data->vtable.CmdSetViewport(command_buffer, 0, 1, &viewport);
+    device_data->vtable.CmdSetViewport(draw->command_buffer, 0, 1, &viewport);


    /* Setup scale and translation through push constants :
@@ -1013,10 +1056,10 @@ static void render_swapchain_display(struct swapchain_data *data,
    float translate[2];
    translate[0] = -1.0f - draw_data->DisplayPos.x * scale[0];
    translate[1] = -1.0f - draw_data->DisplayPos.y * scale[1];
-    device_data->vtable.CmdPushConstants(command_buffer, data->pipeline_layout,
+    device_data->vtable.CmdPushConstants(draw->command_buffer, data->pipeline_layout,
                                         VK_SHADER_STAGE_VERTEX_BIT,
                                         sizeof(float) * 0, sizeof(float) * 2, scale);
-    device_data->vtable.CmdPushConstants(command_buffer, data->pipeline_layout,
+    device_data->vtable.CmdPushConstants(draw->command_buffer, data->pipeline_layout,
                                         VK_SHADER_STAGE_VERTEX_BIT,
                                         sizeof(float) * 2, sizeof(float) * 2, translate);

@@ -1037,42 +1080,33 @@ static void render_swapchain_display(struct swapchain_data *data,
            scissor.offset.y = (int32_t)(pcmd->ClipRect.y - display_pos.y) > 0 ? (int32_t)(pcmd->ClipRect.y - display_pos.y) : 0;
            scissor.extent.width = (uint32_t)(pcmd->ClipRect.z - pcmd->ClipRect.x);
            scissor.extent.height = (uint32_t)(pcmd->ClipRect.w - pcmd->ClipRect.y + 1); // FIXME: Why +1 here?
-            device_data->vtable.CmdSetScissor(command_buffer, 0, 1, &scissor);
+            device_data->vtable.CmdSetScissor(draw->command_buffer, 0, 1, &scissor);

            // Draw
-            device_data->vtable.CmdDrawIndexed(command_buffer, pcmd->ElemCount, 1, idx_offset, vtx_offset, 0);
+            device_data->vtable.CmdDrawIndexed(draw->command_buffer, pcmd->ElemCount, 1, idx_offset, vtx_offset, 0);

            idx_offset += pcmd->ElemCount;
        }
        vtx_offset += cmd_list->VtxBuffer.Size;
    }

-   device_data->vtable.CmdEndRenderPass(command_buffer);
-   device_data->vtable.EndCommandBuffer(command_buffer);
-
-   if (data->submission_semaphore) {
-      device_data->vtable.DestroySemaphore(device_data->device,
-                                           data->submission_semaphore,
-                                           NULL);
-   }
-   /* Submission semaphore */
-   VkSemaphoreCreateInfo semaphore_info = {};
-   semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-   VK_CHECK(device_data->vtable.CreateSemaphore(device_data->device, &semaphore_info,
-                                                NULL, &data->submission_semaphore));
+   device_data->vtable.CmdEndRenderPass(draw->command_buffer);
+   device_data->vtable.EndCommandBuffer(draw->command_buffer);

   VkSubmitInfo submit_info = {};
   VkPipelineStageFlags stage_wait = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
   submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
   submit_info.commandBufferCount = 1;
-   submit_info.pCommandBuffers = &command_buffer;
+   submit_info.pCommandBuffers = &draw->command_buffer;
   submit_info.pWaitDstStageMask = &stage_wait;
   submit_info.waitSemaphoreCount = n_wait_semaphores;
   submit_info.pWaitSemaphores = wait_semaphores;
   submit_info.signalSemaphoreCount = 1;
-   submit_info.pSignalSemaphores = &data->submission_semaphore;
+   submit_info.pSignalSemaphores = &draw->semaphore;

-   device_data->vtable.QueueSubmit(device_data->graphic_queue->queue, 1, &submit_info, VK_NULL_HANDLE);
+   device_data->vtable.QueueSubmit(device_data->graphic_queue->queue, 1, &submit_info, draw->fence);
+
+   return draw;
 }

 static const uint32_t overlay_vert_spv[] = {
@@ -1437,7 +1471,7 @@ static void setup_swapchain_data(struct swapchain_data *data,
                                                     NULL, &data->framebuffers[i]));
   }

-   /* Command buffer */
+   /* Command buffer pool */
   VkCommandPoolCreateInfo cmd_buffer_pool_info = {};
   cmd_buffer_pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
   cmd_buffer_pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
@@ -1445,29 +1479,21 @@ static void setup_swapchain_data(struct swapchain_data *data,
   VK_CHECK(device_data->vtable.CreateCommandPool(device_data->device,
                                                  &cmd_buffer_pool_info,
                                                  NULL, &data->command_pool));
-
-   VkCommandBuffer cmd_bufs[ARRAY_SIZE(data->frame_data)];
-
-   VkCommandBufferAllocateInfo cmd_buffer_info = {};
-   cmd_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-   cmd_buffer_info.commandPool = data->command_pool;
-   cmd_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-   cmd_buffer_info.commandBufferCount = 2;
-   VK_CHECK(device_data->vtable.AllocateCommandBuffers(device_data->device,
-                                                       &cmd_buffer_info,
-                                                       cmd_bufs));
-   for (uint32_t i = 0; i < ARRAY_SIZE(data->frame_data); i++) {
-      VK_CHECK(device_data->set_device_loader_data(device_data->device,
-                                                   cmd_bufs[i]));
-
-      data->frame_data[i].command_buffer = cmd_bufs[i];
-   }
 }

 static void shutdown_swapchain_data(struct swapchain_data *data)
 {
   struct device_data *device_data = data->device;

+   list_for_each_entry_safe(struct overlay_draw, draw, &data->draws, link) {
+      device_data->vtable.DestroySemaphore(device_data->device, draw->semaphore, NULL);
+      device_data->vtable.DestroyFence(device_data->device, draw->fence, NULL);
+      device_data->vtable.DestroyBuffer(device_data->device, draw->vertex_buffer, NULL);
+      device_data->vtable.DestroyBuffer(device_data->device, draw->index_buffer, NULL);
+      device_data->vtable.FreeMemory(device_data->device, draw->vertex_buffer_mem, NULL);
+      device_data->vtable.FreeMemory(device_data->device, draw->index_buffer_mem, NULL);
+   }
+
   for (uint32_t i = 0; i < data->n_images; i++) {
      device_data->vtable.DestroyImageView(device_data->device, data->image_views[i], NULL);
      device_data->vtable.DestroyFramebuffer(device_data->device, data->framebuffers[i], NULL);
@@ -1475,24 +1501,8 @@ static void shutdown_swapchain_data(struct swapchain_data *data)

   device_data->vtable.DestroyRenderPass(device_data->device, data->render_pass, NULL);

-   for (uint32_t i = 0; i < ARRAY_SIZE(data->frame_data); i++) {
-      device_data->vtable.FreeCommandBuffers(device_data->device,
-                                             data->command_pool,
-                                             1, &data->frame_data[i].command_buffer);
-      if (data->frame_data[i].vertex_buffer)
-         device_data->vtable.DestroyBuffer(device_data->device, data->frame_data[i].vertex_buffer, NULL);
-      if (data->frame_data[i].index_buffer)
-         device_data->vtable.DestroyBuffer(device_data->device, data->frame_data[i].index_buffer, NULL);
-      if (data->frame_data[i].vertex_buffer_mem)
-         device_data->vtable.FreeMemory(device_data->device, data->frame_data[i].vertex_buffer_mem, NULL);
-      if (data->frame_data[i].index_buffer_mem)
-         device_data->vtable.FreeMemory(device_data->device, data->frame_data[i].index_buffer_mem, NULL);
-   }
   device_data->vtable.DestroyCommandPool(device_data->device, data->command_pool, NULL);

-   if (data->submission_semaphore)
-      device_data->vtable.DestroySemaphore(device_data->device, data->submission_semaphore, NULL);
-
   device_data->vtable.DestroyPipeline(device_data->device, data->pipeline, NULL);
   device_data->vtable.DestroyPipelineLayout(device_data->device, data->pipeline_layout, NULL);

@@ -1512,19 +1522,24 @@ static void shutdown_swapchain_data(struct swapchain_data *data)
   ImGui::DestroyContext(data->imgui_context);
 }

-static void before_present(struct swapchain_data *swapchain_data,
-                           const VkSemaphore *wait_semaphores,
-                           unsigned n_wait_semaphores,
-                           unsigned imageIndex)
+static struct overlay_draw *before_present(struct swapchain_data *swapchain_data,
+                                           const VkSemaphore *wait_semaphores,
+                                           unsigned n_wait_semaphores,
+                                           unsigned imageIndex)
 {
   struct instance_data *instance_data = swapchain_data->device->instance;
+   struct overlay_draw *draw = NULL;

   snapshot_swapchain_frame(swapchain_data);

   if (!instance_data->params.no_display && swapchain_data->n_frames > 0) {
      compute_swapchain_display(swapchain_data);
-      render_swapchain_display(swapchain_data, wait_semaphores, n_wait_semaphores, imageIndex);
+      draw = render_swapchain_display(swapchain_data,
+                                      wait_semaphores, n_wait_semaphores,
+                                      imageIndex);
   }
+
+   return draw;
 }

 static VkResult overlay_CreateSwapchainKHR(
@@ -1642,16 +1657,19 @@ static VkResult overlay_QueuePresentKHR(
         present_info.swapchainCount = 1;
         present_info.pSwapchains = &swapchain;

-         before_present(swapchain_data,
-                        pPresentInfo->pWaitSemaphores,
-                        pPresentInfo->waitSemaphoreCount,
-                        pPresentInfo->pImageIndices[i]);
+         uint32_t image_index = pPresentInfo->pImageIndices[i];
+
+         struct overlay_draw *draw = before_present(swapchain_data,
+                                                    pPresentInfo->pWaitSemaphores,
+                                                    pPresentInfo->waitSemaphoreCount,
+                                                    image_index);
+
         /* Because the submission of the overlay draw waits on the semaphores
          * handed for present, we don't need to have this present operation
          * wait on them as well, we can just wait on the overlay submission
          * semaphore.
          */
-         present_info.pWaitSemaphores = &swapchain_data->submission_semaphore;
+         present_info.pWaitSemaphores = &draw->semaphore;
         present_info.waitSemaphoreCount = 1;

         VkResult chain_result = queue_data->device->vtable.QueuePresentKHR(queue, &present_info);
@@ -2011,9 +2029,9 @@ static VkResult overlay_AllocateCommandBuffers(
   }

   if (pipeline_query_pool)
-      map_object(pipeline_query_pool, (void *)(uintptr_t) pAllocateInfo->commandBufferCount);
+      map_object(HKEY(pipeline_query_pool), (void *)(uintptr_t) pAllocateInfo->commandBufferCount);
   if (timestamp_query_pool)
-      map_object(timestamp_query_pool, (void *)(uintptr_t) pAllocateInfo->commandBufferCount);
+      map_object(HKEY(timestamp_query_pool), (void *)(uintptr_t) pAllocateInfo->commandBufferCount);

   return result;
 }
@@ -2028,21 +2046,21 @@ static void overlay_FreeCommandBuffers(
   for (uint32_t i = 0; i < commandBufferCount; i++) {
      struct command_buffer_data *cmd_buffer_data =
         FIND_CMD_BUFFER_DATA(pCommandBuffers[i]);
-      uint64_t count = (uintptr_t)find_object_data((void *)cmd_buffer_data->pipeline_query_pool);
+      uint64_t count = (uintptr_t)find_object_data(HKEY(cmd_buffer_data->pipeline_query_pool));
      if (count == 1) {
-         unmap_object(cmd_buffer_data->pipeline_query_pool);
+         unmap_object(HKEY(cmd_buffer_data->pipeline_query_pool));
         device_data->vtable.DestroyQueryPool(device_data->device,
                                              cmd_buffer_data->pipeline_query_pool, NULL);
      } else if (count != 0) {
-         map_object(cmd_buffer_data->pipeline_query_pool, (void *)(uintptr_t)(count - 1));
+         map_object(HKEY(cmd_buffer_data->pipeline_query_pool), (void *)(uintptr_t)(count - 1));
      }
-      count = (uintptr_t)find_object_data((void *)cmd_buffer_data->timestamp_query_pool);
+      count = (uintptr_t)find_object_data(HKEY(cmd_buffer_data->timestamp_query_pool));
      if (count == 1) {
-         unmap_object(cmd_buffer_data->timestamp_query_pool);
+         unmap_object(HKEY(cmd_buffer_data->timestamp_query_pool));
         device_data->vtable.DestroyQueryPool(device_data->device,
                                              cmd_buffer_data->timestamp_query_pool, NULL);
      } else if (count != 0) {
-         map_object(cmd_buffer_data->timestamp_query_pool, (void *)(uintptr_t)(count - 1));
+         map_object(HKEY(cmd_buffer_data->timestamp_query_pool), (void *)(uintptr_t)(count - 1));
      }
      destroy_command_buffer_data(cmd_buffer_data);
   }