CHROMIUM: i965: Implement EGL_KHR_mutable_render_buffer

Tested with a low-latency handwriting application on Android Nougat on the Chrome OS Pixelbook (codename Eve) with Kabylake. BUG=b:77899911 TEST=No android-cts-7.1 regressions on Eve. Change-Id: Ia816fa6b0a1158f81e5b63477451bf337c2001aa
CHROMIUM: egl/android: Implement EGL_KHR_mutable_render_buffer
2018-05-01 03:16:01 -07:00 · 2018-05-01 03:16:00 -07:00 · 2018-05-01 03:15:58 -07:00 · 2018-05-01 03:15:56 -07:00 · 2018-05-01 03:15:54 -07:00 · 2018-05-01 03:15:42 -07:00
88 changed files with 3410 additions and 459 deletions
--- a/PRESUBMIT.cfg
+++ b/PRESUBMIT.cfg
@@ -0,0 +1,10 @@
+# This sample config file disables all of the ChromiumOS source style checks.
+# Comment out the disable-flags for any checks you want to leave enabled.
+
+[Hook Overrides]
+stray_whitespace_check: false
+long_line_check: false
+cros_license_check: false
+tab_check: false
+bug_field_check: false
+test_field_check: false
--- a/configure.ac
+++ b/configure.ac
@@ -383,9 +383,11 @@ if test "x$GCC_ATOMIC_BUILTINS_SUPPORTED" = x1; then
    AC_MSG_CHECKING(whether -latomic is needed)
    AC_LINK_IFELSE([AC_LANG_SOURCE([[
    #include <stdint.h>
-    uint64_t v;
+    struct {
+        uint64_t* v;
+    } x;
    int main() {
-        return (int)__atomic_load_n(&v, __ATOMIC_ACQUIRE);
+        return (int)__atomic_load_n(x.v, __ATOMIC_ACQUIRE);
    }]])], GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC=no, GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC=yes)
    AC_MSG_RESULT($GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC)
    if test "x$GCC_ATOMIC_BUILTINS_NEED_LIBATOMIC" = xyes; then
@@ -2408,12 +2410,13 @@ dnl Surfaceless is an alternative for the last one.
 dnl
 require_basic_egl() {
    case "$with_platforms" in
-        *drm*|*surfaceless*)
+        *drm*|*surfaceless*|*android*)
            ;;
        *)
            AC_MSG_ERROR([$1 requires one of these:
                  1) --with-platforms=drm (X, Wayland, offscreen rendering based on DRM)
                  2) --with-platforms=surfaceless (offscreen only)
+                  3) --with-platforms=android (Android only)
                  Recommended options: drm,x11])
            ;;
    esac
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -48,6 +48,7 @@ typedef unsigned int drm_drawable_t;
 typedef struct drm_clip_rect drm_clip_rect_t;
 #endif

+#include <stdbool.h>
 #include <stdint.h>

 /**
@@ -704,7 +705,8 @@ struct __DRIuseInvalidateExtensionRec {
 #define __DRI_ATTRIB_BIND_TO_TEXTURE_TARGETS	46
 #define __DRI_ATTRIB_YINVERTED			47
 #define __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE	48
-#define __DRI_ATTRIB_MAX			(__DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE + 1)
+#define __DRI_ATTRIB_MUTABLE_RENDER_BUFFER	49 /* EGL_MUTABLE_RENDER_BUFFER_BIT_KHR */
+#define __DRI_ATTRIB_MAX			50

 /* __DRI_ATTRIB_RENDER_TYPE */
 #define __DRI_ATTRIB_RGBA_BIT			0x01	
@@ -1810,7 +1812,48 @@ struct __DRI2rendererQueryExtensionRec {

 enum __DRIimageBufferMask {
   __DRI_IMAGE_BUFFER_BACK = (1 << 0),
-   __DRI_IMAGE_BUFFER_FRONT = (1 << 1)
+   __DRI_IMAGE_BUFFER_FRONT = (1 << 1),
+
+   /**
+    * A buffer shared between application and compositor. The buffer may be
+    * simultaneously accessed by each.
+    *
+    * A shared buffer is equivalent to an EGLSurface whose EGLConfig contains
+    * EGL_MUTABLE_RENDER_BUFFER_BIT_KHR and whose active EGL_RENDER_BUFFER (as
+    * opposed to any pending, requested change to EGL_RENDER_BUFFER) is
+    * EGL_SINGLE_BUFFER.
+    *
+    * If the loader returns __DRI_IMAGE_BUFFER_SHARED, then it is returned
+    * alone without accompanying back nor front buffer.
+    *
+    * The loader returns __DRI_IMAGE_BUFFER_SHARED if and only if:
+    *     - The loader supports __DRI_MUTABLE_RENDER_BUFFER_LOADER.
+    *     - The driver supports __DRI_MUTABLE_RENDER_BUFFER_DRIVER.
+    *     - The EGLConfig of the drawable EGLSurface contains
+    *       EGL_MUTABLE_RENDER_BUFFER_BIT_KHR.
+    *     - The EGLContext's EGL_RENDER_BUFFER is EGL_SINGLE_BUFFER.
+    *       Equivalently, the EGLSurface's active EGL_RENDER_BUFFER (as
+    *       opposed to any pending,requested change to EGL_RENDER_BUFFER) is
+    *       EGL_SINGLE_BUFFER.
+    *
+    * A shared buffer is similar a front buffer in that all rendering to the
+    * buffer should appear promptly on the screen. It is different from
+    * a front buffer in that its behavior is independent from the
+    * GL_DRAW_BUFFER state. Specifically, if GL_DRAW_FRAMEBUFFER is 0 and the
+    * __DRIdrawable's current buffer mask is __DRI_IMAGE_BUFFER_SHARED, then
+    * all rendering should appear promptly on the screen if GL_DRAW_BUFFER is
+    * not GL_NONE.
+    *
+    * The difference between a shared buffer and a front buffer is motivated
+    * by the constraints of Android and OpenGL ES. OpenGL ES does not support
+    * front-buffer rendering. Android's SurfaceFlinger protocol provides the
+    * EGL driver only a back buffer and no front buffer. The shared buffer
+    * mode introduced by EGL_KHR_mutable_render_buffer is a backdoor though
+    * EGL that allows Android OpenGL ES applications to render to what is
+    * effectively the front buffer, a backdoor that required no change to the
+    * OpenGL ES API and little change to the SurfaceFlinger API.
+    */
+   __DRI_IMAGE_BUFFER_SHARED = (1 << 2),
 };

 struct __DRIimageList {
@@ -1949,4 +1992,83 @@ struct __DRIbackgroundCallableExtensionRec {
   GLboolean (*isThreadSafe)(void *loaderPrivate);
 };

+/**
+ * The driver portion of EGL_KHR_mutable_render_buffer.
+ *
+ * If the driver creates a __DRIconfig with
+ * __DRI_ATTRIB_MUTABLE_RENDER_BUFFER, then it must support this extension.
+ *
+ * To support this extension:
+ *
+ *    - The driver should create at least one __DRIconfig with
+ *      __DRI_ATTRIB_MUTABLE_RENDER_BUFFER. This is strongly recommended but
+ *      not required.
+ *
+ *    - The driver must be able to handle __DRI_IMAGE_BUFFER_SHARED if
+ *      returned by __DRIimageLoaderExtension:getBuffers().
+ *
+ *    - When rendering to __DRI_IMAGE_BUFFER_SHARED, it must call
+ *      __DRImutableRenderBufferLoaderExtension::displaySharedBuffer() on each
+ *      application-initiated flush. This includes glFlush, glFinish,
+ *      GL_SYNC_FLUSH_COMMANDS_BIT, EGL_SYNC_FLUSH_COMMANDS_BIT, and possibly
+ *      more. (Android applications expect that glFlush will immediately
+ *      display the buffer when in shared buffer mode because that is common
+ *      behavior among Android drivers). It :may: call displaySharedBuffer()
+ *      more often than required.
+ *
+ *    - When rendering to __DRI_IMAGE_BUFFER_SHARED, it must ensure that the
+ *      buffer is always in a format compatible for display because the
+ *      display engine (usually SurfaceFlinger or hwcomposer) may display the
+ *      image at any time, even concurrently with 3D rendering. For example,
+ *      display hardware and the GL hardware may be able to access the buffer
+ *      simultaneously. In particular, if the buffer is compressed than take
+ *      care that SurfaceFlinger and hwcomposer can consume the compression
+ *      format.
+ *
+ * \see __DRI_IMAGE_BUFFER_SHARED
+ * \see __DRI_ATTRIB_MUTABLE_RENDER_BUFFER
+ * \see __DRI_MUTABLE_RENDER_BUFFER_LOADER
+ */
+#define __DRI_MUTABLE_RENDER_BUFFER_DRIVER "DRI_MutableRenderBufferDriver"
+#define __DRI_MUTABLE_RENDER_BUFFER_DRIVER_VERSION 1
+
+typedef struct __DRImutableRenderBufferDriverExtensionRec __DRImutableRenderBufferDriverExtension;
+struct __DRImutableRenderBufferDriverExtensionRec {
+   __DRIextension base;
+};
+
+/**
+ * The loader portion of EGL_KHR_mutable_render_buffer.
+ *
+ * Requires loader extension DRI_IMAGE_LOADER, through which the loader sends
+ * __DRI_IMAGE_BUFFER_SHARED to the driver.
+ *
+ * \see __DRI_MUTABLE_RENDER_BUFFER_DRIVER
+ */
+#define __DRI_MUTABLE_RENDER_BUFFER_LOADER "DRI_MutableRenderBufferLoader"
+#define __DRI_MUTABLE_RENDER_BUFFER_LOADER_VERSION 1
+
+typedef struct __DRImutableRenderBufferLoaderExtensionRec __DRImutableRenderBufferLoaderExtension;
+struct __DRImutableRenderBufferLoaderExtensionRec {
+   __DRIextension base;
+
+   /**
+    * Inform the display engine (usually SurfaceFlinger or hwcomposer)
+    * that the __DRIdrawable has new content. The display engine may ignore
+    * this, for example, if it continually refreshes and displays the buffer
+    * on every frame, as in EGL_ANDROID_front_buffer_auto_refresh. On the
+    * other extreme, the display engine may refresh and display the buffer
+    * only in frames in which the driver calls this.
+    *
+    * If the fence_fd is not -1, then the display engine will display the
+    * buffer only after the fence signals.
+    *
+    * The drawable's current __DRIimageBufferMask, as returned by
+    * __DRIimageLoaderExtension::getBuffers(), must contain
+    * __DRI_IMAGE_BUFFER_SHARED.
+    */
+   void (*displaySharedBuffer)(__DRIdrawable *drawable, int fence_fd,
+                               void *loaderPrivate);
+};
+
 #endif
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -269,6 +269,7 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
 		vce.available_rings ? vce_version : 0;
 	info->has_userptr = true;
 	info->has_syncobj = has_syncobj(fd);
+	info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20;
 	info->has_sync_file = info->has_syncobj && info->drm_minor >= 21;
 	info->has_ctx_priority = info->drm_minor >= 22;
 	info->num_render_backends = amdinfo->rb_pipes;
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -81,6 +81,7 @@ struct radeon_info {
 	uint32_t                    drm_patchlevel;
 	bool                        has_userptr;
 	bool                        has_syncobj;
+	bool                        has_syncobj_wait_for_submit;
 	bool                        has_sync_file;
 	bool                        has_ctx_priority;

--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -99,6 +99,13 @@ VULKAN_LIB_DEPS += \
 	$(WAYLAND_CLIENT_LIBS)
 endif

+if HAVE_PLATFORM_ANDROID
+AM_CPPFLAGS += $(ANDROID_CPPFLAGS)
+AM_CFLAGS += $(ANDROID_CFLAGS)
+VULKAN_LIB_DEPS += $(ANDROID_LIBS)
+VULKAN_SOURCES += $(VULKAN_ANDROID_FILES)
+endif
+
 noinst_LTLIBRARIES = libvulkan_common.la
 libvulkan_common_la_SOURCES = $(VULKAN_SOURCES)

@@ -106,11 +113,14 @@ nodist_EXTRA_libvulkan_radeon_la_SOURCES = dummy.cpp
 libvulkan_radeon_la_SOURCES = $(VULKAN_GEM_FILES)

 vulkan_api_xml = $(top_srcdir)/src/vulkan/registry/vk.xml
+vk_android_native_buffer_xml = $(top_srcdir)/src/vulkan/registry/vk_android_native_buffer.xml

 radv_entrypoints.c: radv_entrypoints_gen.py radv_extensions.py $(vulkan_api_xml)
 	$(MKDIR_GEN)
 	$(AM_V_GEN)$(PYTHON2) $(srcdir)/radv_entrypoints_gen.py \
-		--xml $(vulkan_api_xml) --outdir $(builddir)
+		--xml $(vulkan_api_xml) \
+		--xml $(vk_android_native_buffer_xml) \
+		--outdir $(builddir)
 radv_entrypoints.h: radv_entrypoints.c

 radv_extensions.c: radv_extensions.py \
@@ -118,6 +128,7 @@ radv_extensions.c: radv_extensions.py \
 	$(MKDIR_GEN)
 	$(AM_V_GEN)$(PYTHON2) $(srcdir)/radv_extensions.py \
 		--xml $(vulkan_api_xml) \
+		--xml $(vk_android_native_buffer_xml) \
 		--out $@

 vk_format_table.c: vk_format_table.py \
--- a/src/amd/vulkan/Makefile.sources
+++ b/src/amd/vulkan/Makefile.sources
@@ -69,6 +69,9 @@ VULKAN_FILES := \
 	vk_format.h \
 	$(RADV_WS_AMDGPU_FILES)

+VULKAN_ANDROID_FILES := \
+	radv_android.c
+
 VULKAN_WSI_WAYLAND_FILES := \
 	radv_wsi_wayland.c

--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -29,10 +29,11 @@ radv_entrypoints = custom_target(

 radv_extensions_c = custom_target(
  'radv_extensions.c',
-  input : ['radv_extensions.py', vk_api_xml],
+  input : ['radv_extensions.py', vk_api_xml, vk_android_native_buffer_xml],
  output : ['radv_extensions.c'],
-  command : [prog_python2, '@INPUT0@', '--xml', '@INPUT1@',
-             '--out', '@OUTPUT@'],
+  command : [
+    prog_python2, '@INPUT0@', '--xml', '@INPUT1@', '--xml', '@INPUT2@', '--out', '@OUTPUT@',
+  ],
 )

 vk_format_table_c = custom_target(
--- a/src/amd/vulkan/radv_android.c
+++ b/src/amd/vulkan/radv_android.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright © 2017, Google Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <hardware/gralloc.h>
+#include <hardware/hardware.h>
+#include <hardware/hwvulkan.h>
+#include <vulkan/vk_android_native_buffer.h>
+#include <vulkan/vk_icd.h>
+#include <libsync.h>
+
+#include "radv_private.h"
+
+static int radv_hal_open(const struct hw_module_t* mod, const char* id, struct hw_device_t** dev);
+static int radv_hal_close(struct hw_device_t *dev);
+
+static void UNUSED
+static_asserts(void)
+{
+	STATIC_ASSERT(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC);
+}
+
+PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = {
+	.common = {
+		.tag = HARDWARE_MODULE_TAG,
+		.module_api_version = HWVULKAN_MODULE_API_VERSION_0_1,
+		.hal_api_version = HARDWARE_MAKE_API_VERSION(1, 0),
+		.id = HWVULKAN_HARDWARE_MODULE_ID,
+		.name = "AMD Vulkan HAL",
+		.author = "Google",
+		.methods = &(hw_module_methods_t) {
+			.open = radv_hal_open,
+		},
+	},
+};
+
+/* If any bits in test_mask are set, then unset them and return true. */
+static inline bool
+unmask32(uint32_t *inout_mask, uint32_t test_mask)
+{
+	uint32_t orig_mask = *inout_mask;
+	*inout_mask &= ~test_mask;
+	return *inout_mask != orig_mask;
+}
+
+static int
+radv_hal_open(const struct hw_module_t* mod, const char* id,
+             struct hw_device_t** dev)
+{
+	assert(mod == &HAL_MODULE_INFO_SYM.common);
+	assert(strcmp(id, HWVULKAN_DEVICE_0) == 0);
+
+	hwvulkan_device_t *hal_dev = malloc(sizeof(*hal_dev));
+	if (!hal_dev)
+		return -1;
+
+	*hal_dev = (hwvulkan_device_t) {
+		.common = {
+			.tag = HARDWARE_DEVICE_TAG,
+			.version = HWVULKAN_DEVICE_API_VERSION_0_1,
+			.module = &HAL_MODULE_INFO_SYM.common,
+			.close = radv_hal_close,
+		},
+		.EnumerateInstanceExtensionProperties = radv_EnumerateInstanceExtensionProperties,
+		.CreateInstance = radv_CreateInstance,
+		.GetInstanceProcAddr = radv_GetInstanceProcAddr,
+	};
+
+	*dev = &hal_dev->common;
+	return 0;
+}
+
+static int
+radv_hal_close(struct hw_device_t *dev)
+{
+	/* hwvulkan.h claims that hw_device_t::close() is never called. */
+	return -1;
+}
+
+VkResult
+radv_image_from_gralloc(VkDevice device_h,
+                       const VkImageCreateInfo *base_info,
+                       const VkNativeBufferANDROID *gralloc_info,
+                       const VkAllocationCallbacks *alloc,
+                       VkImage *out_image_h)
+
+{
+	RADV_FROM_HANDLE(radv_device, device, device_h);
+	VkImage image_h = VK_NULL_HANDLE;
+	struct radv_image *image = NULL;
+	struct radv_bo *bo = NULL;
+	VkResult result;
+
+	result = radv_image_create(device_h,
+	                           &(struct radv_image_create_info) {
+	                               .vk_info = base_info,
+	                               .scanout = true,
+	                               .no_metadata_planes = true},
+	                           alloc,
+	                           &image_h);
+
+	if (result != VK_SUCCESS)
+		return result;
+
+	if (gralloc_info->handle->numFds != 1) {
+		return vk_errorf(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR,
+		                 "VkNativeBufferANDROID::handle::numFds is %d, "
+		                 "expected 1", gralloc_info->handle->numFds);
+	}
+
+	/* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf
+	 * must exceed that of the gralloc handle, and we do not own the gralloc
+	 * handle.
+	 */
+	int dma_buf = gralloc_info->handle->data[0];
+
+	image = radv_image_from_handle(image_h);
+
+	VkDeviceMemory memory_h;
+
+	const VkMemoryDedicatedAllocateInfoKHR ded_alloc = {
+		.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+		.pNext = NULL,
+		.buffer = VK_NULL_HANDLE,
+		.image = image_h
+	};
+
+	const VkImportMemoryFdInfoKHR import_info = {
+		.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+		.pNext = &ded_alloc,
+		.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR,
+		.fd = dup(dma_buf),
+	};
+	/* Find the first VRAM memory type, or GART for PRIME images. */
+	int memory_type_index = -1;
+	for (int i = 0; i < device->physical_device->memory_properties.memoryTypeCount; ++i) {
+		bool is_local = !!(device->physical_device->memory_properties.memoryTypes[i].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+		if (is_local) {
+			memory_type_index = i;
+			break;
+		}
+	}
+
+	/* fallback */
+	if (memory_type_index == -1)
+		memory_type_index = 0;
+
+	result = radv_AllocateMemory(device_h,
+				     &(VkMemoryAllocateInfo) {
+					     .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+					     .pNext = &import_info,
+					     .allocationSize = image->size,
+					     .memoryTypeIndex = memory_type_index,
+				     },
+				     alloc,
+				     &memory_h);
+	if (result != VK_SUCCESS)
+		goto fail_create_image;
+
+	radv_BindImageMemory(device_h, image_h, memory_h, 0);
+
+	image->owned_memory = memory_h;
+	/* Don't clobber the out-parameter until success is certain. */
+	*out_image_h = image_h;
+
+	return VK_SUCCESS;
+
+fail_create_image:
+fail_size:
+	radv_DestroyImage(device_h, image_h, alloc);
+
+	return result;
+}
+
+VkResult radv_GetSwapchainGrallocUsageANDROID(
+    VkDevice            device_h,
+    VkFormat            format,
+    VkImageUsageFlags   imageUsage,
+    int*                grallocUsage)
+{
+	RADV_FROM_HANDLE(radv_device, device, device_h);
+	struct radv_physical_device *phys_dev = device->physical_device;
+	VkPhysicalDevice phys_dev_h = radv_physical_device_to_handle(phys_dev);
+	VkResult result;
+
+	*grallocUsage = 0;
+
+	/* WARNING: Android Nougat's libvulkan.so hardcodes the VkImageUsageFlags
+	 * returned to applications via VkSurfaceCapabilitiesKHR::supportedUsageFlags.
+	 * The relevant code in libvulkan/swapchain.cpp contains this fun comment:
+	 *
+	 *     TODO(jessehall): I think these are right, but haven't thought hard
+	 *     about it. Do we need to query the driver for support of any of
+	 *     these?
+	 *
+	 * Any disagreement between this function and the hardcoded
+	 * VkSurfaceCapabilitiesKHR:supportedUsageFlags causes tests
+	 * dEQP-VK.wsi.android.swapchain.*.image_usage to fail.
+	 */
+
+	const VkPhysicalDeviceImageFormatInfo2KHR image_format_info = {
+		.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+		.format = format,
+		.type = VK_IMAGE_TYPE_2D,
+		.tiling = VK_IMAGE_TILING_OPTIMAL,
+		.usage = imageUsage,
+	};
+
+	VkImageFormatProperties2KHR image_format_props = {
+		.sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+	};
+
+	/* Check that requested format and usage are supported. */
+	result = radv_GetPhysicalDeviceImageFormatProperties2KHR(phys_dev_h,
+	                                                         &image_format_info, &image_format_props);
+	if (result != VK_SUCCESS) {
+		return vk_errorf(result,
+		                 "radv_GetPhysicalDeviceImageFormatProperties2KHR failed "
+		                 "inside %s", __func__);
+	}
+
+	if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+	                          VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT))
+		*grallocUsage |= GRALLOC_USAGE_HW_RENDER;
+
+	if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+	                          VK_IMAGE_USAGE_SAMPLED_BIT |
+	                          VK_IMAGE_USAGE_STORAGE_BIT |
+	                          VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
+		*grallocUsage |= GRALLOC_USAGE_HW_TEXTURE;
+
+	/* All VkImageUsageFlags not explicitly checked here are unsupported for
+	 * gralloc swapchains.
+	 */
+	if (imageUsage != 0) {
+	return vk_errorf(VK_ERROR_FORMAT_NOT_SUPPORTED,
+	                "unsupported VkImageUsageFlags(0x%x) for gralloc "
+	                "swapchain", imageUsage);
+	}
+
+	/*
+	* FINISHME: Advertise all display-supported formats. Mostly
+	* DRM_FORMAT_ARGB2101010 and DRM_FORMAT_ABGR2101010, but need to check
+	* what we need for 30-bit colors.
+	*/
+	if (format == VK_FORMAT_B8G8R8A8_UNORM ||
+	    format == VK_FORMAT_B5G6R5_UNORM_PACK16) {
+		*grallocUsage |= GRALLOC_USAGE_HW_FB |
+		                 GRALLOC_USAGE_HW_COMPOSER |
+		                 GRALLOC_USAGE_EXTERNAL_DISP;
+	}
+
+	if (*grallocUsage == 0)
+		return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+	return VK_SUCCESS;
+}
+
+VkResult
+radv_AcquireImageANDROID(
+      VkDevice            device,
+      VkImage             image_h,
+      int                 nativeFenceFd,
+      VkSemaphore         semaphore,
+      VkFence             fence)
+{
+	VkResult semaphore_result = VK_SUCCESS, fence_result = VK_SUCCESS;
+
+	if (semaphore != VK_NULL_HANDLE) {
+		int semaphore_fd = nativeFenceFd >= 0 ? dup(nativeFenceFd) : nativeFenceFd;
+		semaphore_result = radv_ImportSemaphoreFdKHR(device,
+		                                             &(VkImportSemaphoreFdInfoKHR) {
+		                                                 .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR,
+		                                                 .flags = VK_SEMAPHORE_IMPORT_TEMPORARY_BIT_KHR,
+		                                                 .fd = semaphore_fd,
+		                                                 .semaphore = semaphore,
+		                                            });
+	}
+
+	if (fence != VK_NULL_HANDLE) {
+		int fence_fd = nativeFenceFd >= 0 ? dup(nativeFenceFd) : nativeFenceFd;
+		fence_result = radv_ImportFenceFdKHR(device,
+		                                     &(VkImportFenceFdInfoKHR) {
+		                                         .sType = VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR,
+		                                         .flags = VK_FENCE_IMPORT_TEMPORARY_BIT_KHR,
+		                                         .fd = fence_fd,
+		                                         .fence = fence,
+		                                     });
+	}
+
+	close(nativeFenceFd);
+
+	if (semaphore_result != VK_SUCCESS)
+		return semaphore_result;
+	return fence_result;
+}
+
+VkResult
+radv_QueueSignalReleaseImageANDROID(
+      VkQueue             _queue,
+      uint32_t            waitSemaphoreCount,
+      const VkSemaphore*  pWaitSemaphores,
+      VkImage             image,
+      int*                pNativeFenceFd)
+{
+	RADV_FROM_HANDLE(radv_queue, queue, _queue);
+	VkResult result = VK_SUCCESS;
+
+	if (waitSemaphoreCount == 0) {
+		if (pNativeFenceFd)
+			*pNativeFenceFd = -1;
+		return VK_SUCCESS;
+	}
+
+	int fd = -1;
+
+	for (uint32_t i = 0; i < waitSemaphoreCount; ++i) {
+		int tmp_fd;
+		result = radv_GetSemaphoreFdKHR(radv_device_to_handle(queue->device),
+		                                &(VkSemaphoreGetFdInfoKHR) {
+		                                    .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+		                                    .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR,
+		                                    .semaphore = pWaitSemaphores[i],
+		                            }, &tmp_fd);
+		if (result != VK_SUCCESS) {
+			if (fd >= 0)
+				close (fd);
+			return result;
+		}
+
+		if (fd < 0)
+			fd = tmp_fd;
+		else if (tmp_fd >= 0) {
+			sync_accumulate("radv", &fd, tmp_fd);
+			close(tmp_fd);
+		}
+	}
+
+	if (pNativeFenceFd) {
+		*pNativeFenceFd = fd;
+	} else if (fd >= 0) {
+		close(fd);
+		/* We still need to do the exports, to reset the semaphores, but
+		 * otherwise we don't wait on them. */
+	}
+	return VK_SUCCESS;
+}
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1038,6 +1038,10 @@ VkResult radv_CreateDevice(
 		}
 	}

+#ifdef ANDROID
+	device->always_use_syncobj = device->physical_device->rad_info.has_syncobj_wait_for_submit;
+#endif
+
 #if HAVE_LLVM < 0x0400
 	device->llvm_supports_spill = false;
 #else
@@ -1794,12 +1798,14 @@ fail:
 static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts,
 				      int num_sems,
 				      const VkSemaphore *sems,
+				      VkFence _fence,
 				      bool reset_temp)
 {
 	int syncobj_idx = 0, sem_idx = 0;

-	if (num_sems == 0)
+	if (num_sems == 0 && _fence == VK_NULL_HANDLE)
 		return VK_SUCCESS;
+
 	for (uint32_t i = 0; i < num_sems; i++) {
 		RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]);

@@ -1809,6 +1815,12 @@ static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts,
 			counts->sem_count++;
 	}

+	if (_fence != VK_NULL_HANDLE) {
+		RADV_FROM_HANDLE(radv_fence, fence, _fence);
+		if (fence->temp_syncobj || fence->syncobj)
+			counts->syncobj_count++;
+	}
+
 	if (counts->syncobj_count) {
 		counts->syncobj = (uint32_t *)malloc(sizeof(uint32_t) * counts->syncobj_count);
 		if (!counts->syncobj)
@@ -1837,6 +1849,14 @@ static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts,
 		}
 	}

+	if (_fence != VK_NULL_HANDLE) {
+		RADV_FROM_HANDLE(radv_fence, fence, _fence);
+		if (fence->temp_syncobj)
+			counts->syncobj[syncobj_idx++] = fence->temp_syncobj;
+		else if (fence->syncobj)
+			counts->syncobj[syncobj_idx++] = fence->syncobj;
+	}
+
 	return VK_SUCCESS;
 }

@@ -1867,15 +1887,16 @@ VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
 			     int num_wait_sems,
 			     const VkSemaphore *wait_sems,
 			     int num_signal_sems,
-			     const VkSemaphore *signal_sems)
+			     const VkSemaphore *signal_sems,
+			     VkFence fence)
 {
 	VkResult ret;
 	memset(sem_info, 0, sizeof(*sem_info));

-	ret = radv_alloc_sem_counts(&sem_info->wait, num_wait_sems, wait_sems, true);
+	ret = radv_alloc_sem_counts(&sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true);
 	if (ret)
 		return ret;
-	ret = radv_alloc_sem_counts(&sem_info->signal, num_signal_sems, signal_sems, false);
+	ret = radv_alloc_sem_counts(&sem_info->signal, num_signal_sems, signal_sems, fence, false);
 	if (ret)
 		radv_free_sem_info(sem_info);

@@ -1885,6 +1906,32 @@ VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
 	return ret;
 }

+/* Signals fence as soon as all the work currently put on queue is done. */
+static VkResult radv_signal_fence(struct radv_queue *queue,
+                              struct radv_fence *fence)
+{
+	int ret;
+	VkResult result;
+	struct radv_winsys_sem_info sem_info;
+
+	result = radv_alloc_sem_info(&sem_info, 0, NULL, 0, NULL,
+	                             radv_fence_to_handle(fence));
+	if (result != VK_SUCCESS)
+		return result;
+
+	ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
+	                                   &queue->device->empty_cs[queue->queue_family_index],
+	                                   1, NULL, NULL, &sem_info,
+	                                   false, fence->fence);
+	radv_free_sem_info(&sem_info);
+
+	/* TODO: find a better error */
+	if (ret)
+		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+	return VK_SUCCESS;
+}
+
 VkResult radv_QueueSubmit(
 	VkQueue                                     _queue,
 	uint32_t                                    submitCount,
@@ -1941,7 +1988,8 @@ VkResult radv_QueueSubmit(
 					     pSubmits[i].waitSemaphoreCount,
 					     pSubmits[i].pWaitSemaphores,
 					     pSubmits[i].signalSemaphoreCount,
-					     pSubmits[i].pSignalSemaphores);
+					     pSubmits[i].pSignalSemaphores,
+					     _fence);
 		if (result != VK_SUCCESS)
 			return result;

@@ -2010,11 +2058,7 @@ VkResult radv_QueueSubmit(

 	if (fence) {
 		if (!fence_emitted) {
-			struct radv_winsys_sem_info sem_info = {0};
-			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
-							   &queue->device->empty_cs[queue->queue_family_index],
-							   1, NULL, NULL, &sem_info,
-							   false, base_fence);
+			radv_signal_fence(queue, fence);
 		}
 		fence->submitted = true;
 	}
@@ -2506,7 +2550,8 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,
 					     pBindInfo[i].waitSemaphoreCount,
 					     pBindInfo[i].pWaitSemaphores,
 					     pBindInfo[i].signalSemaphoreCount,
-					     pBindInfo[i].pSignalSemaphores);
+					     pBindInfo[i].pSignalSemaphores,
+					     _fence);
 		if (result != VK_SUCCESS)
 			return result;

@@ -2525,8 +2570,11 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,

 	}

-	if (fence && !fence_emitted) {
-		fence->signalled = true;
+	if (fence) {
+		if (!fence_emitted) {
+			radv_signal_fence(queue, fence);
+		}
+		fence->submitted = true;
 	}

 	return VK_SUCCESS;
@@ -2539,6 +2587,11 @@ VkResult radv_CreateFence(
 	VkFence*                                    pFence)
 {
 	RADV_FROM_HANDLE(radv_device, device, _device);
+	const VkExportFenceCreateInfoKHR *export =
+		vk_find_struct_const(pCreateInfo->pNext, EXPORT_FENCE_CREATE_INFO_KHR);
+	VkExternalFenceHandleTypeFlagsKHR handleTypes =
+		export ? export->handleTypes : 0;
+
 	struct radv_fence *fence = vk_alloc2(&device->alloc, pAllocator,
 					       sizeof(*fence), 8,
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -2549,10 +2602,24 @@ VkResult radv_CreateFence(
 	memset(fence, 0, sizeof(*fence));
 	fence->submitted = false;
 	fence->signalled = !!(pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT);
-	fence->fence = device->ws->create_fence();
-	if (!fence->fence) {
-		vk_free2(&device->alloc, pAllocator, fence);
-		return VK_ERROR_OUT_OF_HOST_MEMORY;
+	fence->temp_syncobj = 0;
+	if (device->always_use_syncobj || handleTypes) {
+		int ret = device->ws->create_syncobj(device->ws, &fence->syncobj);
+		if (ret) {
+			vk_free2(&device->alloc, pAllocator, fence);
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+		}
+		if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
+			device->ws->signal_syncobj(device->ws, fence->syncobj);
+		}
+		fence->fence = NULL;
+	} else {
+		fence->fence = device->ws->create_fence();
+		if (!fence->fence) {
+			vk_free2(&device->alloc, pAllocator, fence);
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+		}
+		fence->syncobj = 0;
 	}

 	*pFence = radv_fence_to_handle(fence);
@@ -2570,7 +2637,13 @@ void radv_DestroyFence(

 	if (!fence)
 		return;
-	device->ws->destroy_fence(fence->fence);
+
+	if (fence->temp_syncobj)
+		device->ws->destroy_syncobj(device->ws, fence->temp_syncobj);
+	if (fence->syncobj)
+		device->ws->destroy_syncobj(device->ws, fence->syncobj);
+	if (fence->fence)
+		device->ws->destroy_fence(fence->fence);
 	vk_free2(&device->alloc, pAllocator, fence);
 }

@@ -2605,6 +2678,18 @@ VkResult radv_WaitForFences(
 		RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
 		bool expired = false;

+		if (fence->temp_syncobj) {
+			if (!device->ws->wait_syncobj(device->ws, fence->temp_syncobj, timeout))
+				return VK_TIMEOUT;
+			continue;
+		}
+
+		if (fence->syncobj) {
+			if (!device->ws->wait_syncobj(device->ws, fence->syncobj, timeout))
+				return VK_TIMEOUT;
+			continue;
+		}
+
 		if (fence->signalled)
 			continue;

@@ -2621,13 +2706,26 @@ VkResult radv_WaitForFences(
 	return VK_SUCCESS;
 }

-VkResult radv_ResetFences(VkDevice device,
+VkResult radv_ResetFences(VkDevice _device,
 			  uint32_t fenceCount,
 			  const VkFence *pFences)
 {
+	RADV_FROM_HANDLE(radv_device, device, _device);
+
 	for (unsigned i = 0; i < fenceCount; ++i) {
 		RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
 		fence->submitted = fence->signalled = false;
+
+		/* Per spec, we first restore the permanent payload, and then reset, so
+		 * having a temp syncobj should not skip resetting the permanent syncobj. */
+		if (fence->temp_syncobj) {
+			device->ws->destroy_syncobj(device->ws, fence->temp_syncobj);
+			fence->temp_syncobj = 0;
+		}
+
+		if (fence->syncobj) {
+			device->ws->reset_syncobj(device->ws, fence->syncobj);
+		}
 	}

 	return VK_SUCCESS;
@@ -2638,11 +2736,20 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence)
 	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_fence, fence, _fence);

+	if (fence->temp_syncobj) {
+			bool success = device->ws->wait_syncobj(device->ws, fence->temp_syncobj, 0);
+			return success ? VK_SUCCESS : VK_NOT_READY;
+	}
+
+	if (fence->syncobj) {
+			bool success = device->ws->wait_syncobj(device->ws, fence->syncobj, 0);
+			return success ? VK_SUCCESS : VK_NOT_READY;
+	}
+
 	if (fence->signalled)
 		return VK_SUCCESS;
 	if (!fence->submitted)
 		return VK_NOT_READY;
-
 	if (!device->ws->fence_wait(device->ws, fence->fence, false, 0))
 		return VK_NOT_READY;

@@ -2672,9 +2779,8 @@ VkResult radv_CreateSemaphore(

 	sem->temp_syncobj = 0;
 	/* create a syncobject if we are going to export this semaphore */
-	if (handleTypes) {
+	if (device->always_use_syncobj || handleTypes) {
 		assert (device->physical_device->rad_info.has_syncobj);
-		assert (handleTypes == VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR);
 		int ret = device->ws->create_syncobj(device->ws, &sem->syncobj);
 		if (ret) {
 			vk_free2(&device->alloc, pAllocator, sem);
@@ -3523,18 +3629,59 @@ VkResult radv_GetMemoryFdPropertiesKHR(VkDevice _device,
   return VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR;
 }

+static VkResult radv_import_opaque_fd(struct radv_device *device,
+                                      int fd,
+                                      uint32_t *syncobj)
+{
+	uint32_t syncobj_handle = 0;
+	int ret = device->ws->import_syncobj(device->ws, fd, &syncobj_handle);
+	if (ret != 0)
+		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+
+	if (*syncobj)
+		device->ws->destroy_syncobj(device->ws, *syncobj);
+
+	*syncobj = syncobj_handle;
+	close(fd);
+
+	return VK_SUCCESS;
+}
+
+static VkResult radv_import_sync_fd(struct radv_device *device,
+                                    int fd,
+                                    uint32_t *syncobj)
+{
+	/* If we create a syncobj we do it locally so that if we have an error, we don't
+	 * leave a syncobj in an undetermined state in the fence. */
+	uint32_t syncobj_handle =  *syncobj;
+	if (!syncobj_handle) {
+		int ret = device->ws->create_syncobj(device->ws, &syncobj_handle);
+		if (ret) {
+			return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		}
+	}
+
+	if (fd == -1) {
+		device->ws->signal_syncobj(device->ws, syncobj_handle);
+	} else {
+		int ret = device->ws->import_syncobj_from_sync_file(device->ws, syncobj_handle, fd);
+	if (ret != 0)
+		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+	}
+
+	*syncobj = syncobj_handle;
+	if (fd != -1)
+		close(fd);
+
+	return VK_SUCCESS;
+}
+
 VkResult radv_ImportSemaphoreFdKHR(VkDevice _device,
 				   const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo)
 {
 	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_semaphore, sem, pImportSemaphoreFdInfo->semaphore);
-	uint32_t syncobj_handle = 0;
 	uint32_t *syncobj_dst = NULL;
-	assert(pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR);
-
-	int ret = device->ws->import_syncobj(device->ws, pImportSemaphoreFdInfo->fd, &syncobj_handle);
-	if (ret != 0)
-		return VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR;

 	if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT_KHR) {
 		syncobj_dst = &sem->temp_syncobj;
@@ -3542,12 +3689,14 @@ VkResult radv_ImportSemaphoreFdKHR(VkDevice _device,
 		syncobj_dst = &sem->syncobj;
 	}

-	if (*syncobj_dst)
-		device->ws->destroy_syncobj(device->ws, *syncobj_dst);
-
-	*syncobj_dst = syncobj_handle;
-	close(pImportSemaphoreFdInfo->fd);
-	return VK_SUCCESS;
+	switch(pImportSemaphoreFdInfo->handleType) {
+		case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR:
+			return radv_import_opaque_fd(device, pImportSemaphoreFdInfo->fd, syncobj_dst);
+		case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR:
+			return radv_import_sync_fd(device, pImportSemaphoreFdInfo->fd, syncobj_dst);
+		default:
+			unreachable("Unhandled semaphore handle type");
+	}
 }

 VkResult radv_GetSemaphoreFdKHR(VkDevice _device,
@@ -3559,12 +3708,30 @@ VkResult radv_GetSemaphoreFdKHR(VkDevice _device,
 	int ret;
 	uint32_t syncobj_handle;

-	assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR);
 	if (sem->temp_syncobj)
 		syncobj_handle = sem->temp_syncobj;
 	else
 		syncobj_handle = sem->syncobj;
-	ret = device->ws->export_syncobj(device->ws, syncobj_handle, pFd);
+
+	switch(pGetFdInfo->handleType) {
+	case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR:
+		ret = device->ws->export_syncobj(device->ws, syncobj_handle, pFd);
+		break;
+	case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR:
+		ret = device->ws->export_syncobj_to_sync_file(device->ws, syncobj_handle, pFd);
+		if (!ret) {
+			if (sem->temp_syncobj) {
+				close (sem->temp_syncobj);
+				sem->temp_syncobj = 0;
+			} else {
+				device->ws->reset_syncobj(device->ws, syncobj_handle);
+			}
+		}
+		break;
+	default:
+		unreachable("Unhandled semaphore handle type");
+	}
+
 	if (ret)
 		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 	return VK_SUCCESS;
@@ -3575,7 +3742,17 @@ void radv_GetPhysicalDeviceExternalSemaphorePropertiesKHR(
 	const VkPhysicalDeviceExternalSemaphoreInfoKHR* pExternalSemaphoreInfo,
 	VkExternalSemaphorePropertiesKHR*           pExternalSemaphoreProperties)
 {
-	if (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR) {
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
+
+	/* Require has_syncobj_wait_for_submit for the syncobj signal ioctl introduced at virtually the same time */
+	if (pdevice->rad_info.has_syncobj_wait_for_submit &&
+	    (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR || 
+	     pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR)) {
+		pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR;
+		pExternalSemaphoreProperties->compatibleHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR;
+		pExternalSemaphoreProperties->externalSemaphoreFeatures = VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR |
+			VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR;
+	} else if (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR) {
 		pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
 		pExternalSemaphoreProperties->compatibleHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
 		pExternalSemaphoreProperties->externalSemaphoreFeatures = VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR |
@@ -3586,3 +3763,86 @@ void radv_GetPhysicalDeviceExternalSemaphorePropertiesKHR(
 		pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
 	}
 }
+
+VkResult radv_ImportFenceFdKHR(VkDevice _device,
+				   const VkImportFenceFdInfoKHR *pImportFenceFdInfo)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_fence, fence, pImportFenceFdInfo->fence);
+	uint32_t *syncobj_dst = NULL;
+
+
+	if (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT_KHR) {
+		syncobj_dst = &fence->temp_syncobj;
+	} else {
+		syncobj_dst = &fence->syncobj;
+	}
+
+	switch(pImportFenceFdInfo->handleType) {
+		case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR:
+			return radv_import_opaque_fd(device, pImportFenceFdInfo->fd, syncobj_dst);
+		case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT_KHR:
+			return radv_import_sync_fd(device, pImportFenceFdInfo->fd, syncobj_dst);
+		default:
+			unreachable("Unhandled fence handle type");
+	}
+}
+
+VkResult radv_GetFenceFdKHR(VkDevice _device,
+				const VkFenceGetFdInfoKHR *pGetFdInfo,
+				int *pFd)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_fence, fence, pGetFdInfo->fence);
+	int ret;
+	uint32_t syncobj_handle;
+
+	if (fence->temp_syncobj)
+		syncobj_handle = fence->temp_syncobj;
+	else
+		syncobj_handle = fence->syncobj;
+
+	switch(pGetFdInfo->handleType) {
+	case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR:
+		ret = device->ws->export_syncobj(device->ws, syncobj_handle, pFd);
+		break;
+	case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT_KHR:
+		ret = device->ws->export_syncobj_to_sync_file(device->ws, syncobj_handle, pFd);
+		if (!ret) {
+			if (fence->temp_syncobj) {
+				close (fence->temp_syncobj);
+				fence->temp_syncobj = 0;
+			} else {
+				device->ws->reset_syncobj(device->ws, syncobj_handle);
+			}
+		}
+		break;
+	default:
+		unreachable("Unhandled fence handle type");
+	}
+
+	if (ret)
+		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+	return VK_SUCCESS;
+}
+
+void radv_GetPhysicalDeviceExternalFencePropertiesKHR(
+	VkPhysicalDevice                            physicalDevice,
+	const VkPhysicalDeviceExternalFenceInfoKHR* pExternalFenceInfo,
+	VkExternalFencePropertiesKHR*           pExternalFenceProperties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
+
+	if (pdevice->rad_info.has_syncobj_wait_for_submit &&
+	    (pExternalFenceInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR || 
+	     pExternalFenceInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT_KHR)) {
+		pExternalFenceProperties->exportFromImportedHandleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR | VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT_KHR;
+		pExternalFenceProperties->compatibleHandleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR | VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT_KHR;
+		pExternalFenceProperties->externalFenceFeatures = VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT_KHR |
+			VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR;
+	} else {
+		pExternalFenceProperties->exportFromImportedHandleTypes = 0;
+		pExternalFenceProperties->compatibleHandleTypes = 0;
+		pExternalFenceProperties->externalFenceFeatures = 0;
+	}
+}
--- a/src/amd/vulkan/radv_entrypoints_gen.py
+++ b/src/amd/vulkan/radv_entrypoints_gen.py
@@ -237,7 +237,9 @@ def get_entrypoints(doc, entrypoints_to_defines, start_index):
        if extension.attrib['name'] not in supported:
            continue

-        assert extension.attrib['supported'] == 'vulkan'
+        if extension.attrib['supported'] != 'vulkan':
+            continue
+
        for command in extension.findall('./require/command'):
            enabled_commands.add(command.attrib['name'])

--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -50,9 +50,13 @@ class Extension:
 # the those extension strings, then tests dEQP-VK.api.info.instance.extensions
 # and dEQP-VK.api.info.device fail due to the duplicated strings.
 EXTENSIONS = [
+    Extension('VK_ANDROID_native_buffer',                 5, 'ANDROID && device->rad_info.has_syncobj_wait_for_submit'),
    Extension('VK_KHR_bind_memory2',                      1, True),
    Extension('VK_KHR_dedicated_allocation',              1, True),
    Extension('VK_KHR_descriptor_update_template',        1, True),
+    Extension('VK_KHR_external_fence',                    1, 'device->rad_info.has_syncobj_wait_for_submit'),
+    Extension('VK_KHR_external_fence_capabilities',       1, True),
+    Extension('VK_KHR_external_fence_fd',                 1, 'device->rad_info.has_syncobj_wait_for_submit'),
    Extension('VK_KHR_external_memory',                   1, True),
    Extension('VK_KHR_external_memory_capabilities',      1, True),
    Extension('VK_KHR_external_memory_fd',                1, True),
@@ -77,7 +81,6 @@ EXTENSIONS = [
    Extension('VK_KHR_xcb_surface',                       6, 'VK_USE_PLATFORM_XCB_KHR'),
    Extension('VK_KHR_xlib_surface',                      6, 'VK_USE_PLATFORM_XLIB_KHR'),
    Extension('VK_KHX_multiview',                         1, True),
-    Extension('VK_EXT_debug_report',                      8, True),
    Extension('VK_EXT_global_priority',                   1, 'device->rad_info.has_ctx_priority'),
    Extension('VK_AMD_draw_indirect_count',               1, True),
    Extension('VK_AMD_rasterization_order',               1, 'device->rad_info.chip_class >= VI && device->rad_info.max_se >= 2'),
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -904,29 +904,34 @@ radv_image_create(VkDevice _device,
 	image->size = image->surface.surf_size;
 	image->alignment = image->surface.surf_alignment;

-	/* Try to enable DCC first. */
-	if (radv_image_can_enable_dcc(image)) {
-		radv_image_alloc_dcc(image);
-	} else {
-		/* When DCC cannot be enabled, try CMASK. */
-		image->surface.dcc_size = 0;
-		if (radv_image_can_enable_cmask(image)) {
-			radv_image_alloc_cmask(device, image);
-		}
-	}
-
-	/* Try to enable FMASK for multisampled images. */
-	if (radv_image_can_enable_fmask(image)) {
-		radv_image_alloc_fmask(device, image);
-	} else {
-		/* Otherwise, try to enable HTILE for depth surfaces. */
-		if (radv_image_can_enable_htile(image) &&
-		    !(device->instance->debug_flags & RADV_DEBUG_NO_HIZ)) {
-			radv_image_alloc_htile(image);
-			image->tc_compatible_htile = image->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE;
+	if (!create_info->no_metadata_planes) {
+		/* Try to enable DCC first. */
+		if (radv_image_can_enable_dcc(image)) {
+			radv_image_alloc_dcc(image);
 		} else {
-			image->surface.htile_size = 0;
+			/* When DCC cannot be enabled, try CMASK. */
+			image->surface.dcc_size = 0;
+			if (radv_image_can_enable_cmask(image)) {
+				radv_image_alloc_cmask(device, image);
+			}
 		}
+
+		/* Try to enable FMASK for multisampled images. */
+		if (radv_image_can_enable_fmask(image)) {
+			radv_image_alloc_fmask(device, image);
+		} else {
+			/* Otherwise, try to enable HTILE for depth surfaces. */
+			if (radv_image_can_enable_htile(image) &&
+			    !(device->instance->debug_flags & RADV_DEBUG_NO_HIZ)) {
+				radv_image_alloc_htile(image);
+				image->tc_compatible_htile = image->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE;
+			} else {
+				image->surface.htile_size = 0;
+			}
+		}
+	} else {
+		image->surface.dcc_size = 0;
+		image->surface.htile_size = 0;
 	}

 	if (pCreateInfo->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) {
@@ -1114,6 +1119,15 @@ radv_CreateImage(VkDevice device,
 		 const VkAllocationCallbacks *pAllocator,
 		 VkImage *pImage)
 {
+#ifdef ANDROID
+	const VkNativeBufferANDROID *gralloc_info =
+		vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
+
+	if (gralloc_info)
+		return radv_image_from_gralloc(device, pCreateInfo, gralloc_info,
+		                              pAllocator, pImage);
+#endif
+
 	return radv_image_create(device,
 				 &(struct radv_image_create_info) {
 					 .vk_info = pCreateInfo,
@@ -1136,6 +1150,9 @@ radv_DestroyImage(VkDevice _device, VkImage _image,
 	if (image->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT)
 		device->ws->buffer_destroy(image->bo);

+	if (image->owned_memory != VK_NULL_HANDLE)
+		radv_FreeMemory(_device, image->owned_memory, pAllocator);
+
 	vk_free2(&device->alloc, pAllocator, image);
 }

--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -69,6 +69,7 @@ typedef uint32_t xcb_window_t;
 #include <vulkan/vulkan.h>
 #include <vulkan/vulkan_intel.h>
 #include <vulkan/vk_icd.h>
+#include <vulkan/vk_android_native_buffer.h>

 #include "radv_entrypoints.h"

@@ -532,6 +533,7 @@ struct radv_device {
 	int queue_count[RADV_MAX_QUEUE_FAMILIES];
 	struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES];

+	bool always_use_syncobj;
 	bool llvm_supports_spill;
 	bool has_distributed_tess;
 	uint32_t tess_offchip_block_dw_size;
@@ -1249,6 +1251,9 @@ struct radv_image {
 	struct radv_cmask_info cmask;
 	uint64_t clear_value_offset;
 	uint64_t dcc_pred_offset;
+
+	/* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
+	VkDeviceMemory owned_memory;
 };

 /* Whether the image has a htile that is known consistent with the contents of
@@ -1333,6 +1338,7 @@ struct radv_image_view {
 struct radv_image_create_info {
 	const VkImageCreateInfo *vk_info;
 	bool scanout;
+	bool no_metadata_planes;
 };

 VkResult radv_image_create(VkDevice _device,
@@ -1340,6 +1346,13 @@ VkResult radv_image_create(VkDevice _device,
 			   const VkAllocationCallbacks* alloc,
 			   VkImage *pImage);

+VkResult
+radv_image_from_gralloc(VkDevice device_h,
+                       const VkImageCreateInfo *base_info,
+                       const VkNativeBufferANDROID *gralloc_info,
+                       const VkAllocationCallbacks *alloc,
+                       VkImage *out_image_h);
+
 void radv_image_view_init(struct radv_image_view *view,
 			  struct radv_device *device,
 			  const VkImageViewCreateInfo* pCreateInfo);
@@ -1521,7 +1534,8 @@ VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
 			     int num_wait_sems,
 			     const VkSemaphore *wait_sems,
 			     int num_signal_sems,
-			     const VkSemaphore *signal_sems);
+			     const VkSemaphore *signal_sems,
+			     VkFence fence);
 void radv_free_sem_info(struct radv_winsys_sem_info *sem_info);

 void
@@ -1556,6 +1570,9 @@ struct radv_fence {
 	struct radeon_winsys_fence *fence;
 	bool submitted;
 	bool signalled;
+
+	uint32_t syncobj;
+	uint32_t temp_syncobj;
 };

 struct radeon_winsys_sem;
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -256,9 +256,18 @@ struct radeon_winsys {
 	int (*create_syncobj)(struct radeon_winsys *ws, uint32_t *handle);
 	void (*destroy_syncobj)(struct radeon_winsys *ws, uint32_t handle);

+	void (*reset_syncobj)(struct radeon_winsys *ws, uint32_t handle);
+	void (*signal_syncobj)(struct radeon_winsys *ws, uint32_t handle);
+	bool (*wait_syncobj)(struct radeon_winsys *ws, uint32_t handle, uint64_t timeout);
+
 	int (*export_syncobj)(struct radeon_winsys *ws, uint32_t syncobj, int *fd);
 	int (*import_syncobj)(struct radeon_winsys *ws, int fd, uint32_t *syncobj);

+	int (*export_syncobj_to_sync_file)(struct radeon_winsys *ws, uint32_t syncobj, int *fd);
+
+	/* Note that this, unlike the normal import, uses an existing syncobj. */
+	int (*import_syncobj_from_sync_file)(struct radeon_winsys *ws, uint32_t syncobj, int fd);
+
 };

 static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -194,19 +194,22 @@ radv_shader_compile_to_nir(struct radv_device *device,
 					spec_entries[i].data32 = *(const uint32_t *)data;
 			}
 		}
-		const struct nir_spirv_supported_extensions supported_ext = {
-			.draw_parameters = true,
-			.float64 = true,
-			.image_read_without_format = true,
-			.image_write_without_format = true,
-			.tessellation = true,
-			.int64 = true,
-			.multiview = true,
-			.variable_pointers = true,
+		const struct spirv_to_nir_options spirv_options = {
+			.caps = {
+				.draw_parameters = true,
+				.float64 = true,
+				.image_read_without_format = true,
+				.image_write_without_format = true,
+				.tessellation = true,
+				.int64 = true,
+				.multiview = true,
+				.variable_pointers = true,
+			},
 		};
 		entry_point = spirv_to_nir(spirv, module->size / 4,
 					   spec_entries, num_spec_entries,
-					   stage, entrypoint_name, &supported_ext, &nir_options);
+					   stage, entrypoint_name,
+					   &spirv_options, &nir_options);
 		nir = entry_point->shader;
 		assert(nir->info.stage == stage);
 		nir_validate_shader(nir);
--- a/src/amd/vulkan/radv_wsi.c
+++ b/src/amd/vulkan/radv_wsi.c
@@ -445,13 +445,14 @@ VkResult radv_GetSwapchainImagesKHR(
 }

 VkResult radv_AcquireNextImageKHR(
-	VkDevice                                     device,
+	VkDevice                                     _device,
 	VkSwapchainKHR                               _swapchain,
 	uint64_t                                     timeout,
 	VkSemaphore                                  semaphore,
 	VkFence                                      _fence,
 	uint32_t*                                    pImageIndex)
 {
+	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(wsi_swapchain, swapchain, _swapchain);
 	RADV_FROM_HANDLE(radv_fence, fence, _fence);

@@ -461,6 +462,11 @@ VkResult radv_AcquireNextImageKHR(
 	if (fence && (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR)) {
 		fence->submitted = true;
 		fence->signalled = true;
+		if (fence->temp_syncobj) {
+			device->ws->signal_syncobj(device->ws, fence->temp_syncobj);
+		} else if (fence->syncobj) {
+			device->ws->signal_syncobj(device->ws, fence->syncobj);
+		}
 	}
 	return result;
 }
@@ -479,20 +485,6 @@ VkResult radv_QueuePresentKHR(
 		struct radeon_winsys_cs *cs;
 		const VkPresentRegionKHR *region = NULL;
 		VkResult item_result;
-		struct radv_winsys_sem_info sem_info;
-
-		item_result = radv_alloc_sem_info(&sem_info,
-						  pPresentInfo->waitSemaphoreCount,
-						  pPresentInfo->pWaitSemaphores,
-						  0,
-						  NULL);
-		if (pPresentInfo->pResults != NULL)
-			pPresentInfo->pResults[i] = item_result;
-		result = result == VK_SUCCESS ? item_result : result;
-		if (item_result != VK_SUCCESS) {
-			radv_free_sem_info(&sem_info);
-			continue;
-		}

 		assert(radv_device_from_handle(swapchain->device) == queue->device);
 		if (swapchain->fences[0] == VK_NULL_HANDLE) {
@@ -505,7 +497,6 @@ VkResult radv_QueuePresentKHR(
 				pPresentInfo->pResults[i] = item_result;
 			result = result == VK_SUCCESS ? item_result : result;
 			if (item_result != VK_SUCCESS) {
-				radv_free_sem_info(&sem_info);
 				continue;
 			}
 		} else {
@@ -513,6 +504,22 @@ VkResult radv_QueuePresentKHR(
 					 1, &swapchain->fences[0]);
 		}

+		struct radv_winsys_sem_info sem_info;
+
+		item_result = radv_alloc_sem_info(&sem_info,
+						  pPresentInfo->waitSemaphoreCount,
+						  pPresentInfo->pWaitSemaphores,
+						  0,
+						  NULL,
+						  swapchain->fences[0]);
+		if (pPresentInfo->pResults != NULL)
+			pPresentInfo->pResults[i] = item_result;
+		result = result == VK_SUCCESS ? item_result : result;
+		if (item_result != VK_SUCCESS) {
+			radv_free_sem_info(&sem_info);
+			continue;
+		}
+
 		if (swapchain->needs_linear_copy) {
 			int idx = (queue->queue_family_index * swapchain->image_count) + pPresentInfo->pImageIndices[i];
 			cs = radv_cmd_buffer_from_handle(swapchain->cmd_buffers[idx])->cs;
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -1257,6 +1257,43 @@ static void radv_amdgpu_destroy_syncobj(struct radeon_winsys *_ws,
 	amdgpu_cs_destroy_syncobj(ws->dev, handle);
 }

+static void radv_amdgpu_reset_syncobj(struct radeon_winsys *_ws,
+				    uint32_t handle)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+	amdgpu_cs_syncobj_reset(ws->dev, &handle, 1);
+}
+
+static void radv_amdgpu_signal_syncobj(struct radeon_winsys *_ws,
+				    uint32_t handle)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+	amdgpu_cs_syncobj_signal(ws->dev, &handle, 1);
+}
+
+static bool radv_amdgpu_wait_syncobj(struct radeon_winsys *_ws,
+				    uint32_t handle, uint64_t timeout)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+	uint32_t tmp;
+
+	/* The timeouts are signed, while vulkan timeouts are unsigned. */
+	timeout = MIN2(timeout, INT64_MAX);
+
+	int ret = amdgpu_cs_syncobj_wait(ws->dev, &handle, 1, timeout,
+					 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT |
+					 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+					 &tmp);
+	if (ret == 0) {
+		return true;
+	} else if (ret == -1 && errno == ETIME) {
+		return false;
+	} else {
+		fprintf(stderr, "amdgpu: radv_amdgpu_wait_syncobj failed!\nerrno: %d\n", errno);
+		return false;
+	}
+}
+
 static int radv_amdgpu_export_syncobj(struct radeon_winsys *_ws,
 				      uint32_t syncobj,
 				      int *fd)
@@ -1275,6 +1312,25 @@ static int radv_amdgpu_import_syncobj(struct radeon_winsys *_ws,
 	return amdgpu_cs_import_syncobj(ws->dev, fd, syncobj);
 }

+
+static int radv_amdgpu_export_syncobj_to_sync_file(struct radeon_winsys *_ws,
+                                                   uint32_t syncobj,
+                                                   int *fd)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+
+	return amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, fd);
+}
+
+static int radv_amdgpu_import_syncobj_from_sync_file(struct radeon_winsys *_ws,
+                                                     uint32_t syncobj,
+                                                     int fd)
+{
+	struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
+
+	return amdgpu_cs_syncobj_import_sync_file(ws->dev, syncobj, fd);
+}
+
 void radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
 {
 	ws->base.ctx_create = radv_amdgpu_ctx_create;
@@ -1295,7 +1351,12 @@ void radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
 	ws->base.destroy_sem = radv_amdgpu_destroy_sem;
 	ws->base.create_syncobj = radv_amdgpu_create_syncobj;
 	ws->base.destroy_syncobj = radv_amdgpu_destroy_syncobj;
+	ws->base.reset_syncobj = radv_amdgpu_reset_syncobj;
+	ws->base.signal_syncobj = radv_amdgpu_signal_syncobj;
+	ws->base.wait_syncobj = radv_amdgpu_wait_syncobj;
 	ws->base.export_syncobj = radv_amdgpu_export_syncobj;
 	ws->base.import_syncobj = radv_amdgpu_import_syncobj;
+	ws->base.export_syncobj_to_sync_file = radv_amdgpu_export_syncobj_to_sync_file;
+	ws->base.import_syncobj_from_sync_file = radv_amdgpu_import_syncobj_from_sync_file;
 	ws->base.fence_wait = radv_amdgpu_fence_wait;
 }
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -225,10 +225,12 @@ expanded_line:
 			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro);
 		_glcpp_parser_skip_stack_change_if (parser, & @1, "elif", $2.value);
 	}
-|	LINE_EXPANDED integer_constant NEWLINE {
+|	LINE_EXPANDED expression NEWLINE {
+		if (parser->is_gles && $2.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro);
 		parser->has_new_line_number = 1;
-		parser->new_line_number = $2;
-		_mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2);
+		parser->new_line_number = $2.value;
+		_mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2.value);
 	}
 |	LINE_EXPANDED integer_constant integer_constant NEWLINE {
 		parser->has_new_line_number = 1;
@@ -239,6 +241,19 @@ expanded_line:
 					   "#line %" PRIiMAX " %" PRIiMAX "\n",
 					    $2, $3);
 	}
+|	LINE_EXPANDED '(' expression ')' '(' expression ')' NEWLINE {
+		if (parser->is_gles && $3.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $3.undefined_macro);
+		if (parser->is_gles && $6.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $6.undefined_macro);
+		parser->has_new_line_number = 1;
+		parser->new_line_number = $3.value;
+		parser->has_new_source_number = 1;
+		parser->new_source_number = $6.value;
+		_mesa_string_buffer_printf(parser->output,
+					   "#line %" PRIiMAX " %" PRIiMAX "\n",
+					   $3.value, $6.value);
+	}
 ;

 define:
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -637,6 +637,9 @@ private:
                                    this->record_next_sampler))
               return;

+            /* Avoid overflowing the sampler array. (crbug.com/141901) */
+            this->next_sampler = MIN2(this->next_sampler, MAX_SAMPLERS);
+
            for (unsigned i = uniform->opaque[shader_type].index;
                 i < MIN2(this->next_sampler, MAX_SAMPLERS);
                 i++) {
--- a/src/compiler/spirv/nir_spirv.h
+++ b/src/compiler/spirv/nir_spirv.h
@@ -42,24 +42,34 @@ struct nir_spirv_specialization {
   };
 };

-struct nir_spirv_supported_extensions {
-   bool float64;
-   bool image_ms_array;
-   bool tessellation;
-   bool draw_parameters;
-   bool image_read_without_format;
-   bool image_write_without_format;
-   bool int64;
-   bool multiview;
-   bool variable_pointers;
+struct spirv_to_nir_options {
+   /* Whether or not to lower all workgroup variable access to offsets
+    * up-front.  This means you will _shared intrinsics instead of _var
+    * for workgroup data access.
+    *
+    * This is currently required for full variable pointers support.
+    */
+   bool lower_workgroup_access_to_offsets;
+
+   struct {
+      bool float64;
+      bool image_ms_array;
+      bool tessellation;
+      bool draw_parameters;
+      bool image_read_without_format;
+      bool image_write_without_format;
+      bool int64;
+      bool multiview;
+      bool variable_pointers;
+   } caps;
 };

 nir_function *spirv_to_nir(const uint32_t *words, size_t word_count,
                           struct nir_spirv_specialization *specializations,
                           unsigned num_specializations,
                           gl_shader_stage stage, const char *entry_point_name,
-                           const struct nir_spirv_supported_extensions *ext,
-                           const nir_shader_compiler_options *options);
+                           const struct spirv_to_nir_options *options,
+                           const nir_shader_compiler_options *nir_options);

 #ifdef __cplusplus
 }
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -117,7 +117,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,

         load->value = constant->values[0];

-         nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
+         nir_instr_insert_before_cf_list(&b->nb.impl->body, &load->instr);
         val->def = &load->def;
      } else {
         assert(glsl_type_is_matrix(type));
@@ -133,7 +133,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,

            load->value = constant->values[i];

-            nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
+            nir_instr_insert_before_cf_list(&b->nb.impl->body, &load->instr);
            col_val->def = &load->def;

            val->elems[i] = col_val;
@@ -729,6 +729,64 @@ translate_image_format(SpvImageFormat format)
   }
 }

+static struct vtn_type *
+vtn_type_layout_std430(struct vtn_builder *b, struct vtn_type *type,
+                       uint32_t *size_out, uint32_t *align_out)
+{
+   switch (type->base_type) {
+   case vtn_base_type_scalar: {
+      uint32_t comp_size = glsl_get_bit_size(type->type) / 8;
+      *size_out = comp_size;
+      *align_out = comp_size;
+      return type;
+   }
+
+   case vtn_base_type_vector: {
+      uint32_t comp_size = glsl_get_bit_size(type->type) / 8;
+      assert(type->length > 0 && type->length <= 4);
+      unsigned align_comps = type->length == 3 ? 4 : type->length;
+      *size_out = comp_size * type->length,
+      *align_out = comp_size * align_comps;
+      return type;
+   }
+
+   case vtn_base_type_matrix:
+   case vtn_base_type_array: {
+      /* We're going to add an array stride */
+      type = vtn_type_copy(b, type);
+      uint32_t elem_size, elem_align;
+      type->array_element = vtn_type_layout_std430(b, type->array_element,
+                                                   &elem_size, &elem_align);
+      type->stride = vtn_align_u32(elem_size, elem_align);
+      *size_out = type->stride * type->length;
+      *align_out = elem_align;
+      return type;
+   }
+
+   case vtn_base_type_struct: {
+      /* We're going to add member offsets */
+      type = vtn_type_copy(b, type);
+      uint32_t offset = 0;
+      uint32_t align = 0;
+      for (unsigned i = 0; i < type->length; i++) {
+         uint32_t mem_size, mem_align;
+         type->members[i] = vtn_type_layout_std430(b, type->members[i],
+                                                   &mem_size, &mem_align);
+         offset = vtn_align_u32(offset, mem_align);
+         type->offsets[i] = offset;
+         offset += mem_size;
+         align = MAX2(align, mem_align);
+      }
+      *size_out = offset;
+      *align_out = align;
+      return type;
+   }
+
+   default:
+      unreachable("Invalid SPIR-V type for std430");
+   }
+}
+
 static void
 vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
                const uint32_t *w, unsigned count)
@@ -878,6 +936,19 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
          */
         val->type->type = glsl_vector_type(GLSL_TYPE_UINT, 2);
      }
+
+      if (storage_class == SpvStorageClassWorkgroup &&
+          b->options->lower_workgroup_access_to_offsets) {
+         uint32_t size, align;
+         val->type->deref = vtn_type_layout_std430(b, val->type->deref,
+                                                   &size, &align);
+         val->type->length = size;
+         val->type->align = align;
+         /* These can actually be stored to nir_variables and used as SSA
+          * values so they need a real glsl_type.
+          */
+         val->type->type = glsl_uint_type();
+      }
      break;
   }

@@ -1394,8 +1465,11 @@ vtn_handle_function_call(struct vtn_builder *b, SpvOp opcode,
                         const uint32_t *w, unsigned count)
 {
   struct vtn_type *res_type = vtn_value(b, w[1], vtn_value_type_type)->type;
-   struct nir_function *callee =
-      vtn_value(b, w[3], vtn_value_type_function)->func->impl->function;
+   struct vtn_function *vtn_callee =
+      vtn_value(b, w[3], vtn_value_type_function)->func;
+   struct nir_function *callee = vtn_callee->impl->function;
+
+   vtn_callee->referenced = true;

   nir_call_instr *call = nir_call_instr_create(b->nb.shader, callee);
   for (unsigned i = 0; i < call->num_params; i++) {
@@ -1410,7 +1484,7 @@ vtn_handle_function_call(struct vtn_builder *b, SpvOp opcode,

         /* Make a temporary to store the argument in */
         nir_variable *tmp =
-            nir_local_variable_create(b->impl, arg_ssa->type, "arg_tmp");
+            nir_local_variable_create(b->nb.impl, arg_ssa->type, "arg_tmp");
         call->params[i] = nir_deref_var_create(call, tmp);

         vtn_local_store(b, arg_ssa, call->params[i]);
@@ -1420,7 +1494,7 @@ vtn_handle_function_call(struct vtn_builder *b, SpvOp opcode,
   nir_variable *out_tmp = NULL;
   assert(res_type->type == callee->return_type);
   if (!glsl_type_is_void(callee->return_type)) {
-      out_tmp = nir_local_variable_create(b->impl, callee->return_type,
+      out_tmp = nir_local_variable_create(b->nb.impl, callee->return_type,
                                          "out_tmp");
      call->return_deref = nir_deref_var_create(call, out_tmp);
   }
@@ -2098,6 +2172,32 @@ get_ssbo_nir_atomic_op(SpvOp opcode)

 static nir_intrinsic_op
 get_shared_nir_atomic_op(SpvOp opcode)
+{
+   switch (opcode) {
+   case SpvOpAtomicLoad:      return nir_intrinsic_load_shared;
+   case SpvOpAtomicStore:     return nir_intrinsic_store_shared;
+#define OP(S, N) case SpvOp##S: return nir_intrinsic_shared_##N;
+   OP(AtomicExchange,         atomic_exchange)
+   OP(AtomicCompareExchange,  atomic_comp_swap)
+   OP(AtomicIIncrement,       atomic_add)
+   OP(AtomicIDecrement,       atomic_add)
+   OP(AtomicIAdd,             atomic_add)
+   OP(AtomicISub,             atomic_add)
+   OP(AtomicSMin,             atomic_imin)
+   OP(AtomicUMin,             atomic_umin)
+   OP(AtomicSMax,             atomic_imax)
+   OP(AtomicUMax,             atomic_umax)
+   OP(AtomicAnd,              atomic_and)
+   OP(AtomicOr,               atomic_or)
+   OP(AtomicXor,              atomic_xor)
+#undef OP
+   default:
+      unreachable("Invalid shared atomic");
+   }
+}
+
+static nir_intrinsic_op
+get_var_nir_atomic_op(SpvOp opcode)
 {
   switch (opcode) {
   case SpvOpAtomicLoad:      return nir_intrinsic_load_var;
@@ -2161,10 +2261,11 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
   SpvMemorySemanticsMask semantics = w[5];
   */

-   if (ptr->mode == vtn_variable_mode_workgroup) {
+   if (ptr->mode == vtn_variable_mode_workgroup &&
+       !b->options->lower_workgroup_access_to_offsets) {
      nir_deref_var *deref = vtn_pointer_to_deref(b, ptr);
      const struct glsl_type *deref_type = nir_deref_tail(&deref->deref)->type;
-      nir_intrinsic_op op = get_shared_nir_atomic_op(opcode);
+      nir_intrinsic_op op = get_var_nir_atomic_op(opcode);
      atomic = nir_intrinsic_instr_create(b->nb.shader, op);
      atomic->variables[0] = nir_deref_var_clone(deref, atomic);

@@ -2201,27 +2302,36 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,

      }
   } else {
-      assert(ptr->mode == vtn_variable_mode_ssbo);
      nir_ssa_def *offset, *index;
      offset = vtn_pointer_to_offset(b, ptr, &index, NULL);

-      nir_intrinsic_op op = get_ssbo_nir_atomic_op(opcode);
+      nir_intrinsic_op op;
+      if (ptr->mode == vtn_variable_mode_ssbo) {
+         op = get_ssbo_nir_atomic_op(opcode);
+      } else {
+         assert(ptr->mode == vtn_variable_mode_workgroup &&
+                b->options->lower_workgroup_access_to_offsets);
+         op = get_shared_nir_atomic_op(opcode);
+      }

      atomic = nir_intrinsic_instr_create(b->nb.shader, op);

+      int src = 0;
      switch (opcode) {
      case SpvOpAtomicLoad:
         atomic->num_components = glsl_get_vector_elements(ptr->type->type);
-         atomic->src[0] = nir_src_for_ssa(index);
-         atomic->src[1] = nir_src_for_ssa(offset);
+         if (ptr->mode == vtn_variable_mode_ssbo)
+            atomic->src[src++] = nir_src_for_ssa(index);
+         atomic->src[src++] = nir_src_for_ssa(offset);
         break;

      case SpvOpAtomicStore:
         atomic->num_components = glsl_get_vector_elements(ptr->type->type);
         nir_intrinsic_set_write_mask(atomic, (1 << atomic->num_components) - 1);
-         atomic->src[0] = nir_src_for_ssa(vtn_ssa_value(b, w[4])->def);
-         atomic->src[1] = nir_src_for_ssa(index);
-         atomic->src[2] = nir_src_for_ssa(offset);
+         atomic->src[src++] = nir_src_for_ssa(vtn_ssa_value(b, w[4])->def);
+         if (ptr->mode == vtn_variable_mode_ssbo)
+            atomic->src[src++] = nir_src_for_ssa(index);
+         atomic->src[src++] = nir_src_for_ssa(offset);
         break;

      case SpvOpAtomicExchange:
@@ -2238,9 +2348,10 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
      case SpvOpAtomicAnd:
      case SpvOpAtomicOr:
      case SpvOpAtomicXor:
-         atomic->src[0] = nir_src_for_ssa(index);
-         atomic->src[1] = nir_src_for_ssa(offset);
-         fill_common_atomic_sources(b, opcode, w, &atomic->src[2]);
+         if (ptr->mode == vtn_variable_mode_ssbo)
+            atomic->src[src++] = nir_src_for_ssa(index);
+         atomic->src[src++] = nir_src_for_ssa(offset);
+         fill_common_atomic_sources(b, opcode, w, &atomic->src[src]);
         break;

      default:
@@ -2672,7 +2783,7 @@ stage_for_execution_model(SpvExecutionModel model)
 }

 #define spv_check_supported(name, cap) do {		\
-      if (!(b->ext && b->ext->name))			\
+      if (!(b->options && b->options->caps.name))	\
         vtn_warn("Unsupported SPIR-V capability: %s",  \
                  spirv_capability_to_string(cap));     \
   } while(0)
@@ -3313,8 +3424,8 @@ nir_function *
 spirv_to_nir(const uint32_t *words, size_t word_count,
             struct nir_spirv_specialization *spec, unsigned num_spec,
             gl_shader_stage stage, const char *entry_point_name,
-             const struct nir_spirv_supported_extensions *ext,
-             const nir_shader_compiler_options *options)
+             const struct spirv_to_nir_options *options,
+             const nir_shader_compiler_options *nir_options)
 {
   const uint32_t *word_end = words + word_count;

@@ -3336,7 +3447,7 @@ spirv_to_nir(const uint32_t *words, size_t word_count,
   exec_list_make_empty(&b->functions);
   b->entry_point_stage = stage;
   b->entry_point_name = entry_point_name;
-   b->ext = ext;
+   b->options = options;

   /* Handle all the preamble instructions */
   words = vtn_foreach_instruction(b, words, word_end,
@@ -3348,7 +3459,7 @@ spirv_to_nir(const uint32_t *words, size_t word_count,
      return NULL;
   }

-   b->shader = nir_shader_create(NULL, stage, options, NULL);
+   b->shader = nir_shader_create(NULL, stage, nir_options, NULL);

   /* Set shader info defaults */
   b->shader->info.gs.invocations = 1;
@@ -3366,13 +3477,22 @@ spirv_to_nir(const uint32_t *words, size_t word_count,

   vtn_build_cfg(b, words, word_end);

-   foreach_list_typed(struct vtn_function, func, node, &b->functions) {
-      b->impl = func->impl;
-      b->const_table = _mesa_hash_table_create(b, _mesa_hash_pointer,
-                                               _mesa_key_pointer_equal);
+   assert(b->entry_point->value_type == vtn_value_type_function);
+   b->entry_point->func->referenced = true;

-      vtn_function_emit(b, func, vtn_handle_body_instruction);
-   }
+   bool progress;
+   do {
+      progress = false;
+      foreach_list_typed(struct vtn_function, func, node, &b->functions) {
+         if (func->referenced && !func->emitted) {
+            b->const_table = _mesa_hash_table_create(b, _mesa_hash_pointer,
+                                                     _mesa_key_pointer_equal);
+
+            vtn_function_emit(b, func, vtn_handle_body_instruction);
+            progress = true;
+         }
+      }
+   } while (progress);

   assert(b->entry_point->value_type == vtn_value_type_function);
   nir_function *entry_point = b->entry_point->func->impl->function;
--- a/src/compiler/spirv/vtn_cfg.c
+++ b/src/compiler/spirv/vtn_cfg.c
@@ -606,7 +606,7 @@ vtn_emit_cf_list(struct vtn_builder *b, struct list_head *cf_list,
         if ((*block->branch & SpvOpCodeMask) == SpvOpReturnValue) {
            struct vtn_ssa_value *src = vtn_ssa_value(b, block->branch[1]);
            vtn_local_store(b, src,
-                            nir_deref_var_create(b, b->impl->return_var));
+                            nir_deref_var_create(b, b->nb.impl->return_var));
         }

         if (block->branch_type != vtn_branch_type_none) {
@@ -783,4 +783,6 @@ vtn_function_emit(struct vtn_builder *b, struct vtn_function *func,
    */
   if (b->has_loop_continue)
      nir_repair_ssa_impl(func->impl);
+
+   func->emitted = true;
 }
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -159,6 +159,9 @@ struct vtn_block {
 struct vtn_function {
   struct exec_node node;

+   bool referenced;
+   bool emitted;
+
   nir_function_impl *impl;
   struct vtn_block *start_block;

@@ -217,7 +220,10 @@ struct vtn_type {
   /* The value that declares this type.  Used for finding decorations */
   struct vtn_value *val;

-   /* Specifies the length of complex types. */
+   /* Specifies the length of complex types.
+    *
+    * For Workgroup pointers, this is the size of the referenced type.
+    */
   unsigned length;

   /* for arrays, matrices and pointers, the array stride */
@@ -268,6 +274,9 @@ struct vtn_type {

         /* Storage class for pointers */
         SpvStorageClass storage_class;
+
+         /* Required alignment for pointers */
+         uint32_t align;
      };

      /* Members for image types */
@@ -369,13 +378,6 @@ struct vtn_pointer {
   struct nir_ssa_def *offset;
 };

-static inline bool
-vtn_pointer_uses_ssa_offset(struct vtn_pointer *ptr)
-{
-   return ptr->mode == vtn_variable_mode_ubo ||
-          ptr->mode == vtn_variable_mode_ssbo;
-}
-
 struct vtn_variable {
   enum vtn_variable_mode mode;

@@ -389,6 +391,8 @@ struct vtn_variable {
   nir_variable *var;
   nir_variable **members;

+   int shared_location;
+
   /**
    * In some early released versions of GLSLang, it implemented all function
    * calls by making copies of all parameters into temporary variables and
@@ -464,8 +468,7 @@ struct vtn_builder {
   nir_builder nb;

   nir_shader *shader;
-   nir_function_impl *impl;
-   const struct nir_spirv_supported_extensions *ext;
+   const struct spirv_to_nir_options *options;
   struct vtn_block *block;

   /* Current file, line, and column.  Useful for debugging.  Set
@@ -631,6 +634,13 @@ void vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
 bool vtn_handle_glsl450_instruction(struct vtn_builder *b, uint32_t ext_opcode,
                                    const uint32_t *words, unsigned count);

+static inline uint32_t
+vtn_align_u32(uint32_t v, uint32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
 static inline uint64_t
 vtn_u64_literal(const uint32_t *w)
 {
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -57,6 +57,27 @@ vtn_access_chain_extend(struct vtn_builder *b, struct vtn_access_chain *old,
   return chain;
 }

+static bool
+vtn_pointer_uses_ssa_offset(struct vtn_builder *b,
+                            struct vtn_pointer *ptr)
+{
+   return ptr->mode == vtn_variable_mode_ubo ||
+          ptr->mode == vtn_variable_mode_ssbo ||
+          (ptr->mode == vtn_variable_mode_workgroup &&
+           b->options->lower_workgroup_access_to_offsets);
+}
+
+static bool
+vtn_pointer_is_external_block(struct vtn_builder *b,
+                              struct vtn_pointer *ptr)
+{
+   return ptr->mode == vtn_variable_mode_ssbo ||
+          ptr->mode == vtn_variable_mode_ubo ||
+          ptr->mode == vtn_variable_mode_push_constant ||
+          (ptr->mode == vtn_variable_mode_workgroup &&
+           b->options->lower_workgroup_access_to_offsets);
+}
+
 /* Dereference the given base pointer by the access chain */
 static struct vtn_pointer *
 vtn_access_chain_pointer_dereference(struct vtn_builder *b,
@@ -150,7 +171,8 @@ vtn_ssa_offset_pointer_dereference(struct vtn_builder *b,
      /* We need ptr_type for the stride */
      assert(base->ptr_type);
      /* This must be a pointer to an actual element somewhere */
-      assert(block_index && offset);
+      assert(offset);
+      assert(block_index || base->mode == vtn_variable_mode_workgroup);
      /* We need at least one element in the chain */
      assert(deref_chain->length >= 1);

@@ -161,24 +183,49 @@ vtn_ssa_offset_pointer_dereference(struct vtn_builder *b,
      idx++;
   }

-   if (!block_index) {
+   if (!offset) {
+      /* This is the first access chain so we don't have a block index */
+      assert(!block_index);
+
      assert(base->var);
-      if (glsl_type_is_array(type->type)) {
-         /* We need at least one element in the chain */
-         assert(deref_chain->length >= 1);
+      assert(base->ptr_type);
+      switch (base->mode) {
+      case vtn_variable_mode_ubo:
+      case vtn_variable_mode_ssbo:
+         if (glsl_type_is_array(type->type)) {
+            /* We need at least one element in the chain */
+            assert(deref_chain->length >= 1);

-         nir_ssa_def *desc_arr_idx =
-            vtn_access_link_as_ssa(b, deref_chain->link[0], 1);
-         block_index = vtn_variable_resource_index(b, base->var, desc_arr_idx);
-         type = type->array_element;
-         idx++;
-      } else {
-         block_index = vtn_variable_resource_index(b, base->var, NULL);
+            nir_ssa_def *desc_arr_idx =
+               vtn_access_link_as_ssa(b, deref_chain->link[0], 1);
+            block_index = vtn_variable_resource_index(b, base->var, desc_arr_idx);
+            type = type->array_element;
+            idx++;
+         } else {
+            block_index = vtn_variable_resource_index(b, base->var, NULL);
+         }
+         offset = nir_imm_int(&b->nb, 0);
+         break;
+
+      case vtn_variable_mode_workgroup:
+         /* Assign location on first use so that we don't end up bloating SLM
+          * address space for variables which are never statically used.
+          */
+         if (base->var->shared_location < 0) {
+            assert(base->ptr_type->length > 0 && base->ptr_type->align > 0);
+            b->shader->num_shared = vtn_align_u32(b->shader->num_shared,
+                                                  base->ptr_type->align);
+            base->var->shared_location = b->shader->num_shared;
+            b->shader->num_shared += base->ptr_type->length;
+         }
+
+         block_index = NULL;
+         offset = nir_imm_int(&b->nb, base->var->shared_location);
+         break;
+
+      default:
+         unreachable("Invalid offset pointer mode");
      }
-
-      /* This is the first access chain so we also need an offset */
-      assert(!offset);
-      offset = nir_imm_int(&b->nb, 0);
   }
   assert(offset);

@@ -228,7 +275,7 @@ vtn_pointer_dereference(struct vtn_builder *b,
                        struct vtn_pointer *base,
                        struct vtn_access_chain *deref_chain)
 {
-   if (vtn_pointer_uses_ssa_offset(base)) {
+   if (vtn_pointer_uses_ssa_offset(b, base)) {
      return vtn_ssa_offset_pointer_dereference(b, base, deref_chain);
   } else {
      return vtn_access_chain_pointer_dereference(b, base, deref_chain);
@@ -478,45 +525,27 @@ vtn_local_store(struct vtn_builder *b, struct vtn_ssa_value *src,
   }
 }

-static nir_ssa_def *
-get_vulkan_resource_index(struct vtn_builder *b, struct vtn_pointer *ptr,
-                          struct vtn_type **type, unsigned *chain_idx)
-{
-   /* Push constants have no explicit binding */
-   if (ptr->mode == vtn_variable_mode_push_constant) {
-      *chain_idx = 0;
-      *type = ptr->var->type;
-      return NULL;
-   }
-
-   if (glsl_type_is_array(ptr->var->type->type)) {
-      assert(ptr->chain->length > 0);
-      nir_ssa_def *desc_array_index =
-         vtn_access_link_as_ssa(b, ptr->chain->link[0], 1);
-      *chain_idx = 1;
-      *type = ptr->var->type->array_element;
-      return vtn_variable_resource_index(b, ptr->var, desc_array_index);
-   } else {
-      *chain_idx = 0;
-      *type = ptr->var->type;
-      return vtn_variable_resource_index(b, ptr->var, NULL);
-   }
-}
-
 nir_ssa_def *
 vtn_pointer_to_offset(struct vtn_builder *b, struct vtn_pointer *ptr,
                      nir_ssa_def **index_out, unsigned *end_idx_out)
 {
-   if (ptr->offset) {
-      assert(ptr->block_index);
+   if (vtn_pointer_uses_ssa_offset(b, ptr)) {
+      if (!ptr->offset) {
+         assert(ptr->mode == vtn_variable_mode_workgroup);
+         struct vtn_access_chain chain = {
+            .length = 0,
+         };
+         ptr = vtn_ssa_offset_pointer_dereference(b, ptr, &chain);
+      }
      *index_out = ptr->block_index;
      return ptr->offset;
   }

-   unsigned idx = 0;
-   struct vtn_type *type;
-   *index_out = get_vulkan_resource_index(b, ptr, &type, &idx);
+   assert(ptr->mode == vtn_variable_mode_push_constant);
+   *index_out = NULL;

+   unsigned idx = 0;
+   struct vtn_type *type = ptr->var->type;
   nir_ssa_def *offset = nir_imm_int(&b->nb, 0);
   for (; idx < ptr->chain->length; idx++) {
      enum glsl_base_type base_type = glsl_get_base_type(type->type);
@@ -829,6 +858,9 @@ vtn_block_load(struct vtn_builder *b, struct vtn_pointer *src)
      vtn_access_chain_get_offset_size(src->chain, src->var->type,
                                       &access_offset, &access_size);
      break;
+   case vtn_variable_mode_workgroup:
+      op = nir_intrinsic_load_shared;
+      break;
   default:
      unreachable("Invalid block variable mode");
   }
@@ -848,22 +880,26 @@ static void
 vtn_block_store(struct vtn_builder *b, struct vtn_ssa_value *src,
                struct vtn_pointer *dst)
 {
+   nir_intrinsic_op op;
+   switch (dst->mode) {
+   case vtn_variable_mode_ssbo:
+      op = nir_intrinsic_store_ssbo;
+      break;
+   case vtn_variable_mode_workgroup:
+      op = nir_intrinsic_store_shared;
+      break;
+   default:
+      unreachable("Invalid block variable mode");
+   }
+
   nir_ssa_def *offset, *index = NULL;
   unsigned chain_idx;
   offset = vtn_pointer_to_offset(b, dst, &index, &chain_idx);

-   _vtn_block_load_store(b, nir_intrinsic_store_ssbo, false, index, offset,
+   _vtn_block_load_store(b, op, false, index, offset,
                         0, 0, dst->chain, chain_idx, dst->type, &src);
 }

-static bool
-vtn_pointer_is_external_block(struct vtn_pointer *ptr)
-{
-   return ptr->mode == vtn_variable_mode_ssbo ||
-          ptr->mode == vtn_variable_mode_ubo ||
-          ptr->mode == vtn_variable_mode_push_constant;
-}
-
 static void
 _vtn_variable_load_store(struct vtn_builder *b, bool load,
                         struct vtn_pointer *ptr,
@@ -923,7 +959,7 @@ _vtn_variable_load_store(struct vtn_builder *b, bool load,
 struct vtn_ssa_value *
 vtn_variable_load(struct vtn_builder *b, struct vtn_pointer *src)
 {
-   if (vtn_pointer_is_external_block(src)) {
+   if (vtn_pointer_is_external_block(b, src)) {
      return vtn_block_load(b, src);
   } else {
      struct vtn_ssa_value *val = NULL;
@@ -936,8 +972,9 @@ void
 vtn_variable_store(struct vtn_builder *b, struct vtn_ssa_value *src,
                   struct vtn_pointer *dest)
 {
-   if (vtn_pointer_is_external_block(dest)) {
-      assert(dest->mode == vtn_variable_mode_ssbo);
+   if (vtn_pointer_is_external_block(b, dest)) {
+      assert(dest->mode == vtn_variable_mode_ssbo ||
+             dest->mode == vtn_variable_mode_workgroup);
      vtn_block_store(b, src, dest);
   } else {
      _vtn_variable_load_store(b, false, dest, &src);
@@ -1492,11 +1529,9 @@ vtn_pointer_to_ssa(struct vtn_builder *b, struct vtn_pointer *ptr)
   assert(ptr->ptr_type);
   assert(ptr->ptr_type->type);

-   if (ptr->offset && ptr->block_index) {
-      return nir_vec2(&b->nb, ptr->block_index, ptr->offset);
-   } else {
-      /* If we don't have an offset or block index, then we must be a pointer
-       * to the variable itself.
+   if (!ptr->offset) {
+      /* If we don't have an offset then we must be a pointer to the variable
+       * itself.
       */
      assert(!ptr->offset && !ptr->block_index);

@@ -1506,8 +1541,20 @@ vtn_pointer_to_ssa(struct vtn_builder *b, struct vtn_pointer *ptr)
       */
      assert(ptr->var && ptr->var->type->base_type == vtn_base_type_struct);

-      return nir_vec2(&b->nb, vtn_variable_resource_index(b, ptr->var, NULL),
-                              nir_imm_int(&b->nb, 0));
+      struct vtn_access_chain chain = {
+         .length = 0,
+      };
+      ptr = vtn_ssa_offset_pointer_dereference(b, ptr, &chain);
+   }
+
+   assert(ptr->offset);
+   if (ptr->block_index) {
+      assert(ptr->mode == vtn_variable_mode_ubo ||
+             ptr->mode == vtn_variable_mode_ssbo);
+      return nir_vec2(&b->nb, ptr->block_index, ptr->offset);
+   } else {
+      assert(ptr->mode == vtn_variable_mode_workgroup);
+      return ptr->offset;
   }
 }

@@ -1515,7 +1562,7 @@ struct vtn_pointer *
 vtn_pointer_from_ssa(struct vtn_builder *b, nir_ssa_def *ssa,
                     struct vtn_type *ptr_type)
 {
-   assert(ssa->num_components == 2 && ssa->bit_size == 32);
+   assert(ssa->num_components <= 2 && ssa->bit_size == 32);
   assert(ptr_type->base_type == vtn_base_type_pointer);
   assert(ptr_type->deref->base_type != vtn_base_type_pointer);
   /* This pointer type needs to have actual storage */
@@ -1526,8 +1573,19 @@ vtn_pointer_from_ssa(struct vtn_builder *b, nir_ssa_def *ssa,
                                         ptr_type, NULL);
   ptr->type = ptr_type->deref;
   ptr->ptr_type = ptr_type;
-   ptr->block_index = nir_channel(&b->nb, ssa, 0);
-   ptr->offset = nir_channel(&b->nb, ssa, 1);
+
+   if (ssa->num_components > 1) {
+      assert(ssa->num_components == 2);
+      assert(ptr->mode == vtn_variable_mode_ubo ||
+             ptr->mode == vtn_variable_mode_ssbo);
+      ptr->block_index = nir_channel(&b->nb, ssa, 0);
+      ptr->offset = nir_channel(&b->nb, ssa, 1);
+   } else {
+      assert(ssa->num_components == 1);
+      assert(ptr->mode == vtn_variable_mode_workgroup);
+      ptr->block_index = NULL;
+      ptr->offset = ssa;
+   }

   return ptr;
 }
@@ -1599,7 +1657,6 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
   case vtn_variable_mode_global:
   case vtn_variable_mode_image:
   case vtn_variable_mode_sampler:
-   case vtn_variable_mode_workgroup:
      /* For these, we create the variable normally */
      var->var = rzalloc(b->shader, nir_variable);
      var->var->name = ralloc_strdup(var->var, val->name);
@@ -1617,6 +1674,18 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
      }
      break;

+   case vtn_variable_mode_workgroup:
+      if (b->options->lower_workgroup_access_to_offsets) {
+         var->shared_location = -1;
+      } else {
+         /* Create the variable normally */
+         var->var = rzalloc(b->shader, nir_variable);
+         var->var->name = ralloc_strdup(var->var, val->name);
+         var->var->type = var->type->type;
+         var->var->data.mode = nir_var_shared;
+      }
+      break;
+
   case vtn_variable_mode_input:
   case vtn_variable_mode_output: {
      /* In order to know whether or not we're a per-vertex inout, we need
@@ -1731,7 +1800,7 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,

   if (var->mode == vtn_variable_mode_local) {
      assert(var->members == NULL && var->var != NULL);
-      nir_function_impl_add_variable(b->impl, var->var);
+      nir_function_impl_add_variable(b->nb.impl, var->var);
   } else if (var->var) {
      nir_shader_add_variable(b->shader, var->var);
   } else if (var->members) {
@@ -1741,9 +1810,7 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
         nir_shader_add_variable(b->shader, var->members[i]);
      }
   } else {
-      assert(var->mode == vtn_variable_mode_ubo ||
-             var->mode == vtn_variable_mode_ssbo ||
-             var->mode == vtn_variable_mode_push_constant);
+      assert(vtn_pointer_is_external_block(b, val->pointer));
   }
 }

@@ -1868,15 +1935,19 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
      const uint32_t offset = ptr->var->type->offsets[w[4]];
      const uint32_t stride = ptr->var->type->members[w[4]]->stride;

-      unsigned chain_idx;
-      struct vtn_type *type;
-      nir_ssa_def *index =
-         get_vulkan_resource_index(b, ptr, &type, &chain_idx);
+      if (!ptr->block_index) {
+         assert(ptr->mode == vtn_variable_mode_workgroup);
+         struct vtn_access_chain chain = {
+            .length = 0,
+         };
+         ptr = vtn_ssa_offset_pointer_dereference(b, ptr, &chain);
+         assert(ptr->block_index);
+      }

      nir_intrinsic_instr *instr =
         nir_intrinsic_instr_create(b->nb.shader,
                                    nir_intrinsic_get_buffer_size);
-      instr->src[0] = nir_src_for_ssa(index);
+      instr->src[0] = nir_src_for_ssa(ptr->block_index);
      nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
      nir_builder_instr_insert(&b->nb, &instr->instr);
      nir_ssa_def *buf_size = &instr->dest.ssa;
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -41,6 +41,7 @@ LOCAL_SRC_FILES := \
 LOCAL_CFLAGS := \
 	-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID \
 	-D_EGL_BUILT_IN_DRIVER_DRI2 \
+	-DHAS_GRALLOC_DRM_HEADERS \
 	-DHAVE_ANDROID_PLATFORM

 LOCAL_C_INCLUDES := \
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -105,7 +105,9 @@ endif
 if HAVE_PLATFORM_ANDROID
 AM_CFLAGS += $(ANDROID_CFLAGS)
 libEGL_common_la_LIBADD += $(ANDROID_LIBS)
-dri2_backend_FILES += drivers/dri2/platform_android.c
+dri2_backend_FILES += \
+	drivers/dri2/platform_android.c \
+	drivers/dri2/egl_dri2_drm_gralloc.h
 endif

 AM_CFLAGS += \
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -299,7 +299,10 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
         _eglSetConfigKey(&base, EGL_MAX_PBUFFER_HEIGHT,
                          _EGL_MAX_PBUFFER_HEIGHT);
         break;
-
+      case __DRI_ATTRIB_MUTABLE_RENDER_BUFFER:
+         if (disp->Extensions.KHR_mutable_render_buffer)
+            surface_type |= EGL_MUTABLE_RENDER_BUFFER_BIT_KHR;
+         break;
      default:
         key = dri2_to_egl_attribute_map[attrib];
         if (key != 0)
@@ -457,6 +460,7 @@ static const struct dri2_extension_match optional_core_extensions[] = {
   { __DRI2_RENDERER_QUERY, 1, offsetof(struct dri2_egl_display, rendererQuery) },
   { __DRI2_INTEROP, 1, offsetof(struct dri2_egl_display, interop) },
   { __DRI_IMAGE, 1, offsetof(struct dri2_egl_display, image) },
+   { __DRI_MUTABLE_RENDER_BUFFER_DRIVER, 1, offsetof(struct dri2_egl_display, mutable_render_buffer) },
   { NULL, 0, 0 }
 };

@@ -904,10 +908,6 @@ dri2_initialize(_EGLDriver *drv, _EGLDisplay *disp)
      return EGL_TRUE;
   }

-   /* not until swrast_dri is supported */
-   if (disp->Options.UseFallback)
-      return EGL_FALSE;
-
   switch (disp->Platform) {
 #ifdef HAVE_SURFACELESS_PLATFORM
   case _EGL_PLATFORM_SURFACELESS:
@@ -1329,12 +1329,6 @@ dri2_create_context(_EGLDriver *drv, _EGLDisplay *disp, _EGLConfig *conf,
         dri_config = dri2_config->dri_config[1][0];
      else
         dri_config = dri2_config->dri_config[0][0];
-
-      /* EGL_WINDOW_BIT is set only when there is a double-buffered dri_config.
-       * This makes sure the back buffer will always be used.
-       */
-      if (conf->SurfaceType & EGL_WINDOW_BIT)
-         dri2_ctx->base.WindowRenderBuffer = EGL_BACK_BUFFER;
   }
   else
      dri_config = NULL;
@@ -1525,6 +1519,8 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
 {
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   _EGLDisplay *old_disp = NULL;
+   struct dri2_egl_display *old_dri2_dpy = NULL;
   _EGLContext *old_ctx;
   _EGLSurface *old_dsurf, *old_rsurf;
   _EGLSurface *tmp_dsurf, *tmp_rsurf;
@@ -1541,6 +1537,11 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
      return EGL_FALSE;
   }

+   if (old_ctx) {
+      old_disp = old_ctx->Resource.Display;
+      old_dri2_dpy = dri2_egl_display(old_disp);
+   }
+
   /* flush before context switch */
   if (old_ctx)
      dri2_gl_flush();
@@ -1554,31 +1555,30 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,

      if (old_dsurf)
         dri2_surf_update_fence_fd(old_ctx, disp, old_dsurf);
+
+      /* Disable shared buffer mode */
+      if (old_dsurf && _eglSurfaceInSharedBufferMode(old_dsurf) &&
+          old_dri2_dpy->vtbl->set_shared_buffer_mode) {
+         old_dri2_dpy->vtbl->set_shared_buffer_mode(old_disp, old_dsurf, false);
+      }
+
      dri2_dpy->core->unbindContext(old_cctx);
   }

   unbind = (cctx == NULL && ddraw == NULL && rdraw == NULL);

-   if (unbind || dri2_dpy->core->bindContext(cctx, ddraw, rdraw)) {
-      dri2_destroy_surface(drv, disp, old_dsurf);
-      dri2_destroy_surface(drv, disp, old_rsurf);
-
-      if (!unbind)
-         dri2_dpy->ref_count++;
-      if (old_ctx) {
-         EGLDisplay old_disp = _eglGetDisplayHandle(old_ctx->Resource.Display);
-         dri2_destroy_context(drv, disp, old_ctx);
-         dri2_display_release(old_disp);
-      }
-
-      return EGL_TRUE;
-   } else {
+   if (!unbind && !dri2_dpy->core->bindContext(cctx, ddraw, rdraw)) {
      /* undo the previous _eglBindContext */
      _eglBindContext(old_ctx, old_dsurf, old_rsurf, &ctx, &tmp_dsurf, &tmp_rsurf);
      assert(&dri2_ctx->base == ctx &&
             tmp_dsurf == dsurf &&
             tmp_rsurf == rsurf);

+      if (old_dsurf && _eglSurfaceInSharedBufferMode(old_dsurf) &&
+          old_dri2_dpy->vtbl->set_shared_buffer_mode) {
+         old_dri2_dpy->vtbl->set_shared_buffer_mode(old_disp, old_dsurf, true);
+      }
+
      _eglPutSurface(dsurf);
      _eglPutSurface(rsurf);
      _eglPutContext(ctx);
@@ -1593,6 +1593,31 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
       */
      return _eglError(EGL_BAD_MATCH, "eglMakeCurrent");
   }
+
+   dri2_destroy_surface(drv, disp, old_dsurf);
+   dri2_destroy_surface(drv, disp, old_rsurf);
+
+   if (!unbind)
+      dri2_dpy->ref_count++;
+
+   if (old_ctx) {
+      dri2_destroy_context(drv, disp, old_ctx);
+      dri2_display_release(old_disp);
+   }
+
+   if (dsurf && _eglSurfaceHasMutableRenderBuffer(dsurf) &&
+       dri2_dpy->vtbl->set_shared_buffer_mode) {
+      /* Always update the shared buffer mode. This is obviously needed when
+       * the active EGL_RENDER_BUFFER is EGL_SINGLE_BUFFER. When
+       * EGL_RENDER_BUFFER is EGL_BACK_BUFFER, the update protects us in the
+       * case where external non-EGL API may have changed window's shared
+       * buffer mode since we last saw it.
+       */
+      bool mode = (dsurf->ActiveRenderBuffer == EGL_SINGLE_BUFFER);
+      dri2_dpy->vtbl->set_shared_buffer_mode(disp, dsurf, mode);
+   }
+
+   return EGL_TRUE;
 }

 __DRIdrawable *
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -61,7 +61,7 @@ struct zwp_linux_dmabuf_v1;

 #include <system/window.h>
 #include <hardware/gralloc.h>
-#include <gralloc_drm_handle.h>
+#include "platform_android_gralloc_drm.h"

 #endif /* HAVE_ANDROID_PLATFORM */

@@ -147,6 +147,12 @@ struct dri2_egl_display_vtbl {
   __DRIdrawable *(*get_dri_drawable)(_EGLSurface *surf);

   void (*close_screen_notify)(_EGLDisplay *dpy);
+
+   /* Used in EGL_KHR_mutable_render_buffer to update the native window's
+    * shared buffer mode.
+    */
+   bool (*set_shared_buffer_mode)(_EGLDisplay *dpy, _EGLSurface *surf,
+                                  bool mode);
 };

 struct dri2_egl_display
@@ -172,6 +178,7 @@ struct dri2_egl_display
   const __DRI2fenceExtension *fence;
   const __DRI2rendererQueryExtension *rendererQuery;
   const __DRI2interopExtension *interop;
+   const __DRImutableRenderBufferDriverExtension *mutable_render_buffer;
   int                       fd;

   /* dri2_initialize/dri2_terminate increment/decrement this count, so does
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -37,7 +37,7 @@
 #include "loader.h"
 #include "egl_dri2.h"
 #include "egl_dri2_fallbacks.h"
-#include "gralloc_drm.h"
+#include "platform_android_gralloc_drm.h"

 #define ALIGN(val, align)	(((val) + (align) - 1) & ~((align) - 1))

@@ -59,6 +59,10 @@ static const struct droid_yuv_format droid_yuv_formats[] = {
   { HAL_PIXEL_FORMAT_YCbCr_420_888,   0, 1, __DRI_IMAGE_FOURCC_YUV420 },
   { HAL_PIXEL_FORMAT_YCbCr_420_888,   1, 1, __DRI_IMAGE_FOURCC_YVU420 },
   { HAL_PIXEL_FORMAT_YV12,            1, 1, __DRI_IMAGE_FOURCC_YVU420 },
+   /* HACK: See droid_create_image_from_prime_fd() and b/32077885. */
+   { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED,   0, 2, __DRI_IMAGE_FOURCC_NV12 },
+   { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED,   0, 1, __DRI_IMAGE_FOURCC_YUV420 },
+   { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED,   1, 1, __DRI_IMAGE_FOURCC_YVU420 },
 };

 static int
@@ -90,6 +94,11 @@ get_format_bpp(int native)

   switch (native) {
   case HAL_PIXEL_FORMAT_RGBA_8888:
+   case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED:
+      /*
+       * HACK: Hardcode this to RGBX_8888 as per cros_gralloc hack.
+       * TODO: Remove this once b/32077885 is fixed.
+       */
   case HAL_PIXEL_FORMAT_RGBX_8888:
   case HAL_PIXEL_FORMAT_BGRA_8888:
      bpp = 4;
@@ -112,6 +121,11 @@ static int get_fourcc(int native)
   case HAL_PIXEL_FORMAT_RGB_565:   return __DRI_IMAGE_FOURCC_RGB565;
   case HAL_PIXEL_FORMAT_BGRA_8888: return __DRI_IMAGE_FOURCC_ARGB8888;
   case HAL_PIXEL_FORMAT_RGBA_8888: return __DRI_IMAGE_FOURCC_ABGR8888;
+   case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED:
+      /*
+       * HACK: Hardcode this to RGBX_8888 as per cros_gralloc hack.
+       * TODO: Remove this once b/32077885 is fixed.
+       */
   case HAL_PIXEL_FORMAT_RGBX_8888: return __DRI_IMAGE_FOURCC_XBGR8888;
   default:
      _eglLog(_EGL_WARNING, "unsupported native buffer format 0x%x", native);
@@ -125,6 +139,11 @@ static int get_format(int format)
   case HAL_PIXEL_FORMAT_BGRA_8888: return __DRI_IMAGE_FORMAT_ARGB8888;
   case HAL_PIXEL_FORMAT_RGB_565:   return __DRI_IMAGE_FORMAT_RGB565;
   case HAL_PIXEL_FORMAT_RGBA_8888: return __DRI_IMAGE_FORMAT_ABGR8888;
+   case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED:
+      /*
+       * HACK: Hardcode this to RGBX_8888 as per cros_gralloc hack.
+       * TODO: Revert this once b/32077885 is fixed.
+       */
   case HAL_PIXEL_FORMAT_RGBX_8888: return __DRI_IMAGE_FORMAT_XBGR8888;
   default:
      _eglLog(_EGL_WARNING, "unsupported native buffer format 0x%x", format);
@@ -273,6 +292,32 @@ droid_window_cancel_buffer(struct dri2_egl_surface *dri2_surf)
   }
 }

+static bool
+droid_set_shared_buffer_mode(_EGLDisplay *disp, _EGLSurface *surf, bool mode)
+{
+#if __ANDROID_API__ >= 24
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
+   struct ANativeWindow *window = dri2_surf->window;
+
+   assert(surf->Type == EGL_WINDOW_BIT);
+   assert(_eglSurfaceHasMutableRenderBuffer(&dri2_surf->base));
+
+   _eglLog(_EGL_DEBUG, "%s: mode=%d", __func__, mode);
+
+   if (native_window_set_shared_buffer_mode(window, mode)) {
+      _eglLog(_EGL_WARNING, "failed native_window_set_shared_buffer_mode"
+              "(window=%p, mode=%d)", window, mode);
+      return false;
+   }
+
+   return true;
+#else
+   _eglLog(_EGL_FATAL, "%s:%d: internal error: unreachable", __FILE__, __LINE__);
+   return false;
+#endif
+}
+
 static _EGLSurface *
 droid_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
 		    _EGLConfig *conf, void *native_window,
@@ -547,6 +592,21 @@ droid_image_get_buffers(__DRIdrawable *driDrawable,
   if (update_buffers(dri2_surf) < 0)
      return 0;

+   if (_eglSurfaceInSharedBufferMode(&dri2_surf->base)) {
+      if (get_back_bo(dri2_surf) < 0)
+         return 0;
+
+      /* We have dri_image_back because this is a window surface and
+       * get_back_bo() succeeded.
+       */
+      assert(dri2_surf->dri_image_back);
+      images->back = dri2_surf->dri_image_back;
+      images->image_mask |= __DRI_IMAGE_BUFFER_SHARED;
+
+      /* There exists no accompanying back nor front buffer. */
+      return 1;
+   }
+
   if (buffer_mask & __DRI_IMAGE_BUFFER_FRONT) {
      if (get_front_bo(dri2_surf, format) < 0)
         return 0;
@@ -593,6 +653,21 @@ droid_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
   if (dri2_surf->base.Type != EGL_WINDOW_BIT)
      return EGL_TRUE;

+   const bool has_mutable_rb = _eglSurfaceHasMutableRenderBuffer(draw);
+
+   /* From the EGL_KHR_mutable_render_buffer spec (v12):
+    *
+    *    If surface is a single-buffered window, pixmap, or pbuffer surface
+    *    for which there is no pending change to the EGL_RENDER_BUFFER
+    *    attribute, eglSwapBuffers has no effect.
+    */
+   if (has_mutable_rb &&
+       draw->RequestedRenderBuffer == EGL_SINGLE_BUFFER &&
+       draw->ActiveRenderBuffer == EGL_SINGLE_BUFFER) {
+      _eglLog(_EGL_DEBUG, "%s: remain in shared buffer mode", __func__);
+      return EGL_TRUE;
+   }
+
   for (int i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
      if (dri2_surf->color_buffers[i].age > 0)
         dri2_surf->color_buffers[i].age++;
@@ -617,6 +692,18 @@ droid_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)

   dri2_dpy->flush->invalidate(dri2_surf->dri_drawable);

+   /* Update the shared buffer mode */
+   if (has_mutable_rb &&
+       draw->ActiveRenderBuffer != draw->RequestedRenderBuffer) {
+       bool mode = (draw->RequestedRenderBuffer == EGL_SINGLE_BUFFER);
+      _eglLog(_EGL_DEBUG, "%s: change to shared buffer mode %d",
+              __func__, mode);
+
+      if (!droid_set_shared_buffer_mode(disp, draw, mode))
+         return EGL_FALSE;
+      draw->ActiveRenderBuffer = draw->RequestedRenderBuffer;
+   }
+
   return EGL_TRUE;
 }

@@ -678,6 +765,10 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx,
   ret = dri2_dpy->gralloc->lock_ycbcr(dri2_dpy->gralloc, buf->handle,
                                       0, 0, 0, 0, 0, &ycbcr);
   if (ret) {
+      /* HACK: See droid_create_image_from_prime_fd() and b/32077885. */
+      if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
+         return NULL;
+
      _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret);
      return NULL;
   }
@@ -757,8 +848,20 @@ droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx,
 {
   unsigned int pitch;

-   if (is_yuv(buf->format))
-      return droid_create_image_from_prime_fd_yuv(disp, ctx, buf, fd);
+   if (is_yuv(buf->format)) {
+      _EGLImage *image;
+
+      image = droid_create_image_from_prime_fd_yuv(disp, ctx, buf, fd);
+      /*
+       * HACK: b/32077885
+       * There is no API available to properly query the IMPLEMENTATION_DEFINED
+       * format. As a workaround we rely here on gralloc allocating either
+       * an arbitrary YCbCr 4:2:0 or RGBX_8888, with the latter being recognized
+       * by lock_ycbcr failing.
+       */
+      if (image || buf->format != HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
+         return image;
+   }

   const int fourcc = get_fourcc(buf->format);
   if (fourcc == -1) {
@@ -1005,7 +1108,6 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
      { HAL_PIXEL_FORMAT_RGBA_8888, { 0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000 } },
      { HAL_PIXEL_FORMAT_RGBX_8888, { 0x000000ff, 0x0000ff00, 0x00ff0000, 0x00000000 } },
      { HAL_PIXEL_FORMAT_RGB_565,   { 0x0000f800, 0x000007e0, 0x0000001f, 0x00000000 } },
-      { HAL_PIXEL_FORMAT_BGRA_8888, { 0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000 } },
   };

   unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 };
@@ -1073,7 +1175,7 @@ droid_open_device(struct dri2_egl_display *dri2_dpy)
                                          GRALLOC_MODULE_PERFORM_GET_DRM_FD,
                                          &fd);
   if (err || fd < 0) {
-      _eglLog(_EGL_WARNING, "fail to get drm fd");
+      _eglLog(_EGL_DEBUG, "fail to get drm fd");
      fd = -1;
   }

@@ -1102,6 +1204,7 @@ static const struct dri2_egl_display_vtbl droid_display_vtbl = {
   .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
   .get_sync_values = dri2_fallback_get_sync_values,
   .get_dri_drawable = dri2_surface_get_dri_drawable,
+   .set_shared_buffer_mode = droid_set_shared_buffer_mode,
 };

 static const __DRIdri2LoaderExtension droid_dri2_loader_extension = {
@@ -1121,10 +1224,89 @@ static const __DRIimageLoaderExtension droid_image_loader_extension = {
   .getCapability       = droid_get_capability,
 };

+static void
+droid_display_shared_buffer(__DRIdrawable *driDrawable, int fence_fd,
+                            void *loaderPrivate)
+{
+   struct dri2_egl_surface *dri2_surf = loaderPrivate;
+   struct ANativeWindowBuffer *old_buffer UNUSED = dri2_surf->buffer;
+
+   if (!_eglSurfaceInSharedBufferMode(&dri2_surf->base)) {
+      _eglLog(_EGL_WARNING, "%s: internal error: buffer is not shared",
+              __func__);
+      return;
+   }
+
+   if (fence_fd >= 0) {
+      /* The driver's fence is more recent than the surface's out fence, if it
+       * exists at all. So use the driver's fence.
+       */
+      if (dri2_surf->out_fence_fd >= 0) {
+         close(dri2_surf->out_fence_fd);
+         dri2_surf->out_fence_fd = -1;
+      }
+   } else if (dri2_surf->out_fence_fd >= 0) {
+      fence_fd = dri2_surf->out_fence_fd;
+      dri2_surf->out_fence_fd = -1;
+   }
+
+   if (dri2_surf->window->queueBuffer(dri2_surf->window, dri2_surf->buffer,
+                                      fence_fd)) {
+      _eglLog(_EGL_WARNING, "%s: ANativeWindow::queueBuffer failed", __func__);
+      close(fence_fd);
+      return;
+   }
+
+   fence_fd = -1;
+
+   if (dri2_surf->window->dequeueBuffer(dri2_surf->window, &dri2_surf->buffer,
+                                        &fence_fd)) {
+      /* Tear down the surface because it no longer has a back buffer. */
+      struct dri2_egl_display *dri2_dpy =
+         dri2_egl_display(dri2_surf->base.Resource.Display);
+
+      _eglLog(_EGL_WARNING, "%s: ANativeWindow::dequeueBuffer failed", __func__);
+
+      dri2_surf->base.Lost = true;
+      dri2_surf->buffer = NULL;
+      dri2_surf->back = NULL;
+
+      if (dri2_surf->dri_image_back) {
+         dri2_dpy->image->destroyImage(dri2_surf->dri_image_back);
+         dri2_surf->dri_image_back = NULL;
+      }
+
+      dri2_dpy->flush->invalidate(dri2_surf->dri_drawable);
+      return;
+   }
+
+   if (fence_fd < 0)
+      return;
+
+   /* Access to the buffer is controlled by a sync fence. Block on it.
+    *
+    * Ideally, we would submit the fence to the driver, and the driver would
+    * postpone command execution until it signalled. But DRI lacks API for
+    * that (as of 2018-04-11).
+    *
+    *  SYNC_IOC_WAIT waits forever if timeout < 0
+    */
+   sync_wait(fence_fd, -1);
+   close(fence_fd);
+}
+
+static const __DRImutableRenderBufferLoaderExtension droid_mutable_render_buffer_extension = {
+   .base = { __DRI_MUTABLE_RENDER_BUFFER_LOADER, 1 },
+   .displaySharedBuffer = droid_display_shared_buffer,
+};
+
 static const __DRIextension *droid_dri2_loader_extensions[] = {
   &droid_dri2_loader_extension.base,
   &image_lookup_extension.base,
   &use_invalidate.base,
+   /* No __DRI_MUTABLE_RENDER_BUFFER_LOADER because it requires
+    * __DRI_IMAGE_LOADER.
+    */
   NULL,
 };

@@ -1132,9 +1314,82 @@ static const __DRIextension *droid_image_loader_extensions[] = {
   &droid_image_loader_extension.base,
   &image_lookup_extension.base,
   &use_invalidate.base,
+   &droid_mutable_render_buffer_extension.base,
   NULL,
 };

+static bool
+droid_probe_device(_EGLDisplay *dpy, bool swrast)
+{
+   struct dri2_egl_display *dri2_dpy = dpy->DriverData;
+   bool loaded;
+
+   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER;
+   if (!dri2_dpy->is_render_node && !gralloc_supports_gem_names()) {
+      _eglLog(_EGL_WARNING, "DRI2: control nodes not supported without GEM name suport in gralloc\n");
+      return false;
+   }
+
+   if (swrast)
+      dri2_dpy->driver_name = strdup("kms_swrast");
+   else
+      dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
+
+   if (dri2_dpy->driver_name == NULL) {
+      _eglLog(_EGL_WARNING, "DRI2: failed to get driver name");
+      return false;
+   }
+
+   /* render nodes cannot use Gem names, and thus do not support
+    * the __DRI_DRI2_LOADER extension */
+   if (!dri2_dpy->is_render_node) {
+      dri2_dpy->loader_extensions = droid_dri2_loader_extensions;
+	   loaded = dri2_load_driver(dpy);
+   } else {
+      dri2_dpy->loader_extensions = droid_image_loader_extensions;
+      loaded = dri2_load_driver_dri3(dpy);
+   }
+
+   if (!loaded) {
+      _eglLog(_EGL_WARNING, "DRI2: failed to load driver");
+      free(dri2_dpy->driver_name);
+      dri2_dpy->driver_name = NULL;
+      return false;
+   }
+
+   return true;
+}
+
+static bool
+droid_probe_devices(_EGLDisplay *dpy, bool swrast)
+{
+   struct dri2_egl_display *dri2_dpy = dpy->DriverData;
+   const char *name_template = "%s/renderD%d";
+   const int base = 128;
+   const int limit = 64;
+   int minor;
+
+   for (minor = base; minor < base + limit; ++minor) {
+      char *card_path;
+
+      if (asprintf(&card_path, name_template, DRM_DIR_NAME, minor) < 0)
+         continue;
+
+      dri2_dpy->fd = loader_open_device(card_path);
+      free(card_path);
+      if (dri2_dpy->fd < 0)
+         continue;
+
+      if (droid_probe_device(dpy, swrast))
+         return true;
+
+      close(dri2_dpy->fd);
+      dri2_dpy->fd = -1;
+   }
+
+   return false;
+}
+
 EGLBoolean
 dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
 {
@@ -1159,35 +1414,17 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
   dpy->DriverData = (void *) dri2_dpy;

   dri2_dpy->fd = droid_open_device(dri2_dpy);
-   if (dri2_dpy->fd < 0) {
-      err = "DRI2: failed to open device";
+   if (dri2_dpy->fd >= 0 &&
+       !droid_probe_device(dpy, dpy->Options.UseFallback)) {
+      _eglLog(_EGL_WARNING, "DRI2: Failed to load %s driver",
+              dpy->Options.UseFallback ? "software" : "hardware");
      goto cleanup;
-   }
-
-   dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
-   if (dri2_dpy->driver_name == NULL) {
-      err = "DRI2: failed to get driver name";
+   } else if (!droid_probe_devices(dpy, dpy->Options.UseFallback)) {
+      _eglLog(_EGL_WARNING, "DRI2: Failed to load %s driver",
+              dpy->Options.UseFallback ? "software" : "hardware");
      goto cleanup;
   }

-   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER;
-
-   /* render nodes cannot use Gem names, and thus do not support
-    * the __DRI_DRI2_LOADER extension */
-   if (!dri2_dpy->is_render_node) {
-      dri2_dpy->loader_extensions = droid_dri2_loader_extensions;
-      if (!dri2_load_driver(dpy)) {
-         err = "DRI2: failed to load driver";
-         goto cleanup;
-      }
-   } else {
-      dri2_dpy->loader_extensions = droid_image_loader_extensions;
-      if (!dri2_load_driver_dri3(dpy)) {
-         err = "DRI3: failed to load driver";
-         goto cleanup;
-      }
-   }
-
   if (!dri2_create_screen(dpy)) {
      err = "DRI2: failed to create screen";
      goto cleanup;
@@ -1200,11 +1437,6 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)

   dri2_setup_screen(dpy);

-   if (!droid_add_configs_for_visuals(drv, dpy)) {
-      err = "DRI2: failed to add configs";
-      goto cleanup;
-   }
-
   dpy->Extensions.ANDROID_framebuffer_target = EGL_TRUE;
   dpy->Extensions.ANDROID_image_native_buffer = EGL_TRUE;
   dpy->Extensions.ANDROID_recordable = EGL_TRUE;
@@ -1212,6 +1444,21 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
 #if ANDROID_API_LEVEL >= 23
   dpy->Extensions.KHR_partial_update = EGL_TRUE;
 #endif
+   dpy->Extensions.KHR_image = EGL_TRUE;
+#if __ANDROID_API__ >= 24
+   if (dri2_dpy->mutable_render_buffer &&
+       dri2_dpy->loader_extensions == droid_image_loader_extensions) {
+      dpy->Extensions.KHR_mutable_render_buffer = EGL_TRUE;
+   }
+#endif
+
+   /* Create configs *after* enabling extensions because presence of DRI
+    * driver extensions can affect the capabilities of EGLConfigs.
+    */
+   if (!droid_add_configs_for_visuals(drv, dpy)) {
+      err = "DRI2: failed to add configs";
+      goto cleanup;
+   }

   /* Fill vtbl last to prevent accidentally calling virtual function during
    * initialization.
--- a/src/egl/drivers/dri2/platform_android_gralloc_drm.h
+++ b/src/egl/drivers/dri2/platform_android_gralloc_drm.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2016 Google Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef HAS_GRALLOC_DRM_HEADERS
+
+#include <gralloc_drm.h>
+#include <gralloc_drm_handle.h>
+
+static inline bool gralloc_supports_gem_names(void) { return true; }
+
+#else
+
+#define GRALLOC_MODULE_PERFORM_GET_DRM_FD 0x0FD4DEAD
+
+static inline int gralloc_drm_get_gem_handle(buffer_handle_t handle)
+{
+   return 0; /* Not supported, return invalid handle. */
+}
+
+static inline bool gralloc_supports_gem_names(void) { return false; }
+
+#endif
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -652,6 +652,10 @@ dri2_initialize_drm(_EGLDriver *drv, _EGLDisplay *disp)
   struct gbm_device *gbm;
   const char *err;

+   /* Not supported yet */
+   if (disp->Options.UseFallback)
+      return EGL_FALSE;
+
   loader_set_logger(_eglLog);

   dri2_dpy = calloc(1, sizeof *dri2_dpy);
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -504,9 +504,11 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
   _EGL_CHECK_EXTENSION(KHR_gl_texture_3D_image);
   _EGL_CHECK_EXTENSION(KHR_gl_texture_cubemap_image);
   if (dpy->Extensions.KHR_image_base && dpy->Extensions.KHR_image_pixmap)
-      _eglAppendExtension(&exts, "EGL_KHR_image");
+      dpy->Extensions.KHR_image = EGL_TRUE;
+   _EGL_CHECK_EXTENSION(KHR_image);
   _EGL_CHECK_EXTENSION(KHR_image_base);
   _EGL_CHECK_EXTENSION(KHR_image_pixmap);
+   _EGL_CHECK_EXTENSION(KHR_mutable_render_buffer);
   _EGL_CHECK_EXTENSION(KHR_no_config_context);
   _EGL_CHECK_EXTENSION(KHR_partial_update);
   _EGL_CHECK_EXTENSION(KHR_reusable_sync);
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -268,6 +268,7 @@ static const struct {
 EGLBoolean
 _eglValidateConfig(const _EGLConfig *conf, EGLBoolean for_matching)
 {
+   _EGLDisplay *disp = conf->Display;
   EGLint i, attr, val;
   EGLBoolean valid = EGL_TRUE;

@@ -331,6 +332,8 @@ _eglValidateConfig(const _EGLConfig *conf, EGLBoolean for_matching)
                   EGL_VG_ALPHA_FORMAT_PRE_BIT |
                   EGL_MULTISAMPLE_RESOLVE_BOX_BIT |
                   EGL_SWAP_BEHAVIOR_PRESERVED_BIT;
+            if (disp->Extensions.KHR_mutable_render_buffer)
+               mask |= EGL_MUTABLE_RENDER_BUFFER_BIT_KHR;
            break;
         case EGL_RENDERABLE_TYPE:
         case EGL_CONFORMANT:
--- a/src/egl/main/eglcontext.c
+++ b/src/egl/main/eglcontext.c
@@ -579,7 +579,6 @@ _eglInitContext(_EGLContext *ctx, _EGLDisplay *dpy, _EGLConfig *conf,
   _eglInitResource(&ctx->Resource, sizeof(*ctx), dpy);
   ctx->ClientAPI = api;
   ctx->Config = conf;
-   ctx->WindowRenderBuffer = EGL_NONE;
   ctx->Profile = EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT_KHR;

   ctx->ClientMajorVersion = 1; /* the default, per EGL spec */
@@ -611,15 +610,42 @@ static EGLint
 _eglQueryContextRenderBuffer(_EGLContext *ctx)
 {
   _EGLSurface *surf = ctx->DrawSurface;
-   EGLint rb;

+   /* From the EGL 1.5 spec:
+    *
+    *    - If the context is not bound to a surface, then EGL_NONE will be
+    *      returned.
+    */
   if (!surf)
      return EGL_NONE;
-   if (surf->Type == EGL_WINDOW_BIT && ctx->WindowRenderBuffer != EGL_NONE)
-      rb = ctx->WindowRenderBuffer;
-   else
-      rb = surf->RenderBuffer;
-   return rb;
+
+   switch (surf->Type) {
+   default:
+      unreachable("bad EGLSurface type");
+   case EGL_PIXMAP_BIT:
+      /* - If the context is bound to a pixmap surface, then EGL_SINGLE_BUFFER
+       *   will be returned.
+       */
+      return EGL_SINGLE_BUFFER;
+   case EGL_PBUFFER_BIT:
+      /* - If the context is bound to a pbuffer surface, then EGL_BACK_BUFFER
+       *   will be returned.
+       */
+      return EGL_BACK_BUFFER;
+   case EGL_WINDOW_BIT:
+      /* - If the context is bound to a window surface, then either
+       *   EGL_BACK_BUFFER or EGL_SINGLE_BUFFER may be returned. The value
+       *   returned depends on both the buffer requested by the setting of the
+       *   EGL_RENDER_BUFFER property of the surface [...], and on the client
+       *   API (not all client APIs support single-buffer Rendering to window
+       *   surfaces). Some client APIs allow control of whether rendering goes
+       *   to the front or back buffer. This client API-specific choice is not
+       *   reflected in the returned value, which only describes the buffer
+       *   that will be rendered to by default if not overridden by the client
+       *   API.
+       */
+      return surf->ActiveRenderBuffer;
+   }
 }


--- a/src/egl/main/eglcontext.h
+++ b/src/egl/main/eglcontext.h
@@ -64,9 +64,6 @@ struct _egl_context
   EGLint ResetNotificationStrategy;
   EGLint ContextPriority;
   EGLBoolean NoError;
-
-   /* The real render buffer when a window surface is bound */
-   EGLint WindowRenderBuffer;
 };


--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -120,8 +120,10 @@ struct _egl_extensions
   EGLBoolean KHR_gl_texture_2D_image;
   EGLBoolean KHR_gl_texture_3D_image;
   EGLBoolean KHR_gl_texture_cubemap_image;
+   EGLBoolean KHR_image;
   EGLBoolean KHR_image_base;
   EGLBoolean KHR_image_pixmap;
+   EGLBoolean KHR_mutable_render_buffer;
   EGLBoolean KHR_no_config_context;
   EGLBoolean KHR_partial_update;
   EGLBoolean KHR_reusable_sync;
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -122,7 +122,13 @@ _eglParseSurfaceAttribList(_EGLSurface *surf, const EGLint *attrib_list)
            err = EGL_BAD_ATTRIBUTE;
            break;
         }
-         surf->RenderBuffer = val;
+         surf->RequestedRenderBuffer = val;
+         if (surf->Config->SurfaceType & EGL_MUTABLE_RENDER_BUFFER_BIT_KHR) {
+            /* Unlike normal EGLSurfaces, one with a mutable render buffer
+             * uses the application-chosen render buffer.
+             */
+            surf->ActiveRenderBuffer = val;
+         }
         break;
      case EGL_POST_SUB_BUFFER_SUPPORTED_NV:
         if (!dpy->Extensions.NV_post_sub_buffer ||
@@ -285,7 +291,8 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
   surf->TextureTarget = EGL_NO_TEXTURE;
   surf->MipmapTexture = EGL_FALSE;
   surf->LargestPbuffer = EGL_FALSE;
-   surf->RenderBuffer = renderBuffer;
+   surf->RequestedRenderBuffer = renderBuffer;
+   surf->ActiveRenderBuffer = renderBuffer;
   surf->VGAlphaFormat = EGL_VG_ALPHA_FORMAT_NONPRE;
   surf->VGColorspace = EGL_VG_COLORSPACE_sRGB;
   surf->GLColorspace = EGL_GL_COLORSPACE_LINEAR_KHR;
@@ -358,7 +365,35 @@ _eglQuerySurface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface,
      *value = surface->SwapBehavior;
      break;
   case EGL_RENDER_BUFFER:
-      *value = surface->RenderBuffer;
+      /* From the EGL_KHR_mutable_render_buffer spec (v12):
+       *
+       *    Querying EGL_RENDER_BUFFER returns the buffer which client API
+       *    rendering is requested to use. For a window surface, this is the
+       *    attribute value specified when the surface was created or last set
+       *    via eglSurfaceAttrib.
+       *
+       * In other words, querying a window surface returns the value most
+       * recently *requested* by the user.
+       *
+       * The paragraph continues in the EGL 1.5 spec (2014.08.27):
+       *
+       *    For a pbuffer surface, it is always EGL_BACK_BUFFER . For a pixmap
+       *    surface, it is always EGL_SINGLE_BUFFER . To determine the actual
+       *    buffer being rendered to by a context, call eglQueryContext.
+       */
+      switch (surface->Type) {
+      default:
+         unreachable("bad EGLSurface type");
+      case EGL_WINDOW_BIT:
+         *value = surface->RequestedRenderBuffer;
+         break;
+      case EGL_PBUFFER_BIT:
+         *value = EGL_BACK_BUFFER;
+         break;
+      case EGL_PIXMAP_BIT:
+         *value = EGL_SINGLE_BUFFER;
+         break;
+      }
      break;
   case EGL_PIXEL_ASPECT_RATIO:
      *value = surface->AspectRatio;
@@ -450,6 +485,31 @@ _eglSurfaceAttrib(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface,
         break;
      surface->MultisampleResolve = value;
      break;
+   case EGL_RENDER_BUFFER:
+      if (!dpy->Extensions.KHR_mutable_render_buffer) {
+         err = EGL_BAD_ATTRIBUTE;
+         break;
+      }
+
+      if (value != EGL_BACK_BUFFER && value != EGL_SINGLE_BUFFER) {
+         err = EGL_BAD_PARAMETER;
+         break;
+      }
+
+      /* From the EGL_KHR_mutable_render_buffer spec (v12):
+       *
+       *    If attribute is EGL_RENDER_BUFFER, and the EGL_SURFACE_TYPE
+       *    attribute of the EGLConfig used to create surface does not contain
+       *    EGL_MUTABLE_RENDER_BUFFER_BIT_KHR, [...] an EGL_BAD_MATCH error is
+       *    generated [...].
+       */
+      if (!(surface->Config->SurfaceType & EGL_MUTABLE_RENDER_BUFFER_BIT_KHR)) {
+         err = EGL_BAD_MATCH;
+         break;
+      }
+
+      surface->RequestedRenderBuffer = value;
+      break;
   case EGL_SWAP_BEHAVIOR:
      switch (value) {
      case EGL_BUFFER_DESTROYED:
@@ -551,3 +611,18 @@ _eglSwapInterval(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf,
 {
   return EGL_TRUE;
 }
+
+EGLBoolean
+_eglSurfaceHasMutableRenderBuffer(_EGLSurface *surf)
+{
+   return surf->Type == EGL_WINDOW_BIT &&
+          surf->Config &&
+          (surf->Config->SurfaceType & EGL_MUTABLE_RENDER_BUFFER_BIT_KHR);
+}
+
+EGLBoolean
+_eglSurfaceInSharedBufferMode(_EGLSurface *surf)
+{
+   return _eglSurfaceHasMutableRenderBuffer(surf) &&
+          surf->ActiveRenderBuffer == EGL_SINGLE_BUFFER;
+}
--- a/src/egl/main/eglsurface.h
+++ b/src/egl/main/eglsurface.h
@@ -67,7 +67,59 @@ struct _egl_surface
   EGLenum TextureTarget;
   EGLBoolean MipmapTexture;
   EGLBoolean LargestPbuffer;
-   EGLenum RenderBuffer;
+
+   /**
+    * Value of EGL_RENDER_BUFFER selected at creation.
+    *
+    * The user may select, for window surfaces, the EGL_RENDER_BUFFER through
+    * the attribute list of eglCreateWindowSurface(). The EGL spec allows the
+    * implementation to ignore request, though; hence why we maintain both
+    * RequestedRenderBuffer and ActiveRenderBuffer. For pbuffer and pixmap
+    * surfaces, the EGL spec hard-codes the EGL_RENDER_BUFFER value and the
+    * user must not provide it in the attribute list.
+    *
+    * Normally, the attribute is immutable and after surface creation.
+    * However, EGL_KHR_mutable_render_buffer allows the user to change it in
+    * window surfaces via eglSurfaceAttrib, in which case
+    * eglQuerySurface(EGL_RENDER_BUFFER) will immediately afterwards return
+    * the requested value but the actual render buffer used by the context
+    * does not change until completion of the next eglSwapBuffers call.
+    *
+    * From the EGL_KHR_mutable_render_buffer spec (v12):
+    *
+    *    Querying EGL_RENDER_BUFFER returns the buffer which client API
+    *    rendering is requested to use. For a window surface, this is the
+    *    attribute value specified when the surface was created or last set
+    *    via eglSurfaceAttrib.
+    *
+    * eglQueryContext(EGL_RENDER_BUFFER) ignores this.
+    */
+   EGLenum RequestedRenderBuffer;
+
+   /**
+    * The EGL_RENDER_BUFFER in use by the context.
+    *
+    * This is valid only when bound as the draw surface.  This may differ from
+    * the RequestedRenderBuffer.
+    *
+    * Refer to eglQueryContext(EGL_RENDER_BUFFER) in the EGL spec.
+    * eglQuerySurface(EGL_RENDER_BUFFER) ignores this.
+    *
+    * If a window surface is bound as the draw surface and has a pending,
+    * user-requested change to EGL_RENDER_BUFFER, then the next eglSwapBuffers
+    * will flush the pending change. (The flush of EGL_RENDER_BUFFER state may
+    * occur without the implicit glFlush induced by eglSwapBuffers). The spec
+    * requires that the flush occur at that time and nowhere else. During the
+    * state-flush, we copy RequestedRenderBuffer to ActiveRenderBuffer.
+    *
+    * From the EGL_KHR_mutable_render_buffer spec (v12):
+    *
+    *    If [...] there is a pending change to the EGL_RENDER_BUFFER
+    *    attribute, eglSwapBuffers performs an implicit flush operation on the
+    *    context and effects the attribute change.
+    */
+   EGLenum ActiveRenderBuffer;
+
   EGLenum VGAlphaFormat;
   EGLenum VGColorspace;
   EGLenum GLColorspace;
@@ -124,6 +176,11 @@ _eglReleaseTexImage(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf, EGLin
 extern EGLBoolean
 _eglSwapInterval(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf, EGLint interval);

+extern EGLBoolean
+_eglSurfaceHasMutableRenderBuffer(_EGLSurface *surf);
+
+extern EGLBoolean
+_eglSurfaceInSharedBufferMode(_EGLSurface *surf);

 /**
 * Increment reference count for the surface.
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -110,7 +110,7 @@ pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)

   if (pipe_reference_described(&(*ptr)->reference, &surf->reference, 
                                (debug_reference_descriptor)debug_describe_surface))
-      old_surf->context->surface_destroy(old_surf->context, old_surf);
+      old_surf->surface_destroy(old_surf->context, old_surf);
   *ptr = surf;
 }

@@ -156,7 +156,7 @@ pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_

   if (pipe_reference_described(&(*ptr)->reference, &view->reference,
                                (debug_reference_descriptor)debug_describe_sampler_view))
-      old_view->context->sampler_view_destroy(old_view->context, old_view);
+      old_view->sampler_view_destroy(old_view->context, old_view);
   *ptr = view;
 }

--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -44,6 +44,10 @@
 #include "i915_resource.h"
 #include "i915_state.h"

+static void
+i915_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view);
+
 /* The i915 (and related graphics cores) do not support GL_CLAMP.  The
 * Intel drivers for "other operating systems" implement GL_CLAMP as
 * GL_CLAMP_TO_EDGE, so the same is done here.
@@ -827,6 +831,7 @@ i915_create_sampler_view_custom(struct pipe_context *pipe,
      view->texture = NULL;
      pipe_resource_reference(&view->texture, texture);
      view->context = pipe;
+      view->sampler_view_destroy = i915_sampler_view_destroy;
   }

   return view;
@@ -845,6 +850,7 @@ i915_create_sampler_view(struct pipe_context *pipe,
      view->texture = NULL;
      pipe_resource_reference(&view->texture, texture);
      view->context = pipe;
+      view->sampler_view_destroy = i915_sampler_view_destroy;
   }

   return view;
--- a/src/gallium/drivers/i915/i915_surface.c
+++ b/src/gallium/drivers/i915/i915_surface.c
@@ -376,6 +376,7 @@ i915_create_surface_custom(struct pipe_context *ctx,
      ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
      ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
      ps->context = ctx;
+      ps->surface_destroy = ctx->surface_destroy;
   }
   return ps;
 }
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -40,6 +40,9 @@
 #include "lp_debug.h"
 #include "state_tracker/sw_winsys.h"

+static void
+llvmpipe_sampler_view_destroy(struct pipe_context *pipe,
+                              struct pipe_sampler_view *view);

 static void *
 llvmpipe_create_sampler_state(struct pipe_context *pipe,
@@ -183,6 +186,7 @@ llvmpipe_create_sampler_view(struct pipe_context *pipe,
      view->texture = NULL;
      pipe_resource_reference(&view->texture, texture);
      view->context = pipe;
+      view->sampler_view_destroy = llvmpipe_sampler_view_destroy;

 #ifdef DEBUG
     /*
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -147,6 +147,7 @@ llvmpipe_create_surface(struct pipe_context *pipe,
      pipe_resource_reference(&ps->texture, pt);
      ps->context = pipe;
      ps->format = surf_tmpl->format;
+      ps->surface_destroy = pipe->surface_destroy;
      if (llvmpipe_resource_is_texture(pt)) {
         assert(surf_tmpl->u.tex.level <= pt->last_level);
         assert(surf_tmpl->u.tex.first_layer <= surf_tmpl->u.tex.last_layer);
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -46,6 +46,10 @@
 #include "r300_texture.h"
 #include "r300_vs.h"

+static void
+r300_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view);
+
 /* r300_state: Functions used to intialize state context by translating
 * Gallium state objects into semi-native r300 state objects. */

@@ -1609,6 +1613,7 @@ r300_create_sampler_view_custom(struct pipe_context *pipe,
        view->base.reference.count = 1;
        view->base.context = pipe;
        view->base.texture = NULL;
+        view->base.sampler_view_destroy = r300_sampler_view_destroy;
        pipe_resource_reference(&view->base.texture, texture);

 	view->width0_override = width0_override;
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -1227,6 +1227,7 @@ struct pipe_surface* r300_create_surface_custom(struct pipe_context * ctx,
        pipe_reference_init(&surface->base.reference, 1);
        pipe_resource_reference(&surface->base.texture, texture);
        surface->base.context = ctx;
+        surface->base.surface_destroy = ctx->surface_destroy;
        surface->base.format = surf_tmpl->format;
        surface->base.width = u_minify(width0_override, level);
        surface->base.height = u_minify(height0_override, level);
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -737,6 +737,8 @@ void r600_sampler_views_dirty(struct r600_context *rctx,
 			      struct r600_samplerview_state *state);
 void r600_sampler_states_dirty(struct r600_context *rctx,
 			       struct r600_sampler_states *state);
+void r600_sampler_view_destroy(struct pipe_context *ctx,
+			       struct pipe_sampler_view *state);
 void r600_constant_buffers_dirty(struct r600_context *rctx, struct r600_constbuf_state *state);
 void r600_set_sample_locations_constant_buffer(struct r600_context *rctx);
 uint32_t r600_translate_stencil_op(int s_op);
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -682,6 +682,7 @@ r600_create_sampler_view_custom(struct pipe_context *ctx,
 	view->base.texture = texture;
 	view->base.reference.count = 1;
 	view->base.context = ctx;
+	view->base.sampler_view_destroy = r600_sampler_view_destroy;

 	if (texture->target == PIPE_BUFFER)
 		return texture_buffer_sampler_view(view, texture->width0, 1);
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -385,8 +385,8 @@ static void r600_delete_rs_state(struct pipe_context *ctx, void *state)
 	FREE(rs);
 }

-static void r600_sampler_view_destroy(struct pipe_context *ctx,
-				      struct pipe_sampler_view *state)
+void r600_sampler_view_destroy(struct pipe_context *ctx,
+			       struct pipe_sampler_view *state)
 {
 	struct r600_pipe_sampler_view *view = (struct r600_pipe_sampler_view *)state;

--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -410,6 +410,8 @@ static int r600_fence_get_fd(struct pipe_screen *screen,

 	/* If we don't have FDs at this point, it means we don't have fences
 	 * either. */
+	if (sdma_fd == -1 && gfx_fd == -1)
+		return ws->export_signalled_sync_file(ws);
 	if (sdma_fd == -1)
 		return gfx_fd;
 	if (gfx_fd == -1)
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -1928,6 +1928,7 @@ struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
 	pipe_reference_init(&surface->base.reference, 1);
 	pipe_resource_reference(&surface->base.texture, texture);
 	surface->base.context = pipe;
+	surface->base.surface_destroy = pipe->surface_destroy;
 	surface->base.format = templ->format;
 	surface->base.width = width;
 	surface->base.height = height;
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -609,6 +609,11 @@ struct radeon_winsys {
    int (*fence_export_sync_file)(struct radeon_winsys *ws,
 				  struct pipe_fence_handle *fence);

+    /**
+     * Return a sync file FD that is already signalled.
+     */
+    int (*export_signalled_sync_file)(struct radeon_winsys *ws);
+
    /**
     * Initialize surface
     *
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3786,6 +3786,15 @@ si_make_texture_descriptor(struct si_screen *screen,
 	}
 }

+static void si_sampler_view_destroy(struct pipe_context *ctx,
+                                    struct pipe_sampler_view *state)
+{
+        struct si_sampler_view *view = (struct si_sampler_view *)state;
+
+        pipe_resource_reference(&state->texture, NULL);
+        FREE(view);
+}
+
 /**
 * Create a sampler view.
 *
@@ -3821,6 +3830,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 	view->base.texture = NULL;
 	view->base.reference.count = 1;
 	view->base.context = ctx;
+	view->base.sampler_view_destroy = si_sampler_view_destroy;

 	assert(texture);
 	pipe_resource_reference(&view->base.texture, texture);
@@ -3956,15 +3966,6 @@ si_create_sampler_view(struct pipe_context *ctx,
 					     texture ? texture->height0 : 0, 0);
 }

-static void si_sampler_view_destroy(struct pipe_context *ctx,
-				    struct pipe_sampler_view *state)
-{
-	struct si_sampler_view *view = (struct si_sampler_view *)state;
-
-	pipe_resource_reference(&state->texture, NULL);
-	FREE(view);
-}
-
 static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
 {
 	return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -3467,6 +3467,7 @@ softpipe_create_sampler_view(struct pipe_context *pipe,
      view->texture = NULL;
      pipe_resource_reference(&view->texture, resource);
      view->context = pipe;
+      view->sampler_view_destroy = pipe->sampler_view_destroy;

 #ifdef DEBUG
     /*
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -300,6 +300,7 @@ softpipe_create_surface(struct pipe_context *pipe,
      pipe_resource_reference(&ps->texture, pt);
      ps->context = pipe;
      ps->format = surf_tmpl->format;
+      ps->surface_destroy = pipe->surface_destroy;
      if (pt->target != PIPE_BUFFER) {
         assert(surf_tmpl->u.tex.level <= pt->last_level);
         ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level);
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -206,6 +206,7 @@ static struct pipe_surface *virgl_create_surface(struct pipe_context *ctx,
   pipe_reference_init(&surf->base.reference, 1);
   pipe_resource_reference(&surf->base.texture, resource);
   surf->base.context = ctx;
+   surf->base.surface_destroy = ctx->surface_destroy;
   surf->base.format = templ->format;
   if (resource->target != PIPE_BUFFER) {
      surf->base.width = u_minify(resource->width0, templ->u.tex.level);
@@ -676,6 +677,7 @@ static struct pipe_sampler_view *virgl_create_sampler_view(struct pipe_context *

   grview->base.texture = NULL;
   grview->base.context = ctx;
+   grview->base.sampler_view_destroy = ctx->sampler_view_destroy;
   pipe_resource_reference(&grview->base.texture, texture);
   grview->handle = handle;
   return &grview->base;
--- a/src/gallium/drivers/virgl/virgl_tgsi.c
+++ b/src/gallium/drivers/virgl/virgl_tgsi.c
@@ -76,7 +76,6 @@ virgl_tgsi_transform_instruction(struct tgsi_transform_context *ctx,
   for (unsigned i = 0; i < inst->Instruction.NumSrcRegs; i++) {
      if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT &&
          inst->Src[i].Register.Dimension &&
-          !inst->Src[i].Register.Indirect &&
          inst->Src[i].Dimension.Index == 0)
         inst->Src[i].Register.Dimension = 0;
   }
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -427,6 +427,9 @@ struct pipe_surface
   uint16_t height;              /**< logical height in pixels */

   union pipe_surface_desc u;
+
+   void (*surface_destroy)(struct pipe_context *ctx,
+                           struct pipe_surface *);
 };


@@ -456,6 +459,9 @@ struct pipe_sampler_view
         unsigned size;     /**< size of the readable sub-range in bytes */
      } buf;
   } u;
+
+   void (*sampler_view_destroy)(struct pipe_context *ctx,
+                                struct pipe_sampler_view *view);
 };


--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -249,7 +249,7 @@ dri_fill_in_modes(struct dri_screen *screen)
                                        depth_buffer_factor, back_buffer_modes,
                                        ARRAY_SIZE(back_buffer_modes),
                                        msaa_modes, 1,
-                                        GL_TRUE, !mixed_color_depth);
+                                        GL_TRUE, !mixed_color_depth, GL_FALSE);
         configs = driConcatConfigs(configs, new_configs);

         /* Multi-sample configs without an accumulation buffer. */
@@ -259,7 +259,7 @@ dri_fill_in_modes(struct dri_screen *screen)
                                           depth_buffer_factor, back_buffer_modes,
                                           ARRAY_SIZE(back_buffer_modes),
                                           msaa_modes+1, num_msaa_modes-1,
-                                           GL_FALSE, !mixed_color_depth);
+                                           GL_FALSE, !mixed_color_depth, GL_FALSE);
            configs = driConcatConfigs(configs, new_configs);
         }
      }
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -113,6 +113,28 @@ static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
   return fd;
 }

+static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
+{
+   struct amdgpu_winsys *ws = amdgpu_winsys(rws);
+   uint32_t syncobj;
+   int fd = -1;
+
+   int r = amdgpu_cs_create_syncobj2(ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                     &syncobj);
+   if (r) {
+      return -1;
+   }
+
+   r = amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, &fd);
+   if (r) {
+      fd = -1;
+   }
+
+   amdgpu_cs_destroy_syncobj(ws->dev, syncobj);
+   return fd;
+}
+
+
 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
                                   uint64_t seq_no,
                                   uint64_t *user_fence_cpu_address)
@@ -1552,4 +1574,5 @@ void amdgpu_cs_init_functions(struct amdgpu_winsys *ws)
   ws->base.fence_reference = amdgpu_fence_reference;
   ws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
   ws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
+   ws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
 }
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -59,20 +59,29 @@
 #define DEBUG_PRINT(msg, ...)
 #endif

+struct kms_sw_displaytarget;
+
+struct kms_sw_plane {
+   unsigned width;
+   unsigned height;
+   unsigned stride;
+   unsigned offset;
+   struct kms_sw_displaytarget* dt;
+   struct list_head link;
+};

 struct kms_sw_displaytarget
 {
   enum pipe_format format;
-   unsigned width;
-   unsigned height;
-   unsigned stride;
   unsigned size;

   uint32_t handle;
   void *mapped;
+   void *ro_mapped;

   int ref_count;
   struct list_head link;
+   struct list_head planes;
 };

 struct kms_sw_winsys
@@ -83,10 +92,10 @@ struct kms_sw_winsys
   struct list_head bo_list;
 };

-static inline struct kms_sw_displaytarget *
-kms_sw_displaytarget( struct sw_displaytarget *dt )
+static inline struct kms_sw_plane *
+kms_sw_plane( struct sw_displaytarget *dt )
 {
-   return (struct kms_sw_displaytarget *)dt;
+   return (struct kms_sw_plane *)dt;
 }

 static inline struct kms_sw_winsys *
@@ -105,6 +114,42 @@ kms_sw_is_displaytarget_format_supported( struct sw_winsys *ws,
   return TRUE;
 }

+static struct kms_sw_plane *get_plane(struct kms_sw_displaytarget *kms_sw_dt,
+                                      enum pipe_format format,
+                                      unsigned width, unsigned height,
+                                      unsigned stride, unsigned offset) {
+   struct kms_sw_plane * tmp, * plane = NULL;
+   if (offset + util_format_get_2d_size(format, stride, height) >
+       kms_sw_dt->size) {
+      DEBUG_PRINT("KMS-DEBUG: plane too big. format: %d stride: %d height: %d "
+                  "offset: %d size:%d\n", format, stride, height, offset,
+                  kms_sw_dt->size);
+      return NULL;
+   }
+   LIST_FOR_EACH_ENTRY(tmp, &kms_sw_dt->planes, link) {
+      if (tmp->offset == offset) {
+         plane = tmp;
+         break;
+      }
+   }
+   if (plane) {
+      assert(plane->width == width);
+      assert(plane->height == height);
+      assert(plane->stride == stride);
+      assert(plane->dt == kms_sw_dt);
+   } else {
+      plane = CALLOC_STRUCT(kms_sw_plane);
+      if (plane == NULL) return NULL;
+      plane->width = width;
+      plane->height = height;
+      plane->stride = stride;
+      plane->offset = offset;
+      plane->dt = kms_sw_dt;
+      list_add(&plane->link, &kms_sw_dt->planes);
+   }
+   return plane;
+}
+
 static struct sw_displaytarget *
 kms_sw_displaytarget_create(struct sw_winsys *ws,
                            unsigned tex_usage,
@@ -124,11 +169,10 @@ kms_sw_displaytarget_create(struct sw_winsys *ws,
   if (!kms_sw_dt)
      goto no_dt;

+   list_inithead(&kms_sw_dt->planes);
   kms_sw_dt->ref_count = 1;

   kms_sw_dt->format = format;
-   kms_sw_dt->width = width;
-   kms_sw_dt->height = height;

   memset(&create_req, 0, sizeof(create_req));
   create_req.bpp = 32;
@@ -138,17 +182,19 @@ kms_sw_displaytarget_create(struct sw_winsys *ws,
   if (ret)
      goto free_bo;

-   kms_sw_dt->stride = create_req.pitch;
   kms_sw_dt->size = create_req.size;
   kms_sw_dt->handle = create_req.handle;
+   struct kms_sw_plane* plane = get_plane(kms_sw_dt, format, width, height,
+                                          create_req.pitch, 0);
+   if (plane == NULL)
+      goto free_bo;

   list_add(&kms_sw_dt->link, &kms_sw->bo_list);

   DEBUG_PRINT("KMS-DEBUG: created buffer %u (size %u)\n", kms_sw_dt->handle, kms_sw_dt->size);

-   *stride = kms_sw_dt->stride;
-   return (struct sw_displaytarget *)kms_sw_dt;
-
+   *stride = create_req.pitch;
+   return (struct sw_displaytarget *) plane;
 free_bo:
   memset(&destroy_req, 0, sizeof destroy_req);
   destroy_req.handle = create_req.handle;
@@ -163,13 +209,19 @@ kms_sw_displaytarget_destroy(struct sw_winsys *ws,
                             struct sw_displaytarget *dt)
 {
   struct kms_sw_winsys *kms_sw = kms_sw_winsys(ws);
-   struct kms_sw_displaytarget *kms_sw_dt = kms_sw_displaytarget(dt);
+   struct kms_sw_plane *plane = kms_sw_plane(dt);
+   struct kms_sw_displaytarget *kms_sw_dt = plane->dt;
   struct drm_mode_destroy_dumb destroy_req;

   kms_sw_dt->ref_count --;
   if (kms_sw_dt->ref_count > 0)
      return;

+   if (kms_sw_dt->ro_mapped)
+     munmap(kms_sw_dt->ro_mapped, kms_sw_dt->size);
+   if (kms_sw_dt->mapped)
+     munmap(kms_sw_dt->mapped, kms_sw_dt->size);
+
   memset(&destroy_req, 0, sizeof destroy_req);
   destroy_req.handle = kms_sw_dt->handle;
   drmIoctl(kms_sw->fd, DRM_IOCTL_MODE_DESTROY_DUMB, &destroy_req);
@@ -178,6 +230,10 @@ kms_sw_displaytarget_destroy(struct sw_winsys *ws,

   DEBUG_PRINT("KMS-DEBUG: destroyed buffer %u\n", kms_sw_dt->handle);

+   struct kms_sw_plane * tmp;
+   LIST_FOR_EACH_ENTRY_SAFE(plane, tmp, &kms_sw_dt->planes, link) {
+      FREE(plane);
+   }
   FREE(kms_sw_dt);
 }

@@ -187,7 +243,8 @@ kms_sw_displaytarget_map(struct sw_winsys *ws,
                         unsigned flags)
 {
   struct kms_sw_winsys *kms_sw = kms_sw_winsys(ws);
-   struct kms_sw_displaytarget *kms_sw_dt = kms_sw_displaytarget(dt);
+   struct kms_sw_plane *plane = kms_sw_plane(dt);
+   struct kms_sw_displaytarget *kms_sw_dt = plane->dt;
   struct drm_mode_map_dumb map_req;
   int prot, ret;

@@ -198,16 +255,20 @@ kms_sw_displaytarget_map(struct sw_winsys *ws,
      return NULL;

   prot = (flags == PIPE_TRANSFER_READ) ? PROT_READ : (PROT_READ | PROT_WRITE);
-   kms_sw_dt->mapped = mmap(0, kms_sw_dt->size, prot, MAP_SHARED,
-                            kms_sw->fd, map_req.offset);
+   void **ptr = (flags == PIPE_TRANSFER_READ) ? &kms_sw_dt->ro_mapped : &kms_sw_dt->mapped;
+   if (*ptr == NULL) {
+      void * tmp = mmap(0, kms_sw_dt->size, prot, MAP_SHARED,
+                       kms_sw->fd, map_req.offset);
+      if (tmp == MAP_FAILED)
+         return NULL;
+      *ptr = tmp;
+   }

-   if (kms_sw_dt->mapped == MAP_FAILED)
-      return NULL;
+   DEBUG_PRINT("KMS-DEBUG: mapped buffer %u (size %u) at %p %dx%d \n",
+         kms_sw_dt->handle, kms_sw_dt->size, *ptr,
+         plane->width, plane->height);

-   DEBUG_PRINT("KMS-DEBUG: mapped buffer %u (size %u) at %p\n",
-         kms_sw_dt->handle, kms_sw_dt->size, kms_sw_dt->mapped);
-
-   return kms_sw_dt->mapped;
+   return *ptr + plane->offset;
 }

 static struct kms_sw_displaytarget *
@@ -230,10 +291,11 @@ kms_sw_displaytarget_find_and_ref(struct kms_sw_winsys *kms_sw,
   return NULL;
 }

-static struct kms_sw_displaytarget *
+static struct kms_sw_plane *
 kms_sw_displaytarget_add_from_prime(struct kms_sw_winsys *kms_sw, int fd,
+                                    enum pipe_format format,
                                    unsigned width, unsigned height,
-                                    unsigned stride)
+                                    unsigned stride, unsigned offset)
 {
   uint32_t handle = -1;
   struct kms_sw_displaytarget * kms_sw_dt;
@@ -245,13 +307,19 @@ kms_sw_displaytarget_add_from_prime(struct kms_sw_winsys *kms_sw, int fd,
      return NULL;

   kms_sw_dt = kms_sw_displaytarget_find_and_ref(kms_sw, handle);
-   if (kms_sw_dt)
-      return kms_sw_dt;
+   struct kms_sw_plane * plane = NULL;
+   if (kms_sw_dt) {
+      plane = get_plane(kms_sw_dt, format, width, height, stride, offset);
+      if (plane == NULL)
+         kms_sw_dt->ref_count --;
+      return plane;
+   }

   kms_sw_dt = CALLOC_STRUCT(kms_sw_displaytarget);
   if (!kms_sw_dt)
      return NULL;

+   list_inithead(&kms_sw_dt->planes);
   off_t lseek_ret = lseek(fd, 0, SEEK_END);
   if (lseek_ret == -1) {
      FREE(kms_sw_dt);
@@ -260,27 +328,27 @@ kms_sw_displaytarget_add_from_prime(struct kms_sw_winsys *kms_sw, int fd,
   kms_sw_dt->size = lseek_ret;
   kms_sw_dt->ref_count = 1;
   kms_sw_dt->handle = handle;
-   kms_sw_dt->width = width;
-   kms_sw_dt->height = height;
-   kms_sw_dt->stride = stride;

   lseek(fd, 0, SEEK_SET);
+   plane = get_plane(kms_sw_dt, format, width, height, stride, offset);
+   if (plane == NULL) {
+      FREE(kms_sw_dt);
+      return NULL;
+   }

   list_add(&kms_sw_dt->link, &kms_sw->bo_list);

-   return kms_sw_dt;
+   return plane;
 }

 static void
 kms_sw_displaytarget_unmap(struct sw_winsys *ws,
                           struct sw_displaytarget *dt)
 {
-   struct kms_sw_displaytarget *kms_sw_dt = kms_sw_displaytarget(dt);
+   struct kms_sw_plane * plane = kms_sw_plane(dt);
+   struct kms_sw_displaytarget *kms_sw_dt = plane->dt;

-   DEBUG_PRINT("KMS-DEBUG: unmapped buffer %u (was %p)\n", kms_sw_dt->handle, kms_sw_dt->mapped);
-
-   munmap(kms_sw_dt->mapped, kms_sw_dt->size);
-   kms_sw_dt->mapped = NULL;
+   DEBUG_PRINT("KMS-DEBUG: ignore unmap buffer %u \n", kms_sw_dt->handle);
 }

 static struct sw_displaytarget *
@@ -291,30 +359,34 @@ kms_sw_displaytarget_from_handle(struct sw_winsys *ws,
 {
   struct kms_sw_winsys *kms_sw = kms_sw_winsys(ws);
   struct kms_sw_displaytarget *kms_sw_dt;
+   struct kms_sw_plane *kms_sw_pl;

   assert(whandle->type == DRM_API_HANDLE_TYPE_KMS ||
          whandle->type == DRM_API_HANDLE_TYPE_FD);

-   if (whandle->offset != 0) {
-      DEBUG_PRINT("KMS-DEBUG: attempt to import unsupported winsys offset %d\n",
-                  whandle->offset);
-      return NULL;
-   }
-
   switch(whandle->type) {
   case DRM_API_HANDLE_TYPE_FD:
-      kms_sw_dt = kms_sw_displaytarget_add_from_prime(kms_sw, whandle->handle,
+      kms_sw_pl = kms_sw_displaytarget_add_from_prime(kms_sw, whandle->handle,
+                                                      templ->format,
                                                      templ->width0,
                                                      templ->height0,
-                                                      whandle->stride);
-      if (kms_sw_dt)
-         *stride = kms_sw_dt->stride;
-      return (struct sw_displaytarget *)kms_sw_dt;
+                                                      whandle->stride,
+                                                      whandle->offset);
+      if (kms_sw_pl) {
+         *stride = kms_sw_pl->stride;
+      }
+      return (struct sw_displaytarget *)kms_sw_pl;
   case DRM_API_HANDLE_TYPE_KMS:
      kms_sw_dt = kms_sw_displaytarget_find_and_ref(kms_sw, whandle->handle);
      if (kms_sw_dt) {
-         *stride = kms_sw_dt->stride;
-         return (struct sw_displaytarget *)kms_sw_dt;
+         struct kms_sw_plane * plane;
+         LIST_FOR_EACH_ENTRY(plane, &kms_sw_dt->planes, link) {
+            if (whandle->offset == plane->offset) {
+               *stride = plane->stride;
+               return  (struct sw_displaytarget *)plane;
+            }
+         }
+         kms_sw_dt->ref_count --;
      }
      /* fallthrough */
   default:
@@ -331,19 +403,20 @@ kms_sw_displaytarget_get_handle(struct sw_winsys *winsys,
                                struct winsys_handle *whandle)
 {
   struct kms_sw_winsys *kms_sw = kms_sw_winsys(winsys);
-   struct kms_sw_displaytarget *kms_sw_dt = kms_sw_displaytarget(dt);
+   struct kms_sw_plane *plane = kms_sw_plane(dt);
+   struct kms_sw_displaytarget *kms_sw_dt = plane->dt;

   switch(whandle->type) {
   case DRM_API_HANDLE_TYPE_KMS:
      whandle->handle = kms_sw_dt->handle;
-      whandle->stride = kms_sw_dt->stride;
-      whandle->offset = 0;
+      whandle->stride = plane->stride;
+      whandle->offset = plane->offset;
      return TRUE;
   case DRM_API_HANDLE_TYPE_FD:
      if (!drmPrimeHandleToFD(kms_sw->fd, kms_sw_dt->handle,
                             DRM_CLOEXEC, (int*)&whandle->handle)) {
-         whandle->stride = kms_sw_dt->stride;
-         whandle->offset = 0;
+         whandle->stride = plane->stride;
+         whandle->offset = plane->offset;
         return TRUE;
      }
      /* fallthrough */
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -46,6 +46,7 @@ COMPILER_FILES = \
 	compiler/brw_eu_util.c \
 	compiler/brw_eu_validate.c \
 	compiler/brw_fs_builder.h \
+	compiler/brw_fs_bank_conflicts.cpp \
 	compiler/brw_fs_cmod_propagation.cpp \
 	compiler/brw_fs_combine_constants.cpp \
 	compiler/brw_fs_copy_propagation.cpp \
--- a/src/intel/compiler/brw_cfg.cpp
+++ b/src/intel/compiler/brw_cfg.cpp
@@ -98,6 +98,7 @@ ends_block(const backend_instruction *inst)
          op == BRW_OPCODE_ELSE ||
          op == BRW_OPCODE_CONTINUE ||
          op == BRW_OPCODE_BREAK ||
+          op == BRW_OPCODE_DO ||
          op == BRW_OPCODE_WHILE;
 }

@@ -268,13 +269,57 @@ cfg_t::cfg_t(exec_list *instructions)
         }

         cur->instructions.push_tail(inst);
+
+         /* Represent divergent execution of the loop as a pair of alternative
+          * edges coming out of the DO instruction: For any physical iteration
+          * of the loop a given logical thread can either start off enabled
+          * (which is represented as the "next" successor), or disabled (if it
+          * has reached a non-uniform exit of the loop during a previous
+          * iteration, which is represented as the "cur_while" successor).
+          *
+          * The disabled edge will be taken by the logical thread anytime we
+          * arrive at the DO instruction through a back-edge coming from a
+          * conditional exit of the loop where divergent control flow started.
+          *
+          * This guarantees that there is a control-flow path from any
+          * divergence point of the loop into the convergence point
+          * (immediately past the WHILE instruction) such that it overlaps the
+          * whole IP region of divergent control flow (potentially the whole
+          * loop) *and* doesn't imply the execution of any instructions part
+          * of the loop (since the corresponding execution mask bit will be
+          * disabled for a diverging thread).
+          *
+          * This way we make sure that any variables that are live throughout
+          * the region of divergence for an inactive logical thread are also
+          * considered to interfere with any other variables assigned by
+          * active logical threads within the same physical region of the
+          * program, since otherwise we would risk cross-channel data
+          * corruption.
+          */
+         next = new_block();
+         cur->add_successor(mem_ctx, next);
+         cur->add_successor(mem_ctx, cur_while);
+         set_next_block(&cur, next, ip);
 	 break;

      case BRW_OPCODE_CONTINUE:
         cur->instructions.push_tail(inst);

+         /* A conditional CONTINUE may start a region of divergent control
+          * flow until the start of the next loop iteration (*not* until the
+          * end of the loop which is why the successor is not the top-level
+          * divergence point at cur_do).  The live interval of any variable
+          * extending through a CONTINUE edge is guaranteed to overlap the
+          * whole region of divergent execution, because any variable live-out
+          * at the CONTINUE instruction will also be live-in at the top of the
+          * loop, and therefore also live-out at the bottom-most point of the
+          * loop which is reachable from the top (since a control flow path
+          * exists from a definition of the variable through this CONTINUE
+          * instruction, the top of the loop, the (reachable) bottom of the
+          * loop, the top of the loop again, into a use of the variable).
+          */
         assert(cur_do != NULL);
-	 cur->add_successor(mem_ctx, cur_do);
+         cur->add_successor(mem_ctx, cur_do->next());

 	 next = new_block();
 	 if (inst->predicate)
@@ -286,8 +331,18 @@ cfg_t::cfg_t(exec_list *instructions)
      case BRW_OPCODE_BREAK:
         cur->instructions.push_tail(inst);

-         assert(cur_while != NULL);
-	 cur->add_successor(mem_ctx, cur_while);
+         /* A conditional BREAK instruction may start a region of divergent
+          * control flow until the end of the loop if the condition is
+          * non-uniform, in which case the loop will execute additional
+          * iterations with the present channel disabled.  We model this as a
+          * control flow path from the divergence point to the convergence
+          * point that overlaps the whole IP range of the loop and skips over
+          * the execution of any other instructions part of the loop.
+          *
+          * See the DO case for additional explanation.
+          */
+         assert(cur_do != NULL);
+         cur->add_successor(mem_ctx, cur_do);

 	 next = new_block();
 	 if (inst->predicate)
@@ -300,10 +355,18 @@ cfg_t::cfg_t(exec_list *instructions)
         cur->instructions.push_tail(inst);

         assert(cur_do != NULL && cur_while != NULL);
-	 cur->add_successor(mem_ctx, cur_do);

-         if (inst->predicate)
-            cur->add_successor(mem_ctx, cur_while);
+         /* A conditional WHILE instruction may start a region of divergent
+          * control flow until the end of the loop, just like the BREAK
+          * instruction.  See the BREAK case for more details.  OTOH an
+          * unconditional WHILE instruction is non-divergent (just like an
+          * unconditional CONTINUE), and will necessarily lead to the
+          * execution of an additional iteration of the loop for all enabled
+          * channels, so we may skip over the divergence point at the top of
+          * the loop to keep the CFG as unambiguous as possible.
+          */
+         cur->add_successor(mem_ctx, inst->predicate ? cur_do :
+                                     cur_do->next());

 	 set_next_block(&cur, cur_while, ip);

--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5961,6 +5961,8 @@ fs_visitor::allocate_registers(bool allow_spilling)
   if (failed)
      return;

+   opt_bank_conflicts();
+
   schedule_instructions(SCHEDULE_POST);

   if (last_scratch > 0) {
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -145,6 +145,8 @@ public:
                                   exec_list *acp);
   bool opt_drop_redundant_mov_to_flags();
   bool opt_register_renaming();
+   bool opt_bank_conflicts();
+   unsigned bank_conflict_cycles(const fs_inst *inst) const;
   bool register_coalesce();
   bool compute_to_mrf();
   bool eliminate_find_live_channel();
--- a/src/intel/compiler/brw_fs_bank_conflicts.cpp
+++ b/src/intel/compiler/brw_fs_bank_conflicts.cpp
@@ -0,0 +1,912 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_bank_conflicts.cpp
+ *
+ * This file contains a GRF bank conflict mitigation pass.  The pass is
+ * intended to be run after register allocation and works by rearranging the
+ * layout of the GRF space (without altering the semantics of the program) in
+ * a way that minimizes the number of GRF bank conflicts incurred by ternary
+ * instructions.
+ *
+ * Unfortunately there is close to no information about bank conflicts in the
+ * hardware spec, but experimentally on Gen7-Gen9 ternary instructions seem to
+ * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
+ * the second and third source are stored in the same GRF bank (\sa bank_of()
+ * for the exact bank layout) which cannot be fetched during the same cycle by
+ * the EU, unless the EU logic manages to optimize out the read cycle of a
+ * duplicate source register (\sa is_conflict_optimized_out()).
+ *
+ * The asymptotic run-time of the algorithm is dominated by the
+ * shader_conflict_weight_matrix() computation below, which is O(n) on the
+ * number of instructions in the program, however for small and medium-sized
+ * programs the run-time is likely to be dominated by
+ * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
+ * the program (\sa partitioning), which is bounded (since the program uses a
+ * bounded number of registers post-regalloc) and of the order of 100.  For
+ * that reason optimize_reg_permutation() is vectorized in order to keep the
+ * cubic term within reasonable bounds for m close to its theoretical maximum.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+#ifdef __SSE2__
+
+#include <emmintrin.h>
+
+/**
+ * Thin layer around vector intrinsics so they can be easily replaced with
+ * e.g. the fall-back scalar path, an implementation with different vector
+ * width or using different SIMD architectures (AVX-512?!).
+ *
+ * This implementation operates on pairs of independent SSE2 integer vectors à
+ * la SIMD16 for somewhat improved throughput.  SSE2 is supported by virtually
+ * all platforms that care about bank conflicts, so this path should almost
+ * always be available in practice.
+ */
+namespace {
+   /**
+    * SIMD integer vector data type.
+    */
+   struct vector_type {
+      __m128i v[2];
+   };
+
+   /**
+    * Scalar data type matching the representation of a single component of \p
+    * vector_type.
+    */
+   typedef int16_t scalar_type;
+
+   /**
+    * Maximum integer value representable as a \p scalar_type.
+    */
+   const scalar_type max_scalar = INT16_MAX;
+
+   /**
+    * Number of components of a \p vector_type.
+    */
+   const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
+
+   /**
+    * Set the i-th component of vector \p v to \p x.
+    */
+   void
+   set(vector_type &v, unsigned i, scalar_type x)
+   {
+      assert(i < vector_width);
+      memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
+   }
+
+   /**
+    * Get the i-th component of vector \p v.
+    */
+   scalar_type
+   get(const vector_type &v, unsigned i)
+   {
+      assert(i < vector_width);
+      scalar_type x;
+      memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
+      return x;
+   }
+
+   /**
+    * Add two vectors with saturation.
+    */
+   vector_type
+   adds(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_adds_epi16(v.v[0], w.v[0]),
+            _mm_adds_epi16(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Subtract two vectors with saturation.
+    */
+   vector_type
+   subs(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_subs_epi16(v.v[0], w.v[0]),
+            _mm_subs_epi16(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Compute the bitwise conjunction of two vectors.
+    */
+   vector_type
+   mask(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_and_si128(v.v[0], w.v[0]),
+            _mm_and_si128(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Reduce the components of a vector using saturating addition.
+    */
+   scalar_type
+   sums(const vector_type &v)
+   {
+      const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
+      const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
+      const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
+      const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
+      return _mm_extract_epi16(v1, 0);
+   }
+}
+
+#else
+
+/**
+ * Thin layer around vector intrinsics so they can be easily replaced with
+ * e.g. the fall-back scalar path, an implementation with different vector
+ * width or using different SIMD architectures (AVX-512?!).
+ *
+ * This implementation operates on scalar values and doesn't rely on
+ * any vector extensions.  This is mainly intended for debugging and
+ * to keep this file building on exotic platforms.
+ */
+namespace {
+   /**
+    * SIMD integer vector data type.
+    */
+   typedef int16_t vector_type;
+
+   /**
+    * Scalar data type matching the representation of a single component of \p
+    * vector_type.
+    */
+   typedef int16_t scalar_type;
+
+   /**
+    * Maximum integer value representable as a \p scalar_type.
+    */
+   const scalar_type max_scalar = INT16_MAX;
+
+   /**
+    * Number of components of a \p vector_type.
+    */
+   const unsigned vector_width = 1;
+
+   /**
+    * Set the i-th component of vector \p v to \p x.
+    */
+   void
+   set(vector_type &v, unsigned i, scalar_type x)
+   {
+      assert(i < vector_width);
+      v = x;
+   }
+
+   /**
+    * Get the i-th component of vector \p v.
+    */
+   scalar_type
+   get(const vector_type &v, unsigned i)
+   {
+      assert(i < vector_width);
+      return v;
+   }
+
+   /**
+    * Add two vectors with saturation.
+    */
+   vector_type
+   adds(vector_type v, vector_type w)
+   {
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
+   }
+
+   /**
+    * Substract two vectors with saturation.
+    */
+   vector_type
+   subs(vector_type v, vector_type w)
+   {
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
+   }
+
+   /**
+    * Compute the bitwise conjunction of two vectors.
+    */
+   vector_type
+   mask(vector_type v, vector_type w)
+   {
+      return v & w;
+   }
+
+   /**
+    * Reduce the components of a vector using saturating addition.
+    */
+   scalar_type
+   sums(vector_type v)
+   {
+      return v;
+   }
+}
+
+#endif
+
+/**
+ * Swap \p x and \p y.
+ */
+#define SWAP(x, y) do {                          \
+      __typeof(y) _swap_tmp = y;                 \
+      y = x;                                     \
+      x = _swap_tmp;                             \
+   } while (0)
+
+namespace {
+   /**
+    * Variable-length vector type intended to represent cycle-count costs for
+    * arbitrary atom-to-bank assignments.  It's indexed by a pair of integers
+    * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
+    * the conflict (respectively, whether the cost is incurred whenever the
+    * atoms are assigned the same bank b or opposite-parity banks b and b^1).
+    * \sa shader_conflict_weight_matrix()
+    */
+   struct weight_vector_type {
+      weight_vector_type() : v(NULL), size(0) {}
+
+      weight_vector_type(unsigned n) :
+         v(new vector_type[DIV_ROUND_UP(n, vector_width)]()),
+         size(n) {}
+
+      weight_vector_type(const weight_vector_type &u) :
+         v(new vector_type[DIV_ROUND_UP(u.size, vector_width)]()),
+         size(u.size)
+      {
+         memcpy(v, u.v,
+                DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
+      }
+
+      ~weight_vector_type()
+      {
+         delete[] v;
+      }
+
+      weight_vector_type &
+      operator=(weight_vector_type u)
+      {
+         SWAP(v, u.v);
+         SWAP(size, u.size);
+         return *this;
+      }
+
+      vector_type *v;
+      unsigned size;
+   };
+
+   /**
+    * Set the (i, p)-th component of weight vector \p v to \p x.
+    */
+   void
+   set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
+   {
+      set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
+   }
+
+   /**
+    * Get the (i, p)-th component of weight vector \p v.
+    */
+   scalar_type
+   get(const weight_vector_type &v, unsigned i, unsigned p)
+   {
+      return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
+   }
+
+   /**
+    * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
+    */
+   void
+   swap(weight_vector_type &v,
+        unsigned i, unsigned p,
+        unsigned j, unsigned q)
+   {
+      const scalar_type tmp = get(v, i, p);
+      set(v, i, p, get(v, j, q));
+      set(v, j, q, tmp);
+   }
+}
+
+namespace {
+   /**
+    * Object that represents the partitioning of an arbitrary register space
+    * into indivisible units (referred to as atoms below) that can potentially
+    * be rearranged independently from other registers.  The partitioning is
+    * inferred from a number of contiguity requirements specified using
+    * require_contiguous().  This allows efficient look-up of the atom index a
+    * given register address belongs to, or conversely the range of register
+    * addresses that belong to a given atom.
+    */
+   struct partitioning {
+      /**
+       * Create a (for the moment unrestricted) partitioning of a register
+       * file of size \p n.  The units are arbitrary.
+       */
+      partitioning(unsigned n) :
+         max_reg(n),
+         offsets(new unsigned[n + num_terminator_atoms]),
+         atoms(new unsigned[n + num_terminator_atoms])
+      {
+         for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
+            offsets[i] = i;
+            atoms[i] = i;
+         }
+      }
+
+      partitioning(const partitioning &p) :
+         max_reg(p.max_reg),
+         offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
+         atoms(new unsigned[p.max_reg + num_terminator_atoms])
+      {
+         memcpy(offsets, p.offsets,
+                sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
+         memcpy(atoms, p.atoms,
+                sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
+      }
+
+      ~partitioning()
+      {
+         delete[] offsets;
+         delete[] atoms;
+      }
+
+      partitioning &
+      operator=(partitioning p)
+      {
+         SWAP(max_reg, p.max_reg);
+         SWAP(offsets, p.offsets);
+         SWAP(atoms, p.atoms);
+         return *this;
+      }
+
+      /**
+       * Require register range [reg, reg + n[ to be considered part of the
+       * same atom.
+       */
+      void
+      require_contiguous(unsigned reg, unsigned n)
+      {
+         unsigned r = atoms[reg];
+
+         /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
+          * case that the specified contiguity requirement leads to the fusion
+          * (yay) of one or more existing atoms.
+          */
+         for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
+            if (offsets[atoms[reg1]] < reg + n) {
+               atoms[reg1] = r;
+            } else {
+               if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
+                  r++;
+
+               offsets[r] = offsets[atoms[reg1]];
+               atoms[reg1] = r;
+            }
+         }
+      }
+
+      /**
+       * Get the atom index register address \p reg belongs to.
+       */
+      unsigned
+      atom_of_reg(unsigned reg) const
+      {
+         return atoms[reg];
+      }
+
+      /**
+       * Get the base register address that belongs to atom \p r.
+       */
+      unsigned
+      reg_of_atom(unsigned r) const
+      {
+         return offsets[r];
+      }
+
+      /**
+       * Get the size of atom \p r in register address units.
+       */
+      unsigned
+      size_of_atom(unsigned r) const
+      {
+         assert(r < num_atoms());
+         return reg_of_atom(r + 1) - reg_of_atom(r);
+      }
+
+      /**
+       * Get the number of atoms the whole register space is partitioned into.
+       */
+      unsigned
+      num_atoms() const
+      {
+         return atoms[max_reg];
+      }
+
+   private:
+      /**
+       * Number of trailing atoms inserted for convenience so among other
+       * things we don't need to special-case the last element in
+       * size_of_atom().
+       */
+      static const unsigned num_terminator_atoms = 1;
+      unsigned max_reg;
+      unsigned *offsets;
+      unsigned *atoms;
+   };
+
+   /**
+    * Only GRF sources (whether they have been register-allocated or not) can
+    * possibly incur bank conflicts.
+    */
+   bool
+   is_grf(const fs_reg &r)
+   {
+      return r.file == VGRF || r.file == FIXED_GRF;
+   }
+
+   /**
+    * Register offset of \p r in GRF units.  Useful because the representation
+    * of GRFs post-register allocation is somewhat inconsistent and depends on
+    * whether the register already had a fixed GRF offset prior to register
+    * allocation or whether it was part of a VGRF allocation.
+    */
+   unsigned
+   reg_of(const fs_reg &r)
+   {
+      assert(is_grf(r));
+      if (r.file == VGRF)
+         return r.nr + r.offset / REG_SIZE;
+      else
+         return reg_offset(r) / REG_SIZE;
+   }
+
+   /**
+    * Calculate the finest partitioning of the GRF space compatible with the
+    * register contiguity requirements derived from all instructions part of
+    * the program.
+    */
+   partitioning
+   shader_reg_partitioning(const fs_visitor *v)
+   {
+      partitioning p(BRW_MAX_GRF);
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (is_grf(inst->dst))
+            p.require_contiguous(reg_of(inst->dst), regs_written(inst));
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (is_grf(inst->src[i]))
+               p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
+         }
+      }
+
+      return p;
+   }
+
+   /**
+    * Return the set of GRF atoms that should be left untouched at their
+    * original location to avoid violating hardware or software assumptions.
+    */
+   bool *
+   shader_reg_constraints(const fs_visitor *v, const partitioning &p)
+   {
+      bool *constrained = new bool[p.num_atoms()]();
+
+      /* These are read implicitly by some send-message instructions without
+       * any indication at the IR level.  Assume they are unsafe to move
+       * around.
+       */
+      for (unsigned reg = 0; reg < 2; reg++)
+         constrained[p.atom_of_reg(reg)] = true;
+
+      /* Assume that anything referenced via fixed GRFs is baked into the
+       * hardware's fixed-function logic and may be unsafe to move around.
+       * Also take into account the source GRF restrictions of EOT
+       * send-message instructions.
+       */
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (inst->dst.file == FIXED_GRF)
+            constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == FIXED_GRF ||
+                (is_grf(inst->src[i]) && inst->eot))
+               constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
+         }
+      }
+
+      return constrained;
+   }
+
+   /**
+    * Return whether the hardware will be able to prevent a bank conflict by
+    * optimizing out the read cycle of a source register.  The formula was
+    * found experimentally.
+    */
+   bool
+   is_conflict_optimized_out(const gen_device_info *devinfo, const fs_inst *inst)
+   {
+      return devinfo->gen >= 9 &&
+         ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
+                                    reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
+          reg_of(inst->src[1]) == reg_of(inst->src[2]));
+   }
+
+   /**
+    * Return a matrix that allows reasonably efficient computation of the
+    * cycle-count cost of bank conflicts incurred throughout the whole program
+    * for any given atom-to-bank assignment.
+    *
+    * More precisely, if C_r_s_p is the result of this function, the total
+    * cost of all bank conflicts involving any given atom r can be readily
+    * recovered as follows:
+    *
+    *  S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
+    *
+    * where d_i_j is the Kronecker delta, and B_r indicates the bank
+    * assignment of r.  \sa delta_conflicts() for a vectorized implementation
+    * of the expression above.
+    *
+    * FINISHME: Teach this about the Gen10+ bank conflict rules, which are
+    *           somewhat more relaxed than on previous generations.  In the
+    *           meantime optimizing based on Gen9 weights is likely to be more
+    *           helpful than not optimizing at all.
+    */
+   weight_vector_type *
+   shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
+   {
+      weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
+      for (unsigned r = 0; r < p.num_atoms(); r++)
+         conflicts[r] = weight_vector_type(2 * p.num_atoms());
+
+      /* Crude approximation of the number of times the current basic block
+       * will be executed at run-time.
+       */
+      unsigned block_scale = 1;
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (inst->opcode == BRW_OPCODE_DO) {
+            block_scale *= 10;
+
+         } else if (inst->opcode == BRW_OPCODE_WHILE) {
+            block_scale /= 10;
+
+         } else if (inst->is_3src(v->devinfo) &&
+                    is_grf(inst->src[1]) && is_grf(inst->src[2])) {
+            const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
+            const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
+
+            /* Estimate of the cycle-count cost of incurring a bank conflict
+             * for this instruction.  This is only true on the average, for a
+             * sequence of back-to-back ternary instructions, since the EU
+             * front-end only seems to be able to issue a new instruction at
+             * an even cycle.  The cost of a bank conflict incurred by an
+             * isolated ternary instruction may be higher.
+             */
+            const unsigned exec_size = inst->dst.component_size(inst->exec_size);
+            const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
+                                                                    REG_SIZE);
+
+            /* Neglect same-atom conflicts (since they're either trivial or
+             * impossible to avoid without splitting the atom), and conflicts
+             * known to be optimized out by the hardware.
+             */
+            if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
+               /* Calculate the parity of the sources relative to the start of
+                * their respective atoms.  If their parity is the same (and
+                * none of the atoms straddle the 2KB mark), the instruction
+                * will incur a conflict iff both atoms are assigned the same
+                * bank b.  If their parity is opposite, the instruction will
+                * incur a conflict iff they are assigned opposite banks (b and
+                * b^1).
+                */
+               const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
+               const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
+               const unsigned p = p_r ^ p_s;
+
+               /* Calculate the updated cost of a hypothetical conflict
+                * between atoms r and s.  Note that the weight matrix is
+                * symmetric with respect to indices r and s by construction.
+                */
+               const scalar_type w = MIN2(unsigned(max_scalar),
+                                          get(conflicts[r], s, p) + cycle_scale);
+               set(conflicts[r], s, p, w);
+               set(conflicts[s], r, p, w);
+            }
+         }
+      }
+
+      return conflicts;
+   }
+
+   /**
+    * Return the set of GRF atoms that could potentially lead to bank
+    * conflicts if laid out unfavorably in the GRF space according to
+    * the specified \p conflicts matrix (\sa
+    * shader_conflict_weight_matrix()).
+    */
+   bool *
+   have_any_conflicts(const partitioning &p,
+                      const weight_vector_type *conflicts)
+   {
+      bool *any_conflicts = new bool[p.num_atoms()]();
+
+      for (unsigned r = 0; r < p.num_atoms(); r++) {
+         const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
+         for (unsigned s = 0; s < m; s++)
+            any_conflicts[r] |= sums(conflicts[r].v[s]);
+      }
+
+      return any_conflicts;
+   }
+
+   /**
+    * Calculate the difference between two S(B) cost estimates as defined
+    * above (\sa shader_conflict_weight_matrix()).  This represents the
+    * (partial) cycle-count benefit from moving an atom r from bank p to n.
+    * The respective bank assignments Bp and Bn are encoded as the \p
+    * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
+    * according to the formula:
+    *
+    *  bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
+    *
+    * Notice the similarity with the delta function in the S(B) expression
+    * above, and how bank_mask(B) can be precomputed for every possible
+    * selection of r since bank_mask(B) only depends on it via B_r that may
+    * only assume one of four different values, so the caller can keep every
+    * possible bank_mask(B) vector in memory without much hassle (\sa
+    * bank_characteristics()).
+    */
+   int
+   delta_conflicts(const weight_vector_type &bank_mask_p,
+                   const weight_vector_type &bank_mask_n,
+                   const weight_vector_type &conflicts)
+   {
+      const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
+      vector_type s_p = {}, s_n = {};
+
+      for (unsigned r = 0; r < m; r++) {
+         s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
+         s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
+      }
+
+      return sums(subs(s_p, s_n));
+   }
+
+   /**
+    * Register atom permutation, represented as the start GRF offset each atom
+    * is mapped into.
+    */
+   struct permutation {
+      permutation() : v(NULL), size(0) {}
+
+      permutation(unsigned n) :
+         v(new unsigned[n]()), size(n) {}
+
+      permutation(const permutation &p) :
+         v(new unsigned[p.size]), size(p.size)
+      {
+         memcpy(v, p.v, p.size * sizeof(unsigned));
+      }
+
+      ~permutation()
+      {
+         delete[] v;
+      }
+
+      permutation &
+      operator=(permutation p)
+      {
+         SWAP(v, p.v);
+         SWAP(size, p.size);
+         return *this;
+      }
+
+      unsigned *v;
+      unsigned size;
+   };
+
+   /**
+    * Return an identity permutation of GRF atoms.
+    */
+   permutation
+   identity_reg_permutation(const partitioning &p)
+   {
+      permutation map(p.num_atoms());
+
+      for (unsigned r = 0; r < map.size; r++)
+         map.v[r] = p.reg_of_atom(r);
+
+      return map;
+   }
+
+   /**
+    * Return the bank index of GRF address \p reg, numbered according to the
+    * table:
+    *        Even Odd
+    *    Lo    0   1
+    *    Hi    2   3
+    */
+   unsigned
+   bank_of(unsigned reg)
+   {
+      return (reg & 0x40) >> 5 | (reg & 1);
+   }
+
+   /**
+    * Return bitmasks suitable for use as bank mask arguments for the
+    * delta_conflicts() computation.  Note that this is just the (negative)
+    * characteristic function of each bank, if you regard it as a set
+    * containing all atoms assigned to it according to the \p map array.
+    */
+   weight_vector_type *
+   bank_characteristics(const permutation &map)
+   {
+      weight_vector_type *banks = new weight_vector_type[4];
+
+      for (unsigned b = 0; b < 4; b++) {
+         banks[b] = weight_vector_type(2 * map.size);
+
+         for (unsigned j = 0; j < map.size; j++) {
+            for (unsigned p = 0; p < 2; p++)
+               set(banks[b], j, p,
+                   (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
+         }
+      }
+
+      return banks;
+   }
+
+   /**
+    * Return an improved permutation of GRF atoms based on \p map attempting
+    * to reduce the total cycle-count cost of bank conflicts greedily.
+    *
+    * Note that this doesn't attempt to merge multiple atoms into one, which
+    * may allow it to do a better job in some cases -- It simply reorders
+    * existing atoms in the GRF space without affecting their identity.
+    */
+   permutation
+   optimize_reg_permutation(const partitioning &p,
+                            const bool *constrained,
+                            const weight_vector_type *conflicts,
+                            permutation map)
+   {
+      const bool *any_conflicts = have_any_conflicts(p, conflicts);
+      weight_vector_type *banks = bank_characteristics(map);
+
+      for (unsigned r = 0; r < map.size; r++) {
+         const unsigned bank_r = bank_of(map.v[r]);
+
+         if (!constrained[r]) {
+            unsigned best_s = r;
+            int best_benefit = 0;
+
+            for (unsigned s = 0; s < map.size; s++) {
+               const unsigned bank_s = bank_of(map.v[s]);
+
+               if (bank_r != bank_s && !constrained[s] &&
+                   p.size_of_atom(r) == p.size_of_atom(s) &&
+                   (any_conflicts[r] || any_conflicts[s])) {
+                  const int benefit =
+                     delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
+                     delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
+
+                  if (benefit > best_benefit) {
+                     best_s = s;
+                     best_benefit = benefit;
+                  }
+               }
+            }
+
+            if (best_s != r) {
+               for (unsigned b = 0; b < 4; b++) {
+                  for (unsigned p = 0; p < 2; p++)
+                     swap(banks[b], r, p, best_s, p);
+               }
+
+               SWAP(map.v[r], map.v[best_s]);
+            }
+         }
+      }
+
+      delete[] banks;
+      delete[] any_conflicts;
+      return map;
+   }
+
+   /**
+    * Apply the GRF atom permutation given by \p map to register \p r and
+    * return the result.
+    */
+   fs_reg
+   transform(const partitioning &p, const permutation &map, fs_reg r)
+   {
+      if (r.file == VGRF) {
+         const unsigned reg = reg_of(r);
+         const unsigned s = p.atom_of_reg(reg);
+         r.nr = map.v[s] + reg - p.reg_of_atom(s);
+         r.offset = r.offset % REG_SIZE;
+      }
+
+      return r;
+   }
+}
+
+bool
+fs_visitor::opt_bank_conflicts()
+{
+   assert(grf_used || !"Must be called after register allocation");
+
+   /* No ternary instructions -- No bank conflicts. */
+   if (devinfo->gen < 6)
+      return false;
+
+   const partitioning p = shader_reg_partitioning(this);
+   const bool *constrained = shader_reg_constraints(this, p);
+   const weight_vector_type *conflicts =
+      shader_conflict_weight_matrix(this, p);
+   const permutation map =
+      optimize_reg_permutation(p, constrained, conflicts,
+                               identity_reg_permutation(p));
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      inst->dst = transform(p, map, inst->dst);
+
+      for (int i = 0; i < inst->sources; i++)
+         inst->src[i] = transform(p, map, inst->src[i]);
+   }
+
+   delete[] conflicts;
+   delete[] constrained;
+   return true;
+}
+
+/**
+ * Estimate the number of GRF bank conflict cycles incurred by an instruction.
+ *
+ * Note that this neglects conflict cycles prior to register allocation
+ * because we don't know which bank each VGRF is going to end up aligned to.
+ */
+unsigned
+fs_visitor::bank_conflict_cycles(const fs_inst *inst) const
+{
+   if (grf_used && inst->is_3src(devinfo) &&
+       is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
+       bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
+       !is_conflict_optimized_out(devinfo, inst)) {
+      return DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
+   } else {
+      return 0;
+   }
+}
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -36,9 +36,12 @@

 #include "util/bitset.h"
 #include "brw_fs.h"
+#include "brw_fs_live_variables.h"
 #include "brw_cfg.h"
 #include "brw_eu.h"

+using namespace brw;
+
 namespace { /* avoid conflict with opt_copy_propagation_elements */
 struct acp_entry : public exec_node {
   fs_reg dst;
@@ -77,12 +80,19 @@ struct block_data {
    * course of this block.
    */
   BITSET_WORD *kill;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are guaranteed to
+    * have a fully uninitialized destination at the end of this block.
+    */
+   BITSET_WORD *undef;
 };

 class fs_copy_prop_dataflow
 {
 public:
   fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
+                         const fs_live_variables *live,
                         exec_list *out_acp[ACP_HASH_SIZE]);

   void setup_initial_values();
@@ -92,6 +102,7 @@ public:

   void *mem_ctx;
   cfg_t *cfg;
+   const fs_live_variables *live;

   acp_entry **acp;
   int num_acp;
@@ -102,8 +113,9 @@ public:
 } /* anonymous namespace */

 fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
+                                             const fs_live_variables *live,
                                             exec_list *out_acp[ACP_HASH_SIZE])
-   : mem_ctx(mem_ctx), cfg(cfg)
+   : mem_ctx(mem_ctx), cfg(cfg), live(live)
 {
   bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);

@@ -124,6 +136,7 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
      bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
      bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words);
      bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words);
+      bd[block->num].undef = rzalloc_array(bd, BITSET_WORD, bitset_words);

      for (int i = 0; i < ACP_HASH_SIZE; i++) {
         foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
@@ -173,8 +186,7 @@ fs_copy_prop_dataflow::setup_initial_values()

   /* Populate the initial values for the livein and liveout sets.  For the
    * block at the start of the program, livein = 0 and liveout = copy.
-    * For the others, set liveout to 0 (the empty set) and livein to ~0
-    * (the universal set).
+    * For the others, set liveout and livein to ~0 (the universal set).
    */
   foreach_block (block, cfg) {
      if (block->parents.is_empty()) {
@@ -184,11 +196,23 @@ fs_copy_prop_dataflow::setup_initial_values()
         }
      } else {
         for (int i = 0; i < bitset_words; i++) {
-            bd[block->num].liveout[i] = 0u;
+            bd[block->num].liveout[i] = ~0u;
            bd[block->num].livein[i] = ~0u;
         }
      }
   }
+
+   /* Initialize the undef set. */
+   foreach_block (block, cfg) {
+      for (int i = 0; i < num_acp; i++) {
+         BITSET_SET(bd[block->num].undef, i);
+         for (unsigned off = 0; off < acp[i]->size_written; off += REG_SIZE) {
+            if (BITSET_TEST(live->block_data[block->num].defout,
+                            live->var_from_reg(byte_offset(acp[i]->dst, off))))
+               BITSET_CLEAR(bd[block->num].undef, i);
+         }
+      }
+   }
 }

 /**
@@ -203,14 +227,40 @@ fs_copy_prop_dataflow::run()
   do {
      progress = false;

-      /* Update liveout for all blocks. */
      foreach_block (block, cfg) {
         if (block->parents.is_empty())
            continue;

         for (int i = 0; i < bitset_words; i++) {
            const BITSET_WORD old_liveout = bd[block->num].liveout[i];
+            BITSET_WORD livein_from_any_block = 0;

+            /* Update livein for this block.  If a copy is live out of all
+             * parent blocks, it's live coming in to this block.
+             */
+            bd[block->num].livein[i] = ~0u;
+            foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
+               bblock_t *parent = parent_link->block;
+               /* Consider ACP entries with a known-undefined destination to
+                * be available from the parent.  This is valid because we're
+                * free to set the undefined variable equal to the source of
+                * the ACP entry without breaking the application's
+                * expectations, since the variable is undefined.
+                */
+               bd[block->num].livein[i] &= (bd[parent->num].liveout[i] |
+                                            bd[parent->num].undef[i]);
+               livein_from_any_block |= bd[parent->num].liveout[i];
+            }
+
+            /* Limit to the set of ACP entries that can possibly be available
+             * at the start of the block, since propagating from a variable
+             * which is guaranteed to be undefined (rather than potentially
+             * undefined for some dynamic control-flow paths) doesn't seem
+             * particularly useful.
+             */
+            bd[block->num].livein[i] &= livein_from_any_block;
+
+            /* Update liveout for this block. */
            bd[block->num].liveout[i] =
               bd[block->num].copy[i] | (bd[block->num].livein[i] &
                                         ~bd[block->num].kill[i]);
@@ -219,27 +269,6 @@ fs_copy_prop_dataflow::run()
               progress = true;
         }
      }
-
-      /* Update livein for all blocks.  If a copy is live out of all parent
-       * blocks, it's live coming in to this block.
-       */
-      foreach_block (block, cfg) {
-         if (block->parents.is_empty())
-            continue;
-
-         for (int i = 0; i < bitset_words; i++) {
-            const BITSET_WORD old_livein = bd[block->num].livein[i];
-
-            bd[block->num].livein[i] = ~0u;
-            foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
-               bblock_t *parent = parent_link->block;
-               bd[block->num].livein[i] &= bd[parent->num].liveout[i];
-            }
-
-            if (old_livein != bd[block->num].livein[i])
-               progress = true;
-         }
-      }
   } while (progress);
 }

@@ -830,6 +859,8 @@ fs_visitor::opt_copy_propagation()
   for (int i = 0; i < cfg->num_blocks; i++)
      out_acp[i] = new exec_list [ACP_HASH_SIZE];

+   calculate_live_intervals();
+
   /* First, walk through each block doing local copy propagation and getting
    * the set of copies available at the end of the block.
    */
@@ -839,7 +870,7 @@ fs_visitor::opt_copy_propagation()
   }

   /* Do dataflow analysis for those available copies. */
-   fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, out_acp);
+   fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, live_intervals, out_acp);

   /* Next, re-run local copy propagation, this time with the set of copies
    * provided by the dataflow analysis available at the start of a block.
--- a/src/intel/compiler/brw_fs_live_variables.cpp
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -83,9 +83,11 @@ fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
   /* The def[] bitset marks when an initialization in a block completely
    * screens off previous updates of that variable (VGRF channel).
    */
-   if (inst->dst.file == VGRF && !inst->is_partial_write()) {
-      if (!BITSET_TEST(bd->use, var))
+   if (inst->dst.file == VGRF) {
+      if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
         BITSET_SET(bd->def, var);
+
+      BITSET_SET(bd->defout, var);
   }
 }

@@ -199,6 +201,28 @@ fs_live_variables::compute_live_variables()
         }
      }
   }
+
+   /* Propagate defin and defout down the CFG to calculate the union of live
+    * variables potentially defined along any possible control flow path.
+    */
+   do {
+      cont = false;
+
+      foreach_block (block, cfg) {
+         const struct block_data *bd = &block_data[block->num];
+
+	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
+            struct block_data *child_bd = &block_data[child_link->block->num];
+
+	    for (int i = 0; i < bitset_words; i++) {
+               const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i];
+               child_bd->defin[i] |= new_def;
+               child_bd->defout[i] |= new_def;
+               cont |= new_def;
+	    }
+	 }
+      }
+   } while (cont);
 }

 /**
@@ -212,12 +236,12 @@ fs_live_variables::compute_start_end()
      struct block_data *bd = &block_data[block->num];

      for (int i = 0; i < num_vars; i++) {
-         if (BITSET_TEST(bd->livein, i)) {
+         if (BITSET_TEST(bd->livein, i) && BITSET_TEST(bd->defin, i)) {
            start[i] = MIN2(start[i], block->start_ip);
            end[i] = MAX2(end[i], block->start_ip);
         }

-         if (BITSET_TEST(bd->liveout, i)) {
+         if (BITSET_TEST(bd->liveout, i) && BITSET_TEST(bd->defout, i)) {
            start[i] = MIN2(start[i], block->end_ip);
            end[i] = MAX2(end[i], block->end_ip);
         }
@@ -260,6 +284,8 @@ fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg)
      block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
      block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
      block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].defin = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].defout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);

      block_data[i].flag_def[0] = 0;
      block_data[i].flag_use[0] = 0;
--- a/src/intel/compiler/brw_fs_live_variables.h
+++ b/src/intel/compiler/brw_fs_live_variables.h
@@ -55,6 +55,18 @@ struct block_data {
   /** Which defs reach the exit point of the block. */
   BITSET_WORD *liveout;

+   /**
+    * Variables such that the entry point of the block may be reached from any
+    * of their definitions.
+    */
+   BITSET_WORD *defin;
+
+   /**
+    * Variables such that the exit point of the block may be reached from any
+    * of their definitions.
+    */
+   BITSET_WORD *defout;
+
   BITSET_WORD flag_def[1];
   BITSET_WORD flag_use[1];
   BITSET_WORD flag_livein[1];
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -1543,10 +1543,11 @@ vec4_instruction_scheduler::choose_instruction_to_schedule()
 int
 fs_instruction_scheduler::issue_time(backend_instruction *inst)
 {
+   const unsigned overhead = v->bank_conflict_cycles((fs_inst *)inst);
   if (is_compressed((fs_inst *)inst))
-      return 4;
+      return 4 + overhead;
   else
-      return 2;
+      return 2 + overhead;
 }

 int
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -41,6 +41,7 @@ libintel_compiler_files = files(
  'brw_eu.h',
  'brw_eu_util.c',
  'brw_eu_validate.c',
+  'brw_fs_bank_conflicts.cpp',
  'brw_fs_builder.h',
  'brw_fs_cmod_propagation.cpp',
  'brw_fs_combine_constants.cpp',
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -701,7 +701,7 @@ void anv_GetPhysicalDeviceFeatures2KHR(
      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES_KHR: {
         VkPhysicalDeviceVariablePointerFeaturesKHR *features = (void *)ext;
         features->variablePointersStorageBuffer = true;
-         features->variablePointers = false;
+         features->variablePointers = true;
         break;
      }

--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -123,20 +123,23 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
      }
   }

-   const struct nir_spirv_supported_extensions supported_ext = {
-      .float64 = device->instance->physicalDevice.info.gen >= 8,
-      .int64 = device->instance->physicalDevice.info.gen >= 8,
-      .tessellation = true,
-      .draw_parameters = true,
-      .image_write_without_format = true,
-      .multiview = true,
-      .variable_pointers = true,
+   struct spirv_to_nir_options spirv_options = {
+      .lower_workgroup_access_to_offsets = true,
+      .caps = {
+         .float64 = device->instance->physicalDevice.info.gen >= 8,
+         .int64 = device->instance->physicalDevice.info.gen >= 8,
+         .tessellation = true,
+         .draw_parameters = true,
+         .image_write_without_format = true,
+         .multiview = true,
+         .variable_pointers = true,
+      },
   };

   nir_function *entry_point =
      spirv_to_nir(spirv, module->size / 4,
                   spec_entries, num_spec_entries,
-                   stage, entrypoint_name, &supported_ext, nir_options);
+                   stage, entrypoint_name, &spirv_options, nir_options);
   nir_shader *nir = entry_point->shader;
   assert(nir->info.stage == stage);
   nir_validate_shader(nir);
@@ -385,10 +388,8 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
   if (stage != MESA_SHADER_COMPUTE)
      NIR_PASS_V(nir, anv_nir_lower_multiview, pipeline->subpass->view_mask);

-   if (stage == MESA_SHADER_COMPUTE) {
-      NIR_PASS_V(nir, brw_nir_lower_cs_shared);
+   if (stage == MESA_SHADER_COMPUTE)
      prog_data->total_shared = nir->num_shared;
-   }

   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));

--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -81,6 +81,8 @@ setupLoaderExtensions(__DRIscreen *psp,
 	    psp->swrast_loader = (__DRIswrastLoaderExtension *) extensions[i];
        if (strcmp(extensions[i]->name, __DRI_IMAGE_LOADER) == 0)
           psp->image.loader = (__DRIimageLoaderExtension *) extensions[i];
+        if (strcmp(extensions[i]->name, __DRI_MUTABLE_RENDER_BUFFER_LOADER) == 0)
+           psp->mutableRenderBuffer.loader = (__DRImutableRenderBufferLoaderExtension *) extensions[i];
    }
 }

--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -180,6 +180,10 @@ struct __DRIscreenRec {
        const __DRIimageLoaderExtension *loader;
    } image;

+    struct {
+       const __DRImutableRenderBufferLoaderExtension *loader;
+    } mutableRenderBuffer;
+
    driOptionCache optionInfo;
    driOptionCache optionCache;

--- a/src/mesa/drivers/dri/common/utils.c
+++ b/src/mesa/drivers/dri/common/utils.c
@@ -147,7 +147,10 @@ driGetRendererString( char * buffer, const char * hardware_name,
 * \param color_depth_match Whether the color depth must match the zs depth
 *                          This forces 32-bit color to have 24-bit depth, and
 *                          16-bit color to have 16-bit depth.
- * 
+ * \param mutable_render_buffer Enable __DRI_ATTRIB_MUTABLE_RENDER_BUFFER,
+ *                              which translates to
+ *                              EGL_MUTABLE_RENDER_BUFFER_BIT_KHR.
+ *
 * \returns
 * Pointer to any array of pointers to the \c __DRIconfig structures created
 * for the specified formats.  If there is an error, \c NULL is returned.
@@ -160,7 +163,8 @@ driCreateConfigs(mesa_format format,
 		 unsigned num_depth_stencil_bits,
 		 const GLenum * db_modes, unsigned num_db_modes,
 		 const uint8_t * msaa_samples, unsigned num_msaa_modes,
-		 GLboolean enable_accum, GLboolean color_depth_match)
+		 GLboolean enable_accum, GLboolean color_depth_match,
+		 GLboolean mutable_render_buffer)
 {
   static const uint32_t masks_table[][4] = {
      /* MESA_FORMAT_B5G6R5_UNORM */
@@ -314,6 +318,7 @@ driCreateConfigs(mesa_format format,

 		    modes->yInverted = GL_TRUE;
 		    modes->sRGBCapable = is_srgb;
+		    modes->mutableRenderBuffer = mutable_render_buffer;
 		}
 	    }
 	}
@@ -398,6 +403,7 @@ static const struct { unsigned int attrib, offset; } attribMap[] = {
    __ATTRIB(__DRI_ATTRIB_BIND_TO_TEXTURE_TARGETS,	bindToTextureTargets),
    __ATTRIB(__DRI_ATTRIB_YINVERTED,			yInverted),
    __ATTRIB(__DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE,	sRGBCapable),
+    __ATTRIB(__DRI_ATTRIB_MUTABLE_RENDER_BUFFER,	mutableRenderBuffer),

    /* The struct field doesn't matter here, these are handled by the
     * switch in driGetConfigAttribIndex.  We need them in the array
--- a/src/mesa/drivers/dri/common/utils.h
+++ b/src/mesa/drivers/dri/common/utils.h
@@ -45,7 +45,8 @@ driCreateConfigs(mesa_format format,
 		 unsigned num_depth_stencil_bits,
 		 const GLenum * db_modes, unsigned num_db_modes,
 		 const uint8_t * msaa_samples, unsigned num_msaa_modes,
-		 GLboolean enable_accum, GLboolean color_depth_match);
+		 GLboolean enable_accum, GLboolean color_depth_match,
+		 GLboolean mutable_render_buffer);

 __DRIconfig **driConcatConfigs(__DRIconfig **a,
 			       __DRIconfig **b);
--- a/src/mesa/drivers/dri/i915/intel_screen.c
+++ b/src/mesa/drivers/dri/i915/intel_screen.c
@@ -1094,7 +1094,7 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
                                     num_depth_stencil_bits,
                                     back_buffer_modes, 2,
                                     singlesample_samples, 1,
-                                     false, false);
+                                     false, false, false);
      configs = driConcatConfigs(configs, new_configs);
   }

@@ -1116,7 +1116,7 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
                                     depth_bits, stencil_bits, 1,
                                     back_buffer_modes, 1,
                                     singlesample_samples, 1,
-                                     true, false);
+                                     true, false, false);
      configs = driConcatConfigs(configs, new_configs);
   }

--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -235,6 +235,35 @@ intel_flush_front(struct gl_context *ctx)
   }
 }

+static void
+brw_display_shared_buffer(struct brw_context *brw)
+{
+   __DRIcontext *dri_context = brw->driContext;
+   __DRIdrawable *dri_drawable = dri_context->driDrawablePriv;
+   __DRIscreen *dri_screen = brw->screen->driScrnPriv;
+   int fence_fd = -1;
+
+   if (!brw->is_shared_buffer_bound)
+      return;
+
+   if (!brw->is_shared_buffer_dirty)
+      return;
+
+   if (brw->screen->has_exec_fence) {
+      /* This function is always called during a flush operation, so there is
+       * no need to flush again here. But we want to provide a fence_fd to the
+       * loader, and a redundant flush is the easiest way to acquire one.
+       */
+      if (intel_batchbuffer_flush_fence(brw, -1, &fence_fd))
+         return;
+   }
+
+   dri_screen->mutableRenderBuffer.loader
+      ->displaySharedBuffer(dri_drawable, fence_fd,
+                            dri_drawable->loaderPrivate);
+   brw->is_shared_buffer_dirty = false;
+}
+
 static void
 intel_glFlush(struct gl_context *ctx)
 {
@@ -242,7 +271,7 @@ intel_glFlush(struct gl_context *ctx)

   intel_batchbuffer_flush(brw);
   intel_flush_front(ctx);
-
+   brw_display_shared_buffer(brw);
   brw->need_flush_throttle = true;
 }

@@ -862,7 +891,9 @@ brwCreateContext(gl_api api,
   brw->screen = screen;
   brw->bufmgr = screen->bufmgr;

-   brw->has_hiz = devinfo->has_hiz_and_separate_stencil;
+   /* Braswell has hiz issues, disable it. */
+   brw->has_hiz = devinfo->has_hiz_and_separate_stencil &&
+                  screen->deviceID != 0x22B1;
   brw->has_separate_stencil = devinfo->has_hiz_and_separate_stencil;

   brw->has_swizzling = screen->hw_has_swizzling;
@@ -942,7 +973,7 @@ brwCreateContext(gl_api api,

   intel_batchbuffer_init(brw);

-   if (devinfo->gen >= 6) {
+   if (devinfo->gen >= 7) {
      /* Create a new hardware context.  Using a hardware context means that
       * our GPU state will be saved/restored on context switch, allowing us
       * to assume that the GPU is in the same state we left it in.
@@ -1258,6 +1289,21 @@ intel_resolve_for_dri2_flush(struct brw_context *brw,
         intel_miptree_prepare_external(brw, rb->mt);
      } else {
         intel_renderbuffer_downsample(brw, rb);
+
+         /* Call prepare_external on the single-sample miptree to do any
+          * needed resolves prior to handing it off to the window system.
+          * This is needed in the case that rb->singlesample_mt is Y-tiled
+          * with CCS_E enabled but without I915_FORMAT_MOD_Y_TILED_CCS_E.  In
+          * this case, the MSAA resolve above will write compressed data into
+          * rb->singlesample_mt.
+          *
+          * TODO: Some day, if we decide to care about the tiny performance
+          * hit we're taking by doing the MSAA resolve and then a CCS resolve,
+          * we could detect this case and just allocate the single-sampled
+          * miptree without aux.  However, that would be a lot of plumbing and
+          * this is a rather exotic case so it's not really worth it.
+          */
+         intel_miptree_prepare_external(brw, rb->singlesample_mt);
      }
   }
 }
@@ -1393,6 +1439,11 @@ intel_prepare_render(struct brw_context *brw)
    */
   if (_mesa_is_front_buffer_drawing(ctx->DrawBuffer))
      brw->front_buffer_dirty = true;
+
+   if (brw->is_shared_buffer_bound) {
+      /* Subsequent rendering will probably dirty the shared buffer. */
+      brw->is_shared_buffer_dirty = true;
+   }
 }

 /**
@@ -1622,8 +1673,12 @@ intel_update_image_buffer(struct brw_context *intel,
   else
      last_mt = rb->singlesample_mt;

-   if (last_mt && last_mt->bo == buffer->bo)
+   if (last_mt && last_mt->bo == buffer->bo) {
+      if (buffer_type == __DRI_IMAGE_BUFFER_SHARED) {
+         intel_miptree_make_shareable(intel, last_mt);
+      }
      return;
+   }

   struct intel_mipmap_tree *mt =
      intel_miptree_create_for_dri_image(intel, buffer, GL_TEXTURE_2D,
@@ -1643,6 +1698,35 @@ intel_update_image_buffer(struct brw_context *intel,
       rb->Base.Base.NumSamples > 1) {
      intel_renderbuffer_upsample(intel, rb);
   }
+
+   if (buffer_type == __DRI_IMAGE_BUFFER_SHARED) {
+      /* The compositor and the application may access this image
+       * concurrently. The display hardware may even scanout the image while
+       * the GPU is rendering to it.  Aux surfaces cause difficulty with
+       * concurrent access, so permanently disable aux for this miptree.
+       *
+       * Perhaps we could improve overall application performance by
+       * re-enabling the aux surface when EGL_RENDER_BUFFER transitions to
+       * EGL_BACK_BUFFER, then disabling it again when EGL_RENDER_BUFFER
+       * returns to EGL_SINGLE_BUFFER. I expect the wins and losses with this
+       * approach to be highly dependent on the application's GL usage.
+       *
+       * I [chadv] expect clever disabling/reenabling to be counterproductive
+       * in the use cases I care about: applications that render nearly
+       * realtime handwriting to the surface while possibly undergiong
+       * simultaneously scanout as a display plane. The app requires low
+       * render latency. Even though the app spends most of its time in
+       * shared-buffer mode, it also frequently transitions between
+       * shared-buffer (EGL_SINGLE_BUFFER) and double-buffer (EGL_BACK_BUFFER)
+       * mode.  Visual sutter during the transitions should be avoided.
+       *
+       * In this case, I [chadv] believe reducing the GPU workload at
+       * shared-buffer/double-buffer transitions would offer a smoother app
+       * experience than any savings due to aux compression. But I've
+       * collected no data to prove my theory.
+       */
+      intel_miptree_make_shareable(intel, mt);
+   }
 }

 static void
@@ -1703,4 +1787,19 @@ intel_update_image_buffers(struct brw_context *brw, __DRIdrawable *drawable)
                                images.back,
                                __DRI_IMAGE_BUFFER_BACK);
   }
+
+   if (images.image_mask & __DRI_IMAGE_BUFFER_SHARED) {
+      assert(images.image_mask == __DRI_IMAGE_BUFFER_SHARED);
+      drawable->w = images.back->width;
+      drawable->h = images.back->height;
+      intel_update_image_buffer(brw,
+                                drawable,
+                                back_rb,
+                                images.back,
+                                __DRI_IMAGE_BUFFER_SHARED);
+      brw->is_shared_buffer_bound = true;
+   } else {
+      brw->is_shared_buffer_bound = false;
+      brw->is_shared_buffer_dirty = false;
+   }
 }
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -711,6 +711,18 @@ struct brw_context
    */
   bool front_buffer_dirty;

+   /**
+    * True if the __DRIdrawable's current __DRIimageBufferMask is
+    * __DRI_IMAGE_BUFFER_SHARED.
+    */
+   bool is_shared_buffer_bound;
+
+   /**
+    * True if a shared buffer is bound and it has received any rendering since
+    * the previous __DRImutableRenderBufferLoaderExtension::displaySharedBuffer().
+    */
+   bool is_shared_buffer_dirty;
+
   /** Framerate throttling: @{ */
   struct brw_bo *throttle_batch[2];

--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -127,6 +127,8 @@ get_stencil_miptree(struct intel_renderbuffer *irb)
 {
   if (!irb)
      return NULL;
+   if (!irb->mt)
+      return NULL;
   if (irb->mt->stencil_mt)
      return irb->mt->stencil_mt;
   return intel_renderbuffer_get_mt(irb);
@@ -225,11 +227,11 @@ brw_workaround_depthstencil_alignment(struct brw_context *brw,
   /* Check if depth buffer is in depth/stencil format.  If so, then it's only
    * safe to invalidate it if we're also clearing stencil.
    */
-   if (depth_irb && invalidate_depth &&
+   if (depth_irb && depth_mt && invalidate_depth &&
      _mesa_get_format_base_format(depth_mt->format) == GL_DEPTH_STENCIL)
      invalidate_depth = invalidate_stencil && stencil_irb;

-   if (depth_irb) {
+   if (depth_irb && depth_mt) {
      if (rebase_depth_stencil(brw, depth_irb, invalidate_depth)) {
         /* In the case of stencil_irb being the same packed depth/stencil
          * texture but not the same rb, make it point at our rebased mt, too.
@@ -242,7 +244,7 @@ brw_workaround_depthstencil_alignment(struct brw_context *brw,
         }
      }

-      if (stencil_irb) {
+      if (stencil_irb && stencil_irb->mt) {
         assert(stencil_irb->mt == depth_irb->mt);
         assert(stencil_irb->mt_level == depth_irb->mt_level);
         assert(stencil_irb->mt_layer == depth_irb->mt_layer);
@@ -250,7 +252,7 @@ brw_workaround_depthstencil_alignment(struct brw_context *brw,
   }

   /* If there is no depth attachment, consider if stencil needs rebase. */
-   if (!depth_irb && stencil_irb)
+   if (!(depth_irb && depth_mt) && stencil_irb && stencil_irb->mt)
       rebase_depth_stencil(brw, stencil_irb, invalidate_stencil);
 }

--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -2206,6 +2206,14 @@ const struct brw_tracked_state genX(cc_vp) = {

 /* ---------------------------------------------------------------------- */

+/* Clamp scissors to 16-bit unsigned values; otherwise, the compiler truncates
+ * them to fit inside the bitfields, which is often not what is desired.
+ * My reading of GL and GLES specs suggests that overly-large scissor values are
+ * not an erroring condition and that the actual behavior is undefined, so
+ * switching from truncation to clamping is probably not a problem. ~ C.
+ */
+#define CLAMP_SCISSOR(X) CLAMP(X, 0x0000, 0xffff)
+
 static void
 set_scissor_bits(const struct gl_context *ctx, int i,
                 bool render_to_fbo, unsigned fb_width, unsigned fb_height,
@@ -2232,16 +2240,16 @@ set_scissor_bits(const struct gl_context *ctx, int i,
      sc->ScissorRectangleYMax = 0;
   } else if (render_to_fbo) {
      /* texmemory: Y=0=bottom */
-      sc->ScissorRectangleXMin = bbox[0];
-      sc->ScissorRectangleXMax = bbox[1] - 1;
-      sc->ScissorRectangleYMin = bbox[2];
-      sc->ScissorRectangleYMax = bbox[3] - 1;
+      sc->ScissorRectangleXMin = CLAMP_SCISSOR(bbox[0]);
+      sc->ScissorRectangleXMax = CLAMP_SCISSOR(bbox[1] - 1);
+      sc->ScissorRectangleYMin = CLAMP_SCISSOR(bbox[2]);
+      sc->ScissorRectangleYMax = CLAMP_SCISSOR(bbox[3] - 1);
   } else {
      /* memory: Y=0=top */
-      sc->ScissorRectangleXMin = bbox[0];
-      sc->ScissorRectangleXMax = bbox[1] - 1;
-      sc->ScissorRectangleYMin = fb_height - bbox[3];
-      sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
+      sc->ScissorRectangleXMin = CLAMP_SCISSOR(bbox[0]);
+      sc->ScissorRectangleXMax = CLAMP_SCISSOR(bbox[1] - 1);
+      sc->ScissorRectangleYMin = CLAMP_SCISSOR(fb_height - bbox[3]);
+      sc->ScissorRectangleYMax = CLAMP_SCISSOR(fb_height - bbox[2] - 1);
   }
 }

--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1423,12 +1423,17 @@ static const __DRIrobustnessExtension dri2Robustness = {
   .base = { __DRI2_ROBUSTNESS, 1 }
 };

+static const __DRImutableRenderBufferDriverExtension intelMutableRenderBufferExtension = {
+   .base = { __DRI_MUTABLE_RENDER_BUFFER_DRIVER, 1 },
+};
+
 static const __DRIextension *screenExtensions[] = {
    &intelTexBufferExtension.base,
    &intelFenceExtension.base,
    &intelFlushExtension.base,
    &intelImageExtension.base,
    &intelRendererQueryExtension.base,
+    &intelMutableRenderBufferExtension.base,
    &dri2ConfigQueryExtension.base,
    &dri2NoErrorExtension.base,
    NULL
@@ -1440,6 +1445,7 @@ static const __DRIextension *intelRobustScreenExtensions[] = {
    &intelFlushExtension.base,
    &intelImageExtension.base,
    &intelRendererQueryExtension.base,
+    &intelMutableRenderBufferExtension.base,
    &dri2ConfigQueryExtension.base,
    &dri2Robustness.base,
    &dri2NoErrorExtension.base,
@@ -1952,7 +1958,9 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
   else
      num_formats = 3;

-   /* Generate singlesample configs without accumulation buffer. */
+   /* Generate singlesample configs, each without accumulation buffer
+    * and with EGL_MUTABLE_RENDER_BUFFER_BIT_KHR.
+    */
   for (unsigned i = 0; i < num_formats; i++) {
      __DRIconfig **new_configs;
      int num_depth_stencil_bits = 2;
@@ -1983,7 +1991,8 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
                                     num_depth_stencil_bits,
                                     back_buffer_modes, 2,
                                     singlesample_samples, 1,
-                                     false, false);
+                                     false, false,
+                                     /*mutable_render_buffer*/ true);
      configs = driConcatConfigs(configs, new_configs);
   }

@@ -2005,7 +2014,7 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
                                     depth_bits, stencil_bits, 1,
                                     back_buffer_modes, 1,
                                     singlesample_samples, 1,
-                                     true, false);
+                                     true, false, false);
      configs = driConcatConfigs(configs, new_configs);
   }

@@ -2067,7 +2076,7 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
                                     back_buffer_modes, 1,
                                     multisample_samples,
                                     num_msaa_modes,
-                                     false, false);
+                                     false, false, false);
      configs = driConcatConfigs(configs, new_configs);
   }

--- a/src/mesa/drivers/dri/nouveau/nouveau_screen.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.c
@@ -78,7 +78,7 @@ nouveau_get_configs(uint32_t chipset)
 					  ARRAY_SIZE(back_buffer_modes),
 					  msaa_samples,
 					  ARRAY_SIZE(msaa_samples),
-					  GL_TRUE, chipset < 0x10);
+					  GL_TRUE, chipset < 0x10, GL_FALSE);
 		assert(config);

 		configs = driConcatConfigs(configs, config);
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -804,7 +804,7 @@ __DRIconfig **radeonInitScreen2(__DRIscreen *psp)
 				     ARRAY_SIZE(back_buffer_modes),
 				     msaa_samples_array,
 				     ARRAY_SIZE(msaa_samples_array),
-				     GL_TRUE, GL_FALSE);
+				     GL_TRUE, GL_FALSE, GL_FALSE);
      configs = driConcatConfigs(configs, new_configs);
   }

--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -275,7 +275,7 @@ swrastFillInModes(__DRIscreen *psp,
 			       depth_bits_array, stencil_bits_array,
 			       depth_buffer_factor, back_buffer_modes,
 			       back_buffer_factor, msaa_samples_array, 1,
-			       GL_TRUE, GL_FALSE);
+			       GL_TRUE, GL_FALSE, GL_FALSE);
    if (configs == NULL) {
 	fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
 		__LINE__);
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -253,6 +253,9 @@ struct gl_config

   /* EXT_framebuffer_sRGB */
   GLint sRGBCapable;
+
+   /* EGL_KHR_mutable_render_buffer */
+   GLuint mutableRenderBuffer; /* bool */
 };


--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -642,9 +642,6 @@ st_context_flush(struct st_context_iface *stctxi, unsigned flags,
      st->pipe->screen->fence_reference(st->pipe->screen, fence, NULL);
   }

-   if (flags & ST_FLUSH_FRONT)
-      st_manager_flush_frontbuffer(st);
-
   /* DRI3 changes the framebuffer after SwapBuffers, but we need to invoke
    * st_manager_validate_framebuffers to notice that.
    *