pdate version to 17.3.0-rc3

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
i965: Fix ARB_indirect_parameters logic.
2017-11-07 11:51:45 +00:00 · 2017-11-03 18:30:32 +00:00 · 2017-11-03 18:21:59 +00:00 · 2017-11-03 18:21:59 +00:00 · 2017-11-03 18:21:59 +00:00 · 2017-11-03 18:21:42 +00:00
63 changed files with 781 additions and 435 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-17.3.0-devel
+17.3.0-rc3
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3631,15 +3631,17 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 	LLVMValueRef i1true = LLVMConstInt(ctx->ac.i1, 1, false);
 	MAYBE_UNUSED int length;

+	bool is_unsigned = glsl_get_sampler_result_type(type) == GLSL_TYPE_UINT;
+
 	switch (instr->intrinsic) {
 	case nir_intrinsic_image_atomic_add:
 		atomic_name = "add";
 		break;
 	case nir_intrinsic_image_atomic_min:
-		atomic_name = "smin";
+		atomic_name = is_unsigned ? "umin" : "smin";
 		break;
 	case nir_intrinsic_image_atomic_max:
-		atomic_name = "smax";
+		atomic_name = is_unsigned ? "umax" : "smax";
 		break;
 	case nir_intrinsic_image_atomic_and:
 		atomic_name = "and";
--- a/src/amd/common/ac_surface.c
+++ b/src/amd/common/ac_surface.c
@@ -927,9 +927,11 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib,
 		    in->numSamples == 1) {
 			ADDR2_COMPUTE_DCCINFO_INPUT din = {0};
 			ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0};
+			ADDR2_META_MIP_INFO meta_mip_info[RADEON_SURF_MAX_LEVELS] = {};

 			din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT);
 			dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT);
+			dout.pMipInfo = meta_mip_info;

 			din.dccKeyFlags.pipeAligned = 1;
 			din.dccKeyFlags.rbAligned = 1;
@@ -955,21 +957,37 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib,
 			surf->dcc_alignment = dout.dccRamBaseAlign;
 			surf->num_dcc_levels = in->numMipLevels;

-			/* Disable DCC for the smallest levels. It seems to be
-			 * required for DCC readability between CB and shaders
-			 * when TC L2 isn't flushed. This was guessed.
+			/* Disable DCC for levels that are in the mip tail.
+			 *
+			 * There are two issues that this is intended to
+			 * address:
+			 *
+			 * 1. Multiple mip levels may share a cache line. This
+			 *    can lead to corruption when switching between
+			 *    rendering to different mip levels because the
+			 *    RBs don't maintain coherency.
+			 *
+			 * 2. Texturing with metadata after rendering sometimes
+			 *    fails with corruption, probably for a similar
+			 *    reason.
+			 *
+			 * Working around these issues for all levels in the
+			 * mip tail may be overly conservative, but it's what
+			 * Vulkan does.
 			 *
 			 * Alternative solutions that also work but are worse:
-			 * - Disable DCC.
+			 * - Disable DCC entirely.
 			 * - Flush TC L2 after rendering.
 			 */
-			for (unsigned i = 1; i < in->numMipLevels; i++) {
-				if (mip_info[i].pitch *
-				    mip_info[i].height * surf->bpe < 1024) {
+			for (unsigned i = 0; i < in->numMipLevels; i++) {
+				if (meta_mip_info[i].inMiptail) {
 					surf->num_dcc_levels = i;
 					break;
 				}
 			}
+
+			if (!surf->num_dcc_levels)
+				surf->dcc_size = 0;
 		}

 		/* FMASK */
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -104,6 +104,75 @@ get_chip_name(enum radeon_family family)
 	}
 }

+static void
+radv_physical_device_init_mem_types(struct radv_physical_device *device)
+{
+	STATIC_ASSERT(RADV_MEM_HEAP_COUNT <= VK_MAX_MEMORY_HEAPS);
+	uint64_t visible_vram_size = MIN2(device->rad_info.vram_size,
+	                                  device->rad_info.vram_vis_size);
+
+	int vram_index = -1, visible_vram_index = -1, gart_index = -1;
+	device->memory_properties.memoryHeapCount = 0;
+	if (device->rad_info.vram_size - visible_vram_size > 0) {
+		vram_index = device->memory_properties.memoryHeapCount++;
+		device->memory_properties.memoryHeaps[vram_index] = (VkMemoryHeap) {
+			.size = device->rad_info.vram_size - visible_vram_size,
+			.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+		};
+	}
+	if (visible_vram_size) {
+		visible_vram_index = device->memory_properties.memoryHeapCount++;
+		device->memory_properties.memoryHeaps[visible_vram_index] = (VkMemoryHeap) {
+			.size = visible_vram_size,
+			.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+		};
+	}
+	if (device->rad_info.gart_size > 0) {
+		gart_index = device->memory_properties.memoryHeapCount++;
+		device->memory_properties.memoryHeaps[gart_index] = (VkMemoryHeap) {
+			.size = device->rad_info.gart_size,
+			.flags = 0,
+		};
+	}
+
+	STATIC_ASSERT(RADV_MEM_TYPE_COUNT <= VK_MAX_MEMORY_TYPES);
+	unsigned type_count = 0;
+	if (vram_index >= 0) {
+		device->mem_type_indices[type_count] = RADV_MEM_TYPE_VRAM;
+		device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) {
+			.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+			.heapIndex = vram_index,
+		};
+	}
+	if (gart_index >= 0) {
+		device->mem_type_indices[type_count] = RADV_MEM_TYPE_GTT_WRITE_COMBINE;
+		device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) {
+			.propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+			VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+			.heapIndex = gart_index,
+		};
+	}
+	if (visible_vram_index >= 0) {
+		device->mem_type_indices[type_count] = RADV_MEM_TYPE_VRAM_CPU_ACCESS;
+		device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) {
+			.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+			VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+			VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+			.heapIndex = visible_vram_index,
+		};
+	}
+	if (gart_index >= 0) {
+		device->mem_type_indices[type_count] = RADV_MEM_TYPE_GTT_CACHED;
+		device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) {
+			.propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+			VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+			VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+			.heapIndex = gart_index,
+		};
+	}
+	device->memory_properties.memoryTypeCount = type_count;
+}
+
 static VkResult
 radv_physical_device_init(struct radv_physical_device *device,
 			  struct radv_instance *instance,
@@ -152,6 +221,8 @@ radv_physical_device_init(struct radv_physical_device *device,
 		goto fail;
 	}

+	device->name = get_chip_name(device->rad_info.family);
+
 	if (radv_device_get_cache_uuid(device->rad_info.family, device->cache_uuid)) {
 		radv_finish_wsi(device);
 		device->ws->destroy(device->ws);
@@ -168,12 +239,11 @@ radv_physical_device_init(struct radv_physical_device *device,
 	/* The gpu id is already embeded in the uuid so we just pass "radv"
 	 * when creating the cache.
 	 */
-	char buf[VK_UUID_SIZE + 1];
-	disk_cache_format_hex_id(buf, device->cache_uuid, VK_UUID_SIZE);
-	device->disk_cache = disk_cache_create("radv", buf, shader_env_flags);
+	char buf[VK_UUID_SIZE * 2 + 1];
+	disk_cache_format_hex_id(buf, device->cache_uuid, VK_UUID_SIZE * 2);
+	device->disk_cache = disk_cache_create(device->name, buf, shader_env_flags);

 	fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n");
-	device->name = get_chip_name(device->rad_info.family);

 	radv_get_driver_uuid(&device->device_uuid);
 	radv_get_device_uuid(&device->rad_info, &device->device_uuid);
@@ -189,6 +259,7 @@ radv_physical_device_init(struct radv_physical_device *device,
 	 */
 	device->has_clear_state = device->rad_info.chip_class >= CIK;

+	radv_physical_device_init_mem_types(device);
 	return VK_SUCCESS;

 fail:
@@ -779,49 +850,7 @@ void radv_GetPhysicalDeviceMemoryProperties(
 {
 	RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);

-	STATIC_ASSERT(RADV_MEM_TYPE_COUNT <= VK_MAX_MEMORY_TYPES);
-
-	pMemoryProperties->memoryTypeCount = RADV_MEM_TYPE_COUNT;
-	pMemoryProperties->memoryTypes[RADV_MEM_TYPE_VRAM] = (VkMemoryType) {
-		.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-		.heapIndex = RADV_MEM_HEAP_VRAM,
-	};
-	pMemoryProperties->memoryTypes[RADV_MEM_TYPE_GTT_WRITE_COMBINE] = (VkMemoryType) {
-		.propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-		VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-		.heapIndex = RADV_MEM_HEAP_GTT,
-	};
-	pMemoryProperties->memoryTypes[RADV_MEM_TYPE_VRAM_CPU_ACCESS] = (VkMemoryType) {
-		.propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-		VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-		VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-		.heapIndex = RADV_MEM_HEAP_VRAM_CPU_ACCESS,
-	};
-	pMemoryProperties->memoryTypes[RADV_MEM_TYPE_GTT_CACHED] = (VkMemoryType) {
-		.propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-		VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-		VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
-		.heapIndex = RADV_MEM_HEAP_GTT,
-	};
-
-	STATIC_ASSERT(RADV_MEM_HEAP_COUNT <= VK_MAX_MEMORY_HEAPS);
-	uint64_t visible_vram_size = MIN2(physical_device->rad_info.vram_size,
-	                                  physical_device->rad_info.vram_vis_size);
-
-	pMemoryProperties->memoryHeapCount = RADV_MEM_HEAP_COUNT;
-	pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_VRAM] = (VkMemoryHeap) {
-		.size = physical_device->rad_info.vram_size -
-				visible_vram_size,
-		.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
-	};
-	pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_VRAM_CPU_ACCESS] = (VkMemoryHeap) {
-		.size = visible_vram_size,
-		.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
-	};
-	pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_GTT] = (VkMemoryHeap) {
-		.size = physical_device->rad_info.gart_size,
-		.flags = 0,
-	};
+	*pMemoryProperties = physical_device->memory_properties;
 }

 void radv_GetPhysicalDeviceMemoryProperties2KHR(
@@ -2059,6 +2088,7 @@ VkResult radv_alloc_memory(VkDevice                        _device,
 	VkResult result;
 	enum radeon_bo_domain domain;
 	uint32_t flags = 0;
+	enum radv_mem_type mem_type_index = device->physical_device->mem_type_indices[pAllocateInfo->memoryTypeIndex];

 	assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);

@@ -2101,18 +2131,18 @@ VkResult radv_alloc_memory(VkDevice                        _device,
 	}

 	uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
-	if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
-	    pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_CACHED)
+	if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
+	    mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
 		domain = RADEON_DOMAIN_GTT;
 	else
 		domain = RADEON_DOMAIN_VRAM;

-	if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_VRAM)
+	if (mem_type_index == RADV_MEM_TYPE_VRAM)
 		flags |= RADEON_FLAG_NO_CPU_ACCESS;
 	else
 		flags |= RADEON_FLAG_CPU_ACCESS;

-	if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
+	if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
 		flags |= RADEON_FLAG_GTT_WC;

 	if (mem_flags & RADV_MEM_IMPLICIT_SYNC)
@@ -2125,7 +2155,7 @@ VkResult radv_alloc_memory(VkDevice                        _device,
 		result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
 		goto fail;
 	}
-	mem->type_index = pAllocateInfo->memoryTypeIndex;
+	mem->type_index = mem_type_index;
 out_success:
 	*pMem = radv_device_memory_to_handle(mem);

@@ -2218,13 +2248,14 @@ VkResult radv_InvalidateMappedMemoryRanges(
 }

 void radv_GetBufferMemoryRequirements(
-	VkDevice                                    device,
+	VkDevice                                    _device,
 	VkBuffer                                    _buffer,
 	VkMemoryRequirements*                       pMemoryRequirements)
 {
+	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);

-	pMemoryRequirements->memoryTypeBits = (1u << RADV_MEM_TYPE_COUNT) - 1;
+	pMemoryRequirements->memoryTypeBits = (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;

 	if (buffer->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
 		pMemoryRequirements->alignment = 4096;
@@ -2258,13 +2289,14 @@ void radv_GetBufferMemoryRequirements2KHR(
 }

 void radv_GetImageMemoryRequirements(
-	VkDevice                                    device,
+	VkDevice                                    _device,
 	VkImage                                     _image,
 	VkMemoryRequirements*                       pMemoryRequirements)
 {
+	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_image, image, _image);

-	pMemoryRequirements->memoryTypeBits = (1u << RADV_MEM_TYPE_COUNT) - 1;
+	pMemoryRequirements->memoryTypeBits = (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;

 	pMemoryRequirements->size = image->size;
 	pMemoryRequirements->alignment = image->alignment;
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1766,6 +1766,13 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
 						    stage ? stage->pName : "main", i,
 						    stage ? stage->pSpecializationInfo : NULL);
 		pipeline->active_stages |= mesa_to_vk_shader_stage(i);
+		/* We don't want to alter meta shaders IR directly so clone it
+		 * first.
+		 */
+		if (nir[i]->info.name) {
+			nir[i] = nir_shader_clone(NULL, nir[i]);
+		}
+
 	}

 	if (nir[MESA_SHADER_TESS_CTRL]) {
@@ -1779,6 +1786,14 @@ void radv_create_shaders(struct radv_pipeline *pipeline,

 	radv_link_shaders(pipeline, nir);

+	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
+		if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS))
+			continue;
+
+		if (modules[i])
+			nir_print_shader(nir[i], stderr);
+	}
+
 	if (nir[MESA_SHADER_FRAGMENT]) {
 		if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
 			pipeline->shaders[MESA_SHADER_FRAGMENT] =
@@ -1863,7 +1878,7 @@ void radv_create_shaders(struct radv_pipeline *pipeline,

 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
 		free(codes[i]);
-		if (modules[i] && !modules[i]->nir && !pipeline->device->trace_bo)
+		if (modules[i] && !pipeline->device->trace_bo)
 			ralloc_free(nir[i]);
 	}

--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@@ -170,81 +170,6 @@ radv_pipeline_cache_search(struct radv_pipeline_cache *cache,
 	return entry;
 }

-bool
-radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
-					        struct radv_pipeline_cache *cache,
-					        const unsigned char *sha1,
-					        struct radv_shader_variant **variants)
-{
-	struct cache_entry *entry;
-
-	if (!cache)
-		cache = device->mem_cache;
-
-	pthread_mutex_lock(&cache->mutex);
-
-	entry = radv_pipeline_cache_search_unlocked(cache, sha1);
-
-	if (!entry) {
-		if (!device->physical_device->disk_cache ||
-		    (device->instance->debug_flags & RADV_DEBUG_NO_CACHE)) {
-			pthread_mutex_unlock(&cache->mutex);
-			return false;
-		}
-
-		uint8_t disk_sha1[20];
-		disk_cache_compute_key(device->physical_device->disk_cache,
-				       sha1, 20, disk_sha1);
-		entry = (struct cache_entry *)
-			disk_cache_get(device->physical_device->disk_cache,
-				       disk_sha1, NULL);
-		if (!entry) {
-			pthread_mutex_unlock(&cache->mutex);
-			return false;
-		}
-	}
-
-	char *p = entry->code;
-	for(int i = 0; i < MESA_SHADER_STAGES; ++i) {
-		if (!entry->variants[i] && entry->code_sizes[i]) {
-			struct radv_shader_variant *variant;
-			struct cache_entry_variant_info info;
-
-			variant = calloc(1, sizeof(struct radv_shader_variant));
-			if (!variant) {
-				pthread_mutex_unlock(&cache->mutex);
-				return false;
-			}
-
-			memcpy(&info, p, sizeof(struct cache_entry_variant_info));
-			p += sizeof(struct cache_entry_variant_info);
-
-			variant->config = info.config;
-			variant->info = info.variant_info;
-			variant->rsrc1 = info.rsrc1;
-			variant->rsrc2 = info.rsrc2;
-			variant->code_size = entry->code_sizes[i];
-			variant->ref_count = 1;
-
-			void *ptr = radv_alloc_shader_memory(device, variant);
-			memcpy(ptr, p, entry->code_sizes[i]);
-			p += entry->code_sizes[i];
-
-			entry->variants[i] = variant;
-		}
-
-	}
-
-	for (int i = 0; i < MESA_SHADER_STAGES; ++i)
-		if (entry->variants[i])
-			p_atomic_inc(&entry->variants[i]->ref_count);
-
-	memcpy(variants, entry->variants, sizeof(entry->variants));
-	pthread_mutex_unlock(&cache->mutex);
-	return true;
-}
-
-
 static void
 radv_pipeline_cache_set_entry(struct radv_pipeline_cache *cache,
 			      struct cache_entry *entry)
@@ -314,6 +239,97 @@ radv_pipeline_cache_add_entry(struct radv_pipeline_cache *cache,
 		radv_pipeline_cache_set_entry(cache, entry);
 }

+bool
+radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
+					        struct radv_pipeline_cache *cache,
+					        const unsigned char *sha1,
+					        struct radv_shader_variant **variants)
+{
+	struct cache_entry *entry;
+
+	if (!cache)
+		cache = device->mem_cache;
+
+	pthread_mutex_lock(&cache->mutex);
+
+	entry = radv_pipeline_cache_search_unlocked(cache, sha1);
+
+	if (!entry) {
+		if (!device->physical_device->disk_cache ||
+		    (device->instance->debug_flags & RADV_DEBUG_NO_CACHE)) {
+			pthread_mutex_unlock(&cache->mutex);
+			return false;
+		}
+
+		uint8_t disk_sha1[20];
+		disk_cache_compute_key(device->physical_device->disk_cache,
+				       sha1, 20, disk_sha1);
+		entry = (struct cache_entry *)
+			disk_cache_get(device->physical_device->disk_cache,
+				       disk_sha1, NULL);
+		if (!entry) {
+			pthread_mutex_unlock(&cache->mutex);
+			return false;
+		} else {
+			size_t size = entry_size(entry);
+			struct cache_entry *new_entry = vk_alloc(&cache->alloc, size, 8,
+								 VK_SYSTEM_ALLOCATION_SCOPE_CACHE);
+			if (!new_entry) {
+				free(entry);
+				pthread_mutex_unlock(&cache->mutex);
+				return false;
+			}
+
+			memcpy(new_entry, entry, entry_size(entry));
+			free(entry);
+			entry = new_entry;
+
+			radv_pipeline_cache_add_entry(cache, new_entry);
+		}
+	}
+
+	char *p = entry->code;
+	for(int i = 0; i < MESA_SHADER_STAGES; ++i) {
+		if (!entry->variants[i] && entry->code_sizes[i]) {
+			struct radv_shader_variant *variant;
+			struct cache_entry_variant_info info;
+
+			variant = calloc(1, sizeof(struct radv_shader_variant));
+			if (!variant) {
+				pthread_mutex_unlock(&cache->mutex);
+				return false;
+			}
+
+			memcpy(&info, p, sizeof(struct cache_entry_variant_info));
+			p += sizeof(struct cache_entry_variant_info);
+
+			variant->config = info.config;
+			variant->info = info.variant_info;
+			variant->rsrc1 = info.rsrc1;
+			variant->rsrc2 = info.rsrc2;
+			variant->code_size = entry->code_sizes[i];
+			variant->ref_count = 1;
+
+			void *ptr = radv_alloc_shader_memory(device, variant);
+			memcpy(ptr, p, entry->code_sizes[i]);
+			p += entry->code_sizes[i];
+
+			entry->variants[i] = variant;
+		} else if (entry->code_sizes[i]) {
+			p += sizeof(struct cache_entry_variant_info) + entry->code_sizes[i];
+		}
+
+	}
+
+	for (int i = 0; i < MESA_SHADER_STAGES; ++i)
+		if (entry->variants[i])
+			p_atomic_inc(&entry->variants[i]->ref_count);
+
+	memcpy(variants, entry->variants, sizeof(entry->variants));
+	pthread_mutex_unlock(&cache->mutex);
+	return true;
+}
+
 void
 radv_pipeline_cache_insert_shaders(struct radv_device *device,
 				   struct radv_pipeline_cache *cache,
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -282,6 +282,9 @@ struct radv_physical_device {
 	 * the pipeline cache defined by apps.
 	 */
 	struct disk_cache *                          disk_cache;
+
+	VkPhysicalDeviceMemoryProperties memory_properties;
+	enum radv_mem_type mem_type_indices[RADV_MEM_TYPE_COUNT];
 };

 struct radv_instance {
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -291,9 +291,6 @@ radv_shader_compile_to_nir(struct radv_device *device,
 	nir_remove_dead_variables(nir, nir_var_local);
 	radv_optimize_nir(nir);

-	if (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)
-		nir_print_shader(nir, stderr);
-
 	return nir;
 }

--- a/src/amd/vulkan/radv_wsi.c
+++ b/src/amd/vulkan/radv_wsi.c
@@ -194,12 +194,26 @@ radv_wsi_image_create(VkDevice device_h,
 		.image = image_h
 	};

+	/* Find the first VRAM memory type, or GART for PRIME images. */
+	int memory_type_index = -1;
+	for (int i = 0; i < device->physical_device->memory_properties.memoryTypeCount; ++i) {
+		bool is_local = !!(device->physical_device->memory_properties.memoryTypes[i].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+		if ((linear && !is_local) || (!linear && is_local)) {
+			memory_type_index = i;
+			break;
+		}
+	}
+
+	/* fallback */
+	if (memory_type_index == -1)
+		memory_type_index = 0;
+
 	result = radv_alloc_memory(device_h,
 				     &(VkMemoryAllocateInfo) {
 					     .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
 					     .pNext = &ded_alloc,
 					     .allocationSize = image->size,
-					     .memoryTypeIndex = linear ? 1 : 0,
+					     .memoryTypeIndex = memory_type_index,
 				     },
 				     NULL /* XXX: pAllocator */,
 				     RADV_MEM_IMPLICIT_SYNC,
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -85,6 +85,7 @@ LIBGLSL_FILES = \
 	glsl/lower_buffer_access.cpp \
 	glsl/lower_buffer_access.h \
 	glsl/lower_const_arrays_to_uniforms.cpp \
+	glsl/lower_cs_derived.cpp \
 	glsl/lower_discard.cpp \
 	glsl/lower_discard_flow.cpp \
 	glsl/lower_distance.cpp \
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -1295,15 +1295,10 @@ builtin_variable_generator::generate_cs_special_vars()
                       uvec3_t, "gl_LocalGroupSizeARB");
   }

-   if (state->ctx->Const.LowerCsDerivedVariables) {
-      add_variable("gl_GlobalInvocationID", uvec3_t, ir_var_auto, 0);
-      add_variable("gl_LocalInvocationIndex", uint_t, ir_var_auto, 0);
-   } else {
-      add_system_value(SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
-                       uvec3_t, "gl_GlobalInvocationID");
-      add_system_value(SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
-                       uint_t, "gl_LocalInvocationIndex");
-   }
+   add_system_value(SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
+                    uvec3_t, "gl_GlobalInvocationID");
+   add_system_value(SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
+                    uint_t, "gl_LocalInvocationIndex");
 }


@@ -1474,84 +1469,3 @@ _mesa_glsl_initialize_variables(exec_list *instructions,
      break;
   }
 }
-
-
-/**
- * Initialize compute shader variables with values that are derived from other
- * compute shader variable.
- */
-static void
-initialize_cs_derived_variables(gl_shader *shader,
-                                ir_function_signature *const main_sig)
-{
-   assert(shader->Stage == MESA_SHADER_COMPUTE);
-
-   ir_variable *gl_GlobalInvocationID =
-      shader->symbols->get_variable("gl_GlobalInvocationID");
-   assert(gl_GlobalInvocationID);
-   ir_variable *gl_WorkGroupID =
-      shader->symbols->get_variable("gl_WorkGroupID");
-   assert(gl_WorkGroupID);
-   ir_variable *gl_WorkGroupSize =
-      shader->symbols->get_variable("gl_WorkGroupSize");
-   if (gl_WorkGroupSize == NULL) {
-      void *const mem_ctx = ralloc_parent(shader->ir);
-      gl_WorkGroupSize = new(mem_ctx) ir_variable(glsl_type::uvec3_type,
-                                                  "gl_WorkGroupSize",
-                                                  ir_var_auto);
-      gl_WorkGroupSize->data.how_declared = ir_var_declared_implicitly;
-      gl_WorkGroupSize->data.read_only = true;
-      shader->ir->push_head(gl_WorkGroupSize);
-   }
-   ir_variable *gl_LocalInvocationID =
-      shader->symbols->get_variable("gl_LocalInvocationID");
-   assert(gl_LocalInvocationID);
-
-   /* gl_GlobalInvocationID =
-    *    gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID
-    */
-   ir_instruction *inst =
-      assign(gl_GlobalInvocationID,
-             add(mul(gl_WorkGroupID, gl_WorkGroupSize),
-                 gl_LocalInvocationID));
-   main_sig->body.push_head(inst);
-
-   /* gl_LocalInvocationIndex =
-    *    gl_LocalInvocationID.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y +
-    *    gl_LocalInvocationID.y * gl_WorkGroupSize.x +
-    *    gl_LocalInvocationID.x;
-    */
-   ir_expression *index_z =
-      mul(mul(swizzle_z(gl_LocalInvocationID), swizzle_x(gl_WorkGroupSize)),
-          swizzle_y(gl_WorkGroupSize));
-   ir_expression *index_y =
-      mul(swizzle_y(gl_LocalInvocationID), swizzle_x(gl_WorkGroupSize));
-   ir_expression *index_y_plus_z = add(index_y, index_z);
-   operand index_x(swizzle_x(gl_LocalInvocationID));
-   ir_expression *index_x_plus_y_plus_z = add(index_y_plus_z, index_x);
-   ir_variable *gl_LocalInvocationIndex =
-      shader->symbols->get_variable("gl_LocalInvocationIndex");
-   assert(gl_LocalInvocationIndex);
-   inst = assign(gl_LocalInvocationIndex, index_x_plus_y_plus_z);
-   main_sig->body.push_head(inst);
-}
-
-
-/**
- * Initialize builtin variables with values based on other builtin variables.
- * These are initialized in the main function.
- */
-void
-_mesa_glsl_initialize_derived_variables(struct gl_context *ctx,
-                                        gl_shader *shader)
-{
-   /* We only need to set CS variables currently. */
-   if (shader->Stage == MESA_SHADER_COMPUTE &&
-       ctx->Const.LowerCsDerivedVariables) {
-      ir_function_signature *const main_sig =
-         _mesa_get_main_function_signature(shader->symbols);
-
-      if (main_sig != NULL)
-         initialize_cs_derived_variables(shader, main_sig);
-   }
-}
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -2009,8 +2009,6 @@ opt_shader_and_create_symbol_table(struct gl_context *ctx,
         break;
      }
   }
-
-   _mesa_glsl_initialize_derived_variables(ctx, shader);
 }

 void
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -2412,10 +2412,6 @@ extern void
 _mesa_glsl_initialize_variables(exec_list *instructions,
 				struct _mesa_glsl_parse_state *state);

-extern void
-_mesa_glsl_initialize_derived_variables(struct gl_context *ctx,
-                                        gl_shader *shader);
-
 extern void
 reparent_ir(exec_list *list, void *mem_ctx);

--- a/src/compiler/glsl/ir_optimization.h
+++ b/src/compiler/glsl/ir_optimization.h
@@ -166,6 +166,7 @@ void optimize_dead_builtin_variables(exec_list *instructions,
 bool lower_tess_level(gl_linked_shader *shader);

 bool lower_vertex_id(gl_linked_shader *shader);
+bool lower_cs_derived(gl_linked_shader *shader);
 bool lower_blend_equation_advanced(gl_linked_shader *shader);

 bool lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state);
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -2374,6 +2374,9 @@ link_intrastage_shaders(void *mem_ctx,
   if (ctx->Const.VertexID_is_zero_based)
      lower_vertex_id(linked);

+   if (ctx->Const.LowerCsDerivedVariables)
+      lower_cs_derived(linked);
+
 #ifdef DEBUG
   /* Compute the source checksum. */
   linked->SourceChecksum = 0;
--- a/src/compiler/glsl/lower_cs_derived.cpp
+++ b/src/compiler/glsl/lower_cs_derived.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright © 2017 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file lower_cs_derived.cpp
+ *
+ * For hardware that does not support the gl_GlobalInvocationID and
+ * gl_LocalInvocationIndex system values, replace them with fresh
+ * globals. Note that we can't rely on gl_WorkGroupSize or
+ * gl_LocalGroupSizeARB being available, since they may only have been defined
+ * in a non-main shader.
+ *
+ * [ This can happen if only a secondary shader has the layout(local_size_*)
+ *   declaration. ]
+ *
+ * This is meant to be run post-linking.
+ */
+
+#include "glsl_symbol_table.h"
+#include "ir_hierarchical_visitor.h"
+#include "ir.h"
+#include "ir_builder.h"
+#include "linker.h"
+#include "program/prog_statevars.h"
+#include "builtin_functions.h"
+
+using namespace ir_builder;
+
+namespace {
+
+class lower_cs_derived_visitor : public ir_hierarchical_visitor {
+public:
+   explicit lower_cs_derived_visitor(gl_linked_shader *shader)
+      : progress(false),
+        shader(shader),
+        local_size_variable(shader->Program->info.cs.local_size_variable),
+        gl_WorkGroupSize(NULL),
+        gl_WorkGroupID(NULL),
+        gl_LocalInvocationID(NULL),
+        gl_GlobalInvocationID(NULL),
+        gl_LocalInvocationIndex(NULL)
+   {
+      main_sig = _mesa_get_main_function_signature(shader->symbols);
+      assert(main_sig);
+   }
+
+   virtual ir_visitor_status visit(ir_dereference_variable *);
+
+   ir_variable *add_system_value(
+         int slot, const glsl_type *type, const char *name);
+   void find_sysvals();
+   void make_gl_GlobalInvocationID();
+   void make_gl_LocalInvocationIndex();
+
+   bool progress;
+
+private:
+   gl_linked_shader *shader;
+   bool local_size_variable;
+   ir_function_signature *main_sig;
+
+   ir_rvalue *gl_WorkGroupSize;
+   ir_variable *gl_WorkGroupID;
+   ir_variable *gl_LocalInvocationID;
+
+   ir_variable *gl_GlobalInvocationID;
+   ir_variable *gl_LocalInvocationIndex;
+};
+
+} /* anonymous namespace */
+
+ir_variable *
+lower_cs_derived_visitor::add_system_value(
+      int slot, const glsl_type *type, const char *name)
+{
+   ir_variable *var = new(shader) ir_variable(type, name, ir_var_system_value);
+   var->data.how_declared = ir_var_declared_implicitly;
+   var->data.read_only = true;
+   var->data.location = slot;
+   var->data.explicit_location = true;
+   var->data.explicit_index = 0;
+   shader->ir->push_head(var);
+
+   return var;
+}
+
+void
+lower_cs_derived_visitor::find_sysvals()
+{
+   if (gl_WorkGroupSize != NULL)
+      return;
+
+   ir_variable *WorkGroupSize;
+   if (local_size_variable)
+      WorkGroupSize = shader->symbols->get_variable("gl_LocalGroupSizeARB");
+   else
+      WorkGroupSize = shader->symbols->get_variable("gl_WorkGroupSize");
+   if (WorkGroupSize)
+      gl_WorkGroupSize = new(shader) ir_dereference_variable(WorkGroupSize);
+   gl_WorkGroupID = shader->symbols->get_variable("gl_WorkGroupID");
+   gl_LocalInvocationID = shader->symbols->get_variable("gl_LocalInvocationID");
+
+   /*
+    * These may be missing due to either dead code elimination, or, in the
+    * case of the group size, due to the layout being declared in a non-main
+    * shader. Re-create them.
+    */
+
+   if (!gl_WorkGroupID)
+      gl_WorkGroupID = add_system_value(
+            SYSTEM_VALUE_WORK_GROUP_ID, glsl_type::uvec3_type, "gl_WorkGroupID");
+   if (!gl_LocalInvocationID)
+      gl_LocalInvocationID = add_system_value(
+            SYSTEM_VALUE_LOCAL_INVOCATION_ID, glsl_type::uvec3_type,
+            "gl_LocalInvocationID");
+   if (!WorkGroupSize) {
+      if (local_size_variable) {
+         gl_WorkGroupSize = new(shader) ir_dereference_variable(
+               add_system_value(
+                     SYSTEM_VALUE_LOCAL_GROUP_SIZE, glsl_type::uvec3_type,
+                     "gl_LocalGroupSizeARB"));
+      } else {
+         ir_constant_data data;
+         memset(&data, 0, sizeof(data));
+         for (int i = 0; i < 3; i++)
+            data.u[i] = shader->Program->info.cs.local_size[i];
+         gl_WorkGroupSize = new(shader) ir_constant(glsl_type::uvec3_type, &data);
+      }
+   }
+}
+
+void
+lower_cs_derived_visitor::make_gl_GlobalInvocationID()
+{
+   if (gl_GlobalInvocationID != NULL)
+      return;
+
+   find_sysvals();
+
+   /* gl_GlobalInvocationID =
+    *    gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID
+    */
+   gl_GlobalInvocationID = new(shader) ir_variable(
+         glsl_type::uvec3_type, "__GlobalInvocationID", ir_var_temporary);
+   shader->ir->push_head(gl_GlobalInvocationID);
+
+   ir_instruction *inst =
+      assign(gl_GlobalInvocationID,
+             add(mul(gl_WorkGroupID, gl_WorkGroupSize->clone(shader, NULL)),
+                 gl_LocalInvocationID));
+   main_sig->body.push_head(inst);
+}
+
+void
+lower_cs_derived_visitor::make_gl_LocalInvocationIndex()
+{
+   if (gl_LocalInvocationIndex != NULL)
+      return;
+
+   find_sysvals();
+
+   /* gl_LocalInvocationIndex =
+    *    gl_LocalInvocationID.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y +
+    *    gl_LocalInvocationID.y * gl_WorkGroupSize.x +
+    *    gl_LocalInvocationID.x;
+    */
+   gl_LocalInvocationIndex = new(shader)
+      ir_variable(glsl_type::uint_type, "__LocalInvocationIndex", ir_var_temporary);
+   shader->ir->push_head(gl_LocalInvocationIndex);
+
+   ir_expression *index_z =
+      mul(mul(swizzle_z(gl_LocalInvocationID), swizzle_x(gl_WorkGroupSize->clone(shader, NULL))),
+          swizzle_y(gl_WorkGroupSize->clone(shader, NULL)));
+   ir_expression *index_y =
+      mul(swizzle_y(gl_LocalInvocationID), swizzle_x(gl_WorkGroupSize->clone(shader, NULL)));
+   ir_expression *index_y_plus_z = add(index_y, index_z);
+   operand index_x(swizzle_x(gl_LocalInvocationID));
+   ir_expression *index_x_plus_y_plus_z = add(index_y_plus_z, index_x);
+   ir_instruction *inst =
+      assign(gl_LocalInvocationIndex, index_x_plus_y_plus_z);
+   main_sig->body.push_head(inst);
+}
+
+ir_visitor_status
+lower_cs_derived_visitor::visit(ir_dereference_variable *ir)
+{
+   if (ir->var->data.mode == ir_var_system_value &&
+       ir->var->data.location == SYSTEM_VALUE_GLOBAL_INVOCATION_ID) {
+      make_gl_GlobalInvocationID();
+      ir->var = gl_GlobalInvocationID;
+      progress = true;
+   }
+
+   if (ir->var->data.mode == ir_var_system_value &&
+       ir->var->data.location == SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) {
+      make_gl_LocalInvocationIndex();
+      ir->var = gl_LocalInvocationIndex;
+      progress = true;
+   }
+
+   return visit_continue;
+}
+
+bool
+lower_cs_derived(gl_linked_shader *shader)
+{
+   if (shader->Stage != MESA_SHADER_COMPUTE)
+      return false;
+
+   lower_cs_derived_visitor v(shader);
+   v.run(shader->ir);
+
+   return v.progress;
+}
--- a/src/compiler/glsl/meson.build
+++ b/src/compiler/glsl/meson.build
@@ -124,6 +124,7 @@ files_libglsl = files(
  'lower_buffer_access.cpp',
  'lower_buffer_access.h',
  'lower_const_arrays_to_uniforms.cpp',
+  'lower_cs_derived.cpp',
  'lower_discard.cpp',
  'lower_discard_flow.cpp',
  'lower_distance.cpp',
--- a/src/compiler/glsl/opt_dead_builtin_variables.cpp
+++ b/src/compiler/glsl/opt_dead_builtin_variables.cpp
@@ -62,23 +62,6 @@ optimize_dead_builtin_variables(exec_list *instructions,
       * information, so removing these variables from the user shader will
       * cause problems later.
       *
-       * For compute shaders, gl_GlobalInvocationID has some dependencies, so
-       * we avoid removing these dependencies.
-       *
-       * We also avoid removing gl_GlobalInvocationID at this stage because it
-       * might be used by a linked shader. In this case it still needs to be
-       * initialized by the main function.
-       *
-       *    gl_GlobalInvocationID =
-       *       gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID
-       *
-       * Similarly, we initialize gl_LocalInvocationIndex in the main function:
-       *
-       *    gl_LocalInvocationIndex =
-       *       gl_LocalInvocationID.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y +
-       *       gl_LocalInvocationID.y * gl_WorkGroupSize.x +
-       *       gl_LocalInvocationID.x;
-       *
       * Matrix uniforms with "Transpose" are not eliminated because there's
       * an optimization pass that can turn references to the regular matrix
       * into references to the transpose matrix.  Eliminating the transpose
@@ -90,11 +73,6 @@ optimize_dead_builtin_variables(exec_list *instructions,
       */
      if (strcmp(var->name, "gl_ModelViewProjectionMatrix") == 0
          || strcmp(var->name, "gl_Vertex") == 0
-          || strcmp(var->name, "gl_WorkGroupID") == 0
-          || strcmp(var->name, "gl_WorkGroupSize") == 0
-          || strcmp(var->name, "gl_LocalInvocationID") == 0
-          || strcmp(var->name, "gl_GlobalInvocationID") == 0
-          || strcmp(var->name, "gl_LocalInvocationIndex") == 0
          || strstr(var->name, "Transpose") != NULL)
         continue;

--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -434,7 +434,7 @@ INTRINSIC(load_interpolated_input, 2, ARR(2, 1), true, 0, 0,
 /* src[] = { buffer_index, offset }. No const_index */
 LOAD(ssbo, 2, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { offset }. const_index[] = { base, component } */
-LOAD(output, 1, 1, BASE, COMPONENT, xx, NIR_INTRINSIC_CAN_ELIMINATE)
+LOAD(output, 1, 2, BASE, COMPONENT, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { vertex, offset }. const_index[] = { base, component } */
 LOAD(per_vertex_output, 2, 1, BASE, COMPONENT, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 /* src[] = { offset }. const_index[] = { base } */
--- a/src/compiler/nir/nir_opt_intrinsics.c
+++ b/src/compiler/nir/nir_opt_intrinsics.c
@@ -28,6 +28,26 @@
 * \file nir_opt_intrinsics.c
 */

+static nir_ssa_def *
+high_subgroup_mask(nir_builder *b,
+                   nir_ssa_def *count,
+                   uint64_t base_mask)
+{
+   /* group_mask could probably be calculated more efficiently but we want to
+    * be sure not to shift by 64 if the subgroup size is 64 because the GLSL
+    * shift operator is undefined in that case. In any case if we were worried
+    * about efficency this should probably be done further down because the
+    * subgroup size is likely to be known at compile time.
+    */
+   nir_ssa_def *subgroup_size = nir_load_subgroup_size(b);
+   nir_ssa_def *all_bits = nir_imm_int64(b, ~0ull);
+   nir_ssa_def *shift = nir_isub(b, nir_imm_int(b, 64), subgroup_size);
+   nir_ssa_def *group_mask = nir_ushr(b, all_bits, shift);
+   nir_ssa_def *higher_bits = nir_ishl(b, nir_imm_int64(b, base_mask), count);
+
+   return nir_iand(b, higher_bits, group_mask);
+}
+
 static bool
 opt_intrinsics_impl(nir_function_impl *impl)
 {
@@ -95,10 +115,10 @@ opt_intrinsics_impl(nir_function_impl *impl)
               replacement = nir_ishl(&b, nir_imm_int64(&b, 1ull), count);
               break;
            case nir_intrinsic_load_subgroup_ge_mask:
-               replacement = nir_ishl(&b, nir_imm_int64(&b, ~0ull), count);
+               replacement = high_subgroup_mask(&b, count, ~0ull);
               break;
            case nir_intrinsic_load_subgroup_gt_mask:
-               replacement = nir_ishl(&b, nir_imm_int64(&b, ~1ull), count);
+               replacement = high_subgroup_mask(&b, count, ~1ull);
               break;
            case nir_intrinsic_load_subgroup_le_mask:
               replacement = nir_inot(&b, nir_ishl(&b, nir_imm_int64(&b, ~1ull), count));
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -2802,7 +2802,8 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,

   case SpvOpMemoryModel:
      assert(w[1] == SpvAddressingModelLogical);
-      assert(w[2] == SpvMemoryModelGLSL450);
+      assert(w[2] == SpvMemoryModelSimple ||
+             w[2] == SpvMemoryModelGLSL450);
      break;

   case SpvOpEntryPoint: {
--- a/src/egl/meson.build
+++ b/src/egl/meson.build
@@ -21,7 +21,9 @@
 c_args_for_egl = []
 link_for_egl = []
 deps_for_egl = []
-incs_for_egl = []
+incs_for_egl = [
+  inc_include, inc_src, inc_loader, inc_gbm, include_directories('main'),
+]
 files_egl = files(
  'main/eglapi.c',
  'main/eglapi.h',
@@ -159,10 +161,7 @@ libegl = shared_library(
    '-D_EGL_BUILT_IN_DRIVER_DRI2',
    '-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_@0@'.format(egl_native_platform.to_upper()),
  ],
-  include_directories : [
-    incs_for_egl, inc_include, inc_src, inc_loader, inc_gbm,
-    include_directories('main'),
-  ],
+  include_directories : incs_for_egl,
  link_with : [link_for_egl, libloader, libxmlconfig, libglapi, libmesa_util],
  link_args : [ld_args_bsymbolic, ld_args_gc_sections],
  dependencies : [deps_for_egl, dep_dl, dep_libdrm, dep_clock, dep_thread],
--- a/src/egl/wayland/wayland-egl/Makefile.am
+++ b/src/egl/wayland/wayland-egl/Makefile.am
@@ -3,7 +3,7 @@ pkgconfig_DATA = wayland-egl.pc

 AM_CFLAGS = $(DEFINES) \
 	    $(VISIBILITY_CFLAGS) \
-	    $(WAYLAND_SERVER_CFLAGS)
+	    $(WAYLAND_CLIENT_CFLAGS)

 lib_LTLIBRARIES = libwayland-egl.la
 noinst_HEADERS = wayland-egl-backend.h
--- a/src/egl/wayland/wayland-egl/meson.build
+++ b/src/egl/wayland/wayland-egl/meson.build
@@ -24,6 +24,7 @@ libwayland_egl = shared_library(
  'wayland-egl.c',
  c_args : [c_vis_args],
  link_args : ld_args_gc_sections,
+  dependencies : dep_wayland_client,
  version : '1.0.0',
  install : true,
 )
--- a/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c
@@ -555,6 +555,7 @@ etna_try_rs_blit(struct pipe_context *pctx,
   }

   /* Set up color TS to source surface before blit, if needed */
+   bool source_ts_valid = false;
   if (src->levels[blit_info->src.level].ts_size &&
       src->levels[blit_info->src.level].ts_valid) {
      struct etna_reloc reloc;
@@ -579,6 +580,8 @@ etna_try_rs_blit(struct pipe_context *pctx,

      etna_set_state(ctx->stream, VIVS_TS_COLOR_CLEAR_VALUE,
                     src->levels[blit_info->src.level].clear_value);
+
+      source_ts_valid = true;
   } else {
      etna_set_state(ctx->stream, VIVS_TS_MEM_CONFIG, ts_mem_config);
   }
@@ -593,6 +596,7 @@ etna_try_rs_blit(struct pipe_context *pctx,
      .source_stride = src_lev->stride,
      .source_padded_width = src_lev->padded_width,
      .source_padded_height = src_lev->padded_height,
+      .source_ts_valid = source_ts_valid,
      .dest_format = translate_rs_format(dst_format),
      .dest_tiling = dst->layout,
      .dest = dst->bo,
--- a/src/gallium/drivers/etnaviv/etnaviv_emit.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_emit.c
@@ -171,6 +171,10 @@ etna_submit_rs_state(struct etna_context *ctx,
   struct etna_cmd_stream *stream = ctx->stream;
   struct etna_coalesce coalesce;

+   if (cs->RS_KICKER_INPLACE && !cs->source_ts_valid)
+      /* Inplace resolve is no-op if TS is not configured */
+      return;
+
   ctx->stats.rs_operations++;

   if (cs->RS_KICKER_INPLACE) {
--- a/src/gallium/drivers/etnaviv/etnaviv_rs.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_rs.c
@@ -133,6 +133,7 @@ etna_compile_rs_state(struct etna_context *ctx, struct compiled_rs_state *cs,
      /* Total number of tiles (same as for autodisable) */
      cs->RS_KICKER_INPLACE = rs->source_padded_width * rs->source_padded_height / 16;
   }
+   cs->source_ts_valid = rs->source_ts_valid;
 }

 void
--- a/src/gallium/drivers/etnaviv/etnaviv_rs.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_rs.h
@@ -33,6 +33,7 @@
 struct rs_state {
   uint8_t downsample_x : 1; /* Downsample in x direction */
   uint8_t downsample_y : 1; /* Downsample in y direction */
+   uint8_t source_ts_valid : 1;

   uint8_t source_format; /* RS_FORMAT_XXX */
   uint8_t source_tiling; /* ETNA_LAYOUT_XXX */
@@ -61,6 +62,7 @@ struct rs_state {

 /* treat this as opaque structure */
 struct compiled_rs_state {
+   uint8_t source_ts_valid : 1;
   uint32_t RS_CONFIG;
   uint32_t RS_SOURCE_STRIDE;
   uint32_t RS_DEST_STRIDE;
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -216,6 +216,23 @@ void i915_update_derived(struct i915_context *i915)
   if (I915_DBG_ON(DBG_ATOMS))
      i915_dump_dirty(i915, __FUNCTION__);

+   if (!i915->fs) {
+      i915->dirty &= ~(I915_NEW_FS_CONSTANTS | I915_NEW_FS);
+      i915->hardware_dirty &= ~(I915_HW_PROGRAM | I915_HW_CONSTANTS);
+   }
+
+   if (!i915->vs)
+      i915->dirty &= ~I915_NEW_VS;
+
+   if (!i915->blend)
+      i915->dirty &= ~I915_NEW_BLEND;
+
+   if (!i915->rasterizer)
+      i915->dirty &= ~I915_NEW_RASTERIZER;
+
+   if (!i915->depth_stencil)
+      i915->dirty &= ~I915_NEW_DEPTH_STENCIL;
+   
   for (i = 0; atoms[i]; i++)
      if (atoms[i]->dirty & i915->dirty)
         atoms[i]->update(i915);
--- a/src/gallium/drivers/i915/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915/i915_state_dynamic.c
@@ -213,7 +213,8 @@ static void upload_STIPPLE(struct i915_context *i915)

   /* I915_NEW_RASTERIZER
    */
-   st[1] |= i915->rasterizer->st;
+   if (i915->rasterizer)
+      st[1] |= i915->rasterizer->st;

   /* I915_NEW_STIPPLE
    */
--- a/src/gallium/drivers/i915/i915_state_immediate.c
+++ b/src/gallium/drivers/i915/i915_state_immediate.c
@@ -168,11 +168,13 @@ static void upload_S6(struct i915_context *i915)

   /* I915_NEW_BLEND
    */
-   LIS6 |= i915->blend->LIS6;
+   if (i915->blend)
+      LIS6 |= i915->blend->LIS6;

   /* I915_NEW_DEPTH
    */
-   LIS6 |= i915->depth_stencil->depth_LIS6;
+   if (i915->depth_stencil)
+      LIS6 |= i915->depth_stencil->depth_LIS6;

   set_immediate(i915, I915_IMMEDIATE_S6, LIS6);
 }
--- a/src/gallium/drivers/i915/i915_state_static.c
+++ b/src/gallium/drivers/i915/i915_state_static.c
@@ -216,7 +216,7 @@ static void update_dst_buf_vars(struct i915_context *i915)
      zformat = translate_depth_format(depth_surface->format);

      if (is->is_i945 && tex->tiling != I915_TILE_NONE
-            && !i915->fs->info.writes_z)
+          && (i915->fs && !i915->fs->info.writes_z))
         early_z = CLASSIC_EARLY_DEPTH;
   } else
      zformat = 0;
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -711,22 +711,24 @@ void alu_group_tracker::update_flags(alu_node* n) {
 }

 int post_scheduler::run() {
-	run_on(sh.root);
-	return 0;
+	return run_on(sh.root) ? 0 : 1;
 }

-void post_scheduler::run_on(container_node* n) {
-
+bool post_scheduler::run_on(container_node* n) {
+	int r = true;
 	for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
 		if (I->is_container()) {
 			if (I->subtype == NST_BB) {
 				bb_node* bb = static_cast<bb_node*>(*I);
-				schedule_bb(bb);
+				r = schedule_bb(bb);
 			} else {
-				run_on(static_cast<container_node*>(*I));
+				r = run_on(static_cast<container_node*>(*I));
 			}
+			if (!r)
+				break;
 		}
 	}
+	return r;
 }

 void post_scheduler::init_uc_val(container_node *c, value *v) {
@@ -758,7 +760,7 @@ unsigned post_scheduler::init_ucm(container_node *c, node *n) {
 	return F == ucm.end() ? 0 : F->second;
 }

-void post_scheduler::schedule_bb(bb_node* bb) {
+bool post_scheduler::schedule_bb(bb_node* bb) {
 	PSC_DUMP(
 		sblog << "scheduling BB " << bb->id << "\n";
 		if (!pending.empty())
@@ -791,8 +793,10 @@ void post_scheduler::schedule_bb(bb_node* bb) {

 		if (n->is_alu_clause()) {
 			n->remove();
-			process_alu(static_cast<container_node*>(n));
-			continue;
+			bool r = process_alu(static_cast<container_node*>(n));
+			if (r)
+				continue;
+			return false;
 		}

 		n->remove();
@@ -800,6 +804,7 @@ void post_scheduler::schedule_bb(bb_node* bb) {
 	}

 	this->cur_bb = NULL;
+	return true;
 }

 void post_scheduler::init_regmap() {
@@ -933,10 +938,10 @@ void post_scheduler::process_fetch(container_node *c) {
 	cur_bb->push_front(c);
 }

-void post_scheduler::process_alu(container_node *c) {
+bool post_scheduler::process_alu(container_node *c) {

 	if (c->empty())
-		return;
+		return true;

 	ucm.clear();
 	alu.reset();
@@ -973,7 +978,7 @@ void post_scheduler::process_alu(container_node *c) {
 		}
 	}

-	schedule_alu(c);
+	return schedule_alu(c);
 }

 void post_scheduler::update_local_interferences() {
@@ -1135,15 +1140,20 @@ void post_scheduler::emit_clause() {
 	emit_index_registers();
 }

-void post_scheduler::schedule_alu(container_node *c) {
+bool post_scheduler::schedule_alu(container_node *c) {

 	assert(!ready.empty() || !ready_copies.empty());

-	while (1) {
-
+	bool improving = true;
+	int last_pending = pending.count();
+	while (improving) {
 		prev_regmap = regmap;
-
 		if (!prepare_alu_group()) {
+
+			int new_pending = pending.count();
+			improving = (new_pending < last_pending) || (last_pending == 0);
+			last_pending = new_pending;
+
 			if (alu.current_idx[0] || alu.current_idx[1]) {
 				regmap = prev_regmap;
 				emit_clause();
@@ -1186,6 +1196,7 @@ void post_scheduler::schedule_alu(container_node *c) {
 		dump::dump_op_list(&pending);
 		assert(!"unscheduled pending instructions");
 	}
+	return improving;
 }

 void post_scheduler::add_interferences(value *v, sb_bitset &rb, val_set &vs) {
--- a/src/gallium/drivers/r600/sb/sb_sched.h
+++ b/src/gallium/drivers/r600/sb/sb_sched.h
@@ -267,14 +267,14 @@ public:
 		live(), ucm(), alu(sh),	regmap(), cleared_interf() {}

 	virtual int run();
-	void run_on(container_node *n);
-	void schedule_bb(bb_node *bb);
+	bool run_on(container_node *n);
+	bool schedule_bb(bb_node *bb);

 	void load_index_register(value *v, unsigned idx);
 	void process_fetch(container_node *c);

-	void process_alu(container_node *c);
-	void schedule_alu(container_node *c);
+	bool process_alu(container_node *c);
+	bool schedule_alu(container_node *c);
 	bool prepare_alu_group();

 	void release_op(node *n);
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -182,8 +182,11 @@ void si_vid_join_surfaces(struct r600_common_context *rctx,

 			for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.legacy.level); ++j)
 				surfaces[i]->u.legacy.level[j].offset += off;
-		} else
+		} else {
 			surfaces[i]->u.gfx9.surf_offset += off;
+			for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.gfx9.offset); ++j)
+				surfaces[i]->u.gfx9.offset[j] += off;
+		}

 		off += surfaces[i]->surf_size;
 	}
--- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@@ -6,5 +6,5 @@ DRI_CONF_SECTION_PERFORMANCE
 DRI_CONF_SECTION_END

 DRI_CONF_SECTION_DEBUG
-   DRI_CONF_RADEONSI_CLEAR_DB_META_BEFORE_CLEAR("false")
+   DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR("false")
 DRI_CONF_SECTION_END
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -901,16 +901,16 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		 * corruption in ARK: Survival Evolved, but that may just be
 		 * a coincidence and the root cause is elsewhere.
 		 *
-		 * The corruption can be fixed by putting the DB metadata flush
-		 * before or after the depth clear. (suprisingly)
+		 * The corruption can be fixed by putting the DB flush before
+		 * or after the depth clear. (surprisingly)
 		 *
 		 * https://bugs.freedesktop.org/show_bug.cgi?id=102955 (apitrace)
 		 *
 		 * This hack decreases back-to-back ClearDepth performance.
 		 */
-		if (sctx->screen->clear_db_meta_before_clear)
-			sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META |
-					 SI_CONTEXT_PS_PARTIAL_FLUSH;
+		if (sctx->screen->clear_db_cache_before_clear) {
+			sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
+		}
 	}

 	si_blitter_begin(ctx, SI_CLEAR);
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1072,8 +1072,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 		driQueryOptionb(config->options, "radeonsi_assume_no_z_fights");
 	sscreen->commutative_blend_add =
 		driQueryOptionb(config->options, "radeonsi_commutative_blend_add");
-	sscreen->clear_db_meta_before_clear =
-		driQueryOptionb(config->options, "radeonsi_clear_db_meta_before_clear");
+	sscreen->clear_db_cache_before_clear =
+		driQueryOptionb(config->options, "radeonsi_clear_db_cache_before_clear");
 	sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
 					    sscreen->b.family <= CHIP_POLARIS12) ||
 					   sscreen->b.family == CHIP_VEGA10 ||
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -98,7 +98,7 @@ struct si_screen {
 	bool				has_out_of_order_rast;
 	bool				assume_no_z_fights;
 	bool				commutative_blend_add;
-	bool				clear_db_meta_before_clear;
+	bool				clear_db_cache_before_clear;
 	bool				has_msaa_sample_loc_bug;
 	bool				dpbb_allowed;
 	bool				dfsm_allowed;
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2015,14 +2015,21 @@ static LLVMValueRef fetch_constant(
 		 * code reducing SIMD wave occupancy from 8 to 2 in many cases.
 		 *
 		 * Using s_buffer_load_dword (x1) seems to be the best option right now.
+		 *
+		 * LLVM 5.0 on SI doesn't insert a required s_nop between SALU setting
+		 * a descriptor and s_buffer_load_dword using it, so we can't expand
+		 * the pointer into a full descriptor like below. We have to use
+		 * s_load_dword instead. The only case when LLVM 5.0 would select
+		 * s_buffer_load_dword (that we have to prevent) is when we use use
+		 * a literal offset where we don't need bounds checking.
 		 */
-#if 0 /* keep this codepath disabled */
-		if (!reg->Register.Indirect) {
+		if (ctx->screen->b.chip_class == SI &&
+                    HAVE_LLVM < 0x0600 &&
+                    !reg->Register.Indirect) {
 			addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
 			LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
 			return bitcast(bld_base, type, result);
 		}
-#endif

 		/* Do the bounds checking with a descriptor, because
 		 * doing computation and manual bounds checking of 64-bit
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -302,8 +302,7 @@ void si_nir_scan_shader(const struct nir_shader *nir,
 	info->num_written_clipdistance = nir->info.clip_distance_array_size;
 	info->num_written_culldistance = nir->info.cull_distance_array_size;
 	info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
-	info->culldist_writemask = u_bit_consecutive(info->num_written_clipdistance,
-						     info->num_written_culldistance);
+	info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance);

 	if (info->processor == PIPE_SHADER_FRAGMENT)
 		info->uses_kill = nir->info.fs.uses_discard;
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -61,7 +61,7 @@ struct vc4_cl {
        struct vc4_cl_out *next;
        struct vc4_cl_out *reloc_next;
        uint32_t size;
-#ifdef DEBUG
+#ifndef NDEBUG
        uint32_t reloc_count;
 #endif
 };
@@ -163,8 +163,8 @@ static inline void
 cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 {
        assert(n == 1 || n == 2);
-#ifdef DEBUG
        assert(cl->reloc_count == 0);
+#ifndef NDEBUG
        cl->reloc_count = n;
 #endif

@@ -177,8 +177,8 @@ cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 static inline struct vc4_cl_out *
 cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
-#ifdef DEBUG
        assert(cl->reloc_count == 0);
+#ifndef NDEBUG
        cl->reloc_count = n;
 #endif
        cl->reloc_next = cl->next;
@@ -196,7 +196,7 @@ cl_reloc(struct vc4_job *job, struct vc4_cl *cl, struct vc4_cl_out **cl_out,
        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(job, bo);
        cl_advance(&cl->reloc_next, 4);

-#ifdef DEBUG
+#ifndef NDEBUG
        cl->reloc_count--;
 #endif

@@ -211,7 +211,7 @@ cl_aligned_reloc(struct vc4_job *job, struct vc4_cl *cl,
        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(job, bo);
        cl_advance(&cl->reloc_next, 4);

-#ifdef DEBUG
+#ifndef NDEBUG
        cl->reloc_count--;
 #endif

@@ -297,7 +297,7 @@ cl_pack_emit_reloc(struct vc4_cl *cl, const struct vc4_cl_reloc *reloc)
        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(cl->job, reloc->bo);
        cl_advance(&cl->reloc_next, 4);

-#ifdef DEBUG
+#ifndef NDEBUG
        cl->reloc_count--;
 #endif
 }
--- a/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
+++ b/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
@@ -70,7 +70,6 @@ namespace {
   make_kernel_args(const Module &mod, const std::string &kernel_name,
                    const clang::CompilerInstance &c) {
      std::vector<module::argument> args;
-      const auto address_spaces = c.getTarget().getAddressSpaceMap();
      const Function &f = *mod.getFunction(kernel_name);
      ::llvm::DataLayout dl(&mod);
      const auto size_type =
@@ -128,8 +127,8 @@ namespace {
               const unsigned address_space =
                  cast< ::llvm::PointerType>(actual_type)->getAddressSpace();

-               if (address_space == address_spaces[clang::LangAS::opencl_local
-                                                   - compat::lang_as_offset]) {
+               if (address_space == compat::target_address_space(
+                                  c.getTarget(), clang::LangAS::opencl_local)) {
                  args.emplace_back(module::argument::local, arg_api_size,
                                    target_size, target_align,
                                    module::argument::zero_ext);
--- a/src/gallium/state_trackers/clover/llvm/compat.hpp
+++ b/src/gallium/state_trackers/clover/llvm/compat.hpp
@@ -69,11 +69,19 @@ namespace clover {
         typedef ::llvm::TargetLibraryInfo target_library_info;
 #endif

+         template<typename T, typename AS>
+         unsigned target_address_space(const T &target, const AS lang_as) {
+            const auto &map = target.getAddressSpaceMap();
+#if HAVE_LLVM >= 0x0500
+            return map[static_cast<unsigned>(lang_as)];
+#else
+            return map[lang_as - clang::LangAS::Offset];
+#endif
+         }
+
 #if HAVE_LLVM >= 0x0500
-         const auto lang_as_offset = 0;
         const clang::InputKind ik_opencl = clang::InputKind::OpenCL;
 #else
-         const auto lang_as_offset = clang::LangAS::Offset;
         const clang::InputKind ik_opencl = clang::IK_OpenCL;
 #endif

--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -68,8 +68,9 @@ LOCAL_SHARED_LIBRARIES += $(sort $(GALLIUM_SHARED_LIBS))
 ifneq ($(filter 5 6 7, $(MESA_ANDROID_MAJOR_VERSION)),)
 LOCAL_POST_INSTALL_CMD := \
 	$(foreach l, lib $(if $(filter true,$(TARGET_IS_64_BIT)),lib64), \
-	  mkdir -p $(TARGET_OUT)/$(l)/$(MESA_DRI_MODULE_REL_PATH); \
-	  $(foreach d, $(GALLIUM_TARGET_DRIVERS), ln -sf gallium_dri.so $(TARGET_OUT)/$(l)/$(MESA_DRI_MODULE_REL_PATH)/$(d)_dri.so;) \
+	  $(eval MESA_DRI_MODULE_PATH := $(TARGET_OUT_VENDOR)/$(l)/$(MESA_DRI_MODULE_REL_PATH)) \
+	  mkdir -p $(MESA_DRI_MODULE_PATH); \
+	  $(foreach d, $(GALLIUM_TARGET_DRIVERS), ln -sf gallium_dri.so $(MESA_DRI_MODULE_PATH)/$(d)_dri.so;) \
 	)
 else
 LOCAL_MODULE_SYMLINKS := $(foreach d, $(GALLIUM_TARGET_DRIVERS), $(d)_dri.so)
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1190,7 +1190,7 @@ brw_JMPI(struct brw_codegen *p, struct brw_reg index,
   struct brw_reg ip = brw_ip_reg();
   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);

-   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
+   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
   brw_inst_set_pred_control(devinfo, inst, predicate_control);
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -47,7 +47,8 @@ cat(struct string *dest, const struct string src)
 static bool
 contains(const struct string haystack, const struct string needle)
 {
-   return memmem(haystack.str, haystack.len, needle.str, needle.len) != NULL;
+   return haystack.str && memmem(haystack.str, haystack.len,
+                                 needle.str, needle.len) != NULL;
 }
 #define CONTAINS(haystack, needle) \
   contains(haystack, (struct string){needle, strlen(needle)})
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2092,10 +2092,10 @@ fs_visitor::assign_constant_locations()
    */
   uint32_t *param = stage_prog_data->param;
   stage_prog_data->nr_params = num_push_constants;
-   stage_prog_data->param = ralloc_array(NULL, uint32_t, num_push_constants);
+   stage_prog_data->param = ralloc_array(mem_ctx, uint32_t, num_push_constants);
   if (num_pull_constants > 0) {
      stage_prog_data->nr_pull_params = num_pull_constants;
-      stage_prog_data->pull_param = ralloc_array(NULL, uint32_t,
+      stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
                                                 num_pull_constants);
   }

@@ -5013,7 +5013,9 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
 {
   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
            (inst->components_read(i) == 1 &&
-             lbld.dispatch_width() <= inst->exec_size));
+             lbld.dispatch_width() <= inst->exec_size)) ||
+          (inst->flags_written() &
+           flag_mask(inst->src[i], type_sz(inst->src[i].type)));
 }

 /**
@@ -6164,6 +6166,31 @@ fs_visitor::run_gs()
   return !failed;
 }

+/* From the SKL PRM, Volume 16, Workarounds:
+ *
+ *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
+ *              only header phases (R0-R2)
+ *
+ *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
+ *       have been header only.
+ *
+ * Instead of enabling push constants one can alternatively enable one of the
+ * inputs. Here one simply chooses "layer" which shouldn't impose much
+ * overhead.
+ */
+static void
+gen9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
+{
+   if (wm_prog_data->num_varying_inputs)
+      return;
+
+   if (wm_prog_data->base.curb_read_length)
+      return;
+
+   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
+   wm_prog_data->num_varying_inputs = 1;
+}
+
 bool
 fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 {
@@ -6227,6 +6254,10 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
      optimize();

      assign_curb_setup();
+
+      if (devinfo->gen >= 9)
+         gen9_ps_header_only_workaround(wm_prog_data);
+
      assign_urb_setup();

      fixup_3src_null_dest();
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -402,7 +402,6 @@ fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);

      int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
-      brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
      {
         /* Don't send AA data */
         fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -173,14 +173,13 @@ anv_shader_compile_to_nir(struct anv_pipeline *pipeline,
   NIR_PASS_V(nir, nir_propagate_invariant);
   NIR_PASS_V(nir, nir_lower_io_to_temporaries,
              entry_point->impl, true, false);
-   NIR_PASS_V(nir, nir_lower_system_values);

   /* Vulkan uses the separate-shader linking model */
   nir->info.separate_shader = true;

   nir = brw_preprocess_nir(compiler, nir);

-   NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
+   NIR_PASS_V(nir, nir_lower_system_values);

   if (stage == MESA_SHADER_FRAGMENT)
      NIR_PASS_V(nir, anv_nir_lower_input_attachments);
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -315,7 +315,8 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
      src_format = dst_format = MESA_FORMAT_R_FLOAT32;
   }

-   enum isl_format src_isl_format = brw_isl_format_for_mesa_format(src_format);
+   enum isl_format src_isl_format =
+      brw_blorp_to_isl_format(brw, src_format, false);
   enum isl_aux_usage src_aux_usage =
      intel_miptree_texture_aux_usage(brw, src_mt, src_isl_format);
   /* We do format workarounds for some depth formats so we can't reliably
@@ -328,8 +329,10 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
   intel_miptree_prepare_access(brw, src_mt, src_level, 1, src_layer, 1,
                                src_aux_usage, src_clear_supported);

+   enum isl_format dst_isl_format =
+      brw_blorp_to_isl_format(brw, dst_format, true);
   enum isl_aux_usage dst_aux_usage =
-      intel_miptree_render_aux_usage(brw, dst_mt, encode_srgb, false);
+      intel_miptree_render_aux_usage(brw, dst_mt, dst_isl_format, false);
   const bool dst_clear_supported = dst_aux_usage != ISL_AUX_USAGE_NONE;
   intel_miptree_prepare_access(brw, dst_mt, dst_level, 1, dst_layer, 1,
                                dst_aux_usage, dst_clear_supported);
@@ -351,10 +354,9 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
   struct blorp_batch batch;
   blorp_batch_init(&brw->blorp, &batch, brw, 0);
   blorp_blit(&batch, &src_surf, src_level, src_layer,
-              brw_blorp_to_isl_format(brw, src_format, false), src_isl_swizzle,
+              src_isl_format, src_isl_swizzle,
              &dst_surf, dst_level, dst_layer,
-              brw_blorp_to_isl_format(brw, dst_format, true),
-              ISL_SWIZZLE_IDENTITY,
+              dst_isl_format, ISL_SWIZZLE_IDENTITY,
              src_x0, src_y0, src_x1, src_y1,
              dst_x0, dst_y0, dst_x1, dst_y1,
              filter, mirror_x, mirror_y);
@@ -1157,6 +1159,7 @@ do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,
   mesa_format format = irb->Base.Base.Format;
   if (!encode_srgb && _mesa_get_format_color_encoding(format) == GL_SRGB)
      format = _mesa_get_srgb_format_linear(format);
+   enum isl_format isl_format = brw->mesa_to_isl_render_format[format];

   x0 = fb->_Xmin;
   x1 = fb->_Xmax;
@@ -1255,8 +1258,7 @@ do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,

      struct blorp_batch batch;
      blorp_batch_init(&brw->blorp, &batch, brw, 0);
-      blorp_fast_clear(&batch, &surf,
-                       brw->mesa_to_isl_render_format[format],
+      blorp_fast_clear(&batch, &surf, isl_format,
                       level, irb->mt_layer, num_layers,
                       x0, y0, x1, y1);
      blorp_batch_finish(&batch);
@@ -1275,9 +1277,9 @@ do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,
          irb->mt, irb->mt_level, irb->mt_layer, num_layers);

      enum isl_aux_usage aux_usage =
-         intel_miptree_render_aux_usage(brw, irb->mt, encode_srgb, false);
+         intel_miptree_render_aux_usage(brw, irb->mt, isl_format, false);
      intel_miptree_prepare_render(brw, irb->mt, level, irb->mt_layer,
-                                   num_layers, encode_srgb, false);
+                                   num_layers, isl_format, false);

      struct isl_surf isl_tmp[2];
      struct blorp_surf surf;
@@ -1289,16 +1291,14 @@ do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,

      struct blorp_batch batch;
      blorp_batch_init(&brw->blorp, &batch, brw, 0);
-      blorp_clear(&batch, &surf,
-                  brw->mesa_to_isl_render_format[format],
-                  ISL_SWIZZLE_IDENTITY,
+      blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
                  level, irb->mt_layer, num_layers,
                  x0, y0, x1, y1,
                  clear_color, color_write_disable);
      blorp_batch_finish(&batch);

      intel_miptree_finish_render(brw, irb->mt, level, irb->mt_layer,
-                                  num_layers, encode_srgb, false);
+                                  num_layers, isl_format, false);
   }

   return;
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -1072,6 +1072,12 @@ intelDestroyContext(__DRIcontext * driContextPriv)
   if (brw->wm.base.scratch_bo)
      brw_bo_unreference(brw->wm.base.scratch_bo);

+   brw_bo_unreference(brw->vs.base.push_const_bo);
+   brw_bo_unreference(brw->tcs.base.push_const_bo);
+   brw_bo_unreference(brw->tes.base.push_const_bo);
+   brw_bo_unreference(brw->gs.base.push_const_bo);
+   brw_bo_unreference(brw->wm.base.push_const_bo);
+
   brw_destroy_hw_context(brw->bufmgr, brw->hw_ctx);

   if (ctx->swrast_context) {
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -25,6 +25,7 @@

 #include <sys/errno.h>

+#include "main/blend.h"
 #include "main/context.h"
 #include "main/condrender.h"
 #include "main/samplerobj.h"
@@ -503,9 +504,13 @@ brw_predraw_resolve_framebuffer(struct brw_context *brw)
      if (irb == NULL || irb->mt == NULL)
         continue;

+      mesa_format mesa_format =
+         _mesa_get_render_format(ctx, intel_rb_format(irb));
+      enum isl_format isl_format = brw_isl_format_for_mesa_format(mesa_format);
+
      intel_miptree_prepare_render(brw, irb->mt, irb->mt_level,
                                   irb->mt_layer, irb->layer_count,
-                                   ctx->Color.sRGBEnabled,
+                                   isl_format,
                                   ctx->Color.BlendEnabled & (1 << i));
   }
 }
@@ -571,10 +576,14 @@ brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
      if (!irb)
         continue;

+      mesa_format mesa_format =
+         _mesa_get_render_format(ctx, intel_rb_format(irb));
+      enum isl_format isl_format = brw_isl_format_for_mesa_format(mesa_format);
+
      brw_render_cache_set_add_bo(brw, irb->mt->bo);
      intel_miptree_finish_render(brw, irb->mt, irb->mt_level,
                                  irb->mt_layer, irb->layer_count,
-                                  ctx->Color.sRGBEnabled,
+                                  isl_format,
                                  ctx->Color.BlendEnabled & (1 << i));
   }
 }
@@ -866,7 +875,6 @@ brw_draw_prims(struct gl_context *ctx,
   struct brw_context *brw = brw_context(ctx);
   const struct gl_vertex_array **arrays = ctx->Array._DrawArrays;
   int predicate_state = brw->predicate.state;
-   int combine_op = MI_PREDICATE_COMBINEOP_SET;
   struct brw_transform_feedback_object *xfb_obj =
      (struct brw_transform_feedback_object *) gl_xfb_obj;

@@ -910,49 +918,35 @@ brw_draw_prims(struct gl_context *ctx,
    * to it.
    */

-    if (brw->draw.draw_params_count_bo &&
-        predicate_state == BRW_PREDICATE_STATE_USE_BIT) {
-      /* We need to empty the MI_PREDICATE_DATA register since it might
-       * already be set.
-       */
-
-      BEGIN_BATCH(4);
-      OUT_BATCH(MI_PREDICATE_DATA);
-      OUT_BATCH(0u);
-      OUT_BATCH(MI_PREDICATE_DATA + 4);
-      OUT_BATCH(0u);
-      ADVANCE_BATCH();
-
-      /* We need to combine the results of both predicates.*/
-      combine_op = MI_PREDICATE_COMBINEOP_AND;
-   }
-
   for (i = 0; i < nr_prims; i++) {
      /* Implementation of ARB_indirect_parameters via predicates */
      if (brw->draw.draw_params_count_bo) {
-         struct brw_bo *draw_id_bo = NULL;
-         uint32_t draw_id_offset;
-
-         intel_upload_data(brw, &prims[i].draw_id, 4, 4, &draw_id_bo,
-                           &draw_id_offset);
-
         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_FLUSH_ENABLE);

+         /* Upload the current draw count from the draw parameters buffer to
+          * MI_PREDICATE_SRC0.
+          */
         brw_load_register_mem(brw, MI_PREDICATE_SRC0,
                               brw->draw.draw_params_count_bo,
                               brw->draw.draw_params_count_offset);
-         brw_load_register_mem(brw, MI_PREDICATE_SRC1, draw_id_bo,
-                               draw_id_offset);
+         /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
+         brw_load_register_imm32(brw, MI_PREDICATE_SRC0 + 4, 0);
+         /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
+         brw_load_register_imm64(brw, MI_PREDICATE_SRC1, prims[i].draw_id);

         BEGIN_BATCH(1);
-         OUT_BATCH(GEN7_MI_PREDICATE |
-                   MI_PREDICATE_LOADOP_LOADINV | combine_op |
-                   MI_PREDICATE_COMPAREOP_DELTAS_EQUAL);
+         if (i == 0 && brw->predicate.state != BRW_PREDICATE_STATE_USE_BIT) {
+            OUT_BATCH(GEN7_MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+                      MI_PREDICATE_COMBINEOP_SET |
+                      MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
+         } else {
+            OUT_BATCH(GEN7_MI_PREDICATE |
+                      MI_PREDICATE_LOADOP_LOAD | MI_PREDICATE_COMBINEOP_XOR |
+                      MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
+         }
         ADVANCE_BATCH();

         brw->predicate.state = BRW_PREDICATE_STATE_USE_BIT;
-
-         brw_bo_unreference(draw_id_bo);
      }

      brw_draw_single_prim(ctx, arrays, &prims[i], i, xfb_obj, stream,
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -101,30 +101,6 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }
-
-   /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
-    * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
-    *
-    * On Gen6-7.5, we use an execbuf parameter to do this for us.
-    * However, the kernel ignores that when execlists are in use.
-    * Fortunately, we can just write the registers from userspace
-    * on Gen8+, and they're context saved/restored.
-    */
-   if (devinfo->gen >= 9) {
-      BEGIN_BATCH(3);
-      OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
-      OUT_BATCH(CS_DEBUG_MODE2);
-      OUT_BATCH(REG_MASK(CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE) |
-                CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE);
-      ADVANCE_BATCH();
-   } else if (devinfo->gen == 8) {
-      BEGIN_BATCH(3);
-      OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
-      OUT_BATCH(INSTPM);
-      OUT_BATCH(REG_MASK(INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE) |
-                INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE);
-      ADVANCE_BATCH();
-   }
 }

 static inline const struct brw_tracked_state *
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -213,11 +213,6 @@ gen6_update_renderbuffer_surface(struct brw_context *brw,
   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
   struct intel_mipmap_tree *mt = irb->mt;

-   enum isl_aux_usage aux_usage =
-      brw->draw_aux_buffer_disabled[unit] ? ISL_AUX_USAGE_NONE :
-      intel_miptree_render_aux_usage(brw, mt, ctx->Color.sRGBEnabled,
-                                     ctx->Color.BlendEnabled & (1 << unit));
-
   assert(brw_render_target_supported(brw, rb));

   mesa_format rb_format = _mesa_get_render_format(ctx, intel_rb_format(irb));
@@ -225,9 +220,15 @@ gen6_update_renderbuffer_surface(struct brw_context *brw,
      _mesa_problem(ctx, "%s: renderbuffer format %s unsupported\n",
                    __func__, _mesa_get_format_name(rb_format));
   }
+   enum isl_format isl_format = brw->mesa_to_isl_render_format[rb_format];
+
+   enum isl_aux_usage aux_usage =
+      brw->draw_aux_buffer_disabled[unit] ? ISL_AUX_USAGE_NONE :
+      intel_miptree_render_aux_usage(brw, mt, isl_format,
+                                     ctx->Color.BlendEnabled & (1 << unit));

   struct isl_view view = {
-      .format = brw->mesa_to_isl_render_format[rb_format],
+      .format = isl_format,
      .base_level = irb->mt_level - irb->mt->first_level,
      .levels = 1,
      .base_array_layer = irb->mt_layer,
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -241,6 +241,27 @@ intel_miptree_supports_hiz(const struct brw_context *brw,
   }
 }

+/**
+ * Return true if the format that will be used to access the miptree is
+ * CCS_E-compatible with the miptree's linear/non-sRGB format.
+ *
+ * Why use the linear format? Well, although the miptree may be specified with
+ * an sRGB format, the usage of that color space/format can be toggled. Since
+ * our HW tends to support more linear formats than sRGB ones, we use this
+ * format variant for check for CCS_E compatibility.
+ */
+static bool
+format_ccs_e_compat_with_miptree(const struct gen_device_info *devinfo,
+                                 const struct intel_mipmap_tree *mt,
+                                 enum isl_format access_format)
+{
+   assert(mt->aux_usage == ISL_AUX_USAGE_CCS_E);
+
+   mesa_format linear_format = _mesa_get_srgb_format_linear(mt->format);
+   enum isl_format isl_format = brw_isl_format_for_mesa_format(linear_format);
+   return isl_formats_are_ccs_e_compatible(devinfo, isl_format, access_format);
+}
+
 static bool
 intel_miptree_supports_ccs_e(struct brw_context *brw,
                             const struct intel_mipmap_tree *mt)
@@ -2549,6 +2570,7 @@ can_texture_with_ccs(struct brw_context *brw,
   if (mt->aux_usage != ISL_AUX_USAGE_CCS_E)
      return false;

+   /* TODO: Replace with format_ccs_e_compat_with_miptree for better perf. */
   if (!isl_formats_are_ccs_e_compatible(&brw->screen->devinfo,
                                         mt->surf.format, view_format)) {
      perf_debug("Incompatible sampling format (%s) for rbc (%s)\n",
@@ -2654,7 +2676,8 @@ intel_miptree_prepare_image(struct brw_context *brw,
 enum isl_aux_usage
 intel_miptree_render_aux_usage(struct brw_context *brw,
                               struct intel_mipmap_tree *mt,
-                               bool srgb_enabled, bool blend_enabled)
+                               enum isl_format render_format,
+                               bool blend_enabled)
 {
   switch (mt->aux_usage) {
   case ISL_AUX_USAGE_MCS:
@@ -2665,12 +2688,11 @@ intel_miptree_render_aux_usage(struct brw_context *brw,
      return mt->mcs_buf ? ISL_AUX_USAGE_CCS_D : ISL_AUX_USAGE_NONE;

   case ISL_AUX_USAGE_CCS_E: {
-      mesa_format mesa_format =
-         srgb_enabled ? mt->format :_mesa_get_srgb_format_linear(mt->format);
-      enum isl_format isl_format = brw_isl_format_for_mesa_format(mesa_format);
-
-      /* If the format supports CCS_E, then we can just use it */
-      if (isl_format_supports_ccs_e(&brw->screen->devinfo, isl_format))
+      /* If the format supports CCS_E and is compatible with the miptree,
+       * then we can use it.
+       */
+      if (format_ccs_e_compat_with_miptree(&brw->screen->devinfo,
+                                           mt, render_format))
         return ISL_AUX_USAGE_CCS_E;

      /* Otherwise, we have to fall back to CCS_D */
@@ -2679,8 +2701,8 @@ intel_miptree_render_aux_usage(struct brw_context *brw,
       * formats.  However, there are issues with blending where it doesn't
       * properly apply the sRGB curve to the clear color when blending.
       */
-      if (blend_enabled && isl_format_is_srgb(isl_format) &&
-          !isl_color_value_is_zero_one(mt->fast_clear_color, isl_format))
+      if (blend_enabled && isl_format_is_srgb(render_format) &&
+          !isl_color_value_is_zero_one(mt->fast_clear_color, render_format))
         return ISL_AUX_USAGE_NONE;

      return ISL_AUX_USAGE_CCS_D;
@@ -2695,10 +2717,11 @@ void
 intel_miptree_prepare_render(struct brw_context *brw,
                             struct intel_mipmap_tree *mt, uint32_t level,
                             uint32_t start_layer, uint32_t layer_count,
-                             bool srgb_enabled, bool blend_enabled)
+                             enum isl_format render_format,
+                             bool blend_enabled)
 {
   enum isl_aux_usage aux_usage =
-      intel_miptree_render_aux_usage(brw, mt, srgb_enabled, blend_enabled);
+      intel_miptree_render_aux_usage(brw, mt, render_format, blend_enabled);
   intel_miptree_prepare_access(brw, mt, level, 1, start_layer, layer_count,
                                aux_usage, aux_usage != ISL_AUX_USAGE_NONE);
 }
@@ -2707,12 +2730,13 @@ void
 intel_miptree_finish_render(struct brw_context *brw,
                            struct intel_mipmap_tree *mt, uint32_t level,
                            uint32_t start_layer, uint32_t layer_count,
-                            bool srgb_enabled, bool blend_enabled)
+                            enum isl_format render_format,
+                            bool blend_enabled)
 {
   assert(_mesa_is_format_color_format(mt->format));

   enum isl_aux_usage aux_usage =
-      intel_miptree_render_aux_usage(brw, mt, srgb_enabled, blend_enabled);
+      intel_miptree_render_aux_usage(brw, mt, render_format, blend_enabled);
   intel_miptree_finish_write(brw, mt, level, start_layer, layer_count,
                              aux_usage);
 }
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -650,17 +650,20 @@ intel_miptree_prepare_image(struct brw_context *brw,
 enum isl_aux_usage
 intel_miptree_render_aux_usage(struct brw_context *brw,
                               struct intel_mipmap_tree *mt,
-                               bool srgb_enabled, bool blend_enabled);
+                               enum isl_format render_format,
+                               bool blend_enabled);
 void
 intel_miptree_prepare_render(struct brw_context *brw,
                             struct intel_mipmap_tree *mt, uint32_t level,
                             uint32_t start_layer, uint32_t layer_count,
-                             bool srgb_enabled, bool blend_enabled);
+                             enum isl_format render_format,
+                             bool blend_enabled);
 void
 intel_miptree_finish_render(struct brw_context *brw,
                            struct intel_mipmap_tree *mt, uint32_t level,
                            uint32_t start_layer, uint32_t layer_count,
-                            bool srgb_enabled, bool blend_enabled);
+                            enum isl_format render_format,
+                            bool blend_enabled);
 void
 intel_miptree_prepare_depth(struct brw_context *brw,
                            struct intel_mipmap_tree *mt, uint32_t level,
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -2523,7 +2523,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen)
   screen->compiler = brw_compiler_create(screen, devinfo);
   screen->compiler->shader_debug_log = shader_debug_log_mesa;
   screen->compiler->shader_perf_log = shader_perf_log_mesa;
-   screen->compiler->constant_buffer_0_is_relative = devinfo->gen < 8;
+   screen->compiler->constant_buffer_0_is_relative = true;
   screen->compiler->supports_pull_constants = true;

   screen->has_exec_fence =
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -3815,7 +3815,7 @@ bind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count,
      }

      set_buffer_multi_binding(ctx, buffers, i, caller,
-                               binding, offset, size, !range,
+                               binding, offset, size, range,
                               USAGE_UNIFORM_BUFFER);
   }

@@ -3916,7 +3916,7 @@ bind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
      }

      set_buffer_multi_binding(ctx, buffers, i, caller,
-                               binding, offset, size, !range,
+                               binding, offset, size, range,
                               USAGE_SHADER_STORAGE_BUFFER);
   }

@@ -4238,7 +4238,7 @@ bind_atomic_buffers(struct gl_context *ctx,
      }

      set_buffer_multi_binding(ctx, buffers, i, caller,
-                               binding, offset, size, !range,
+                               binding, offset, size, range,
                               USAGE_ATOMIC_COUNTER_BUFFER);
   }

--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -330,6 +330,15 @@ get_fb0_attachment(struct gl_context *ctx, struct gl_framebuffer *fb,
      return &fb->Attachment[BUFFER_BACK_LEFT];
   case GL_BACK_RIGHT:
      return &fb->Attachment[BUFFER_BACK_RIGHT];
+   case GL_BACK:
+      /* The ARB_ES3_1_compatibility spec says:
+       *
+       *    "Since this command can only query a single framebuffer
+       *     attachment, BACK is equivalent to BACK_LEFT."
+       */
+      if (ctx->Extensions.ARB_ES3_1_compatibility)
+         return &fb->Attachment[BUFFER_BACK_LEFT];
+      return NULL;
   case GL_AUX0:
      if (fb->Visual.numAuxBuffers == 1) {
         return &fb->Attachment[BUFFER_AUX0];
--- a/src/util/disk_cache.c
+++ b/src/util/disk_cache.c
@@ -1110,7 +1110,7 @@ disk_cache_get(struct disk_cache *cache, const cache_key key, size_t *size)
       * TODO: pass the metadata back to the caller and do some basic
       * validation.
       */
-      cache_item_md_size += sizeof(cache_key);
+      cache_item_md_size += num_keys * sizeof(cache_key);
      ret = lseek(fd, num_keys * sizeof(cache_key), SEEK_CUR);
      if (ret == -1)
         goto fail;
--- a/src/util/drirc
+++ b/src/util/drirc
@@ -264,7 +264,7 @@ TODO: document the other workarounds.
    </device>
    <device driver="radeonsi">
        <application name="ARK: Survival Evolved (and unintentionally the UE4 demo template)" executable="ShooterGame">
-            <option name="radeonsi_clear_db_meta_before_clear" value="true" />
+            <option name="radeonsi_clear_db_cache_before_clear" value="true" />
        </application>
    </device>
 </driconf>
--- a/src/util/xmlpool/t_options.h
+++ b/src/util/xmlpool/t_options.h
@@ -444,7 +444,7 @@ DRI_CONF_OPT_BEGIN_B(radeonsi_commutative_blend_add, def) \
        DRI_CONF_DESC(en,gettext("Commutative additive blending optimizations (may cause rendering errors)")) \
 DRI_CONF_OPT_END

-#define DRI_CONF_RADEONSI_CLEAR_DB_META_BEFORE_CLEAR(def) \
-DRI_CONF_OPT_BEGIN_B(radeonsi_clear_db_meta_before_clear, def) \
-        DRI_CONF_DESC(en,"Clear DB metadata cache before fast depth clear") \
+#define DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR(def) \
+DRI_CONF_OPT_BEGIN_B(radeonsi_clear_db_cache_before_clear, def) \
+        DRI_CONF_DESC(en,"Clear DB cache before fast depth clear") \
 DRI_CONF_OPT_END