docs: add release notes for 13.0.1

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
Update version to 13.0.1
2016-11-14 10:58:11 +00:00 · 2016-11-14 10:43:20 +00:00 · 2016-11-14 09:36:25 +00:00 · 2016-11-14 09:35:08 +00:00 · 2016-11-14 09:35:03 +00:00 · 2016-11-14 09:34:12 +00:00
63 changed files with 1417 additions and 756 deletions
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -82,11 +82,13 @@ LOCAL_CFLAGS += \
 	-D__STDC_LIMIT_MACROS
 endif

+ifneq ($(LOCAL_IS_HOST_MODULE),true)
 # add libdrm if there are hardware drivers
 ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
 LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES += libdrm
 endif
+endif

 LOCAL_CPPFLAGS += \
 	$(if $(filter true,$(MESA_LOLLIPOP_BUILD)),-D_USING_LIBCXX) \
--- a/2
+++ b/2
@@ -1 +1 @@
-13.0.0
+13.0.1
--- a/docs/relnotes/13.0.0.html
+++ b/docs/relnotes/13.0.0.html
@@ -33,7 +33,8 @@ because compatibility contexts are not supported.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD.
+4a54d7cdc1a94a8dae05a75ccff48356406d51b0d6a64cbdc641c266e3e008eb  mesa-13.0.0.tar.gz
+94edb4ebff82066a68be79d9c2627f15995e1fe10f67ab3fc63deb842027d727  mesa-13.0.0.tar.xz
 </pre>


--- a/docs/relnotes/13.0.1.html
+++ b/docs/relnotes/13.0.1.html
@@ -0,0 +1,187 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 13.0.1 Release Notes / November 14, 2016</h1>
+
+<p>
+Mesa 13.0.1 is a bug fix release which fixes bugs found since the 13.0.0 release.
+</p>
+<p>
+Mesa 13.0.1 implements the OpenGL 4.4 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.4.  OpenGL
+4.4 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97715">Bug 97715</a> - [ILK,G45,G965] piglit.spec.arb_separate_shader_objects.misc api error checks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98012">Bug 98012</a> - [IVB] Segfault when running Dolphin twice with Vulkan</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98512">Bug 98512</a> - radeon r600 vdpau: Invalid command stream: texture bo too small</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Adam Jackson (2):</p>
+<ul>
+  <li>glx/glvnd: Don't modify the dummy slot in the dispatch table</li>
+  <li>glx/glvnd: Fix dispatch function names and indices</li>
+</ul>
+
+<p>Andreas Boll (1):</p>
+<ul>
+  <li>glx/windows: Add wgl.h to the sources list</li>
+</ul>
+
+<p>Anuj Phogat (1):</p>
+<ul>
+  <li>i965: Fix GPU hang related to multiple render targets and alpha testing</li>
+</ul>
+
+<p>Chih-Wei Huang (1):</p>
+<ul>
+  <li>android: avoid using libdrm with host modules</li>
+</ul>
+
+<p>Darren Salt (1):</p>
+<ul>
+  <li>radv/pipeline: Don't dereference NULL dynamic state pointers</li>
+</ul>
+
+<p>Dave Airlie (8):</p>
+<ul>
+  <li>radv: expose xlib platform extension</li>
+  <li>radv: fix dual source blending</li>
+  <li>Revert "st/vdpau: use linear layout for output surfaces"</li>
+  <li>radv: emit correct last export when Z/stencil export is enabled</li>
+  <li>ac/nir: add support for discard_if intrinsic (v2)</li>
+  <li>nir: add conditional discard optimisation (v4)</li>
+  <li>radv: enable conditional discard optimisation on radv.</li>
+  <li>radv: fix GetFenceStatus for signaled fences</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: add sha256 checksums for 13.0.0</li>
+  <li>amd/addrlib: limit fastcall/regparm to GCC i386</li>
+  <li>anv: use correct .specVersion for extensions</li>
+  <li>radv: use correct .specVersion for extensions</li>
+  <li>radv: Suffix the radeon_icd file with the host CPU</li>
+  <li>Update version to 13.0.1</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>vc4: Use Newton-Raphson on the 1/W write to fix glmark2 terrain.</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>nir: Flip gl_SamplePosition in nir_lower_wpos_ytransform().</li>
+</ul>
+
+<p>Fredrik Höglund (1):</p>
+<ul>
+  <li>radv: add support for anisotropic filtering on VI+</li>
+</ul>
+
+<p>Jason Ekstrand (21):</p>
+<ul>
+  <li>anv/device: Return DEVICE_LOST if execbuf2 fails</li>
+  <li>vulkan/wsi/x11: Better handle wsi_x11_connection_create failure</li>
+  <li>vulkan/wsi/x11: Clean up connections in finish_wsi</li>
+  <li>anv: Better handle return codes from anv_physical_device_init</li>
+  <li>intel/blorp: Use wm_prog_data instead of hand-rolling our own</li>
+  <li>intel/blorp: Pass a brw_stage_prog_data to upload_shader</li>
+  <li>anv/pipeline: Put actual pointers in anv_shader_bin</li>
+  <li>anv/pipeline: Properly cache prog_data::param</li>
+  <li>intel/blorp: Emit all the binding tables</li>
+  <li>anv/device: Add an execbuf wrapper</li>
+  <li>anv: Add a cmd_buffer_execbuf helper</li>
+  <li>anv: Don't presume to know what address is in a surface relocation</li>
+  <li>anv: Add a new bo_pool_init helper</li>
+  <li>anv/allocator: Simplify anv_scratch_pool</li>
+  <li>anv: Initialize anv_bo::offset to -1</li>
+  <li>anv/batch_chain: Improve write_reloc</li>
+  <li>anv: Add an anv_execbuf helper struct</li>
+  <li>anv/batch: Move last_ss_pool_bo_offset to the command buffer</li>
+  <li>anv: Move relocation handling from EndCommandBuffer to QueueSubmit</li>
+  <li>anv/cmd_buffer: Take a command buffer instead of a batch in two helpers</li>
+  <li>anv/cmd_buffer: Enable a CS stall workaround for Sky Lake gt4</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>glsl: Update deref types when resizing implicitly sized arrays.</li>
+  <li>mesa: Fix pixel shader scratch space allocation on Gen9+ platforms.</li>
+</ul>
+
+<p>Kristian Høgsberg (1):</p>
+<ul>
+  <li>anv: Do relocations in userspace before execbuf ioctl</li>
+</ul>
+
+<p>Marek Olšák (4):</p>
+<ul>
+  <li>egl: use util/macros.h</li>
+  <li>egl: make interop ABI visible again</li>
+  <li>glx: make interop ABI visible again</li>
+  <li>radeonsi: fix an assertion failure in si_decompress_sampler_color_textures</li>
+</ul>
+
+<p>Nicolai Hähnle (4):</p>
+<ul>
+  <li>radeonsi: fix BFE/BFI lowering for GLSL semantics</li>
+  <li>glsl: fix lowering of UBO references of named blocks</li>
+  <li>st/glsl_to_tgsi: fix dvec[34] loads from SSBO</li>
+  <li>st/mesa: fix the layer of VDPAU surface samplers</li>
+</ul>
+
+<p>Steven Toth (3):</p>
+<ul>
+  <li>gallium/hud: fix a problem where objects are free'd while in use.</li>
+  <li>gallium/hud: close a previously opened handle</li>
+  <li>gallium/hud: protect against and initialization race</li>
+</ul>
+
+<p>Timothy Arceri (1):</p>
+<ul>
+  <li>mesa/glsl: delete previously linked shaders earlier when linking</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/src/amd/addrlib/addrtypes.h
+++ b/src/amd/addrlib/addrtypes.h
@@ -88,7 +88,11 @@ typedef int            INT;

 #ifndef ADDR_FASTCALL
    #if defined(__GNUC__)
-        #define ADDR_FASTCALL __attribute__((regparm(0)))
+        #if defined(__i386__)
+            #define ADDR_FASTCALL __attribute__((regparm(0)))
+        #else
+            #define ADDR_FASTCALL
+        #endif
    #else
        #define ADDR_FASTCALL __fastcall
    #endif
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2609,6 +2609,24 @@ static void emit_barrier(struct nir_to_llvm_context *ctx)
 			    ctx->voidt, NULL, 0, 0);
 }

+static void emit_discard_if(struct nir_to_llvm_context *ctx,
+			    nir_intrinsic_instr *instr)
+{
+	LLVMValueRef cond;
+	ctx->shader_info->fs.can_discard = true;
+
+	cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+			     get_src(ctx, instr->src[0]),
+			     ctx->i32zero, "");
+
+	cond = LLVMBuildSelect(ctx->builder, cond,
+			       LLVMConstReal(ctx->f32, -1.0f),
+			       ctx->f32zero, "");
+	emit_llvm_intrinsic(ctx, "llvm.AMDGPU.kill",
+			    LLVMVoidTypeInContext(ctx->context),
+			    &cond, 1, 0);
+}
+
 static LLVMValueRef
 visit_load_local_invocation_index(struct nir_to_llvm_context *ctx)
 {
@@ -2921,6 +2939,9 @@ static void visit_intrinsic(struct nir_to_llvm_context *ctx,
 				    LLVMVoidTypeInContext(ctx->context),
 				    NULL, 0, 0);
 		break;
+	case nir_intrinsic_discard_if:
+		emit_discard_if(ctx, instr);
+		break;
 	case nir_intrinsic_memory_barrier:
 		emit_waitcnt(ctx);
 		break;
@@ -4352,12 +4373,10 @@ handle_fs_outputs_post(struct nir_to_llvm_context *ctx,

 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
 		LLVMValueRef values[4];
-		bool last;
+
 		if (!(ctx->output_mask & (1ull << i)))
 			continue;

-		last = ctx->output_mask <= ((1ull << (i + 1)) - 1);
-
 		if (i == FRAG_RESULT_DEPTH) {
 			ctx->shader_info->fs.writes_z = true;
 			depth = to_float(ctx, LLVMBuildLoad(ctx->builder,
@@ -4367,10 +4386,14 @@ handle_fs_outputs_post(struct nir_to_llvm_context *ctx,
 			stencil = to_float(ctx, LLVMBuildLoad(ctx->builder,
 							      ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
 		} else {
+			bool last = false;
 			for (unsigned j = 0; j < 4; j++)
 				values[j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
 									ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));

+			if (!ctx->shader_info->fs.writes_z && !ctx->shader_info->fs.writes_stencil)
+				last = ctx->output_mask <= ((1ull << (i + 1)) - 1);
+
 			si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + index, last);
 			index++;
 		}
--- a/src/amd/vulkan/.gitignore
+++ b/src/amd/vulkan/.gitignore
@@ -4,3 +4,4 @@
 /radv_timestamp.h
 /dev_icd.json
 /vk_format_table.c
+/radeon_icd.*.json
--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -131,11 +131,11 @@ vk_format_table.c: vk_format_table.py \
 	$(PYTHON2) $(srcdir)/vk_format_table.py $(srcdir)/vk_format_layout.csv > $@

 BUILT_SOURCES = $(VULKAN_GENERATED_FILES)
-CLEANFILES = $(BUILT_SOURCES) dev_icd.json radv_timestamp.h
+CLEANFILES = $(BUILT_SOURCES) dev_icd.json radeon_icd.@host_cpu@.json
 EXTRA_DIST = \
 	$(top_srcdir)/include/vulkan/vk_icd.h \
 	dev_icd.json.in \
-	radeon_icd.json \
+	radeon_icd.json.in \
 	radv_entrypoints_gen.py \
 	vk_format_layout.csv \
 	vk_format_parse.py \
@@ -155,7 +155,7 @@ libvulkan_radeon_la_LDFLAGS = \


 icdconfdir = @VULKAN_ICD_INSTALL_DIR@
-icdconf_DATA = radeon_icd.json
+icdconf_DATA = radeon_icd.@host_cpu@.json
 # The following is used for development purposes, by setting VK_ICD_FILENAMES.
 noinst_DATA = dev_icd.json

@@ -164,4 +164,9 @@ dev_icd.json : dev_icd.json.in
 		-e "s#@build_libdir@#${abs_top_builddir}/${LIB_DIR}#" \
 		< $(srcdir)/dev_icd.json.in > $@

+radeon_icd.@host_cpu@.json : radeon_icd.json.in
+	$(AM_V_GEN) $(SED) \
+		-e "s#@install_libdir@#${libdir}#" \
+		< $(srcdir)/radeon_icd.json.in > $@
+
 include $(top_srcdir)/install-lib-links.mk
--- a/src/amd/vulkan/radeon_icd.json.in
+++ b/src/amd/vulkan/radeon_icd.json.in
@@ -1,7 +1,7 @@
 {
    "file_format_version": "1.0.0",
    "ICD": {
-        "library_path": "libvulkan_radeon.so",
+        "library_path": "@install_libdir@/libvulkan_radeon.so",
        "api_version": "1.0.3"
    }
 }
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -113,13 +113,19 @@ static const VkExtensionProperties global_extensions[] = {
 #ifdef VK_USE_PLATFORM_XCB_KHR
 	{
 		.extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME,
-		.specVersion = 5,
+		.specVersion = 6,
+	},
+#endif
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+	{
+		.extensionName = VK_KHR_XLIB_SURFACE_EXTENSION_NAME,
+		.specVersion = 6,
 	},
 #endif
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
 	{
 		.extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME,
-		.specVersion = 4,
+		.specVersion = 5,
 	},
 #endif
 };
@@ -127,7 +133,7 @@ static const VkExtensionProperties global_extensions[] = {
 static const VkExtensionProperties device_extensions[] = {
 	{
 		.extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
-		.specVersion = 67,
+		.specVersion = 68,
 	},
 };

@@ -1166,6 +1172,8 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence)
 	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_fence, fence, _fence);

+	if (fence->signalled)
+		return VK_SUCCESS;
 	if (!fence->submitted)
 		return VK_NOT_READY;

@@ -1728,26 +1736,50 @@ radv_tex_bordercolor(VkBorderColor bcolor)
 	return 0;
 }

+static unsigned
+radv_tex_aniso_filter(unsigned filter)
+{
+	if (filter < 2)
+		return 0;
+	if (filter < 4)
+		return 1;
+	if (filter < 8)
+		return 2;
+	if (filter < 16)
+		return 3;
+	return 4;
+}
+
 static void
 radv_init_sampler(struct radv_device *device,
 		  struct radv_sampler *sampler,
 		  const VkSamplerCreateInfo *pCreateInfo)
 {
-	uint32_t max_aniso = 0;
-	uint32_t max_aniso_ratio = 0;//TODO
+	uint32_t max_aniso = pCreateInfo->anisotropyEnable && pCreateInfo->maxAnisotropy > 1.0 ?
+					(uint32_t) pCreateInfo->maxAnisotropy : 0;
+	uint32_t max_aniso_ratio = radv_tex_aniso_filter(max_aniso);
 	bool is_vi;
 	is_vi = (device->instance->physicalDevice.rad_info.chip_class >= VI);

+	if (!is_vi && max_aniso > 0) {
+		radv_finishme("Anisotropic filtering must be disabled manually "
+		              "by the shader on SI-CI when BASE_LEVEL == LAST_LEVEL\n");
+		max_aniso = max_aniso_ratio = 0;
+	}
+
 	sampler->state[0] = (S_008F30_CLAMP_X(radv_tex_wrap(pCreateInfo->addressModeU)) |
 			     S_008F30_CLAMP_Y(radv_tex_wrap(pCreateInfo->addressModeV)) |
 			     S_008F30_CLAMP_Z(radv_tex_wrap(pCreateInfo->addressModeW)) |
 			     S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
 			     S_008F30_DEPTH_COMPARE_FUNC(radv_tex_compare(pCreateInfo->compareOp)) |
 			     S_008F30_FORCE_UNNORMALIZED(pCreateInfo->unnormalizedCoordinates ? 1 : 0) |
+			     S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) |
+			     S_008F30_ANISO_BIAS(max_aniso_ratio) |
 			     S_008F30_DISABLE_CUBE_WRAP(0) |
 			     S_008F30_COMPAT_MODE(is_vi));
 	sampler->state[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(pCreateInfo->minLod, 0, 15), 8)) |
-			     S_008F34_MAX_LOD(S_FIXED(CLAMP(pCreateInfo->maxLod, 0, 15), 8)));
+			     S_008F34_MAX_LOD(S_FIXED(CLAMP(pCreateInfo->maxLod, 0, 15), 8)) |
+			     S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
 	sampler->state[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(pCreateInfo->mipLodBias, -16, 16), 8)) |
 			     S_008F38_XY_MAG_FILTER(radv_tex_filter(pCreateInfo->magFilter, max_aniso)) |
 			     S_008F38_XY_MIN_FILTER(radv_tex_filter(pCreateInfo->minFilter, max_aniso)) |
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -144,6 +144,7 @@ radv_optimize_nir(struct nir_shader *shader)
                NIR_PASS(progress, shader, nir_opt_algebraic);
                NIR_PASS(progress, shader, nir_opt_constant_folding);
                NIR_PASS(progress, shader, nir_opt_undef);
+                NIR_PASS(progress, shader, nir_opt_conditional_discard);
        } while (progress);
 }

@@ -642,7 +643,8 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,
 					const VkGraphicsPipelineCreateInfo *pCreateInfo,
 					uint32_t blend_enable,
 					uint32_t blend_need_alpha,
-					bool single_cb_enable)
+					bool single_cb_enable,
+					bool blend_mrt0_is_dual_src)
 {
 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
@@ -664,6 +666,8 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,

 	blend->cb_shader_mask = si_get_cb_shader_mask(col_format);

+	if (blend_mrt0_is_dual_src)
+		col_format |= (col_format & 0xf) << 4;
 	if (!col_format)
 		col_format |= V_028714_SPI_SHADER_32_R;
 	blend->spi_shader_col_format = col_format;
@@ -715,8 +719,13 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
 	struct radv_blend_state *blend = &pipeline->graphics.blend;
 	unsigned mode = V_028808_CB_NORMAL;
 	uint32_t blend_enable = 0, blend_need_alpha = 0;
+	bool blend_mrt0_is_dual_src = false;
 	int i;
 	bool single_cb_enable = false;
+
+	if (!vkblend)
+		return;
+
 	if (extra && extra->custom_blend_mode) {
 		single_cb_enable = true;
 		mode = extra->custom_blend_mode;
@@ -755,7 +764,9 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
 		}

 		if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA))
-			radv_finishme("dual source blending");
+			if (i == 0)
+				blend_mrt0_is_dual_src = true;
+
 		if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) {
 			srcRGB = VK_BLEND_FACTOR_ONE;
 			dstRGB = VK_BLEND_FACTOR_ONE;
@@ -797,7 +808,7 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
 		blend->cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);

 	radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo,
-						blend_enable, blend_need_alpha, single_cb_enable);
+						blend_enable, blend_need_alpha, single_cb_enable, blend_mrt0_is_dual_src);
 }

 static uint32_t si_translate_stencil_op(enum VkStencilOp op)
@@ -1069,18 +1080,27 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,

 	struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;

-	dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
-	if (states & (1 << VK_DYNAMIC_STATE_VIEWPORT)) {
-		typed_memcpy(dynamic->viewport.viewports,
-			     pCreateInfo->pViewportState->pViewports,
-			     pCreateInfo->pViewportState->viewportCount);
-	}
+	/* Section 9.2 of the Vulkan 1.0.15 spec says:
+	 *
+	 *    pViewportState is [...] NULL if the pipeline
+	 *    has rasterization disabled.
+	 */
+	if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable) {
+		assert(pCreateInfo->pViewportState);

-	dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
-	if (states & (1 << VK_DYNAMIC_STATE_SCISSOR)) {
-		typed_memcpy(dynamic->scissor.scissors,
-			     pCreateInfo->pViewportState->pScissors,
-			     pCreateInfo->pViewportState->scissorCount);
+		dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
+		if (states & (1 << VK_DYNAMIC_STATE_VIEWPORT)) {
+			typed_memcpy(dynamic->viewport.viewports,
+				     pCreateInfo->pViewportState->pViewports,
+				     pCreateInfo->pViewportState->viewportCount);
+		}
+
+		dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
+		if (states & (1 << VK_DYNAMIC_STATE_SCISSOR)) {
+			typed_memcpy(dynamic->scissor.scissors,
+				     pCreateInfo->pViewportState->pScissors,
+				     pCreateInfo->pViewportState->scissorCount);
+		}
 	}

 	if (states & (1 << VK_DYNAMIC_STATE_LINE_WIDTH)) {
@@ -1098,7 +1118,21 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
 			pCreateInfo->pRasterizationState->depthBiasSlopeFactor;
 	}

-	if (states & (1 << VK_DYNAMIC_STATE_BLEND_CONSTANTS)) {
+	/* Section 9.2 of the Vulkan 1.0.15 spec says:
+	 *
+	 *    pColorBlendState is [...] NULL if the pipeline has rasterization
+	 *    disabled or if the subpass of the render pass the pipeline is
+	 *    created against does not use any color attachments.
+	 */
+	bool uses_color_att = false;
+	for (unsigned i = 0; i < subpass->color_count; ++i) {
+		if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED) {
+			uses_color_att = true;
+			break;
+		}
+	}
+
+	if (uses_color_att && states & (1 << VK_DYNAMIC_STATE_BLEND_CONSTANTS)) {
 		assert(pCreateInfo->pColorBlendState);
 		typed_memcpy(dynamic->blend_constants,
 			     pCreateInfo->pColorBlendState->blendConstants, 4);
@@ -1110,14 +1144,17 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
 	 * no need to override the depthstencil defaults in
 	 * radv_pipeline::dynamic_state when there is no depthstencil attachment.
 	 *
-	 * From the Vulkan spec (20 Oct 2015, git-aa308cb):
+	 * Section 9.2 of the Vulkan 1.0.15 spec says:
 	 *
-	 *    pDepthStencilState [...] may only be NULL if renderPass and subpass
-	 *    specify a subpass that has no depth/stencil attachment.
+	 *    pDepthStencilState is [...] NULL if the pipeline has rasterization
+	 *    disabled or if the subpass of the render pass the pipeline is created
+	 *    against does not use a depth/stencil attachment.
 	 */
-	if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+	if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
+	    subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+		assert(pCreateInfo->pDepthStencilState);
+
 		if (states & (1 << VK_DYNAMIC_STATE_DEPTH_BOUNDS)) {
-			assert(pCreateInfo->pDepthStencilState);
 			dynamic->depth_bounds.min =
 				pCreateInfo->pDepthStencilState->minDepthBounds;
 			dynamic->depth_bounds.max =
@@ -1125,7 +1162,6 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
 		}

 		if (states & (1 << VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK)) {
-			assert(pCreateInfo->pDepthStencilState);
 			dynamic->stencil_compare_mask.front =
 				pCreateInfo->pDepthStencilState->front.compareMask;
 			dynamic->stencil_compare_mask.back =
@@ -1133,7 +1169,6 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
 		}

 		if (states & (1 << VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) {
-			assert(pCreateInfo->pDepthStencilState);
 			dynamic->stencil_write_mask.front =
 				pCreateInfo->pDepthStencilState->front.writeMask;
 			dynamic->stencil_write_mask.back =
@@ -1141,7 +1176,6 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
 		}

 		if (states & (1 << VK_DYNAMIC_STATE_STENCIL_REFERENCE)) {
-			assert(pCreateInfo->pDepthStencilState);
 			dynamic->stencil_reference.front =
 				pCreateInfo->pDepthStencilState->front.reference;
 			dynamic->stencil_reference.back =
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -227,6 +227,7 @@ NIR_FILES = \
 	nir/nir_metadata.c \
 	nir/nir_move_vec_src_uses_to_dest.c \
 	nir/nir_normalize_cubemap_coords.c \
+	nir/nir_opt_conditional_discard.c \
 	nir/nir_opt_constant_folding.c \
 	nir/nir_opt_copy_propagate.c \
 	nir/nir_opt_cse.c \
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -181,7 +181,43 @@ private:
 };


-class array_resize_visitor : public ir_hierarchical_visitor {
+/**
+ * A visitor helper that provides methods for updating the types of
+ * ir_dereferences.  Classes that update variable types (say, updating
+ * array sizes) will want to use this so that dereference types stay in sync.
+ */
+class deref_type_updater : public ir_hierarchical_visitor {
+public:
+   virtual ir_visitor_status visit(ir_dereference_variable *ir)
+   {
+      ir->type = ir->var->type;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_dereference_array *ir)
+   {
+      const glsl_type *const vt = ir->array->type;
+      if (vt->is_array())
+         ir->type = vt->fields.array;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_dereference_record *ir)
+   {
+      for (unsigned i = 0; i < ir->record->type->length; i++) {
+         const struct glsl_struct_field *field =
+            &ir->record->type->fields.structure[i];
+         if (strcmp(field->name, ir->field) == 0) {
+            ir->type = field->type;
+            break;
+         }
+      }
+      return visit_continue;
+   }
+};
+
+
+class array_resize_visitor : public deref_type_updater {
 public:
   unsigned num_vertices;
   gl_shader_program *prog;
@@ -240,24 +276,6 @@ public:

      return visit_continue;
   }
-
-   /* Dereferences of input variables need to be updated so that their type
-    * matches the newly assigned type of the variable they are accessing. */
-   virtual ir_visitor_status visit(ir_dereference_variable *ir)
-   {
-      ir->type = ir->var->type;
-      return visit_continue;
-   }
-
-   /* Dereferences of 2D input arrays need to be updated so that their type
-    * matches the newly assigned type of the array they are accessing. */
-   virtual ir_visitor_status visit_leave(ir_dereference_array *ir)
-   {
-      const glsl_type *const vt = ir->array->type;
-      if (vt->is_array())
-         ir->type = vt->fields.array;
-      return visit_continue;
-   }
 };

 /**
@@ -1353,7 +1371,7 @@ move_non_declarations(exec_list *instructions, exec_node *last,
 * it inside that function leads to compiler warnings with some versions of
 * gcc.
 */
-class array_sizing_visitor : public ir_hierarchical_visitor {
+class array_sizing_visitor : public deref_type_updater {
 public:
   array_sizing_visitor()
      : mem_ctx(ralloc_context(NULL)),
@@ -2273,6 +2291,8 @@ update_array_sizes(struct gl_shader_program *prog)
         if (prog->_LinkedShaders[i] == NULL)
            continue;

+      bool types_were_updated = false;
+
      foreach_in_list(ir_instruction, node, prog->_LinkedShaders[i]->ir) {
         ir_variable *const var = node->as_variable();

@@ -2328,11 +2348,15 @@ update_array_sizes(struct gl_shader_program *prog)

            var->type = glsl_type::get_array_instance(var->type->fields.array,
                                                      size + 1);
-            /* FINISHME: We should update the types of array
-             * dereferences of this variable now.
-             */
+            types_were_updated = true;
         }
      }
+
+      /* Update the types of dereferences in case we changed any. */
+      if (types_were_updated) {
+         deref_type_updater v;
+         v.run(prog->_LinkedShaders[i]->ir);
+      }
   }
 }

@@ -4785,14 +4809,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                   "type of shader\n");
   }

-   for (unsigned int i = 0; i < MESA_SHADER_STAGES; i++) {
-      if (prog->_LinkedShaders[i] != NULL) {
-         _mesa_delete_linked_shader(ctx, prog->_LinkedShaders[i]);
-      }
-
-      prog->_LinkedShaders[i] = NULL;
-   }
-
   /* Link all shaders for a particular stage and validate the result.
    */
   for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
--- a/src/compiler/glsl/lower_ubo_reference.cpp
+++ b/src/compiler/glsl/lower_ubo_reference.cpp
@@ -107,7 +107,6 @@ public:

   struct gl_linked_shader *shader;
   bool clamp_block_indices;
-   struct gl_uniform_buffer_variable *ubo_var;
   const struct glsl_struct_field *struct_field;
   ir_variable *variable;
   ir_rvalue *uniform_block;
@@ -308,8 +307,11 @@ lower_ubo_reference_visitor::setup_for_load_or_store(void *mem_ctx,
            this->uniform_block = index;
         }

-         this->ubo_var = var->is_interface_instance()
-            ? &blocks[i]->Uniforms[0] : &blocks[i]->Uniforms[var->data.location];
+         if (var->is_interface_instance()) {
+            *const_offset = 0;
+         } else {
+            *const_offset = blocks[i]->Uniforms[var->data.location].Offset;
+         }

         break;
      }
@@ -317,8 +319,6 @@ lower_ubo_reference_visitor::setup_for_load_or_store(void *mem_ctx,

   assert(this->uniform_block);

-   *const_offset = ubo_var->Offset;
-
   this->struct_field = NULL;
   setup_buffer_access(mem_ctx, deref, offset, const_offset, row_major,
                       matrix_columns, &this->struct_field, packing);
--- a/src/compiler/glsl/standalone.cpp
+++ b/src/compiler/glsl/standalone.cpp
@@ -421,7 +421,7 @@ standalone_compile_shader(const struct standalone_options *_options,
   }

   if ((status == EXIT_SUCCESS) && options->do_link)  {
-      _mesa_clear_shader_program_data(whole_program);
+      _mesa_clear_shader_program_data(ctx, whole_program);

      link_shaders(ctx, whole_program);
      status = (whole_program->LinkStatus) ? EXIT_SUCCESS : EXIT_FAILURE;
--- a/src/compiler/glsl/standalone_scaffolding.cpp
+++ b/src/compiler/glsl/standalone_scaffolding.cpp
@@ -123,8 +123,16 @@ _mesa_delete_linked_shader(struct gl_context *ctx,
 }

 void
-_mesa_clear_shader_program_data(struct gl_shader_program *shProg)
+_mesa_clear_shader_program_data(struct gl_context *ctx,
+                                struct gl_shader_program *shProg)
 {
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (shProg->_LinkedShaders[i] != NULL) {
+         _mesa_delete_linked_shader(ctx, shProg->_LinkedShaders[i]);
+         shProg->_LinkedShaders[i] = NULL;
+      }
+   }
+
   shProg->NumUniformStorage = 0;
   shProg->UniformStorage = NULL;
   shProg->NumUniformRemapTable = 0;
--- a/src/compiler/glsl/standalone_scaffolding.h
+++ b/src/compiler/glsl/standalone_scaffolding.h
@@ -56,7 +56,8 @@ _mesa_delete_linked_shader(struct gl_context *ctx,
                           struct gl_linked_shader *sh);

 extern "C" void
-_mesa_clear_shader_program_data(struct gl_shader_program *);
+_mesa_clear_shader_program_data(struct gl_context *ctx,
+                                struct gl_shader_program *);

 extern "C" void
 _mesa_shader_debug(struct gl_context *ctx, GLenum type, GLuint *id,
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2625,6 +2625,8 @@ bool nir_opt_remove_phis(nir_shader *shader);

 bool nir_opt_undef(nir_shader *shader);

+bool nir_opt_conditional_discard(nir_shader *shader);
+
 void nir_sweep(nir_shader *shader);

 nir_intrinsic_op nir_intrinsic_from_system_value(gl_system_value val);
--- a/src/compiler/nir/nir_lower_wpos_ytransform.c
+++ b/src/compiler/nir/nir_lower_wpos_ytransform.c
@@ -272,6 +272,26 @@ lower_interp_var_at_offset(lower_wpos_ytransform_state *state,
                                                     flip_y)));
 }

+static void
+lower_load_sample_pos(lower_wpos_ytransform_state *state,
+                      nir_intrinsic_instr *intr)
+{
+   nir_builder *b = &state->b;
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_ssa_def *pos = &intr->dest.ssa;
+   nir_ssa_def *scale = nir_channel(b, get_transform(state), 0);
+   nir_ssa_def *neg_scale = nir_channel(b, get_transform(state), 2);
+   /* Either y or 1-y for scale equal to 1 or -1 respectively. */
+   nir_ssa_def *flipped_y =
+               nir_fadd(b, nir_fmax(b, neg_scale, nir_imm_float(b, 0.0)),
+                        nir_fmul(b, nir_channel(b, pos, 1), scale));
+   nir_ssa_def *flipped_pos = nir_vec2(b, nir_channel(b, pos, 0), flipped_y);
+
+   nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, nir_src_for_ssa(flipped_pos),
+                                  flipped_pos->parent_instr);
+}
+
 static void
 lower_wpos_ytransform_block(lower_wpos_ytransform_state *state, nir_block *block)
 {
@@ -287,6 +307,10 @@ lower_wpos_ytransform_block(lower_wpos_ytransform_state *state, nir_block *block
               /* gl_FragCoord should not have array/struct deref's: */
               assert(dvar->deref.child == NULL);
               lower_fragcoord(state, intr);
+            } else if (var->data.mode == nir_var_system_value &&
+                       var->data.location == SYSTEM_VALUE_SAMPLE_POS) {
+               assert(dvar->deref.child == NULL);
+               lower_load_sample_pos(state, intr);
            }
         } else if (intr->intrinsic == nir_intrinsic_interp_var_at_offset) {
            lower_interp_var_at_offset(state, intr);
--- a/src/compiler/nir/nir_opt_conditional_discard.c
+++ b/src/compiler/nir/nir_opt_conditional_discard.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2016 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/** @file nir_opt_conditional_discard.c
+ *
+ * Handles optimization of lowering if (cond) discard to discard_if(cond).
+ */
+
+static bool
+nir_opt_conditional_discard_block(nir_block *block, void *mem_ctx)
+{
+   nir_builder bld;
+
+   if (nir_cf_node_is_first(&block->cf_node))
+      return false;
+
+   nir_cf_node *prev_node = nir_cf_node_prev(&block->cf_node);
+   if (prev_node->type != nir_cf_node_if)
+      return false;
+
+   nir_if *if_stmt = nir_cf_node_as_if(prev_node);
+   nir_block *then_block = nir_if_first_then_block(if_stmt);
+   nir_block *else_block = nir_if_first_else_block(if_stmt);
+
+   /* check there is only one else block and it is empty */
+   if (nir_if_last_else_block(if_stmt) != else_block)
+      return false;
+   if (!exec_list_is_empty(&else_block->instr_list))
+      return false;
+
+   /* check there is only one then block and it has only one instruction in it */
+   if (nir_if_last_then_block(if_stmt) != then_block)
+      return false;
+   if (exec_list_is_empty(&then_block->instr_list))
+      return false;
+   if (exec_list_length(&then_block->instr_list) > 1)
+      return false;
+   /*
+    * make sure no subsequent phi nodes point at this if.
+    */
+   nir_block *after = nir_cf_node_as_block(nir_cf_node_next(&if_stmt->cf_node));
+   nir_foreach_instr_safe(instr, after) {
+      if (instr->type != nir_instr_type_phi)
+         break;
+      nir_phi_instr *phi = nir_instr_as_phi(instr);
+
+      nir_foreach_phi_src(phi_src, phi) {
+         if (phi_src->pred == then_block ||
+             phi_src->pred == else_block)
+            return false;
+      }
+   }
+
+   /* Get the first instruction in the then block and confirm it is
+    * a discard or a discard_if
+    */
+   nir_instr *instr = nir_block_first_instr(then_block);
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_discard &&
+       intrin->intrinsic != nir_intrinsic_discard_if)
+      return false;
+
+   nir_src cond;
+
+   nir_builder_init(&bld, mem_ctx);
+   bld.cursor = nir_before_cf_node(prev_node);
+   if (intrin->intrinsic == nir_intrinsic_discard)
+      cond = if_stmt->condition;
+   else
+      cond = nir_src_for_ssa(nir_iand(&bld,
+                                      nir_ssa_for_src(&bld, if_stmt->condition, 1),
+                                      nir_ssa_for_src(&bld, intrin->src[0], 1)));
+
+   nir_intrinsic_instr *discard_if =
+      nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_discard_if);
+   nir_src_copy(&discard_if->src[0], &cond, discard_if);
+
+   nir_instr_insert_before_cf(prev_node, &discard_if->instr);
+   nir_instr_remove(&intrin->instr);
+   nir_cf_node_remove(&if_stmt->cf_node);
+
+   return true;
+}
+
+bool
+nir_opt_conditional_discard(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         void *mem_ctx = ralloc_parent(function->impl);
+         nir_foreach_block_safe(block, function->impl) {
+            progress |= nir_opt_conditional_discard_block(block, mem_ctx);
+         }
+      }
+   }
+   return progress;
+}
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -80,8 +80,6 @@
 #include "eglimage.h"
 #include "eglsync.h"

-#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
-
 struct wl_buffer;

 struct dri2_egl_driver
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -2384,7 +2384,7 @@ _eglLockDisplayInterop(EGLDisplay dpy, EGLContext context,
   return MESA_GLINTEROP_SUCCESS;
 }

-int
+PUBLIC int
 MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context,
                                struct mesa_glinterop_device_info *out)
 {
@@ -2406,7 +2406,7 @@ MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context,
   return ret;
 }

-int
+PUBLIC int
 MesaGLInteropEGLExportObject(EGLDisplay dpy, EGLContext context,
                             struct mesa_glinterop_export_in *in,
                             struct mesa_glinterop_export_out *out)
--- a/src/egl/main/egldefines.h
+++ b/src/egl/main/egldefines.h
@@ -34,6 +34,8 @@
 #ifndef EGLDEFINES_INCLUDED
 #define EGLDEFINES_INCLUDED

+#include "util/macros.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -48,9 +50,6 @@ extern "C" {

 #define _EGL_VENDOR_STRING "Mesa Project"

-#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
-#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))
-
 #ifdef __cplusplus
 }
 #endif
--- a/src/gallium/auxiliary/hud/hud_cpufreq.c
+++ b/src/gallium/auxiliary/hud/hud_cpufreq.c
@@ -36,6 +36,7 @@
 #include "hud/hud_private.h"
 #include "util/list.h"
 #include "os/os_time.h"
+#include "os/os_thread.h"
 #include "util/u_memory.h"
 #include <stdio.h>
 #include <unistd.h>
@@ -61,6 +62,7 @@ struct cpufreq_info

 static int gcpufreq_count = 0;
 static struct list_head gcpufreq_list;
+pipe_static_mutex(gcpufreq_mutex);

 static struct cpufreq_info *
 find_cfi_by_index(int cpu_index, int mode)
@@ -112,14 +114,6 @@ query_cfi_load(struct hud_graph *gr)
   }
 }

-static void
-free_query_data(void *p)
-{
-   struct cpufreq_info *cfi = (struct cpufreq_info *)p;
-   list_del(&cfi->list);
-   FREE(cfi);
-}
-
 /**
  * Create and initialize a new object for a specific CPU.
  * \param  pane  parent context.
@@ -162,11 +156,6 @@ hud_cpufreq_graph_install(struct hud_pane *pane, int cpu_index,
   gr->query_data = cfi;
   gr->query_new_value = query_cfi_load;

-   /* Don't use free() as our callback as that messes up Gallium's
-    * memory debugger.  Use simple free_query_data() wrapper.
-    */
-   gr->free_query_data = free_query_data;
-
   hud_pane_add_graph(pane, gr);
   hud_pane_set_max_value(pane, 3000000 /* 3 GHz */);
 }
@@ -199,16 +188,21 @@ hud_get_num_cpufreq(bool displayhelp)
   int cpu_index;

   /* Return the number of CPU metrics we support. */
-   if (gcpufreq_count)
+   pipe_mutex_lock(gcpufreq_mutex);
+   if (gcpufreq_count) {
+      pipe_mutex_unlock(gcpufreq_mutex);
      return gcpufreq_count;
+   }

   /* Scan /sys/devices.../cpu, for every object type we support, create
    * and persist an object to represent its different metrics.
    */
   list_inithead(&gcpufreq_list);
   DIR *dir = opendir("/sys/devices/system/cpu");
-   if (!dir)
+   if (!dir) {
+      pipe_mutex_unlock(gcpufreq_mutex);
      return 0;
+   }

   while ((dp = readdir(dir)) != NULL) {

@@ -238,6 +232,7 @@ hud_get_num_cpufreq(bool displayhelp)
      snprintf(fn, sizeof(fn), "%s/cpufreq/scaling_max_freq", basename);
      add_object(dp->d_name, fn, CPUFREQ_MAXIMUM, cpu_index);
   }
+   closedir(dir);

   if (displayhelp) {
      list_for_each_entry(struct cpufreq_info, cfi, &gcpufreq_list, list) {
@@ -251,6 +246,7 @@ hud_get_num_cpufreq(bool displayhelp)
      }
   }

+   pipe_mutex_unlock(gcpufreq_mutex);
   return gcpufreq_count;
 }

--- a/src/gallium/auxiliary/hud/hud_diskstat.c
+++ b/src/gallium/auxiliary/hud/hud_diskstat.c
@@ -35,6 +35,7 @@
 #include "hud/hud_private.h"
 #include "util/list.h"
 #include "os/os_time.h"
+#include "os/os_thread.h"
 #include "util/u_memory.h"
 #include <stdio.h>
 #include <unistd.h>
@@ -81,6 +82,7 @@ struct diskstat_info
 */
 static int gdiskstat_count = 0;
 static struct list_head gdiskstat_list;
+pipe_static_mutex(gdiskstat_mutex);

 static struct diskstat_info *
 find_dsi_by_name(const char *n, int mode)
@@ -162,14 +164,6 @@ query_dsi_load(struct hud_graph *gr)
   }
 }

-static void
-free_query_data(void *p)
-{
-   struct diskstat_info *nic = (struct diskstat_info *) p;
-   list_del(&nic->list);
-   FREE(nic);
-}
-
 /**
  * Create and initialize a new object for a specific block I/O device.
  * \param  pane  parent context.
@@ -208,11 +202,6 @@ hud_diskstat_graph_install(struct hud_pane *pane, const char *dev_name,
   gr->query_data = dsi;
   gr->query_new_value = query_dsi_load;

-   /* Don't use free() as our callback as that messes up Gallium's
-    * memory debugger.  Use simple free_query_data() wrapper.
-    */
-   gr->free_query_data = free_query_data;
-
   hud_pane_add_graph(pane, gr);
   hud_pane_set_max_value(pane, 100);
 }
@@ -257,16 +246,21 @@ hud_get_num_disks(bool displayhelp)
   char name[64];

   /* Return the number of block devices and partitions. */
-   if (gdiskstat_count)
+   pipe_mutex_lock(gdiskstat_mutex);
+   if (gdiskstat_count) {
+      pipe_mutex_unlock(gdiskstat_mutex);
      return gdiskstat_count;
+   }

   /* Scan /sys/block, for every object type we support, create and
    * persist an object to represent its different statistics.
    */
   list_inithead(&gdiskstat_list);
   DIR *dir = opendir("/sys/block/");
-   if (!dir)
+   if (!dir) {
+      pipe_mutex_unlock(gdiskstat_mutex);
      return 0;
+   }

   while ((dp = readdir(dir)) != NULL) {

@@ -290,8 +284,11 @@ hud_get_num_disks(bool displayhelp)
      /* Add any partitions */
      struct dirent *dpart;
      DIR *pdir = opendir(basename);
-      if (!pdir)
+      if (!pdir) {
+         pipe_mutex_unlock(gdiskstat_mutex);
+         closedir(dir);
         return 0;
+      }

      while ((dpart = readdir(pdir)) != NULL) {
         /* Avoid 'lo' and '..' and '.' */
@@ -311,6 +308,7 @@ hud_get_num_disks(bool displayhelp)
         add_object_part(basename, dpart->d_name, DISKSTAT_WR);
      }
   }
+   closedir(dir);

   if (displayhelp) {
      list_for_each_entry(struct diskstat_info, dsi, &gdiskstat_list, list) {
@@ -322,6 +320,7 @@ hud_get_num_disks(bool displayhelp)
         puts(line);
      }
   }
+   pipe_mutex_unlock(gdiskstat_mutex);

   return gdiskstat_count;
 }
--- a/src/gallium/auxiliary/hud/hud_nic.c
+++ b/src/gallium/auxiliary/hud/hud_nic.c
@@ -35,6 +35,7 @@
 #include "hud/hud_private.h"
 #include "util/list.h"
 #include "os/os_time.h"
+#include "os/os_thread.h"
 #include "util/u_memory.h"
 #include <stdio.h>
 #include <unistd.h>
@@ -66,6 +67,7 @@ struct nic_info
 */
 static int gnic_count = 0;
 static struct list_head gnic_list;
+pipe_static_mutex(gnic_mutex);

 static struct nic_info *
 find_nic_by_name(const char *n, int mode)
@@ -234,14 +236,6 @@ query_nic_load(struct hud_graph *gr)
   }
 }

-static void
-free_query_data(void *p)
-{
-   struct nic_info *nic = (struct nic_info *) p;
-   list_del(&nic->list);
-   FREE(nic);
-}
-
 /**
  * Create and initialize a new object for a specific network interface dev.
  * \param  pane  parent context.
@@ -284,11 +278,6 @@ hud_nic_graph_install(struct hud_pane *pane, const char *nic_name,
   gr->query_data = nic;
   gr->query_new_value = query_nic_load;

-   /* Don't use free() as our callback as that messes up Gallium's
-    * memory debugger.  Use simple free_query_data() wrapper.
-    */
-   gr->free_query_data = free_query_data;
-
   hud_pane_add_graph(pane, gr);
   hud_pane_set_max_value(pane, 100);
 }
@@ -342,16 +331,21 @@ hud_get_num_nics(bool displayhelp)
   char name[64];

   /* Return the number if network interfaces. */
-   if (gnic_count)
+   pipe_mutex_lock(gnic_mutex);
+   if (gnic_count) {
+      pipe_mutex_unlock(gnic_mutex);
      return gnic_count;
+   }

   /* Scan /sys/block, for every object type we support, create and
    * persist an object to represent its different statistics.
    */
   list_inithead(&gnic_list);
   DIR *dir = opendir("/sys/class/net/");
-   if (!dir)
+   if (!dir) {
+      pipe_mutex_unlock(gnic_mutex);
      return 0;
+   }

   while ((dp = readdir(dir)) != NULL) {

@@ -412,6 +406,7 @@ hud_get_num_nics(bool displayhelp)
      }

   }
+   closedir(dir);

   list_for_each_entry(struct nic_info, nic, &gnic_list, list) {
      char line[64];
@@ -424,6 +419,7 @@ hud_get_num_nics(bool displayhelp)

   }

+   pipe_mutex_unlock(gnic_mutex);
   return gnic_count;
 }

--- a/src/gallium/auxiliary/hud/hud_sensors_temp.c
+++ b/src/gallium/auxiliary/hud/hud_sensors_temp.c
@@ -32,6 +32,7 @@
 #include "hud/hud_private.h"
 #include "util/list.h"
 #include "os/os_time.h"
+#include "os/os_thread.h"
 #include "util/u_memory.h"
 #include <stdio.h>
 #include <unistd.h>
@@ -49,6 +50,7 @@
 */
 static int gsensors_temp_count = 0;
 static struct list_head gsensors_temp_list;
+pipe_static_mutex(gsensor_temp_mutex);

 struct sensors_temp_info
 {
@@ -189,17 +191,6 @@ query_sti_load(struct hud_graph *gr)
   }
 }

-static void
-free_query_data(void *p)
-{
-   struct sensors_temp_info *sti = (struct sensors_temp_info *) p;
-   list_del(&sti->list);
-   if (sti->chip)
-      sensors_free_chip_name(sti->chip);
-   FREE(sti);
-   sensors_cleanup();
-}
-
 /**
  * Create and initialize a new object for a specific sensor interface dev.
  * \param  pane  parent context.
@@ -237,11 +228,6 @@ hud_sensors_temp_graph_install(struct hud_pane *pane, const char *dev_name,
   gr->query_data = sti;
   gr->query_new_value = query_sti_load;

-   /* Don't use free() as our callback as that messes up Gallium's
-    * memory debugger.  Use simple free_query_data() wrapper.
-    */
-   gr->free_query_data = free_query_data;
-
   hud_pane_add_graph(pane, gr);
   switch (sti->mode) {
   case SENSORS_TEMP_CURRENT:
@@ -338,12 +324,17 @@ int
 hud_get_num_sensors(bool displayhelp)
 {
   /* Return the number of sensors detected. */
-   if (gsensors_temp_count)
+   pipe_mutex_lock(gsensor_temp_mutex);
+   if (gsensors_temp_count) {
+      pipe_mutex_unlock(gsensor_temp_mutex);
      return gsensors_temp_count;
+   }

   int ret = sensors_init(NULL);
-   if (ret)
+   if (ret) {
+      pipe_mutex_unlock(gsensor_temp_mutex);
      return 0;
+   }

   list_inithead(&gsensors_temp_list);

@@ -377,6 +368,7 @@ hud_get_num_sensors(bool displayhelp)
      }
   }

+   pipe_mutex_unlock(gsensor_temp_mutex);
   return gsensors_temp_count;
 }

--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -487,7 +487,9 @@ si_decompress_sampler_color_textures(struct si_context *sctx,
 		assert(view);

 		tex = (struct r600_texture *)view->texture;
-		assert(tex->cmask.size || tex->fmask.size || tex->dcc_offset);
+		/* CMASK or DCC can be discarded and we can still end up here. */
+		if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
+			continue;

 		si_blit_decompress_color(&sctx->b.b, tex,
 					 view->u.tex.first_level, view->u.tex.last_level,
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c
@@ -459,6 +459,8 @@ static void emit_bfi(const struct lp_build_tgsi_action *action,
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	LLVMValueRef bfi_args[3];
+	LLVMValueRef bfi_sm5;
+	LLVMValueRef cond;

 	// Calculate the bitmask: (((1 << src3) - 1) << src2
 	bfi_args[0] = LLVMBuildShl(builder,
@@ -478,11 +480,40 @@ static void emit_bfi(const struct lp_build_tgsi_action *action,
 	 *   (arg0 & arg1) | (~arg0 & arg2) = arg2 ^ (arg0 & (arg1 ^ arg2)
 	 * Use the right-hand side, which the LLVM backend can convert to V_BFI.
 	 */
-	emit_data->output[emit_data->chan] =
+	bfi_sm5 =
 		LLVMBuildXor(builder, bfi_args[2],
 			LLVMBuildAnd(builder, bfi_args[0],
 				LLVMBuildXor(builder, bfi_args[1], bfi_args[2],
 					     ""), ""), "");
+
+	/* Since shifts of >= 32 bits are undefined in LLVM IR, the backend
+	 * uses the convenient V_BFI lowering for the above, which follows SM5
+	 * and disagrees with GLSL semantics when bits (src3) is 32.
+	 */
+	cond = LLVMBuildICmp(builder, LLVMIntUGE, emit_data->args[3],
+			     lp_build_const_int32(gallivm, 32), "");
+	emit_data->output[emit_data->chan] =
+		LLVMBuildSelect(builder, cond, emit_data->args[1], bfi_sm5, "");
+}
+
+static void emit_bfe(const struct lp_build_tgsi_action *action,
+		     struct lp_build_tgsi_context *bld_base,
+		     struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef bfe_sm5;
+	LLVMValueRef cond;
+
+	bfe_sm5 = lp_build_intrinsic(builder, action->intr_name,
+				     emit_data->dst_type, emit_data->args,
+				     emit_data->arg_count, LLVMReadNoneAttribute);
+
+	/* Correct for GLSL semantics. */
+	cond = LLVMBuildICmp(builder, LLVMIntUGE, emit_data->args[2],
+			     lp_build_const_int32(gallivm, 32), "");
+	emit_data->output[emit_data->chan] =
+		LLVMBuildSelect(builder, cond, emit_data->args[0], bfe_sm5, "");
 }

 /* this is ffs in C */
@@ -783,7 +814,7 @@ void si_shader_context_init_alu(struct lp_build_tgsi_context *bld_base)
 	bld_base->op_actions[TGSI_OPCODE_FSLT].emit = emit_fcmp;
 	bld_base->op_actions[TGSI_OPCODE_FSNE].emit = emit_fcmp;
 	bld_base->op_actions[TGSI_OPCODE_IABS].emit = emit_iabs;
-	bld_base->op_actions[TGSI_OPCODE_IBFE].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_IBFE].emit = emit_bfe;
 	bld_base->op_actions[TGSI_OPCODE_IBFE].intr_name = "llvm.AMDGPU.bfe.i32";
 	bld_base->op_actions[TGSI_OPCODE_IDIV].emit = emit_idiv;
 	bld_base->op_actions[TGSI_OPCODE_IMAX].emit = emit_minmax_int;
@@ -835,7 +866,7 @@ void si_shader_context_init_alu(struct lp_build_tgsi_context *bld_base)
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.trunc.f32";
 	bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd;
-	bld_base->op_actions[TGSI_OPCODE_UBFE].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_UBFE].emit = emit_bfe;
 	bld_base->op_actions[TGSI_OPCODE_UBFE].intr_name = "llvm.AMDGPU.bfe.u32";
 	bld_base->op_actions[TGSI_OPCODE_UDIV].emit = emit_udiv;
 	bld_base->op_actions[TGSI_OPCODE_UMAX].emit = emit_minmax_int;
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1370,7 +1370,7 @@ emit_vert_end(struct vc4_compile *c,
              struct vc4_varying_slot *fs_inputs,
              uint32_t num_fs_inputs)
 {
-        struct qreg rcp_w = qir_RCP(c, c->outputs[c->output_position_index + 3]);
+        struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);

        emit_stub_vpm_read(c);

--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -82,7 +82,7 @@ vlVdpOutputSurfaceCreate(VdpDevice device,
   res_tmpl.depth0 = 1;
   res_tmpl.array_size = 1;
   res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET |
-                   PIPE_BIND_LINEAR | PIPE_BIND_SHARED;
+                   PIPE_BIND_SHARED;
   res_tmpl.usage = PIPE_USAGE_DEFAULT;

   pipe_mutex_lock(dev->mutex);
--- a/src/glx/g_glxglvnddispatchfuncs.c
+++ b/src/glx/g_glxglvnddispatchfuncs.c
@@ -17,16 +17,19 @@ const char * const __glXDispatchTableStrings[DI_LAST_INDEX] = {
 #define __ATTRIB(field) \
    [DI_##field] = "glX"#field

+    __ATTRIB(BindSwapBarrierSGIX),
    __ATTRIB(BindTexImageEXT),
    // glXChooseFBConfig implemented by libglvnd
    __ATTRIB(ChooseFBConfigSGIX),
    // glXChooseVisual implemented by libglvnd
    // glXCopyContext implemented by libglvnd
+    __ATTRIB(CopySubBufferMESA),
    // glXCreateContext implemented by libglvnd
    __ATTRIB(CreateContextAttribsARB),
    __ATTRIB(CreateContextWithConfigSGIX),
    __ATTRIB(CreateGLXPbufferSGIX),
    // glXCreateGLXPixmap implemented by libglvnd
+    __ATTRIB(CreateGLXPixmapMESA),
    __ATTRIB(CreateGLXPixmapWithConfigSGIX),
    // glXCreateNewContext implemented by libglvnd
    // glXCreatePbuffer implemented by libglvnd
@@ -51,54 +54,50 @@ const char * const __glXDispatchTableStrings[DI_LAST_INDEX] = {
    __ATTRIB(GetFBConfigAttribSGIX),
    __ATTRIB(GetFBConfigFromVisualSGIX),
    // glXGetFBConfigs implemented by libglvnd
+    __ATTRIB(GetMscRateOML),
    // glXGetProcAddress implemented by libglvnd
    // glXGetProcAddressARB implemented by libglvnd
+    __ATTRIB(GetScreenDriver),
    // glXGetSelectedEvent implemented by libglvnd
    __ATTRIB(GetSelectedEventSGIX),
+    __ATTRIB(GetSwapIntervalMESA),
+    __ATTRIB(GetSyncValuesOML),
    __ATTRIB(GetVideoSyncSGI),
    // glXGetVisualFromFBConfig implemented by libglvnd
    __ATTRIB(GetVisualFromFBConfigSGIX),
    // glXImportContextEXT implemented by libglvnd
    // glXIsDirect implemented by libglvnd
+    __ATTRIB(JoinSwapGroupSGIX),
    // glXMakeContextCurrent implemented by libglvnd
    // glXMakeCurrent implemented by libglvnd
    // glXQueryContext implemented by libglvnd
    __ATTRIB(QueryContextInfoEXT),
+    __ATTRIB(QueryCurrentRendererIntegerMESA),
+    __ATTRIB(QueryCurrentRendererStringMESA),
    // glXQueryDrawable implemented by libglvnd
    // glXQueryExtension implemented by libglvnd
    // glXQueryExtensionsString implemented by libglvnd
    __ATTRIB(QueryGLXPbufferSGIX),
+    __ATTRIB(QueryMaxSwapBarriersSGIX),
+    __ATTRIB(QueryRendererIntegerMESA),
+    __ATTRIB(QueryRendererStringMESA),
    // glXQueryServerString implemented by libglvnd
    // glXQueryVersion implemented by libglvnd
+    __ATTRIB(ReleaseBuffersMESA),
    __ATTRIB(ReleaseTexImageEXT),
    // glXSelectEvent implemented by libglvnd
    __ATTRIB(SelectEventSGIX),
    // glXSwapBuffers implemented by libglvnd
+    __ATTRIB(SwapBuffersMscOML),
+    __ATTRIB(SwapIntervalMESA),
    __ATTRIB(SwapIntervalSGI),
    // glXUseXFont implemented by libglvnd
+    __ATTRIB(WaitForMscOML),
+    __ATTRIB(WaitForSbcOML),
    // glXWaitGL implemented by libglvnd
    __ATTRIB(WaitVideoSyncSGI),
    // glXWaitX implemented by libglvnd

-    __ATTRIB(glXBindSwapBarrierSGIX),
-    __ATTRIB(glXCopySubBufferMESA),
-    __ATTRIB(glXCreateGLXPixmapMESA),
-    __ATTRIB(glXGetMscRateOML),
-    __ATTRIB(glXGetScreenDriver),
-    __ATTRIB(glXGetSwapIntervalMESA),
-    __ATTRIB(glXGetSyncValuesOML),
-    __ATTRIB(glXJoinSwapGroupSGIX),
-    __ATTRIB(glXQueryCurrentRendererIntegerMESA),
-    __ATTRIB(glXQueryCurrentRendererStringMESA),
-    __ATTRIB(glXQueryMaxSwapBarriersSGIX),
-    __ATTRIB(glXQueryRendererIntegerMESA),
-    __ATTRIB(glXQueryRendererStringMESA),
-    __ATTRIB(glXReleaseBuffersMESA),
-    __ATTRIB(glXSwapBuffersMscOML),
-    __ATTRIB(glXSwapIntervalMESA),
-    __ATTRIB(glXWaitForMscOML),
-    __ATTRIB(glXWaitForSbcOML),
-
 #undef __ATTRIB
 };

@@ -557,49 +556,49 @@ static int dispatch_WaitVideoSyncSGI(int divisor, int remainder,



-static void dispatch_glXBindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable,
+static void dispatch_BindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable,
                                            int barrier)
 {
-    PFNGLXBINDSWAPBARRIERSGIXPROC pglXBindSwapBarrierSGIX;
+    PFNGLXBINDSWAPBARRIERSGIXPROC pBindSwapBarrierSGIX;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return;

-    __FETCH_FUNCTION_PTR(glXBindSwapBarrierSGIX);
-    if (pglXBindSwapBarrierSGIX == NULL)
+    __FETCH_FUNCTION_PTR(BindSwapBarrierSGIX);
+    if (pBindSwapBarrierSGIX == NULL)
        return;

-    (*pglXBindSwapBarrierSGIX)(dpy, drawable, barrier);
+    (*pBindSwapBarrierSGIX)(dpy, drawable, barrier);
 }



-static void dispatch_glXCopySubBufferMESA(Display *dpy, GLXDrawable drawable,
+static void dispatch_CopySubBufferMESA(Display *dpy, GLXDrawable drawable,
                                          int x, int y, int width, int height)
 {
-    PFNGLXCOPYSUBBUFFERMESAPROC pglXCopySubBufferMESA;
+    PFNGLXCOPYSUBBUFFERMESAPROC pCopySubBufferMESA;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return;

-    __FETCH_FUNCTION_PTR(glXCopySubBufferMESA);
-    if (pglXCopySubBufferMESA == NULL)
+    __FETCH_FUNCTION_PTR(CopySubBufferMESA);
+    if (pCopySubBufferMESA == NULL)
        return;

-    (*pglXCopySubBufferMESA)(dpy, drawable, x, y, width, height);
+    (*pCopySubBufferMESA)(dpy, drawable, x, y, width, height);
 }



-static GLXPixmap dispatch_glXCreateGLXPixmapMESA(Display *dpy,
+static GLXPixmap dispatch_CreateGLXPixmapMESA(Display *dpy,
                                                 XVisualInfo *visinfo,
                                                 Pixmap pixmap, Colormap cmap)
 {
-    PFNGLXCREATEGLXPIXMAPMESAPROC pglXCreateGLXPixmapMESA;
+    PFNGLXCREATEGLXPIXMAPMESAPROC pCreateGLXPixmapMESA;
    __GLXvendorInfo *dd;
    GLXPixmap ret;

@@ -607,11 +606,11 @@ static GLXPixmap dispatch_glXCreateGLXPixmapMESA(Display *dpy,
    if (dd == NULL)
        return None;

-    __FETCH_FUNCTION_PTR(glXCreateGLXPixmapMESA);
-    if (pglXCreateGLXPixmapMESA == NULL)
+    __FETCH_FUNCTION_PTR(CreateGLXPixmapMESA);
+    if (pCreateGLXPixmapMESA == NULL)
        return None;

-    ret = (*pglXCreateGLXPixmapMESA)(dpy, visinfo, pixmap, cmap);
+    ret = (*pCreateGLXPixmapMESA)(dpy, visinfo, pixmap, cmap);
    if (AddDrawableMapping(dpy, ret, dd)) {
        /* XXX: Call glXDestroyGLXPixmap which lives in libglvnd. If we're not
         * allowed to call it from here, should we extend __glXDispatchTableIndices ?
@@ -624,47 +623,47 @@ static GLXPixmap dispatch_glXCreateGLXPixmapMESA(Display *dpy,



-static GLboolean dispatch_glXGetMscRateOML(Display *dpy, GLXDrawable drawable,
+static GLboolean dispatch_GetMscRateOML(Display *dpy, GLXDrawable drawable,
                                           int32_t *numerator, int32_t *denominator)
 {
-    PFNGLXGETMSCRATEOMLPROC pglXGetMscRateOML;
+    PFNGLXGETMSCRATEOMLPROC pGetMscRateOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return GL_FALSE;

-    __FETCH_FUNCTION_PTR(glXGetMscRateOML);
-    if (pglXGetMscRateOML == NULL)
+    __FETCH_FUNCTION_PTR(GetMscRateOML);
+    if (pGetMscRateOML == NULL)
        return GL_FALSE;

-    return (*pglXGetMscRateOML)(dpy, drawable, numerator, denominator);
+    return (*pGetMscRateOML)(dpy, drawable, numerator, denominator);
 }



-static const char *dispatch_glXGetScreenDriver(Display *dpy, int scrNum)
+static const char *dispatch_GetScreenDriver(Display *dpy, int scrNum)
 {
    typedef const char *(*fn_glXGetScreenDriver_ptr)(Display *dpy, int scrNum);
-    fn_glXGetScreenDriver_ptr pglXGetScreenDriver;
+    fn_glXGetScreenDriver_ptr pGetScreenDriver;
    __GLXvendorInfo *dd;

    dd = __VND->getDynDispatch(dpy, scrNum);
    if (dd == NULL)
        return NULL;

-    __FETCH_FUNCTION_PTR(glXGetScreenDriver);
-    if (pglXGetScreenDriver == NULL)
+    __FETCH_FUNCTION_PTR(GetScreenDriver);
+    if (pGetScreenDriver == NULL)
        return NULL;

-    return (*pglXGetScreenDriver)(dpy, scrNum);
+    return (*pGetScreenDriver)(dpy, scrNum);
 }



-static int dispatch_glXGetSwapIntervalMESA(void)
+static int dispatch_GetSwapIntervalMESA(void)
 {
-    PFNGLXGETSWAPINTERVALMESAPROC pglXGetSwapIntervalMESA;
+    PFNGLXGETSWAPINTERVALMESAPROC pGetSwapIntervalMESA;
    __GLXvendorInfo *dd;

    if (!__VND->getCurrentContext())
@@ -674,57 +673,57 @@ static int dispatch_glXGetSwapIntervalMESA(void)
    if (dd == NULL)
        return 0;

-    __FETCH_FUNCTION_PTR(glXGetSwapIntervalMESA);
-    if (pglXGetSwapIntervalMESA == NULL)
+    __FETCH_FUNCTION_PTR(GetSwapIntervalMESA);
+    if (pGetSwapIntervalMESA == NULL)
        return 0;

-    return (*pglXGetSwapIntervalMESA)();
+    return (*pGetSwapIntervalMESA)();
 }



-static Bool dispatch_glXGetSyncValuesOML(Display *dpy, GLXDrawable drawable,
+static Bool dispatch_GetSyncValuesOML(Display *dpy, GLXDrawable drawable,
                                         int64_t *ust, int64_t *msc, int64_t *sbc)
 {
-    PFNGLXGETSYNCVALUESOMLPROC pglXGetSyncValuesOML;
+    PFNGLXGETSYNCVALUESOMLPROC pGetSyncValuesOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXGetSyncValuesOML);
-    if (pglXGetSyncValuesOML == NULL)
+    __FETCH_FUNCTION_PTR(GetSyncValuesOML);
+    if (pGetSyncValuesOML == NULL)
        return False;

-    return (*pglXGetSyncValuesOML)(dpy, drawable, ust, msc, sbc);
+    return (*pGetSyncValuesOML)(dpy, drawable, ust, msc, sbc);
 }



-static void dispatch_glXJoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable,
+static void dispatch_JoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable,
                                          GLXDrawable member)
 {
-    PFNGLXJOINSWAPGROUPSGIXPROC pglXJoinSwapGroupSGIX;
+    PFNGLXJOINSWAPGROUPSGIXPROC pJoinSwapGroupSGIX;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return;

-    __FETCH_FUNCTION_PTR(glXJoinSwapGroupSGIX);
-    if (pglXJoinSwapGroupSGIX == NULL)
+    __FETCH_FUNCTION_PTR(JoinSwapGroupSGIX);
+    if (pJoinSwapGroupSGIX == NULL)
        return;

-    (*pglXJoinSwapGroupSGIX)(dpy, drawable, member);
+    (*pJoinSwapGroupSGIX)(dpy, drawable, member);
 }



-static Bool dispatch_glXQueryCurrentRendererIntegerMESA(int attribute,
+static Bool dispatch_QueryCurrentRendererIntegerMESA(int attribute,
                                                        unsigned int *value)
 {
-    PFNGLXQUERYCURRENTRENDERERINTEGERMESAPROC pglXQueryCurrentRendererIntegerMESA;
+    PFNGLXQUERYCURRENTRENDERERINTEGERMESAPROC pQueryCurrentRendererIntegerMESA;
    __GLXvendorInfo *dd;

    if (!__VND->getCurrentContext())
@@ -734,18 +733,18 @@ static Bool dispatch_glXQueryCurrentRendererIntegerMESA(int attribute,
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXQueryCurrentRendererIntegerMESA);
-    if (pglXQueryCurrentRendererIntegerMESA == NULL)
+    __FETCH_FUNCTION_PTR(QueryCurrentRendererIntegerMESA);
+    if (pQueryCurrentRendererIntegerMESA == NULL)
        return False;

-    return (*pglXQueryCurrentRendererIntegerMESA)(attribute, value);
+    return (*pQueryCurrentRendererIntegerMESA)(attribute, value);
 }



-static const char *dispatch_glXQueryCurrentRendererStringMESA(int attribute)
+static const char *dispatch_QueryCurrentRendererStringMESA(int attribute)
 {
-    PFNGLXQUERYCURRENTRENDERERSTRINGMESAPROC pglXQueryCurrentRendererStringMESA;
+    PFNGLXQUERYCURRENTRENDERERSTRINGMESAPROC pQueryCurrentRendererStringMESA;
    __GLXvendorInfo *dd;

    if (!__VND->getCurrentContext())
@@ -755,114 +754,114 @@ static const char *dispatch_glXQueryCurrentRendererStringMESA(int attribute)
    if (dd == NULL)
        return NULL;

-    __FETCH_FUNCTION_PTR(glXQueryCurrentRendererStringMESA);
-    if (pglXQueryCurrentRendererStringMESA == NULL)
+    __FETCH_FUNCTION_PTR(QueryCurrentRendererStringMESA);
+    if (pQueryCurrentRendererStringMESA == NULL)
        return NULL;

-    return (*pglXQueryCurrentRendererStringMESA)(attribute);
+    return (*pQueryCurrentRendererStringMESA)(attribute);
 }



-static Bool dispatch_glXQueryMaxSwapBarriersSGIX(Display *dpy, int screen,
+static Bool dispatch_QueryMaxSwapBarriersSGIX(Display *dpy, int screen,
                                                 int *max)
 {
-    PFNGLXQUERYMAXSWAPBARRIERSSGIXPROC pglXQueryMaxSwapBarriersSGIX;
+    PFNGLXQUERYMAXSWAPBARRIERSSGIXPROC pQueryMaxSwapBarriersSGIX;
    __GLXvendorInfo *dd;

    dd = __VND->getDynDispatch(dpy, screen);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXQueryMaxSwapBarriersSGIX);
-    if (pglXQueryMaxSwapBarriersSGIX == NULL)
+    __FETCH_FUNCTION_PTR(QueryMaxSwapBarriersSGIX);
+    if (pQueryMaxSwapBarriersSGIX == NULL)
        return False;

-    return (*pglXQueryMaxSwapBarriersSGIX)(dpy, screen, max);
+    return (*pQueryMaxSwapBarriersSGIX)(dpy, screen, max);
 }



-static Bool dispatch_glXQueryRendererIntegerMESA(Display *dpy, int screen,
+static Bool dispatch_QueryRendererIntegerMESA(Display *dpy, int screen,
                                                 int renderer, int attribute,
                                                 unsigned int *value)
 {
-    PFNGLXQUERYRENDERERINTEGERMESAPROC pglXQueryRendererIntegerMESA;
+    PFNGLXQUERYRENDERERINTEGERMESAPROC pQueryRendererIntegerMESA;
    __GLXvendorInfo *dd;

    dd = __VND->getDynDispatch(dpy, screen);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXQueryRendererIntegerMESA);
-    if (pglXQueryRendererIntegerMESA == NULL)
+    __FETCH_FUNCTION_PTR(QueryRendererIntegerMESA);
+    if (pQueryRendererIntegerMESA == NULL)
        return False;

-    return (*pglXQueryRendererIntegerMESA)(dpy, screen, renderer, attribute, value);
+    return (*pQueryRendererIntegerMESA)(dpy, screen, renderer, attribute, value);
 }



-static const char *dispatch_glXQueryRendererStringMESA(Display *dpy, int screen,
+static const char *dispatch_QueryRendererStringMESA(Display *dpy, int screen,
                                                       int renderer, int attribute)
 {
-    PFNGLXQUERYRENDERERSTRINGMESAPROC pglXQueryRendererStringMESA;
+    PFNGLXQUERYRENDERERSTRINGMESAPROC pQueryRendererStringMESA;
    __GLXvendorInfo *dd = NULL;

    dd = __VND->getDynDispatch(dpy, screen);
    if (dd == NULL)
        return NULL;

-    __FETCH_FUNCTION_PTR(glXQueryRendererStringMESA);
-    if (pglXQueryRendererStringMESA == NULL)
+    __FETCH_FUNCTION_PTR(QueryRendererStringMESA);
+    if (pQueryRendererStringMESA == NULL)
        return NULL;

-    return (*pglXQueryRendererStringMESA)(dpy, screen, renderer, attribute);
+    return (*pQueryRendererStringMESA)(dpy, screen, renderer, attribute);
 }



-static Bool dispatch_glXReleaseBuffersMESA(Display *dpy, GLXDrawable d)
+static Bool dispatch_ReleaseBuffersMESA(Display *dpy, GLXDrawable d)
 {
-    PFNGLXRELEASEBUFFERSMESAPROC pglXReleaseBuffersMESA;
+    PFNGLXRELEASEBUFFERSMESAPROC pReleaseBuffersMESA;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, d);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXReleaseBuffersMESA);
-    if (pglXReleaseBuffersMESA == NULL)
+    __FETCH_FUNCTION_PTR(ReleaseBuffersMESA);
+    if (pReleaseBuffersMESA == NULL)
        return False;

-    return (*pglXReleaseBuffersMESA)(dpy, d);
+    return (*pReleaseBuffersMESA)(dpy, d);
 }



-static int64_t dispatch_glXSwapBuffersMscOML(Display *dpy, GLXDrawable drawable,
+static int64_t dispatch_SwapBuffersMscOML(Display *dpy, GLXDrawable drawable,
                                             int64_t target_msc, int64_t divisor,
                                             int64_t remainder)
 {
-    PFNGLXSWAPBUFFERSMSCOMLPROC pglXSwapBuffersMscOML;
+    PFNGLXSWAPBUFFERSMSCOMLPROC pSwapBuffersMscOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return 0;

-    __FETCH_FUNCTION_PTR(glXSwapBuffersMscOML);
-    if (pglXSwapBuffersMscOML == NULL)
+    __FETCH_FUNCTION_PTR(SwapBuffersMscOML);
+    if (pSwapBuffersMscOML == NULL)
        return 0;

-    return (*pglXSwapBuffersMscOML)(dpy, drawable, target_msc, divisor, remainder);
+    return (*pSwapBuffersMscOML)(dpy, drawable, target_msc, divisor, remainder);
 }



-static int dispatch_glXSwapIntervalMESA(unsigned int interval)
+static int dispatch_SwapIntervalMESA(unsigned int interval)
 {
-    PFNGLXSWAPINTERVALMESAPROC pglXSwapIntervalMESA;
+    PFNGLXSWAPINTERVALMESAPROC pSwapIntervalMESA;
    __GLXvendorInfo *dd;

    if (!__VND->getCurrentContext())
@@ -872,52 +871,52 @@ static int dispatch_glXSwapIntervalMESA(unsigned int interval)
    if (dd == NULL)
        return 0;

-    __FETCH_FUNCTION_PTR(glXSwapIntervalMESA);
-    if (pglXSwapIntervalMESA == NULL)
+    __FETCH_FUNCTION_PTR(SwapIntervalMESA);
+    if (pSwapIntervalMESA == NULL)
        return 0;

-    return (*pglXSwapIntervalMESA)(interval);
+    return (*pSwapIntervalMESA)(interval);
 }



-static Bool dispatch_glXWaitForMscOML(Display *dpy, GLXDrawable drawable,
+static Bool dispatch_WaitForMscOML(Display *dpy, GLXDrawable drawable,
                                      int64_t target_msc, int64_t divisor,
                                      int64_t remainder, int64_t *ust,
                                      int64_t *msc, int64_t *sbc)
 {
-    PFNGLXWAITFORMSCOMLPROC pglXWaitForMscOML;
+    PFNGLXWAITFORMSCOMLPROC pWaitForMscOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXWaitForMscOML);
-    if (pglXWaitForMscOML == NULL)
+    __FETCH_FUNCTION_PTR(WaitForMscOML);
+    if (pWaitForMscOML == NULL)
        return False;

-    return (*pglXWaitForMscOML)(dpy, drawable, target_msc, divisor, remainder, ust, msc, sbc);
+    return (*pWaitForMscOML)(dpy, drawable, target_msc, divisor, remainder, ust, msc, sbc);
 }



-static Bool dispatch_glXWaitForSbcOML(Display *dpy, GLXDrawable drawable,
+static Bool dispatch_WaitForSbcOML(Display *dpy, GLXDrawable drawable,
                                      int64_t target_sbc, int64_t *ust,
                                      int64_t *msc, int64_t *sbc)
 {
-    PFNGLXWAITFORSBCOMLPROC pglXWaitForSbcOML;
+    PFNGLXWAITFORSBCOMLPROC pWaitForSbcOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXWaitForSbcOML);
-    if (pglXWaitForSbcOML == NULL)
+    __FETCH_FUNCTION_PTR(WaitForSbcOML);
+    if (pWaitForSbcOML == NULL)
        return False;

-    return (*pglXWaitForSbcOML)(dpy, drawable, target_sbc, ust, msc, sbc);
+    return (*pWaitForSbcOML)(dpy, drawable, target_sbc, ust, msc, sbc);
 }

 #undef __FETCH_FUNCTION_PTR
@@ -928,45 +927,44 @@ const void * const __glXDispatchFunctions[DI_LAST_INDEX + 1] = {
 #define __ATTRIB(field) \
    [DI_##field] = (void *)dispatch_##field

-    __ATTRIB(BindTexImageEXT),
+    __ATTRIB(BindSwapBarrierSGIX),
    __ATTRIB(BindTexImageEXT),
    __ATTRIB(ChooseFBConfigSGIX),
+    __ATTRIB(CopySubBufferMESA),
    __ATTRIB(CreateContextAttribsARB),
    __ATTRIB(CreateContextWithConfigSGIX),
    __ATTRIB(CreateGLXPbufferSGIX),
+    __ATTRIB(CreateGLXPixmapMESA),
    __ATTRIB(CreateGLXPixmapWithConfigSGIX),
    __ATTRIB(DestroyGLXPbufferSGIX),
    __ATTRIB(GetContextIDEXT),
    __ATTRIB(GetCurrentDisplayEXT),
    __ATTRIB(GetFBConfigAttribSGIX),
    __ATTRIB(GetFBConfigFromVisualSGIX),
+    __ATTRIB(GetMscRateOML),
+    __ATTRIB(GetScreenDriver),
    __ATTRIB(GetSelectedEventSGIX),
+    __ATTRIB(GetSwapIntervalMESA),
+    __ATTRIB(GetSyncValuesOML),
    __ATTRIB(GetVideoSyncSGI),
    __ATTRIB(GetVisualFromFBConfigSGIX),
+    __ATTRIB(JoinSwapGroupSGIX),
    __ATTRIB(QueryContextInfoEXT),
+    __ATTRIB(QueryCurrentRendererIntegerMESA),
+    __ATTRIB(QueryCurrentRendererStringMESA),
    __ATTRIB(QueryGLXPbufferSGIX),
+    __ATTRIB(QueryMaxSwapBarriersSGIX),
+    __ATTRIB(QueryRendererIntegerMESA),
+    __ATTRIB(QueryRendererStringMESA),
+    __ATTRIB(ReleaseBuffersMESA),
    __ATTRIB(ReleaseTexImageEXT),
    __ATTRIB(SelectEventSGIX),
+    __ATTRIB(SwapBuffersMscOML),
+    __ATTRIB(SwapIntervalMESA),
    __ATTRIB(SwapIntervalSGI),
+    __ATTRIB(WaitForMscOML),
+    __ATTRIB(WaitForSbcOML),
    __ATTRIB(WaitVideoSyncSGI),
-    __ATTRIB(glXBindSwapBarrierSGIX),
-    __ATTRIB(glXCopySubBufferMESA),
-    __ATTRIB(glXCreateGLXPixmapMESA),
-    __ATTRIB(glXGetMscRateOML),
-    __ATTRIB(glXGetScreenDriver),
-    __ATTRIB(glXGetSwapIntervalMESA),
-    __ATTRIB(glXGetSyncValuesOML),
-    __ATTRIB(glXJoinSwapGroupSGIX),
-    __ATTRIB(glXQueryCurrentRendererIntegerMESA),
-    __ATTRIB(glXQueryCurrentRendererStringMESA),
-    __ATTRIB(glXQueryMaxSwapBarriersSGIX),
-    __ATTRIB(glXQueryRendererIntegerMESA),
-    __ATTRIB(glXQueryRendererStringMESA),
-    __ATTRIB(glXReleaseBuffersMESA),
-    __ATTRIB(glXSwapBuffersMscOML),
-    __ATTRIB(glXSwapIntervalMESA),
-    __ATTRIB(glXWaitForMscOML),
-    __ATTRIB(glXWaitForSbcOML),

    [DI_LAST_INDEX] = NULL,
 #undef __ATTRIB
--- a/src/glx/g_glxglvnddispatchindices.h
+++ b/src/glx/g_glxglvnddispatchindices.h
@@ -6,16 +6,19 @@
 #define __glxlibglvnd_dispatchindex_h__

 typedef enum __GLXdispatchIndex {
+    DI_BindSwapBarrierSGIX,
    DI_BindTexImageEXT,
    // ChooseFBConfig implemented by libglvnd
    DI_ChooseFBConfigSGIX,
    // ChooseVisual implemented by libglvnd
    // CopyContext implemented by libglvnd
+    DI_CopySubBufferMESA,
    // CreateContext implemented by libglvnd
    DI_CreateContextAttribsARB,
    DI_CreateContextWithConfigSGIX,
    DI_CreateGLXPbufferSGIX,
    // CreateGLXPixmap implemented by libglvnd
+    DI_CreateGLXPixmapMESA,
    DI_CreateGLXPixmapWithConfigSGIX,
    // CreateNewContext implemented by libglvnd
    // CreatePbuffer implemented by libglvnd
@@ -40,6 +43,7 @@ typedef enum __GLXdispatchIndex {
    DI_GetFBConfigAttribSGIX,
    DI_GetFBConfigFromVisualSGIX,
    // GetFBConfigs implemented by libglvnd
+    DI_GetMscRateOML,
    // GetProcAddress implemented by libglvnd
    // GetProcAddressARB implemented by libglvnd
    // GetSelectedEvent implemented by libglvnd
@@ -47,45 +51,41 @@ typedef enum __GLXdispatchIndex {
    DI_GetVideoSyncSGI,
    // GetVisualFromFBConfig implemented by libglvnd
    DI_GetVisualFromFBConfigSGIX,
+    DI_GetScreenDriver,
+    DI_GetSwapIntervalMESA,
+    DI_GetSyncValuesOML,
    // ImportContextEXT implemented by libglvnd
    // IsDirect implemented by libglvnd
+    DI_JoinSwapGroupSGIX,
    // MakeContextCurrent implemented by libglvnd
    // MakeCurrent implemented by libglvnd
    // QueryContext implemented by libglvnd
    DI_QueryContextInfoEXT,
+    DI_QueryCurrentRendererIntegerMESA,
+    DI_QueryCurrentRendererStringMESA,
    // QueryDrawable implemented by libglvnd
    // QueryExtension implemented by libglvnd
    // QueryExtensionsString implemented by libglvnd
    DI_QueryGLXPbufferSGIX,
+    DI_QueryMaxSwapBarriersSGIX,
+    DI_QueryRendererIntegerMESA,
+    DI_QueryRendererStringMESA,
    // QueryServerString implemented by libglvnd
    // QueryVersion implemented by libglvnd
+    DI_ReleaseBuffersMESA,
    DI_ReleaseTexImageEXT,
    // SelectEvent implemented by libglvnd
    DI_SelectEventSGIX,
    // SwapBuffers implemented by libglvnd
+    DI_SwapBuffersMscOML,
+    DI_SwapIntervalMESA,
    DI_SwapIntervalSGI,
    // UseXFont implemented by libglvnd
    // WaitGL implemented by libglvnd
+    DI_WaitForMscOML,
+    DI_WaitForSbcOML,
    DI_WaitVideoSyncSGI,
    // WaitX implemented by libglvnd
-    DI_glXBindSwapBarrierSGIX,
-    DI_glXCopySubBufferMESA,
-    DI_glXCreateGLXPixmapMESA,
-    DI_glXGetMscRateOML,
-    DI_glXGetScreenDriver,
-    DI_glXGetSwapIntervalMESA,
-    DI_glXGetSyncValuesOML,
-    DI_glXJoinSwapGroupSGIX,
-    DI_glXQueryCurrentRendererIntegerMESA,
-    DI_glXQueryCurrentRendererStringMESA,
-    DI_glXQueryMaxSwapBarriersSGIX,
-    DI_glXQueryRendererIntegerMESA,
-    DI_glXQueryRendererStringMESA,
-    DI_glXReleaseBuffersMESA,
-    DI_glXSwapBuffersMscOML,
-    DI_glXSwapIntervalMESA,
-    DI_glXWaitForMscOML,
-    DI_glXWaitForSbcOML,
    DI_LAST_INDEX
 } __GLXdispatchIndex;

--- a/src/glx/glxcmds.c
+++ b/src/glx/glxcmds.c
@@ -2713,7 +2713,7 @@ __glXGetUST(int64_t * ust)

 #if defined(GLX_DIRECT_RENDERING) && !defined(GLX_USE_APPLEGL)

-int
+PUBLIC int
 MesaGLInteropGLXQueryDeviceInfo(Display *dpy, GLXContext context,
                                struct mesa_glinterop_device_info *out)
 {
@@ -2737,7 +2737,7 @@ MesaGLInteropGLXQueryDeviceInfo(Display *dpy, GLXContext context,
   return ret;
 }

-int
+PUBLIC int
 MesaGLInteropGLXExportObject(Display *dpy, GLXContext context,
                             struct mesa_glinterop_export_in *in,
                             struct mesa_glinterop_export_out *out)
--- a/src/glx/glxglvnd.c
+++ b/src/glx/glxglvnd.c
@@ -50,6 +50,9 @@ static void __glXGLVNDSetDispatchIndex(const GLubyte *procName, int index)
 {
    unsigned internalIndex = FindGLXFunction(procName);

+    if (internalIndex == DI_FUNCTION_COUNT)
+        return; /* unknown or static dispatch */
+
    __glXDispatchTableIndices[internalIndex] = index;
 }

--- a/src/glx/windows/Makefile.am
+++ b/src/glx/windows/Makefile.am
@@ -16,7 +16,8 @@ libwindowsglx_la_SOURCES = \
 	windowsgl.h \
 	windowsgl_internal.h \
 	windows_drawable.c \
-	wgl.c
+	wgl.c \
+	wgl.h

 libwindowsglx_la_CFLAGS = \
 	-I$(top_srcdir)/include \
--- a/src/intel/blorp/blorp.c
+++ b/src/intel/blorp/blorp.c
@@ -169,7 +169,7 @@ blorp_compile_fs(struct blorp_context *blorp, void *mem_ctx,
                 struct nir_shader *nir,
                 const struct brw_wm_prog_key *wm_key,
                 bool use_repclear,
-                 struct brw_blorp_prog_data *prog_data,
+                 struct brw_wm_prog_data *wm_prog_data,
                 unsigned *program_size)
 {
   const struct brw_compiler *compiler = blorp->compiler;
@@ -177,15 +177,14 @@ blorp_compile_fs(struct blorp_context *blorp, void *mem_ctx,
   nir->options =
      compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions;

-   struct brw_wm_prog_data wm_prog_data;
-   memset(&wm_prog_data, 0, sizeof(wm_prog_data));
+   memset(wm_prog_data, 0, sizeof(*wm_prog_data));

-   wm_prog_data.base.nr_params = 0;
-   wm_prog_data.base.param = NULL;
+   wm_prog_data->base.nr_params = 0;
+   wm_prog_data->base.param = NULL;

   /* BLORP always just uses the first two binding table entries */
-   wm_prog_data.binding_table.render_target_start = BLORP_RENDERBUFFER_BT_INDEX;
-   wm_prog_data.base.binding_table.texture_start = BLORP_TEXTURE_BT_INDEX;
+   wm_prog_data->binding_table.render_target_start = BLORP_RENDERBUFFER_BT_INDEX;
+   wm_prog_data->base.binding_table.texture_start = BLORP_TEXTURE_BT_INDEX;

   nir = brw_preprocess_nir(compiler, nir);
   nir_remove_dead_variables(nir, nir_var_shader_in);
@@ -206,22 +205,9 @@ blorp_compile_fs(struct blorp_context *blorp, void *mem_ctx,

   const unsigned *program =
      brw_compile_fs(compiler, blorp->driver_ctx, mem_ctx,
-                     wm_key, &wm_prog_data, nir,
+                     wm_key, wm_prog_data, nir,
                     NULL, -1, -1, false, use_repclear, program_size, NULL);

-   /* Copy the relavent bits of wm_prog_data over into the blorp prog data */
-   prog_data->dispatch_8 = wm_prog_data.dispatch_8;
-   prog_data->dispatch_16 = wm_prog_data.dispatch_16;
-   prog_data->first_curbe_grf_0 = wm_prog_data.base.dispatch_grf_start_reg;
-   prog_data->first_curbe_grf_2 = wm_prog_data.dispatch_grf_start_reg_2;
-   prog_data->ksp_offset_2 = wm_prog_data.prog_offset_2;
-   prog_data->persample_msaa_dispatch = wm_prog_data.persample_dispatch;
-   prog_data->flat_inputs = wm_prog_data.flat_inputs;
-   prog_data->num_varying_inputs = wm_prog_data.num_varying_inputs;
-   prog_data->inputs_read = nir->info.inputs_read;
-
-   assert(wm_prog_data.base.nr_params == 0);
-
   return program;
 }

--- a/src/intel/blorp/blorp.h
+++ b/src/intel/blorp/blorp.h
@@ -30,7 +30,7 @@
 #include "isl/isl.h"

 struct brw_context;
-struct brw_wm_prog_key;
+struct brw_stage_prog_data;

 #ifdef __cplusplus
 extern "C" {
@@ -58,7 +58,8 @@ struct blorp_context {
   void (*upload_shader)(struct blorp_context *blorp,
                         const void *key, uint32_t key_size,
                         const void *kernel, uint32_t kernel_size,
-                         const void *prog_data, uint32_t prog_data_size,
+                         const struct brw_stage_prog_data *prog_data,
+                         uint32_t prog_data_size,
                         uint32_t *kernel_out, void *prog_data_out);
   void (*exec)(struct blorp_batch *batch, const struct blorp_params *params);
 };
--- a/src/intel/blorp/blorp_blit.c
+++ b/src/intel/blorp/blorp_blit.c
@@ -1237,7 +1237,7 @@ brw_blorp_get_blit_kernel(struct blorp_context *blorp,

   const unsigned *program;
   unsigned program_size;
-   struct brw_blorp_prog_data prog_data;
+   struct brw_wm_prog_data prog_data;

   /* Try and compile with NIR first.  If that fails, fall back to the old
    * method of building shaders manually.
@@ -1255,7 +1255,7 @@ brw_blorp_get_blit_kernel(struct blorp_context *blorp,

   blorp->upload_shader(blorp, prog_key, sizeof(*prog_key),
                        program, program_size,
-                        &prog_data, sizeof(prog_data),
+                        &prog_data.base, sizeof(prog_data),
                        &params->wm_prog_kernel, &params->wm_prog_data);

   ralloc_free(mem_ctx);
--- a/src/intel/blorp/blorp_clear.c
+++ b/src/intel/blorp/blorp_clear.c
@@ -74,7 +74,7 @@ blorp_params_get_clear_kernel(struct blorp_context *blorp,
   struct brw_wm_prog_key wm_key;
   brw_blorp_init_wm_prog_key(&wm_key);

-   struct brw_blorp_prog_data prog_data;
+   struct brw_wm_prog_data prog_data;
   unsigned program_size;
   const unsigned *program =
      blorp_compile_fs(blorp, mem_ctx, b.shader, &wm_key, use_replicated_data,
@@ -82,7 +82,7 @@ blorp_params_get_clear_kernel(struct blorp_context *blorp,

   blorp->upload_shader(blorp, &blorp_key, sizeof(blorp_key),
                        program, program_size,
-                        &prog_data, sizeof(prog_data),
+                        &prog_data.base, sizeof(prog_data),
                        &params->wm_prog_kernel, &params->wm_prog_data);

   ralloc_free(mem_ctx);
--- a/src/intel/blorp/blorp_genX_exec.h
+++ b/src/intel/blorp/blorp_genX_exec.h
@@ -207,7 +207,8 @@ blorp_emit_input_varying_data(struct blorp_batch *batch,
   for (unsigned i = 0; i < max_num_varyings; i++) {
      const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;

-      if (!(params->wm_prog_data->inputs_read & (1ull << attr)))
+      const int input_index = params->wm_prog_data->urb_setup[attr];
+      if (input_index < 0)
         continue;

      memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
@@ -401,7 +402,7 @@ static void
 blorp_emit_sf_config(struct blorp_batch *batch,
                     const struct blorp_params *params)
 {
-   const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;

   /* 3DSTATE_SF
    *
@@ -502,7 +503,7 @@ static void
 blorp_emit_ps_config(struct blorp_batch *batch,
                     const struct blorp_params *params)
 {
-   const struct brw_blorp_prog_data *prog_data = params->wm_prog_data;
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;

   /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
    * nonzero to prevent the GPU from hanging.  While the documentation doesn't
@@ -527,16 +528,16 @@ blorp_emit_ps_config(struct blorp_batch *batch,

      if (prog_data) {
         ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
+            prog_data->base.dispatch_grf_start_reg;
         ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
+            prog_data->dispatch_grf_start_reg_2;

         ps._8PixelDispatchEnable = prog_data->dispatch_8;
         ps._16PixelDispatchEnable = prog_data->dispatch_16;

         ps.KernelStartPointer0 = params->wm_prog_kernel;
         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
+            params->wm_prog_kernel + prog_data->prog_offset_2;
      }

      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
@@ -577,7 +578,7 @@ blorp_emit_ps_config(struct blorp_batch *batch,
      if (prog_data) {
         psx.PixelShaderValid = true;
         psx.AttributeEnable = prog_data->num_varying_inputs > 0;
-         psx.PixelShaderIsPerSample = prog_data->persample_msaa_dispatch;
+         psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
      }

      if (params->src.enabled)
@@ -612,7 +613,7 @@ blorp_emit_ps_config(struct blorp_batch *batch,
      if (params->dst.surf.samples > 1) {
         wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
         wm.MultisampleDispatchMode =
-            (prog_data && prog_data->persample_msaa_dispatch) ?
+            (prog_data && prog_data->persample_dispatch) ?
            MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
      } else {
         wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
@@ -630,13 +631,13 @@ blorp_emit_ps_config(struct blorp_batch *batch,

      if (prog_data) {
         ps.DispatchGRFStartRegisterforConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
+            prog_data->base.dispatch_grf_start_reg;
         ps.DispatchGRFStartRegisterforConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
+            prog_data->dispatch_grf_start_reg_2;

         ps.KernelStartPointer0 = params->wm_prog_kernel;
         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
+            params->wm_prog_kernel + prog_data->prog_offset_2;

         ps._8PixelDispatchEnable = prog_data->dispatch_8;
         ps._16PixelDispatchEnable = prog_data->dispatch_16;
@@ -692,13 +693,13 @@ blorp_emit_ps_config(struct blorp_batch *batch,
         wm.ThreadDispatchEnable = true;

         wm.DispatchGRFStartRegisterforConstantSetupData0 =
-            prog_data->first_curbe_grf_0;
+            prog_data->base.dispatch_grf_start_reg;
         wm.DispatchGRFStartRegisterforConstantSetupData2 =
-            prog_data->first_curbe_grf_2;
+            prog_data->dispatch_grf_start_reg_2;

         wm.KernelStartPointer0 = params->wm_prog_kernel;
         wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->ksp_offset_2;
+            params->wm_prog_kernel + prog_data->prog_offset_2;

         wm._8PixelDispatchEnable = prog_data->dispatch_8;
         wm._16PixelDispatchEnable = prog_data->dispatch_16;
@@ -714,7 +715,7 @@ blorp_emit_ps_config(struct blorp_batch *batch,
      if (params->dst.surf.samples > 1) {
         wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
         wm.MultisampleDispatchMode =
-            (prog_data && prog_data->persample_msaa_dispatch) ?
+            (prog_data && prog_data->persample_dispatch) ?
            MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
      } else {
         wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
@@ -1116,6 +1117,11 @@ blorp_emit_surface_states(struct blorp_batch *batch,
   }

 #if GEN_GEN >= 7
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
+   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
+
   blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
      bt.PointertoPSBindingTable = bind_offset;
   }
--- a/src/intel/blorp/blorp_priv.h
+++ b/src/intel/blorp/blorp_priv.h
@@ -138,33 +138,8 @@ struct brw_blorp_wm_inputs
   uint32_t pad[1];
 };

-struct brw_blorp_prog_data
-{
-   bool dispatch_8;
-   bool dispatch_16;
-
-   uint8_t first_curbe_grf_0;
-   uint8_t first_curbe_grf_2;
-
-   uint32_t ksp_offset_2;
-
-   /**
-    * True if the WM program should be run in MSDISPMODE_PERSAMPLE with more
-    * than one sample per pixel.
-    */
-   bool persample_msaa_dispatch;
-
-   /**
-    * Mask of which FS inputs are marked flat by the shader source.  This is
-    * needed for setting up 3DSTATE_SF/SBE.
-    */
-   uint32_t flat_inputs;
-   unsigned num_varying_inputs;
-   uint64_t inputs_read;
-};
-
 static inline unsigned
-brw_blorp_get_urb_length(const struct brw_blorp_prog_data *prog_data)
+brw_blorp_get_urb_length(const struct brw_wm_prog_data *prog_data)
 {
   if (prog_data == NULL)
      return 1;
@@ -197,7 +172,7 @@ struct blorp_params
   unsigned num_draw_buffers;
   unsigned num_layers;
   uint32_t wm_prog_kernel;
-   struct brw_blorp_prog_data *wm_prog_data;
+   struct brw_wm_prog_data *wm_prog_data;
 };

 void blorp_params_init(struct blorp_params *params);
@@ -314,7 +289,7 @@ blorp_compile_fs(struct blorp_context *blorp, void *mem_ctx,
                 struct nir_shader *nir,
                 const struct brw_wm_prog_key *wm_key,
                 bool use_repclear,
-                 struct brw_blorp_prog_data *prog_data,
+                 struct brw_wm_prog_data *wm_prog_data,
                 unsigned *program_size);

 /** \} */
--- a/src/intel/common/gen_device_info.c
+++ b/src/intel/common/gen_device_info.c
@@ -335,7 +335,6 @@ static const struct gen_device_info gen_device_info_chv = {
   .max_gs_threads = 336,                           \
   .max_tcs_threads = 336,                          \
   .max_tes_threads = 336,                          \
-   .max_wm_threads = 64 * 9,                        \
   .max_cs_threads = 56,                            \
   .urb = {                                         \
      .size = 384,                                  \
@@ -388,7 +387,6 @@ static const struct gen_device_info gen_device_info_bxt = {
   .max_tcs_threads = 112,
   .max_tes_threads = 112,
   .max_gs_threads = 112,
-   .max_wm_threads = 64 * 3,
   .max_cs_threads = 6 * 6,
   .urb = {
      .size = 192,
@@ -411,7 +409,6 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
   .max_tcs_threads = 56, /* XXX: guess */
   .max_tes_threads = 56,
   .max_gs_threads = 56,
-   .max_wm_threads = 64 * 2,
   .max_cs_threads = 6 * 6,
   .urb = {
      .size = 128,
@@ -427,18 +424,11 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
 * There's no KBL entry. Using the default SKL (GEN9) GS entries value.
 */

-/*
- * Both SKL and KBL support a maximum of 64 threads per
- * Pixel Shader Dispatch (PSD) unit.
- */
-#define  KBL_MAX_THREADS_PER_PSD 64
-
 static const struct gen_device_info gen_device_info_kbl_gt1 = {
   GEN9_FEATURES,
   .gt = 1,

   .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2,
   .urb.size = 192,
   .num_slices = 1,
 };
@@ -448,7 +438,6 @@ static const struct gen_device_info gen_device_info_kbl_gt1_5 = {
   .gt = 1,

   .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
   .num_slices = 1,
 };

@@ -456,7 +445,6 @@ static const struct gen_device_info gen_device_info_kbl_gt2 = {
   GEN9_FEATURES,
   .gt = 2,

-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
   .num_slices = 1,
 };

@@ -464,7 +452,6 @@ static const struct gen_device_info gen_device_info_kbl_gt3 = {
   GEN9_FEATURES,
   .gt = 3,

-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6,
   .num_slices = 2,
 };

@@ -472,7 +459,6 @@ static const struct gen_device_info gen_device_info_kbl_gt4 = {
   GEN9_FEATURES,
   .gt = 4,

-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9,
   /*
    * From the "L3 Allocation and Programming" documentation:
    *
@@ -500,6 +486,25 @@ gen_get_device_info(int devid, struct gen_device_info *devinfo)
      return false;
   }

+   /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
+    *
+    * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
+    *  allocate scratch space enough so that each slice has 4 slices allowed."
+    *
+    * The equivalent internal documentation says that this programming note
+    * applies to all Gen9+ platforms.
+    *
+    * The hardware typically calculates the scratch space pointer by taking
+    * the base address, and adding per-thread-scratch-space * thread ID.
+    * Extra padding can be necessary depending how the thread IDs are
+    * calculated for a particular shader stage.
+    */
+   if (devinfo->gen >= 9) {
+      devinfo->max_wm_threads = 64 /* threads-per-PSD */
+                              * devinfo->num_slices
+                              * 4; /* effective subslices per slice */
+   }
+
   return true;
 }

--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -253,10 +253,7 @@ anv_block_pool_init(struct anv_block_pool *pool,
   assert(util_is_power_of_two(block_size));

   pool->device = device;
-   pool->bo.gem_handle = 0;
-   pool->bo.offset = 0;
-   pool->bo.size = 0;
-   pool->bo.is_winsys_bo = false;
+   anv_bo_init(&pool->bo, 0, 0);
   pool->block_size = block_size;
   pool->free_list = ANV_FREE_LIST_EMPTY;
   pool->back_free_list = ANV_FREE_LIST_EMPTY;
@@ -463,10 +460,8 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state)
    * values back into pool. */
   pool->map = map + center_bo_offset;
   pool->center_bo_offset = center_bo_offset;
-   pool->bo.gem_handle = gem_handle;
-   pool->bo.size = size;
+   anv_bo_init(&pool->bo, gem_handle, size);
   pool->bo.map = map;
-   pool->bo.index = 0;

 done:
   pthread_mutex_unlock(&pool->device->mutex);
@@ -892,9 +887,9 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool
 {
   for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
      for (unsigned i = 0; i < 16; i++) {
-         struct anv_bo *bo = &pool->bos[i][s];
-         if (bo->size > 0)
-            anv_gem_close(device, bo->gem_handle);
+         struct anv_scratch_bo *bo = &pool->bos[i][s];
+         if (bo->exists > 0)
+            anv_gem_close(device, bo->bo.gem_handle);
      }
   }
 }
@@ -909,70 +904,59 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
   unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
   assert(scratch_size_log2 < 16);

-   struct anv_bo *bo = &pool->bos[scratch_size_log2][stage];
+   struct anv_scratch_bo *bo = &pool->bos[scratch_size_log2][stage];

-   /* From now on, we go into a critical section.  In order to remain
-    * thread-safe, we use the bo size as a lock.  A value of 0 means we don't
-    * have a valid BO yet.  A value of 1 means locked.  A value greater than 1
-    * means we have a bo of the given size.
+   /* We can use "exists" to shortcut and ignore the critical section */
+   if (bo->exists)
+      return &bo->bo;
+
+   pthread_mutex_lock(&device->mutex);
+
+   __sync_synchronize();
+   if (bo->exists)
+      return &bo->bo;
+
+   const struct anv_physical_device *physical_device =
+      &device->instance->physicalDevice;
+   const struct gen_device_info *devinfo = &physical_device->info;
+
+   /* WaCSScratchSize:hsw
+    *
+    * Haswell's scratch space address calculation appears to be sparse
+    * rather than tightly packed. The Thread ID has bits indicating which
+    * subslice, EU within a subslice, and thread within an EU it is.
+    * There's a maximum of two slices and two subslices, so these can be
+    * stored with a single bit. Even though there are only 10 EUs per
+    * subslice, this is stored in 4 bits, so there's an effective maximum
+    * value of 16 EUs. Similarly, although there are only 7 threads per EU,
+    * this is stored in a 3 bit number, giving an effective maximum value
+    * of 8 threads per EU.
+    *
+    * This means that we need to use 16 * 8 instead of 10 * 7 for the
+    * number of threads per subslice.
    */
+   const unsigned subslices = MAX2(physical_device->subslice_total, 1);
+   const unsigned scratch_ids_per_subslice =
+      device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads;

-   if (bo->size > 1)
-      return bo;
+   uint32_t max_threads[] = {
+      [MESA_SHADER_VERTEX]           = devinfo->max_vs_threads,
+      [MESA_SHADER_TESS_CTRL]        = devinfo->max_tcs_threads,
+      [MESA_SHADER_TESS_EVAL]        = devinfo->max_tes_threads,
+      [MESA_SHADER_GEOMETRY]         = devinfo->max_gs_threads,
+      [MESA_SHADER_FRAGMENT]         = devinfo->max_wm_threads,
+      [MESA_SHADER_COMPUTE]          = scratch_ids_per_subslice * subslices,
+   };

-   uint64_t size = __sync_val_compare_and_swap(&bo->size, 0, 1);
-   if (size == 0) {
-      /* We own the lock.  Allocate a buffer */
+   uint32_t size = per_thread_scratch * max_threads[stage];

-      const struct anv_physical_device *physical_device =
-         &device->instance->physicalDevice;
-      const struct gen_device_info *devinfo = &physical_device->info;
+   anv_bo_init_new(&bo->bo, device, size);

-      /* WaCSScratchSize:hsw
-       *
-       * Haswell's scratch space address calculation appears to be sparse
-       * rather than tightly packed. The Thread ID has bits indicating which
-       * subslice, EU within a subslice, and thread within an EU it is.
-       * There's a maximum of two slices and two subslices, so these can be
-       * stored with a single bit. Even though there are only 10 EUs per
-       * subslice, this is stored in 4 bits, so there's an effective maximum
-       * value of 16 EUs. Similarly, although there are only 7 threads per EU,
-       * this is stored in a 3 bit number, giving an effective maximum value
-       * of 8 threads per EU.
-       *
-       * This means that we need to use 16 * 8 instead of 10 * 7 for the
-       * number of threads per subslice.
-       */
-      const unsigned subslices = MAX2(physical_device->subslice_total, 1);
-      const unsigned scratch_ids_per_subslice =
-         device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads;
+   /* Set the exists last because it may be read by other threads */
+   __sync_synchronize();
+   bo->exists = true;

-      uint32_t max_threads[] = {
-         [MESA_SHADER_VERTEX]           = devinfo->max_vs_threads,
-         [MESA_SHADER_TESS_CTRL]        = devinfo->max_tcs_threads,
-         [MESA_SHADER_TESS_EVAL]        = devinfo->max_tes_threads,
-         [MESA_SHADER_GEOMETRY]         = devinfo->max_gs_threads,
-         [MESA_SHADER_FRAGMENT]         = devinfo->max_wm_threads,
-         [MESA_SHADER_COMPUTE]          = scratch_ids_per_subslice * subslices,
-      };
+   pthread_mutex_unlock(&device->mutex);

-      size = per_thread_scratch * max_threads[stage];
-
-      struct anv_bo new_bo;
-      anv_bo_init_new(&new_bo, device, size);
-
-      bo->gem_handle = new_bo.gem_handle;
-
-      /* Set the size last because we use it as a lock */
-      __sync_synchronize();
-      bo->size = size;
-
-      futex_wake((uint32_t *)&bo->size, INT_MAX);
-   } else {
-      /* Someone else got here first */
-      while (bo->size == 1)
-         futex_wait((uint32_t *)&bo->size, 1);
-   }
-
-   return bo;
+   return &bo->bo;
 }
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -32,6 +32,8 @@
 #include "genxml/gen7_pack.h"
 #include "genxml/gen8_pack.h"

+#include "util/debug.h"
+
 /** \file anv_batch_chain.c
 *
 * This file contains functions related to anv_cmd_buffer as a data
@@ -297,8 +299,6 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
   bbo->length = other_bbo->length;
   memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length);

-   bbo->last_ss_pool_bo_offset = other_bbo->last_ss_pool_bo_offset;
-
   *bbo_out = bbo;

   return VK_SUCCESS;
@@ -318,7 +318,6 @@ anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,
   batch->next = batch->start = bbo->bo.map;
   batch->end = bbo->bo.map + bbo->bo.size - batch_padding;
   batch->relocs = &bbo->relocs;
-   bbo->last_ss_pool_bo_offset = 0;
   bbo->relocs.num_relocs = 0;
 }

@@ -620,13 +619,10 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
                                &cmd_buffer->pool->alloc);
   if (result != VK_SUCCESS)
      goto fail_bt_blocks;
+   cmd_buffer->last_ss_pool_center = 0;

   anv_cmd_buffer_new_binding_table_block(cmd_buffer);

-   cmd_buffer->execbuf2.objects = NULL;
-   cmd_buffer->execbuf2.bos = NULL;
-   cmd_buffer->execbuf2.array_length = 0;
-
   return VK_SUCCESS;

 fail_bt_blocks:
@@ -658,9 +654,6 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
                            &cmd_buffer->batch_bos, link) {
      anv_batch_bo_destroy(bbo, cmd_buffer);
   }
-
-   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.objects);
-   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.bos);
 }

 void
@@ -688,6 +681,7 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
   cmd_buffer->bt_next = 0;

   cmd_buffer->surface_relocs.num_relocs = 0;
+   cmd_buffer->last_ss_pool_center = 0;

   /* Reset the list of seen buffers */
   cmd_buffer->seen_bbos.head = 0;
@@ -857,56 +851,83 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
                         &secondary->surface_relocs, 0);
 }

+struct anv_execbuf {
+   struct drm_i915_gem_execbuffer2           execbuf;
+
+   struct drm_i915_gem_exec_object2 *        objects;
+   uint32_t                                  bo_count;
+   struct anv_bo **                          bos;
+
+   /* Allocated length of the 'objects' and 'bos' arrays */
+   uint32_t                                  array_length;
+};
+
+static void
+anv_execbuf_init(struct anv_execbuf *exec)
+{
+   memset(exec, 0, sizeof(*exec));
+}
+
+static void
+anv_execbuf_finish(struct anv_execbuf *exec,
+                   const VkAllocationCallbacks *alloc)
+{
+   vk_free(alloc, exec->objects);
+   vk_free(alloc, exec->bos);
+}
+
 static VkResult
-anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer,
-                      struct anv_bo *bo,
-                      struct anv_reloc_list *relocs)
+anv_execbuf_add_bo(struct anv_execbuf *exec,
+                   struct anv_bo *bo,
+                   struct anv_reloc_list *relocs,
+                   const VkAllocationCallbacks *alloc)
 {
   struct drm_i915_gem_exec_object2 *obj = NULL;

-   if (bo->index < cmd_buffer->execbuf2.bo_count &&
-       cmd_buffer->execbuf2.bos[bo->index] == bo)
-      obj = &cmd_buffer->execbuf2.objects[bo->index];
+   if (bo->index < exec->bo_count && exec->bos[bo->index] == bo)
+      obj = &exec->objects[bo->index];

   if (obj == NULL) {
      /* We've never seen this one before.  Add it to the list and assign
       * an id that we can use later.
       */
-      if (cmd_buffer->execbuf2.bo_count >= cmd_buffer->execbuf2.array_length) {
-         uint32_t new_len = cmd_buffer->execbuf2.objects ?
-                            cmd_buffer->execbuf2.array_length * 2 : 64;
+      if (exec->bo_count >= exec->array_length) {
+         uint32_t new_len = exec->objects ? exec->array_length * 2 : 64;

         struct drm_i915_gem_exec_object2 *new_objects =
-            vk_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_objects),
-                      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+            vk_alloc(alloc, new_len * sizeof(*new_objects),
+                     8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
         if (new_objects == NULL)
            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

         struct anv_bo **new_bos =
-            vk_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_bos),
-                      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+            vk_alloc(alloc, new_len * sizeof(*new_bos),
+                      8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
         if (new_bos == NULL) {
-            vk_free(&cmd_buffer->pool->alloc, new_objects);
+            vk_free(alloc, new_objects);
            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
         }

-         if (cmd_buffer->execbuf2.objects) {
-            memcpy(new_objects, cmd_buffer->execbuf2.objects,
-                   cmd_buffer->execbuf2.bo_count * sizeof(*new_objects));
-            memcpy(new_bos, cmd_buffer->execbuf2.bos,
-                   cmd_buffer->execbuf2.bo_count * sizeof(*new_bos));
+         if (exec->objects) {
+            memcpy(new_objects, exec->objects,
+                   exec->bo_count * sizeof(*new_objects));
+            memcpy(new_bos, exec->bos,
+                   exec->bo_count * sizeof(*new_bos));
         }

-         cmd_buffer->execbuf2.objects = new_objects;
-         cmd_buffer->execbuf2.bos = new_bos;
-         cmd_buffer->execbuf2.array_length = new_len;
+         vk_free(alloc, exec->objects);
+         vk_free(alloc, exec->bos);
+
+         exec->objects = new_objects;
+         exec->bos = new_bos;
+         exec->array_length = new_len;
      }

-      assert(cmd_buffer->execbuf2.bo_count < cmd_buffer->execbuf2.array_length);
+      assert(exec->bo_count < exec->array_length);

-      bo->index = cmd_buffer->execbuf2.bo_count++;
-      obj = &cmd_buffer->execbuf2.objects[bo->index];
-      cmd_buffer->execbuf2.bos[bo->index] = bo;
+      bo->index = exec->bo_count++;
+      obj = &exec->objects[bo->index];
+      exec->bos[bo->index] = bo;

      obj->handle = bo->gem_handle;
      obj->relocation_count = 0;
@@ -929,7 +950,7 @@ anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer,
      for (size_t i = 0; i < relocs->num_relocs; i++) {
         /* A quick sanity check on relocations */
         assert(relocs->relocs[i].offset < bo->size);
-         anv_cmd_buffer_add_bo(cmd_buffer, relocs->reloc_bos[i], NULL);
+         anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL, alloc);
      }
   }

@@ -940,82 +961,62 @@ static void
 anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,
                              struct anv_reloc_list *list)
 {
-   struct anv_bo *bo;
-
-   /* If the kernel supports I915_EXEC_NO_RELOC, it will compare offset in
-    * struct drm_i915_gem_exec_object2 against the bos current offset and if
-    * all bos haven't moved it will skip relocation processing alltogether.
-    * If I915_EXEC_NO_RELOC is not supported, the kernel ignores the incoming
-    * value of offset so we can set it either way.  For that to work we need
-    * to make sure all relocs use the same presumed offset.
-    */
-
-   for (size_t i = 0; i < list->num_relocs; i++) {
-      bo = list->reloc_bos[i];
-      if (bo->offset != list->relocs[i].presumed_offset)
-         cmd_buffer->execbuf2.need_reloc = true;
-
-      list->relocs[i].target_handle = bo->index;
-   }
-}
-
-static uint64_t
-read_reloc(const struct anv_device *device, const void *p)
-{
-   if (device->info.gen >= 8)
-      return *(uint64_t *)p;
-   else
-      return *(uint32_t *)p;
+   for (size_t i = 0; i < list->num_relocs; i++)
+      list->relocs[i].target_handle = list->reloc_bos[i]->index;
 }

 static void
-write_reloc(const struct anv_device *device, void *p, uint64_t v)
+write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush)
 {
-   if (device->info.gen >= 8)
-      *(uint64_t *)p = v;
-   else
+   unsigned reloc_size = 0;
+   if (device->info.gen >= 8) {
+      /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress:
+       *
+       *    "This field specifies the address of the memory location where the
+       *    register value specified in the DWord above will read from. The
+       *    address specifies the DWord location of the data. Range =
+       *    GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress
+       *    [63:48] are ignored by the HW and assumed to be in correct
+       *    canonical form [63:48] == [47]."
+       */
+      const int shift = 63 - 47;
+      reloc_size = sizeof(uint64_t);
+      *(uint64_t *)p = (((int64_t)v) << shift) >> shift;
+   } else {
+      reloc_size = sizeof(uint32_t);
      *(uint32_t *)p = v;
+   }
+
+   if (flush && !device->info.has_llc)
+      anv_clflush_range(p, reloc_size);
 }

 static void
-adjust_relocations_from_block_pool(struct anv_block_pool *pool,
-                                   struct anv_reloc_list *relocs)
+adjust_relocations_from_state_pool(struct anv_block_pool *pool,
+                                   struct anv_reloc_list *relocs,
+                                   uint32_t last_pool_center_bo_offset)
 {
+   assert(last_pool_center_bo_offset <= pool->center_bo_offset);
+   uint32_t delta = pool->center_bo_offset - last_pool_center_bo_offset;
+
   for (size_t i = 0; i < relocs->num_relocs; i++) {
-      /* In general, we don't know how stale the relocated value is.  It
-       * may have been used last time or it may not.  Since we don't want
-       * to stomp it while the GPU may be accessing it, we haven't updated
-       * it anywhere else in the code.  Instead, we just set the presumed
-       * offset to what it is now based on the delta and the data in the
-       * block pool.  Then the kernel will update it for us if needed.
-       */
-      assert(relocs->relocs[i].offset < pool->state.end);
-      const void *p = pool->map + relocs->relocs[i].offset;
-
-      /* We're reading back the relocated value from potentially incoherent
-       * memory here. However, any change to the value will be from the kernel
-       * writing out relocations, which will keep the CPU cache up to date.
-       */
-      relocs->relocs[i].presumed_offset =
-         read_reloc(pool->device, p) - relocs->relocs[i].delta;
-
      /* All of the relocations from this block pool to other BO's should
       * have been emitted relative to the surface block pool center.  We
       * need to add the center offset to make them relative to the
       * beginning of the actual GEM bo.
       */
-      relocs->relocs[i].offset += pool->center_bo_offset;
+      relocs->relocs[i].offset += delta;
   }
 }

 static void
-adjust_relocations_to_block_pool(struct anv_block_pool *pool,
+adjust_relocations_to_state_pool(struct anv_block_pool *pool,
                                 struct anv_bo *from_bo,
                                 struct anv_reloc_list *relocs,
-                                 uint32_t *last_pool_center_bo_offset)
+                                 uint32_t last_pool_center_bo_offset)
 {
-   assert(*last_pool_center_bo_offset <= pool->center_bo_offset);
-   uint32_t delta = pool->center_bo_offset - *last_pool_center_bo_offset;
+   assert(last_pool_center_bo_offset <= pool->center_bo_offset);
+   uint32_t delta = pool->center_bo_offset - last_pool_center_bo_offset;

   /* When we initially emit relocations into a block pool, we don't
    * actually know what the final center_bo_offset will be so we just emit
@@ -1040,37 +1041,147 @@ adjust_relocations_to_block_pool(struct anv_block_pool *pool,
         assert(relocs->relocs[i].offset < from_bo->size);
         write_reloc(pool->device, from_bo->map + relocs->relocs[i].offset,
                     relocs->relocs[i].presumed_offset +
-                     relocs->relocs[i].delta);
+                     relocs->relocs[i].delta, false);
      }
   }
-
-   *last_pool_center_bo_offset = pool->center_bo_offset;
 }

-void
-anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
+static void
+anv_reloc_list_apply(struct anv_device *device,
+                     struct anv_reloc_list *list,
+                     struct anv_bo *bo,
+                     bool always_relocate)
+{
+   for (size_t i = 0; i < list->num_relocs; i++) {
+      struct anv_bo *target_bo = list->reloc_bos[i];
+      if (list->relocs[i].presumed_offset == target_bo->offset &&
+          !always_relocate)
+         continue;
+
+      void *p = bo->map + list->relocs[i].offset;
+      write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true);
+      list->relocs[i].presumed_offset = target_bo->offset;
+   }
+}
+
+/**
+ * This function applies the relocation for a command buffer and writes the
+ * actual addresses into the buffers as per what we were told by the kernel on
+ * the previous execbuf2 call.  This should be safe to do because, for each
+ * relocated address, we have two cases:
+ *
+ *  1) The target BO is inactive (as seen by the kernel).  In this case, it is
+ *     not in use by the GPU so updating the address is 100% ok.  It won't be
+ *     in-use by the GPU (from our context) again until the next execbuf2
+ *     happens.  If the kernel decides to move it in the next execbuf2, it
+ *     will have to do the relocations itself, but that's ok because it should
+ *     have all of the information needed to do so.
+ *
+ *  2) The target BO is active (as seen by the kernel).  In this case, it
+ *     hasn't moved since the last execbuffer2 call because GTT shuffling
+ *     *only* happens when the BO is idle. (From our perspective, it only
+ *     happens inside the execbuffer2 ioctl, but the shuffling may be
+ *     triggered by another ioctl, with full-ppgtt this is limited to only
+ *     execbuffer2 ioctls on the same context, or memory pressure.)  Since the
+ *     target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT
+ *     address and the relocated value we are writing into the BO will be the
+ *     same as the value that is already there.
+ *
+ *     There is also a possibility that the target BO is active but the exact
+ *     RENDER_SURFACE_STATE object we are writing the relocation into isn't in
+ *     use.  In this case, the address currently in the RENDER_SURFACE_STATE
+ *     may be stale but it's still safe to write the relocation because that
+ *     particular RENDER_SURFACE_STATE object isn't in-use by the GPU and
+ *     won't be until the next execbuf2 call.
+ *
+ * By doing relocations on the CPU, we can tell the kernel that it doesn't
+ * need to bother.  We want to do this because the surface state buffer is
+ * used by every command buffer so, if the kernel does the relocations, it
+ * will always be busy and the kernel will always stall.  This is also
+ * probably the fastest mechanism for doing relocations since the kernel would
+ * have to make a full copy of all the relocations lists.
+ */
+static bool
+relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_execbuf *exec)
+{
+   static int userspace_relocs = -1;
+   if (userspace_relocs < 0)
+      userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);
+   if (!userspace_relocs)
+      return false;
+
+   /* First, we have to check to see whether or not we can even do the
+    * relocation.  New buffers which have never been submitted to the kernel
+    * don't have a valid offset so we need to let the kernel do relocations so
+    * that we can get offsets for them.  On future execbuf2 calls, those
+    * buffers will have offsets and we will be able to skip relocating.
+    * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1.
+    */
+   for (uint32_t i = 0; i < exec->bo_count; i++) {
+      if (exec->bos[i]->offset == (uint64_t)-1)
+         return false;
+   }
+
+   /* Since surface states are shared between command buffers and we don't
+    * know what order they will be submitted to the kernel, we don't know
+    * what address is actually written in the surface state object at any
+    * given time.  The only option is to always relocate them.
+    */
+   anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs,
+                        &cmd_buffer->device->surface_state_block_pool.bo,
+                        true /* always relocate surface states */);
+
+   /* Since we own all of the batch buffers, we know what values are stored
+    * in the relocated addresses and only have to update them if the offsets
+    * have changed.
+    */
+   struct anv_batch_bo **bbo;
+   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+      anv_reloc_list_apply(cmd_buffer->device,
+                           &(*bbo)->relocs, &(*bbo)->bo, false);
+   }
+
+   for (uint32_t i = 0; i < exec->bo_count; i++)
+      exec->objects[i].offset = exec->bos[i]->offset;
+
+   return true;
+}
+
+VkResult
+anv_cmd_buffer_execbuf(struct anv_device *device,
+                       struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_batch *batch = &cmd_buffer->batch;
   struct anv_block_pool *ss_pool =
      &cmd_buffer->device->surface_state_block_pool;

-   cmd_buffer->execbuf2.bo_count = 0;
-   cmd_buffer->execbuf2.need_reloc = false;
+   struct anv_execbuf execbuf;
+   anv_execbuf_init(&execbuf);

-   adjust_relocations_from_block_pool(ss_pool, &cmd_buffer->surface_relocs);
-   anv_cmd_buffer_add_bo(cmd_buffer, &ss_pool->bo, &cmd_buffer->surface_relocs);
+   adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs,
+                                      cmd_buffer->last_ss_pool_center);
+   anv_execbuf_add_bo(&execbuf, &ss_pool->bo, &cmd_buffer->surface_relocs,
+                      &cmd_buffer->pool->alloc);

   /* First, we walk over all of the bos we've seen and add them and their
    * relocations to the validate list.
    */
   struct anv_batch_bo **bbo;
   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
-      adjust_relocations_to_block_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs,
-                                       &(*bbo)->last_ss_pool_bo_offset);
+      adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs,
+                                       cmd_buffer->last_ss_pool_center);

-      anv_cmd_buffer_add_bo(cmd_buffer, &(*bbo)->bo, &(*bbo)->relocs);
+      anv_execbuf_add_bo(&execbuf, &(*bbo)->bo, &(*bbo)->relocs,
+                         &cmd_buffer->pool->alloc);
   }

+   /* Now that we've adjusted all of the surface state relocations, we need to
+    * record the surface state pool center so future executions of the command
+    * buffer can adjust correctly.
+    */
+   cmd_buffer->last_ss_pool_center = ss_pool->center_bo_offset;
+
   struct anv_batch_bo *first_batch_bo =
      list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);

@@ -1079,20 +1190,19 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
    * corresponding to the first batch_bo in the chain with the last
    * element in the list.
    */
-   if (first_batch_bo->bo.index != cmd_buffer->execbuf2.bo_count - 1) {
+   if (first_batch_bo->bo.index != execbuf.bo_count - 1) {
      uint32_t idx = first_batch_bo->bo.index;
-      uint32_t last_idx = cmd_buffer->execbuf2.bo_count - 1;
+      uint32_t last_idx = execbuf.bo_count - 1;

-      struct drm_i915_gem_exec_object2 tmp_obj =
-         cmd_buffer->execbuf2.objects[idx];
-      assert(cmd_buffer->execbuf2.bos[idx] == &first_batch_bo->bo);
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf.objects[idx];
+      assert(execbuf.bos[idx] == &first_batch_bo->bo);

-      cmd_buffer->execbuf2.objects[idx] = cmd_buffer->execbuf2.objects[last_idx];
-      cmd_buffer->execbuf2.bos[idx] = cmd_buffer->execbuf2.bos[last_idx];
-      cmd_buffer->execbuf2.bos[idx]->index = idx;
+      execbuf.objects[idx] = execbuf.objects[last_idx];
+      execbuf.bos[idx] = execbuf.bos[last_idx];
+      execbuf.bos[idx]->index = idx;

-      cmd_buffer->execbuf2.objects[last_idx] = tmp_obj;
-      cmd_buffer->execbuf2.bos[last_idx] = &first_batch_bo->bo;
+      execbuf.objects[last_idx] = tmp_obj;
+      execbuf.bos[last_idx] = &first_batch_bo->bo;
      first_batch_bo->bo.index = last_idx;
   }

@@ -1113,9 +1223,9 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
      }
   }

-   cmd_buffer->execbuf2.execbuf = (struct drm_i915_gem_execbuffer2) {
-      .buffers_ptr = (uintptr_t) cmd_buffer->execbuf2.objects,
-      .buffer_count = cmd_buffer->execbuf2.bo_count,
+   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf.objects,
+      .buffer_count = execbuf.bo_count,
      .batch_start_offset = 0,
      .batch_len = batch->next - batch->start,
      .cliprects_ptr = 0,
@@ -1128,6 +1238,49 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
      .rsvd2 = 0,
   };

-   if (!cmd_buffer->execbuf2.need_reloc)
-      cmd_buffer->execbuf2.execbuf.flags |= I915_EXEC_NO_RELOC;
+   if (relocate_cmd_buffer(cmd_buffer, &execbuf)) {
+      /* If we were able to successfully relocate everything, tell the kernel
+       * that it can skip doing relocations. The requirement for using
+       * NO_RELOC is:
+       *
+       *  1) The addresses written in the objects must match the corresponding
+       *     reloc.presumed_offset which in turn must match the corresponding
+       *     execobject.offset.
+       *
+       *  2) To avoid stalling, execobject.offset should match the current
+       *     address of that object within the active context.
+       *
+       * In order to satisfy all of the invariants that make userspace
+       * relocations to be safe (see relocate_cmd_buffer()), we need to
+       * further ensure that the addresses we use match those used by the
+       * kernel for the most recent execbuf2.
+       *
+       * The kernel may still choose to do relocations anyway if something has
+       * moved in the GTT. In this case, the relocation list still needs to be
+       * valid.  All relocations on the batch buffers are already valid and
+       * kept up-to-date.  For surface state relocations, by applying the
+       * relocations in relocate_cmd_buffer, we ensured that the address in
+       * the RENDER_SURFACE_STATE matches presumed_offset, so it should be
+       * safe for the kernel to relocate them as needed.
+       */
+      execbuf.execbuf.flags |= I915_EXEC_NO_RELOC;
+   } else {
+      /* In the case where we fall back to doing kernel relocations, we need
+       * to ensure that the relocation list is valid.  All relocations on the
+       * batch buffers are already valid and kept up-to-date.  Since surface
+       * states are shared between command buffers and we don't know what
+       * order they will be submitted to the kernel, we don't know what
+       * address is actually written in the surface state object at any given
+       * time.  The only option is to set a bogus presumed offset and let the
+       * kernel relocate them.
+       */
+      for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
+         cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
+   }
+
+   VkResult result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos);
+
+   anv_execbuf_finish(&execbuf, &cmd_buffer->pool->alloc);
+
+   return result;
 }
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -44,8 +44,7 @@ lookup_blorp_shader(struct blorp_context *blorp,
   anv_shader_bin_unref(device, bin);

   *kernel_out = bin->kernel.offset;
-   *(const struct brw_stage_prog_data **)prog_data_out =
-      anv_shader_bin_get_prog_data(bin);
+   *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data;

   return true;
 }
@@ -54,7 +53,8 @@ static void
 upload_blorp_shader(struct blorp_context *blorp,
                    const void *key, uint32_t key_size,
                    const void *kernel, uint32_t kernel_size,
-                    const void *prog_data, uint32_t prog_data_size,
+                    const struct brw_stage_prog_data *prog_data,
+                    uint32_t prog_data_size,
                    uint32_t *kernel_out, void *prog_data_out)
 {
   struct anv_device *device = blorp->driver_ctx;
@@ -78,8 +78,7 @@ upload_blorp_shader(struct blorp_context *blorp,
   anv_shader_bin_unref(device, bin);

   *kernel_out = bin->kernel.offset;
-   *(const struct brw_stage_prog_data **)prog_data_out =
-      anv_shader_bin_get_prog_data(bin);
+   *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data;
 }

 void
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -658,7 +658,7 @@ anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
   struct anv_push_constants *data =
      cmd_buffer->state.push_constants[stage];
   const struct brw_stage_prog_data *prog_data =
-      anv_shader_bin_get_prog_data(cmd_buffer->state.pipeline->shaders[stage]);
+      cmd_buffer->state.pipeline->shaders[stage]->prog_data;

   /* If we don't actually have any push constants, bail. */
   if (data == NULL || prog_data == NULL || prog_data->nr_params == 0)
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -203,19 +203,19 @@ static const VkExtensionProperties global_extensions[] = {
 #ifdef VK_USE_PLATFORM_XCB_KHR
   {
      .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME,
-      .specVersion = 5,
+      .specVersion = 6,
   },
 #endif
 #ifdef VK_USE_PLATFORM_XLIB_KHR
   {
      .extensionName = VK_KHR_XLIB_SURFACE_EXTENSION_NAME,
-      .specVersion = 5,
+      .specVersion = 6,
   },
 #endif
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
   {
      .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME,
-      .specVersion = 4,
+      .specVersion = 5,
   },
 #endif
 };
@@ -223,7 +223,7 @@ static const VkExtensionProperties global_extensions[] = {
 static const VkExtensionProperties device_extensions[] = {
   {
      .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
-      .specVersion = 67,
+      .specVersion = 68,
   },
 };

@@ -350,7 +350,7 @@ VkResult anv_EnumeratePhysicalDevices(
         snprintf(path, sizeof(path), "/dev/dri/renderD%d", 128 + i);
         result = anv_physical_device_init(&instance->physicalDevice,
                                           instance, path);
-         if (result == VK_SUCCESS)
+         if (result != VK_ERROR_INCOMPATIBLE_DRIVER)
            break;
      }

@@ -770,7 +770,7 @@ anv_device_submit_simple_batch(struct anv_device *device,
 {
   struct drm_i915_gem_execbuffer2 execbuf;
   struct drm_i915_gem_exec_object2 exec2_objects[1];
-   struct anv_bo bo;
+   struct anv_bo bo, *exec_bos[1];
   VkResult result = VK_SUCCESS;
   uint32_t size;
   int64_t timeout;
@@ -786,6 +786,7 @@ anv_device_submit_simple_batch(struct anv_device *device,
   if (!device->info.has_llc)
      anv_clflush_range(bo.map, size);

+   exec_bos[0] = &bo;
   exec2_objects[0].handle = bo.gem_handle;
   exec2_objects[0].relocation_count = 0;
   exec2_objects[0].relocs_ptr = 0;
@@ -809,18 +810,15 @@ anv_device_submit_simple_batch(struct anv_device *device,
   execbuf.rsvd1 = device->context_id;
   execbuf.rsvd2 = 0;

-   ret = anv_gem_execbuffer(device, &execbuf);
-   if (ret != 0) {
-      /* We don't know the real error. */
-      result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m");
+   result = anv_device_execbuf(device, &execbuf, exec_bos);
+   if (result != VK_SUCCESS)
      goto fail;
-   }

   timeout = INT64_MAX;
   ret = anv_gem_wait(device, bo.gem_handle, &timeout);
   if (ret != 0) {
      /* We don't know the real error. */
-      result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m");
+      result = vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
      goto fail;
   }

@@ -1070,6 +1068,24 @@ void anv_GetDeviceQueue(
   *pQueue = anv_queue_to_handle(&device->queue);
 }

+VkResult
+anv_device_execbuf(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf,
+                   struct anv_bo **execbuf_bos)
+{
+   int ret = anv_gem_execbuffer(device, execbuf);
+   if (ret != 0) {
+      /* We don't know the real error. */
+      return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
+   }
+
+   struct drm_i915_gem_exec_object2 *objects = (void *)execbuf->buffers_ptr;
+   for (uint32_t k = 0; k < execbuf->buffer_count; k++)
+      execbuf_bos[k]->offset = objects[k].offset;
+
+   return VK_SUCCESS;
+}
+
 VkResult anv_QueueSubmit(
    VkQueue                                     _queue,
    uint32_t                                    submitCount,
@@ -1079,7 +1095,34 @@ VkResult anv_QueueSubmit(
   ANV_FROM_HANDLE(anv_queue, queue, _queue);
   ANV_FROM_HANDLE(anv_fence, fence, _fence);
   struct anv_device *device = queue->device;
-   int ret;
+   VkResult result = VK_SUCCESS;
+
+   /* We lock around QueueSubmit for three main reasons:
+    *
+    *  1) When a block pool is resized, we create a new gem handle with a
+    *     different size and, in the case of surface states, possibly a
+    *     different center offset but we re-use the same anv_bo struct when
+    *     we do so.  If this happens in the middle of setting up an execbuf,
+    *     we could end up with our list of BOs out of sync with our list of
+    *     gem handles.
+    *
+    *  2) The algorithm we use for building the list of unique buffers isn't
+    *     thread-safe.  While the client is supposed to syncronize around
+    *     QueueSubmit, this would be extremely difficult to debug if it ever
+    *     came up in the wild due to a broken app.  It's better to play it
+    *     safe and just lock around QueueSubmit.
+    *
+    *  3)  The anv_cmd_buffer_execbuf function may perform relocations in
+    *      userspace.  Due to the fact that the surface state buffer is shared
+    *      between batches, we can't afford to have that happen from multiple
+    *      threads at the same time.  Even though the user is supposed to
+    *      ensure this doesn't happen, we play it safe as in (2) above.
+    *
+    * Since the only other things that ever take the device lock such as block
+    * pool resize only rarely happen, this will almost never be contended so
+    * taking a lock isn't really an expensive operation in this case.
+    */
+   pthread_mutex_lock(&device->mutex);

   for (uint32_t i = 0; i < submitCount; i++) {
      for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
@@ -1087,28 +1130,23 @@ VkResult anv_QueueSubmit(
                         pSubmits[i].pCommandBuffers[j]);
         assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);

-         ret = anv_gem_execbuffer(device, &cmd_buffer->execbuf2.execbuf);
-         if (ret != 0) {
-            /* We don't know the real error. */
-            return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                             "execbuf2 failed: %m");
-         }
-
-         for (uint32_t k = 0; k < cmd_buffer->execbuf2.bo_count; k++)
-            cmd_buffer->execbuf2.bos[k]->offset = cmd_buffer->execbuf2.objects[k].offset;
+         result = anv_cmd_buffer_execbuf(device, cmd_buffer);
+         if (result != VK_SUCCESS)
+            goto out;
      }
   }

   if (fence) {
-      ret = anv_gem_execbuffer(device, &fence->execbuf);
-      if (ret != 0) {
-         /* We don't know the real error. */
-         return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                          "execbuf2 failed: %m");
-      }
+      struct anv_bo *fence_bo = &fence->bo;
+      result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
+      if (result != VK_SUCCESS)
+         goto out;
   }

-   return VK_SUCCESS;
+out:
+   pthread_mutex_unlock(&device->mutex);
+
+   return result;
 }

 VkResult anv_QueueWaitIdle(
@@ -1138,15 +1176,11 @@ VkResult anv_DeviceWaitIdle(
 VkResult
 anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size)
 {
-   bo->gem_handle = anv_gem_create(device, size);
-   if (!bo->gem_handle)
+   uint32_t gem_handle = anv_gem_create(device, size);
+   if (!gem_handle)
      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);

-   bo->map = NULL;
-   bo->index = 0;
-   bo->offset = 0;
-   bo->size = size;
-   bo->is_winsys_bo = false;
+   anv_bo_init(bo, gem_handle, size);

   return VK_SUCCESS;
 }
--- a/src/intel/vulkan/anv_intel.c
+++ b/src/intel/vulkan/anv_intel.c
@@ -49,16 +49,15 @@ VkResult anv_CreateDmaBufImageINTEL(
   if (mem == NULL)
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

-   mem->bo.gem_handle = anv_gem_fd_to_handle(device, pCreateInfo->fd);
-   if (!mem->bo.gem_handle) {
+   uint32_t gem_handle = anv_gem_fd_to_handle(device, pCreateInfo->fd);
+   if (!gem_handle) {
      result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
      goto fail;
   }

-   mem->bo.map = NULL;
-   mem->bo.index = 0;
-   mem->bo.offset = 0;
-   mem->bo.size = pCreateInfo->strideInBytes * pCreateInfo->extent.height;
+   uint64_t size = pCreateInfo->strideInBytes * pCreateInfo->extent.height;
+
+   anv_bo_init(&mem->bo, gem_handle, size);

   anv_image_create(_device,
      &(struct anv_image_create_info) {
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -388,7 +388,8 @@ anv_pipeline_upload_kernel(struct anv_pipeline *pipeline,
                           struct anv_pipeline_cache *cache,
                           const void *key_data, uint32_t key_size,
                           const void *kernel_data, uint32_t kernel_size,
-                           const void *prog_data, uint32_t prog_data_size,
+                           const struct brw_stage_prog_data *prog_data,
+                           uint32_t prog_data_size,
                           const struct anv_pipeline_bind_map *bind_map)
 {
   if (cache) {
@@ -399,7 +400,8 @@ anv_pipeline_upload_kernel(struct anv_pipeline *pipeline,
   } else {
      return anv_shader_bin_create(pipeline->device, key_data, key_size,
                                   kernel_data, kernel_size,
-                                   prog_data, prog_data_size, bind_map);
+                                   prog_data, prog_data_size,
+                                   prog_data->param, bind_map);
   }
 }

@@ -476,7 +478,8 @@ anv_pipeline_compile_vs(struct anv_pipeline *pipeline,

      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
                                       shader_code, code_size,
-                                       &prog_data, sizeof(prog_data), &map);
+                                       &prog_data.base.base, sizeof(prog_data),
+                                       &map);
      if (!bin) {
         ralloc_free(mem_ctx);
         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -486,7 +489,7 @@ anv_pipeline_compile_vs(struct anv_pipeline *pipeline,
   }

   const struct brw_vs_prog_data *vs_prog_data =
-      (const struct brw_vs_prog_data *)anv_shader_bin_get_prog_data(bin);
+      (const struct brw_vs_prog_data *)bin->prog_data;

   if (vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8) {
      pipeline->vs_simd8 = bin->kernel.offset;
@@ -563,7 +566,8 @@ anv_pipeline_compile_gs(struct anv_pipeline *pipeline,
      /* TODO: SIMD8 GS */
      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
                                       shader_code, code_size,
-                                       &prog_data, sizeof(prog_data), &map);
+                                       &prog_data.base.base, sizeof(prog_data),
+                                       &map);
      if (!bin) {
         ralloc_free(mem_ctx);
         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -686,7 +690,8 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,

      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
                                       shader_code, code_size,
-                                       &prog_data, sizeof(prog_data), &map);
+                                       &prog_data.base, sizeof(prog_data),
+                                       &map);
      if (!bin) {
         ralloc_free(mem_ctx);
         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -758,7 +763,8 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,

      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
                                       shader_code, code_size,
-                                       &prog_data, sizeof(prog_data), &map);
+                                       &prog_data.base, sizeof(prog_data),
+                                       &map);
      if (!bin) {
         ralloc_free(mem_ctx);
         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@@ -26,13 +26,9 @@
 #include "util/debug.h"
 #include "anv_private.h"

-struct shader_bin_key {
-   uint32_t size;
-   uint8_t data[0];
-};
-
 static size_t
-anv_shader_bin_size(uint32_t prog_data_size, uint32_t key_size,
+anv_shader_bin_size(uint32_t prog_data_size, uint32_t nr_params,
+                    uint32_t key_size,
                    uint32_t surface_count, uint32_t sampler_count)
 {
   const uint32_t binding_data_size =
@@ -40,28 +36,21 @@ anv_shader_bin_size(uint32_t prog_data_size, uint32_t key_size,

   return align_u32(sizeof(struct anv_shader_bin), 8) +
          align_u32(prog_data_size, 8) +
+          align_u32(nr_params * sizeof(void *), 8) +
          align_u32(sizeof(uint32_t) + key_size, 8) +
          align_u32(binding_data_size, 8);
 }

-static inline const struct shader_bin_key *
-anv_shader_bin_get_key(const struct anv_shader_bin *shader)
-{
-   const void *data = shader;
-   data += align_u32(sizeof(struct anv_shader_bin), 8);
-   data += align_u32(shader->prog_data_size, 8);
-   return data;
-}
-
 struct anv_shader_bin *
 anv_shader_bin_create(struct anv_device *device,
                      const void *key_data, uint32_t key_size,
                      const void *kernel_data, uint32_t kernel_size,
-                      const void *prog_data, uint32_t prog_data_size,
+                      const struct brw_stage_prog_data *prog_data,
+                      uint32_t prog_data_size, const void *prog_data_param,
                      const struct anv_pipeline_bind_map *bind_map)
 {
   const size_t size =
-      anv_shader_bin_size(prog_data_size, key_size,
+      anv_shader_bin_size(prog_data_size, prog_data->nr_params, key_size,
                          bind_map->surface_count, bind_map->sampler_count);

   struct anv_shader_bin *shader =
@@ -82,10 +71,20 @@ anv_shader_bin_create(struct anv_device *device,
   void *data = shader;
   data += align_u32(sizeof(struct anv_shader_bin), 8);

+   shader->prog_data = data;
+   struct brw_stage_prog_data *new_prog_data = data;
   memcpy(data, prog_data, prog_data_size);
   data += align_u32(prog_data_size, 8);

-   struct shader_bin_key *key = data;
+   assert(prog_data->nr_pull_params == 0);
+   assert(prog_data->nr_image_params == 0);
+   new_prog_data->param = data;
+   uint32_t param_size = prog_data->nr_params * sizeof(void *);
+   memcpy(data, prog_data_param, param_size);
+   data += align_u32(param_size, 8);
+
+   shader->key = data;
+   struct anv_shader_bin_key *key = data;
   key->size = key_size;
   memcpy(key->data, key_data, key_size);
   data += align_u32(sizeof(*key) + key_size, 8);
@@ -115,7 +114,7 @@ static size_t
 anv_shader_bin_data_size(const struct anv_shader_bin *shader)
 {
   return anv_shader_bin_size(shader->prog_data_size,
-                              anv_shader_bin_get_key(shader)->size,
+                              shader->prog_data->nr_params, shader->key->size,
                              shader->bind_map.surface_count,
                              shader->bind_map.sampler_count) +
          align_u32(shader->kernel_size, 8);
@@ -126,7 +125,7 @@ anv_shader_bin_write_data(const struct anv_shader_bin *shader, void *data)
 {
   size_t struct_size =
      anv_shader_bin_size(shader->prog_data_size,
-                          anv_shader_bin_get_key(shader)->size,
+                          shader->prog_data->nr_params, shader->key->size,
                          shader->bind_map.surface_count,
                          shader->bind_map.sampler_count);

@@ -151,14 +150,14 @@ anv_shader_bin_write_data(const struct anv_shader_bin *shader, void *data)
 static uint32_t
 shader_bin_key_hash_func(const void *void_key)
 {
-   const struct shader_bin_key *key = void_key;
+   const struct anv_shader_bin_key *key = void_key;
   return _mesa_hash_data(key->data, key->size);
 }

 static bool
 shader_bin_key_compare_func(const void *void_a, const void *void_b)
 {
-   const struct shader_bin_key *a = void_a, *b = void_b;
+   const struct anv_shader_bin_key *a = void_a, *b = void_b;
   if (a->size != b->size)
      return false;

@@ -230,7 +229,7 @@ anv_pipeline_cache_search_locked(struct anv_pipeline_cache *cache,
                                 const void *key_data, uint32_t key_size)
 {
   uint32_t vla[1 + DIV_ROUND_UP(key_size, sizeof(uint32_t))];
-   struct shader_bin_key *key = (void *)vla;
+   struct anv_shader_bin_key *key = (void *)vla;
   key->size = key_size;
   memcpy(key->data, key_data, key_size);

@@ -266,7 +265,9 @@ static struct anv_shader_bin *
 anv_pipeline_cache_add_shader(struct anv_pipeline_cache *cache,
                              const void *key_data, uint32_t key_size,
                              const void *kernel_data, uint32_t kernel_size,
-                              const void *prog_data, uint32_t prog_data_size,
+                              const struct brw_stage_prog_data *prog_data,
+                              uint32_t prog_data_size,
+                              const void *prog_data_param,
                              const struct anv_pipeline_bind_map *bind_map)
 {
   struct anv_shader_bin *shader =
@@ -277,11 +278,12 @@ anv_pipeline_cache_add_shader(struct anv_pipeline_cache *cache,
   struct anv_shader_bin *bin =
      anv_shader_bin_create(cache->device, key_data, key_size,
                            kernel_data, kernel_size,
-                            prog_data, prog_data_size, bind_map);
+                            prog_data, prog_data_size, prog_data_param,
+                            bind_map);
   if (!bin)
      return NULL;

-   _mesa_hash_table_insert(cache->cache, anv_shader_bin_get_key(bin), bin);
+   _mesa_hash_table_insert(cache->cache, bin->key, bin);

   return bin;
 }
@@ -290,7 +292,8 @@ struct anv_shader_bin *
 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
                                 const void *key_data, uint32_t key_size,
                                 const void *kernel_data, uint32_t kernel_size,
-                                 const void *prog_data, uint32_t prog_data_size,
+                                 const struct brw_stage_prog_data *prog_data,
+                                 uint32_t prog_data_size,
                                 const struct anv_pipeline_bind_map *bind_map)
 {
   if (cache->cache) {
@@ -299,7 +302,8 @@ anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
      struct anv_shader_bin *bin =
         anv_pipeline_cache_add_shader(cache, key_data, key_size,
                                       kernel_data, kernel_size,
-                                       prog_data, prog_data_size, bind_map);
+                                       prog_data, prog_data_size,
+                                       prog_data->param, bind_map);

      pthread_mutex_unlock(&cache->mutex);

@@ -311,7 +315,8 @@ anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
      /* In this case, we're not caching it so the caller owns it entirely */
      return anv_shader_bin_create(cache->device, key_data, key_size,
                                   kernel_data, kernel_size,
-                                   prog_data, prog_data_size, bind_map);
+                                   prog_data, prog_data_size,
+                                   prog_data->param, bind_map);
   }
 }

@@ -366,10 +371,16 @@ anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
      memcpy(&bin, p, sizeof(bin));
      p += align_u32(sizeof(struct anv_shader_bin), 8);

-      const void *prog_data = p;
+      const struct brw_stage_prog_data *prog_data = p;
      p += align_u32(bin.prog_data_size, 8);
+      if (p > end)
+         break;

-      struct shader_bin_key key;
+      uint32_t param_size = prog_data->nr_params * sizeof(void *);
+      const void *prog_data_param = p;
+      p += align_u32(param_size, 8);
+
+      struct anv_shader_bin_key key;
      if (p + sizeof(key) > end)
         break;
      memcpy(&key, p, sizeof(key));
@@ -392,7 +403,7 @@ anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
      anv_pipeline_cache_add_shader(cache, key_data, key.size,
                                    kernel_data, bin.kernel_size,
                                    prog_data, bin.prog_data_size,
-                                    &bin.bind_map);
+                                    prog_data_param, &bin.bind_map);
   }
 }

@@ -532,11 +543,11 @@ VkResult anv_MergePipelineCaches(
      struct hash_entry *entry;
      hash_table_foreach(src->cache, entry) {
         struct anv_shader_bin *bin = entry->data;
-         if (_mesa_hash_table_search(dst->cache, anv_shader_bin_get_key(bin)))
+         if (_mesa_hash_table_search(dst->cache, bin->key))
            continue;

         anv_shader_bin_ref(bin);
-         _mesa_hash_table_insert(dst->cache, anv_shader_bin_get_key(bin), bin);
+         _mesa_hash_table_insert(dst->cache, bin->key, bin);
      }
   }

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -267,6 +267,17 @@ struct anv_bo {
   bool is_winsys_bo;
 };

+static inline void
+anv_bo_init(struct anv_bo *bo, uint32_t gem_handle, uint64_t size)
+{
+   bo->gem_handle = gem_handle;
+   bo->index = 0;
+   bo->offset = -1;
+   bo->size = size;
+   bo->map = NULL;
+   bo->is_winsys_bo = false;
+}
+
 /* Represents a lock-free linked list of "free" things.  This is used by
 * both the block pool and the state pools.  Unfortunately, in order to
 * solve the ABA problem, we can't use a single uint32_t head.
@@ -439,9 +450,14 @@ VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo,
                           uint32_t size);
 void anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo);

+struct anv_scratch_bo {
+   bool exists;
+   struct anv_bo bo;
+};
+
 struct anv_scratch_pool {
   /* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */
-   struct anv_bo bos[16][MESA_SHADER_STAGES];
+   struct anv_scratch_bo bos[16][MESA_SHADER_STAGES];
 };

 void anv_scratch_pool_init(struct anv_device *device,
@@ -518,7 +534,8 @@ struct anv_shader_bin *
 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
                                 const void *key_data, uint32_t key_size,
                                 const void *kernel_data, uint32_t kernel_size,
-                                 const void *prog_data, uint32_t prog_data_size,
+                                 const struct brw_stage_prog_data *prog_data,
+                                 uint32_t prog_data_size,
                                 const struct anv_pipeline_bind_map *bind_map);

 struct anv_device {
@@ -567,6 +584,10 @@ void anv_device_get_cache_uuid(void *uuid);
 void anv_device_init_blorp(struct anv_device *device);
 void anv_device_finish_blorp(struct anv_device *device);

+VkResult anv_device_execbuf(struct anv_device *device,
+                            struct drm_i915_gem_execbuffer2 *execbuf,
+                            struct anv_bo **execbuf_bos);
+
 void* anv_gem_mmap(struct anv_device *device,
                   uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
 void anv_gem_munmap(void *p, uint64_t size);
@@ -617,9 +638,6 @@ struct anv_batch_bo {
   /* Bytes actually consumed in this batch BO */
   size_t                                       length;

-   /* Last seen surface state block pool bo offset */
-   uint32_t                                     last_ss_pool_bo_offset;
-
   struct anv_reloc_list                        relocs;
 };

@@ -1153,24 +1171,10 @@ struct anv_cmd_buffer {
    */
   struct u_vector                            bt_blocks;
   uint32_t                                     bt_next;
+
   struct anv_reloc_list                        surface_relocs;
-
-   /* Information needed for execbuf
-    *
-    * These fields are generated by anv_cmd_buffer_prepare_execbuf().
-    */
-   struct {
-      struct drm_i915_gem_execbuffer2           execbuf;
-
-      struct drm_i915_gem_exec_object2 *        objects;
-      uint32_t                                  bo_count;
-      struct anv_bo **                          bos;
-
-      /* Allocated length of the 'objects' and 'bos' arrays */
-      uint32_t                                  array_length;
-
-      bool                                      need_reloc;
-   } execbuf2;
+   /** Last seen surface state block pool center bo offset */
+   uint32_t                                     last_ss_pool_center;

   /* Serial for tracking buffer completion */
   uint32_t                                     serial;
@@ -1192,6 +1196,8 @@ void anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer);
 void anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
                                  struct anv_cmd_buffer *secondary);
 void anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer);
+VkResult anv_cmd_buffer_execbuf(struct anv_device *device,
+                                struct anv_cmd_buffer *cmd_buffer);

 VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer);

@@ -1299,24 +1305,33 @@ struct anv_pipeline_bind_map {
   struct anv_pipeline_binding *                sampler_to_descriptor;
 };

+struct anv_shader_bin_key {
+   uint32_t size;
+   uint8_t data[0];
+};
+
 struct anv_shader_bin {
   uint32_t ref_cnt;

+   const struct anv_shader_bin_key *key;
+
   struct anv_state kernel;
   uint32_t kernel_size;

-   struct anv_pipeline_bind_map bind_map;
-
+   const struct brw_stage_prog_data *prog_data;
   uint32_t prog_data_size;

-   /* Prog data follows, then the key, both aligned to 8-bytes */
+   struct anv_pipeline_bind_map bind_map;
+
+   /* Prog data follows, then params, then the key, all aligned to 8-bytes */
 };

 struct anv_shader_bin *
 anv_shader_bin_create(struct anv_device *device,
                      const void *key, uint32_t key_size,
                      const void *kernel, uint32_t kernel_size,
-                      const void *prog_data, uint32_t prog_data_size,
+                      const struct brw_stage_prog_data *prog_data,
+                      uint32_t prog_data_size, const void *prog_data_param,
                      const struct anv_pipeline_bind_map *bind_map);

 void
@@ -1337,14 +1352,6 @@ anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader)
      anv_shader_bin_destroy(device, shader);
 }

-static inline const struct brw_stage_prog_data *
-anv_shader_bin_get_prog_data(const struct anv_shader_bin *shader)
-{
-   const void *data = shader;
-   data += align_u32(sizeof(struct anv_shader_bin), 8);
-   return data;
-}
-
 struct anv_pipeline {
   struct anv_device *                          device;
   struct anv_batch                             batch;
@@ -1411,7 +1418,7 @@ get_##prefix##_prog_data(struct anv_pipeline *pipeline)              \
 {                                                                    \
   if (anv_pipeline_has_stage(pipeline, stage)) {                    \
      return (const struct brw_##prefix##_prog_data *)               \
-             anv_shader_bin_get_prog_data(pipeline->shaders[stage]); \
+             pipeline->shaders[stage]->prog_data;                    \
   } else {                                                          \
      return NULL;                                                   \
   }                                                                 \
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -200,20 +200,9 @@ genX(EndCommandBuffer)(
    VkCommandBuffer                             commandBuffer)
 {
   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct anv_device *device = cmd_buffer->device;

   anv_cmd_buffer_end_batch_buffer(cmd_buffer);

-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
-      /* The algorithm used to compute the validate list is not threadsafe as
-       * it uses the bo->index field.  We have to lock the device around it.
-       * Fortunately, the chances for contention here are probably very low.
-       */
-      pthread_mutex_lock(&device->mutex);
-      anv_cmd_buffer_prepare_execbuf(cmd_buffer);
-      pthread_mutex_unlock(&device->mutex);
-   }
-
   return VK_SUCCESS;
 }

@@ -1883,22 +1872,25 @@ void genX(CmdEndRenderPass)(
 }

 static void
-emit_ps_depth_count(struct anv_batch *batch,
+emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
                    struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
      pc.DestinationAddressType  = DAT_PPGTT;
      pc.PostSyncOperation       = WritePSDepthCount;
      pc.DepthStallEnable        = true;
      pc.Address                 = (struct anv_address) { bo, offset };
+
+      if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
+         pc.CommandStreamerStallEnable = true;
   }
 }

 static void
-emit_query_availability(struct anv_batch *batch,
+emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
                        struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
      pc.DestinationAddressType  = DAT_PPGTT;
      pc.PostSyncOperation       = WriteImmediateData;
      pc.Address                 = (struct anv_address) { bo, offset };
@@ -1931,7 +1923,7 @@ void genX(CmdBeginQuery)(

   switch (pool->type) {
   case VK_QUERY_TYPE_OCCLUSION:
-      emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
+      emit_ps_depth_count(cmd_buffer, &pool->bo,
                          query * sizeof(struct anv_query_pool_slot));
      break;

@@ -1951,10 +1943,10 @@ void genX(CmdEndQuery)(

   switch (pool->type) {
   case VK_QUERY_TYPE_OCCLUSION:
-      emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
+      emit_ps_depth_count(cmd_buffer, &pool->bo,
                          query * sizeof(struct anv_query_pool_slot) + 8);

-      emit_query_availability(&cmd_buffer->batch, &pool->bo,
+      emit_query_availability(cmd_buffer, &pool->bo,
                              query * sizeof(struct anv_query_pool_slot) + 16);
      break;

@@ -1996,11 +1988,14 @@ void genX(CmdWriteTimestamp)(
         pc.DestinationAddressType  = DAT_PPGTT;
         pc.PostSyncOperation       = WriteTimestamp;
         pc.Address = (struct anv_address) { &pool->bo, offset };
+
+         if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
+            pc.CommandStreamerStallEnable = true;
      }
      break;
   }

-   emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
+   emit_query_availability(cmd_buffer, &pool->bo, query + 16);
 }

 #if GEN_GEN > 7 || GEN_IS_HASWELL
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -52,7 +52,8 @@ static void
 brw_blorp_upload_shader(struct blorp_context *blorp,
                        const void *key, uint32_t key_size,
                        const void *kernel, uint32_t kernel_size,
-                        const void *prog_data, uint32_t prog_data_size,
+                        const struct brw_stage_prog_data *prog_data,
+                        uint32_t prog_data_size,
                        uint32_t *kernel_out, void *prog_data_out)
 {
   struct brw_context *brw = blorp->driver_ctx;
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3673,6 +3673,12 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
       */
      setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
      length++;
+   } else if (key->replicate_alpha && inst->target != 0) {
+      /* Handle the case when fragment shader doesn't write to draw buffer
+       * zero. No need to call setup_color_payload() for src0_alpha because
+       * alpha value will be undefined.
+       */
+      length++;
   }

   setup_color_payload(bld, key, &sources[length], color0, components);
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -291,12 +291,18 @@ _mesa_new_shader_program(GLuint name)
 * Clear (free) the shader program state that gets produced by linking.
 */
 void
-_mesa_clear_shader_program_data(struct gl_shader_program *shProg)
+_mesa_clear_shader_program_data(struct gl_context *ctx,
+                                struct gl_shader_program *shProg)
 {
-   unsigned i;
+   for (gl_shader_stage sh = 0; sh < MESA_SHADER_STAGES; sh++) {
+      if (shProg->_LinkedShaders[sh] != NULL) {
+         _mesa_delete_linked_shader(ctx, shProg->_LinkedShaders[sh]);
+         shProg->_LinkedShaders[sh] = NULL;
+      }
+   }

   if (shProg->UniformStorage) {
-      for (i = 0; i < shProg->NumUniformStorage; ++i)
+      for (unsigned i = 0; i < shProg->NumUniformStorage; ++i)
         _mesa_uniform_detach_all_driver_storage(&shProg->UniformStorage[i]);
      ralloc_free(shProg->UniformStorage);
      shProg->NumUniformStorage = 0;
@@ -347,11 +353,10 @@ _mesa_free_shader_program_data(struct gl_context *ctx,
                               struct gl_shader_program *shProg)
 {
   GLuint i;
-   gl_shader_stage sh;

   assert(shProg->Type == GL_SHADER_PROGRAM_MESA);

-   _mesa_clear_shader_program_data(shProg);
+   _mesa_clear_shader_program_data(ctx, shProg);

   if (shProg->AttributeBindings) {
      string_to_uint_map_dtor(shProg->AttributeBindings);
@@ -385,14 +390,6 @@ _mesa_free_shader_program_data(struct gl_context *ctx,
   shProg->TransformFeedback.VaryingNames = NULL;
   shProg->TransformFeedback.NumVarying = 0;

-
-   for (sh = 0; sh < MESA_SHADER_STAGES; sh++) {
-      if (shProg->_LinkedShaders[sh] != NULL) {
-	 _mesa_delete_linked_shader(ctx, shProg->_LinkedShaders[sh]);
-	 shProg->_LinkedShaders[sh] = NULL;
-      }
-   }
-
   free(shProg->Label);
   shProg->Label = NULL;
 }
--- a/src/mesa/main/shaderobj.h
+++ b/src/mesa/main/shaderobj.h
@@ -99,7 +99,8 @@ extern struct gl_shader_program *
 _mesa_new_shader_program(GLuint name);

 extern void
-_mesa_clear_shader_program_data(struct gl_shader_program *shProg);
+_mesa_clear_shader_program_data(struct gl_context *ctx,
+                                struct gl_shader_program *shProg);

 extern void
 _mesa_free_shader_program_data(struct gl_context *ctx,
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -3052,7 +3052,7 @@ _mesa_glsl_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 {
   unsigned int i;

-   _mesa_clear_shader_program_data(prog);
+   _mesa_clear_shader_program_data(ctx, prog);

   prog->LinkStatus = GL_TRUE;

--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -772,9 +772,9 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,

         int i = u_bit_scan(&writemask);

-         /* before emitting the instruction, see if we have to adjust store
+         /* before emitting the instruction, see if we have to adjust load / store
          * address */
-         if (i > 1 && inst->op == TGSI_OPCODE_STORE &&
+         if (i > 1 && (inst->op == TGSI_OPCODE_LOAD || inst->op == TGSI_OPCODE_STORE) &&
             addr.file == PROGRAM_UNDEFINED) {
            /* We have to advance the buffer address by 16 */
            addr = get_temp(glsl_type::uint_type);
@@ -782,7 +782,6 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
                     inst->src[0], st_src_reg_for_int(16));
         }

-
         /* first time use previous instruction */
         if (dinst == NULL) {
            dinst = inst;
@@ -802,11 +801,10 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
               dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
               dinst->dst[j].index = initial_dst_idx[j];
               if (i > 1) {
-                  if (dinst->op == TGSI_OPCODE_STORE) {
+                  if (dinst->op == TGSI_OPCODE_LOAD || dinst->op == TGSI_OPCODE_STORE)
                     dinst->src[0] = addr;
-                  } else {
+                  if (dinst->op != TGSI_OPCODE_STORE)
                     dinst->dst[j].index++;
-                  }
               }
            } else {
               /* if we aren't writing to a double, just get the bit of the initial writemask
--- a/src/mesa/state_tracker/st_sampler_view.c
+++ b/src/mesa/state_tracker/st_sampler_view.c
@@ -430,8 +430,12 @@ st_create_texture_sampler_view_from_stobj(struct st_context *st,
      templ.u.tex.first_level = stObj->base.MinLevel + stObj->base.BaseLevel;
      templ.u.tex.last_level = last_level(stObj);
      assert(templ.u.tex.first_level <= templ.u.tex.last_level);
-      templ.u.tex.first_layer = stObj->base.MinLayer;
-      templ.u.tex.last_layer = last_layer(stObj);
+      if (stObj->layer_override) {
+         templ.u.tex.first_layer = templ.u.tex.last_layer = stObj->layer_override;
+      } else {
+         templ.u.tex.first_layer = stObj->base.MinLayer;
+         templ.u.tex.last_layer = last_layer(stObj);
+      }
      assert(templ.u.tex.first_layer <= templ.u.tex.last_layer);
      templ.target = gl_target_to_pipe(stObj->base.Target);
   }
@@ -478,8 +482,11 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
         assert(stObj->base.MinLevel + stObj->base.BaseLevel ==
                view->u.tex.first_level);
         assert(last_level(stObj) == view->u.tex.last_level);
-         assert(stObj->base.MinLayer == view->u.tex.first_layer);
-         assert(last_layer(stObj) == view->u.tex.last_layer);
+         assert(stObj->layer_override || stObj->base.MinLayer == view->u.tex.first_layer);
+         assert(stObj->layer_override || last_layer(stObj) == view->u.tex.last_layer);
+         assert(!stObj->layer_override ||
+                (stObj->layer_override == view->u.tex.first_layer &&
+                 stObj->layer_override == view->u.tex.last_layer));
      }
   }
   else {
--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -108,6 +108,15 @@ struct st_texture_object
    */
   enum pipe_format surface_format;

+   /* When non-zero, samplers should use this layer instead of the one
+    * specified by the GL state.
+    *
+    * This is used for VDPAU interop, where imported pipe_resources may be
+    * array textures (containing layers with different fields) even though the
+    * GL state describes one non-array texture per field.
+    */
+   uint layer_override;
+
   /** The glsl version of the shader seen during the previous validation */
   unsigned prev_glsl_version;
   /** The value of the sampler's sRGBDecode state at the previous validation */
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -189,8 +189,8 @@ st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
   struct st_texture_image *stImage = st_texture_image(texImage);

   struct pipe_resource *res;
-   struct pipe_sampler_view templ, **sampler_view;
   mesa_format texFormat;
+   uint layer_override = 0;

   if (output) {
      res = st_vdpau_output_surface_dma_buf(ctx, vdpSurface);
@@ -201,8 +201,10 @@ st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
   } else {
      res = st_vdpau_video_surface_dma_buf(ctx, vdpSurface, index);

-      if (!res)
+      if (!res) {
         res = st_vdpau_video_surface_gallium(ctx, vdpSurface, index);
+         layer_override = index & 1;
+      }
   }

   if (!res) {
@@ -233,18 +235,8 @@ st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
   st_texture_release_all_sampler_views(st, stObj);
   pipe_resource_reference(&stImage->pt, res);

-   u_sampler_view_default_template(&templ, res, res->format);
-   templ.u.tex.first_layer = index & 1;
-   templ.u.tex.last_layer = index & 1;
-   templ.swizzle_r = GET_SWZ(stObj->base._Swizzle, 0);
-   templ.swizzle_g = GET_SWZ(stObj->base._Swizzle, 1);
-   templ.swizzle_b = GET_SWZ(stObj->base._Swizzle, 2);
-   templ.swizzle_a = GET_SWZ(stObj->base._Swizzle, 3);
-
-   sampler_view = st_texture_get_sampler_view(st, stObj);
-   *sampler_view = st->pipe->create_sampler_view(st->pipe, res, &templ);
-
   stObj->surface_format = res->format;
+   stObj->layer_override = layer_override;

   _mesa_dirty_texobj(ctx, texObj);
   pipe_resource_reference(&res, NULL);
@@ -264,6 +256,8 @@ st_vdpau_unmap_surface(struct gl_context *ctx, GLenum target, GLenum access,
   st_texture_release_all_sampler_views(st, stObj);
   pipe_resource_reference(&stImage->pt, NULL);

+   stObj->layer_override = 0;
+
   _mesa_dirty_texobj(ctx, texObj);

   st_flush(st, NULL, 0);
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@@ -117,6 +117,8 @@ wsi_x11_get_connection(struct wsi_device *wsi_dev,

      struct wsi_x11_connection *wsi_conn =
         wsi_x11_connection_create(alloc, conn);
+      if (!wsi_conn)
+         return NULL;

      pthread_mutex_lock(&wsi->mutex);

@@ -889,6 +891,10 @@ wsi_x11_finish_wsi(struct wsi_device *wsi_device,
      (struct wsi_x11 *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_XCB];

   if (wsi) {
+      struct hash_entry *entry;
+      hash_table_foreach(wsi->connections, entry)
+         wsi_x11_connection_destroy(alloc, entry->data);
+
      _mesa_hash_table_destroy(wsi->connections, NULL);

      pthread_mutex_destroy(&wsi->mutex);
@@ -1 +1 @@
 .0.0
 .0.1