docs/relnotes/19.2.8: Add SHA256 sum

VERSION: bump for 19.2.8
docs: add relnotes for 19.2.8
2019-12-18 11:23:15 -08:00 · 2019-12-18 11:02:09 -08:00 · 2019-12-18 11:01:53 -08:00 · 2019-12-17 09:10:49 -08:00 · 2019-12-17 09:08:49 -08:00 · 2019-12-16 16:09:30 -08:00
31 changed files with 596 additions and 179 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-19.2.6
+19.2.8
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -27,6 +27,8 @@ bcd9224728dcb8d8fe4bcddc4bd9b2c36fcfe9dd
 869e32593a9096b845dd6106f8f86e1c41fac968
 a2c3c65a31de90fdb55f76f2894860dfbafe2043
 bb0c5c487e63e88acbb792f092dd8f392bad8540
+937b9055698be0dfdb7d2e0673a989e2ecc05912
+21376cffb37018160ad3eef38b5a640ba1675a4f

 # This is reverted shortly after it was landed
 4432a2d14d80081d062f7939a950d65ea3a16eed
@@ -35,6 +37,15 @@ bb0c5c487e63e88acbb792f092dd8f392bad8540
 1a05811936dd8d0c3a367c6f00629624ef39d537
 911a8261419f48dcd756f78832fa5a5f4c5b8d93

-# This was manuall backported
+# This was manually backported
 2afeed301010917c4eae55dcd2544f9d329df934
 4b392ced2d744fccffe95490ff57e6b41033c266
+
+# This is not being backported to 19.2 due to causing build regressions for
+# downstream projects
+eaf43966027cf9654e91ca57aecc8f5a65b58f49
+
+# Invalid sha warnings
+023282a4f667695ea1dbbe9fbe1cd3a9d550a426
+2fca325ea65f068043d4c18c9cd0fe7f25bde8f7
+7564c5fc6d79a2ddec49a19f67183fb3be799fe5
--- a/docs/relnotes/19.2.7.html
+++ b/docs/relnotes/19.2.7.html
@@ -0,0 +1,96 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<title>Mesa Release Notes</title>
+<link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+<h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 19.2.7 Release Notes / 2019-12-04</h1>
+
+<p>
+    Mesa 19.2.7 is a bug fix release which fixes bugs found since the 19.2.6 release.
+</p>
+<p>
+Mesa 19.2.7 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5. OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+<p>
+Mesa 19.2.7 implements the Vulkan 1.1 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+</p>
+
+<h2>SHA256 checksum</h2>
+<pre>
+    e3799fb7896fd9ed2f90f651fb907b95cdebfbd494968ff116e6bf1be143579e  mesa-19.2.7.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<ul>
+    <li>None</li>
+</ul>
+
+<h2>Bug fixes</h2>
+
+<ul>
+    <li>ld.lld: error: duplicate symbol (mesa-19.3.0-rc1)</li>
+    <li>triangle strip clipping with GL_FIRST_VERTEX_CONVENTION causes wrong vertex&#x27;s attribute to be broadcasted for flat interpolation</li>
+    <li>[bisected][regression][g45,g965,ilk] piglit arb_fragment_program kil failures</li>
+</ul>
+
+<h2>Changes</h2>
+
+<ul>
+    <p>Bas Nieuwenhuizen (2):</p>
+    <li>      radv: Allocate cmdbuffer space for buffer marker write.</li>
+    <li>      radv: Unify max_descriptor_set_size.</li>
+    <p></p>
+    <p>Boris Brezillon (1):</p>
+    <li>      gallium: Fix the -&gt;set_damage_region() implementation</li>
+    <p></p>
+    <p>Ian Romanick (1):</p>
+    <li>      intel/fs: Disable conditional discard optimization on Gen4 and Gen5</li>
+    <p></p>
+    <p>Jason Ekstrand (1):</p>
+    <li>      anv: Set up SBE_SWIZ properly for gl_Viewport</li>
+    <p></p>
+    <p>Jonathan Gray (2):</p>
+    <li>      winsys/amdgpu: avoid double simple_mtx_unlock()</li>
+    <li>      i965: update Makefile.sources for perf changes</li>
+    <p></p>
+    <p>Rhys Perry (1):</p>
+    <li>      radv: set writes_memory for global memory stores/atomics</li>
+    <p></p>
+    <p>Samuel Pitoiset (3):</p>
+    <li>      radv: fix enabling sample shading with SampleID/SamplePosition</li>
+    <li>      radv/gfx10: fix implementation of exclusive scans</li>
+    <li>      radv: fix compute pipeline keys when optimizations are disabled</li>
+    <p></p>
+    <p>Yevhenii Kolesnikov (1):</p>
+    <li>      meson: Fix linkage of libgallium_nine with libgalliumvl</li>
+    <p></p>
+    <p>Zebediah Figura (1):</p>
+    <li>      Revert &quot;draw: revert using correct order for prim decomposition.&quot;</li>
+    <p></p>
+    <p></p>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/19.2.8.html
+++ b/docs/relnotes/19.2.8.html
@@ -0,0 +1,108 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<title>Mesa Release Notes</title>
+<link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+<h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 19.2.8 Release Notes / 2019-12-18</h1>
+
+<p>
+    Mesa 19.2.8 is a bug fix release which fixes bugs found since the 19.2.7 release.
+</p>
+<p>
+Mesa 19.2.8 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5. OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+<p>
+Mesa 19.2.8 implements the Vulkan 1.1 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+</p>
+
+<h2>SHA256 checksum</h2>
+<pre>
+    cffa8fa755c7422ce014c39ca0b770a092d9e0bbae537ceb2609c106916e5a57  mesa-19.2.8.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<ul>
+    <li>None</li>
+</ul>
+
+<h2>Bug fixes</h2>
+
+<ul>
+    <li>i965/iris: assert when destroy GL context with active query</li>
+</ul>
+
+<h2>Changes</h2>
+
+<ul>
+    <p>Alyssa Rosenzweig (1):</p>
+    <li>      gallium/util: Support POLYGON in u_stream_outputs_for_vertices</li>
+    <p></p>
+    <p>Bas Nieuwenhuizen (2):</p>
+    <li>      amd/common: Always use addrlib for HTILE tc-compat.</li>
+    <li>      amd/common: Fix tcCompatible degradation on Stoney.</li>
+    <p></p>
+    <p>Dylan Baker (4):</p>
+    <li>      docs: Add SHA256 sums for 19.2.7</li>
+    <li>      meson/broadcom: libbroadcom_cle needs expat headers</li>
+    <li>      meson/broadcom: libbroadcom_cle also needs zlib</li>
+    <li>      cherry-ignore: Update for 19.2.8</li>
+    <p></p>
+    <p>Gert Wollny (1):</p>
+    <li>      virgl: Increase the shader transfer buffer by doubling the size</li>
+    <p></p>
+    <p>Iván Briano (1):</p>
+    <li>      anv: Export filter_minmax support only when it&#x27;s really supported</li>
+    <p></p>
+    <p>Jason Ekstrand (2):</p>
+    <li>      anv: Re-emit all compute state on pipeline switch</li>
+    <li>      anv: Don&#x27;t leak when set_tiling fails</li>
+    <p></p>
+    <p>Kenneth Graunke (1):</p>
+    <li>      iris: Default to X-tiling for scanout buffers without modifiers</li>
+    <p></p>
+    <p>Lionel Landwerlin (7):</p>
+    <li>      intel/perf: fix invalid hw_id in query results</li>
+    <li>      intel/perf: set read buffer len to 0 to identify empty buffer</li>
+    <li>      intel/perf: take into account that reports read can be fairly old</li>
+    <li>      intel/perf: simplify the processing of OA reports</li>
+    <li>      intel/perf: fix improper pointer access</li>
+    <li>      anv: fix fence underlying primitive checks</li>
+    <li>      mesa: avoid triggering assert in implementation</li>
+    <p></p>
+    <p>Nanley Chery (2):</p>
+    <li>      gallium/dri2: Fix creation of multi-planar modifier images</li>
+    <li>      gallium: Store the image format in winsys_handle</li>
+    <p></p>
+    <p>Rob Clark (1):</p>
+    <li>      nir/lower_clip: Fix incorrect driver loc for clipdist outputs</li>
+    <p></p>
+    <p>Timothy Arceri (1):</p>
+    <li>      glsl/nir: iterate the system values list when adding varyings</li>
+    <p></p>
+    <p></p>
+</ul>
+
+</div>
+</body>
+</html>
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -4218,8 +4218,43 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
 {
 	LLVMValueRef result, tmp;

-	if (ctx->chip_class >= GFX10) {
-		result = inclusive ? src : identity;
+	if (inclusive) {
+		result = src;
+	} else if (ctx->chip_class >= GFX10) {
+		/* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
+		LLVMValueRef active, tmp1, tmp2;
+		LLVMValueRef tid = ac_get_thread_id(ctx);
+
+		tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+
+		tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
+
+		if (maxprefix > 32) {
+			active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
+					       LLVMConstInt(ctx->i32, 32, false), "");
+
+			tmp2 = LLVMBuildSelect(ctx->builder, active,
+					       ac_build_readlane(ctx, src,
+								 LLVMConstInt(ctx->i32, 31, false)),
+					       tmp2, "");
+
+			active = LLVMBuildOr(ctx->builder, active,
+					     LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+							   LLVMBuildAnd(ctx->builder, tid,
+									LLVMConstInt(ctx->i32, 0x1f, false), ""),
+							   LLVMConstInt(ctx->i32, 0x10, false), ""), "");
+			src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+		} else if (maxprefix > 16) {
+			active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
+					       LLVMConstInt(ctx->i32, 16, false), "");
+
+			src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+		}
+
+		result = src;
+	} else if (ctx->chip_class >= GFX8) {
+		src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
+		result = src;
 	} else {
 		if (!inclusive)
 			src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
@@ -4249,33 +4284,31 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
 		return result;

 	if (ctx->chip_class >= GFX10) {
-		/* dpp_row_bcast{15,31} are not supported on gfx10. */
-		LLVMBuilderRef builder = ctx->builder;
 		LLVMValueRef tid = ac_get_thread_id(ctx);
-		LLVMValueRef cc;
-		/* TODO-GFX10: Can we get better code-gen by putting this into
-		 * a branch so that LLVM generates EXEC mask manipulations? */
-		if (inclusive)
-			tmp = result;
-		else
-			tmp = ac_build_alu_op(ctx, result, src, op);
-		tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
-		tmp = ac_build_alu_op(ctx, result, tmp, op);
-		cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
-		cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
-		result = LLVMBuildSelect(builder, cc, tmp, result, "");
+		LLVMValueRef active;
+
+		tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
+
+		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+				       LLVMBuildAnd(ctx->builder, tid,
+						    LLVMConstInt(ctx->i32, 16, false), ""),
+				       ctx->i32_0, "");
+
+		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+		result = ac_build_alu_op(ctx, result, tmp, op);
+
 		if (maxprefix <= 32)
 			return result;

-		if (inclusive)
-			tmp = result;
-		else
-			tmp = ac_build_alu_op(ctx, result, src, op);
-		tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
-		tmp = ac_build_alu_op(ctx, result, tmp, op);
-		cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
-				   LLVMConstInt(ctx->i32, 32, false), "");
-		result = LLVMBuildSelect(builder, cc, tmp, result, "");
+		tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+
+		active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid,
+				       LLVMConstInt(ctx->i32, 32, false), "");
+
+		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+		result = ac_build_alu_op(ctx, result, tmp, op);
 		return result;
 	}

--- a/src/amd/common/ac_surface.c
+++ b/src/amd/common/ac_surface.c
@@ -343,7 +343,7 @@ static int gfx6_compute_level(ADDR_HANDLE addrlib,
 	    AddrSurfInfoIn->flags.depth &&
 	    surf_level->mode == RADEON_SURF_MODE_2D &&
 	    level == 0) {
-		AddrHtileIn->flags.tcCompatible = AddrSurfInfoIn->flags.tcCompatible;
+		AddrHtileIn->flags.tcCompatible = AddrSurfInfoOut->tcCompatible;
 		AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
 		AddrHtileIn->height = AddrSurfInfoOut->height;
 		AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
@@ -777,19 +777,12 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib,
 			if (level > 0)
 				continue;

-			/* Check that we actually got a TC-compatible HTILE if
-			 * we requested it (only for level 0, since we're not
-			 * supporting HTILE on higher mip levels anyway). */
-			assert(AddrSurfInfoOut.tcCompatible ||
-			       !AddrSurfInfoIn.flags.tcCompatible ||
-			       AddrSurfInfoIn.flags.matchStencilTileCfg);
+			if (!AddrSurfInfoOut.tcCompatible) {
+				AddrSurfInfoIn.flags.tcCompatible = 0;
+				surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
+			}

 			if (AddrSurfInfoIn.flags.matchStencilTileCfg) {
-				if (!AddrSurfInfoOut.tcCompatible) {
-					AddrSurfInfoIn.flags.tcCompatible = 0;
-					surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
-				}
-
 				AddrSurfInfoIn.flags.matchStencilTileCfg = 0;
 				AddrSurfInfoIn.tileIndex = AddrSurfInfoOut.tileIndex;
 				stencil_tile_idx = AddrSurfInfoOut.stencilTileIdx;
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -6001,6 +6001,8 @@ void radv_CmdWriteBufferMarkerAMD(

 	si_emit_cache_flush(cmd_buffer);

+	ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
+
 	if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
 		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
@@ -6020,4 +6022,6 @@ void radv_CmdWriteBufferMarkerAMD(
 					   va, marker,
 					   cmd_buffer->gfx9_eop_bug_va);
 	}
+
+	assert(cmd_buffer->cs->cdw <= cdw_max);
 }
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1075,6 +1075,24 @@ void radv_GetPhysicalDeviceFeatures2(
 	return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
 }

+static size_t
+radv_max_descriptor_set_size()
+{
+	/* make sure that the entire descriptor set is addressable with a signed
+	 * 32-bit int. So the sum of all limits scaled by descriptor size has to
+	 * be at most 2 GiB. the combined image & samples object count as one of
+	 * both. This limit is for the pipeline layout, not for the set layout, but
+	 * there is no set limit, so we just set a pipeline limit. I don't think
+	 * any app is going to hit this soon. */
+	return ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS
+	                     - MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
+	          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
+	           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
+	           32 /* sampler, largest when combined with image */ +
+	           64 /* sampled image */ +
+	           64 /* storage image */);
+}
+
 void radv_GetPhysicalDeviceProperties(
 	VkPhysicalDevice                            physicalDevice,
 	VkPhysicalDeviceProperties*                 pProperties)
@@ -1082,18 +1100,7 @@ void radv_GetPhysicalDeviceProperties(
 	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
 	VkSampleCountFlags sample_counts = 0xf;

-	/* make sure that the entire descriptor set is addressable with a signed
-	 * 32-bit int. So the sum of all limits scaled by descriptor size has to
-	 * be at most 2 GiB. the combined image & samples object count as one of
-	 * both. This limit is for the pipeline layout, not for the set layout, but
-	 * there is no set limit, so we just set a pipeline limit. I don't think
-	 * any app is going to hit this soon. */
-	size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
-	          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
-	           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
-	           32 /* sampler, largest when combined with image */ +
-	           64 /* sampled image */ +
-	           64 /* storage image */);
+	size_t max_descriptor_set_size = radv_max_descriptor_set_size();

 	VkPhysicalDeviceLimits limits = {
 		.maxImageDimension1D                      = (1 << 14),
@@ -1362,13 +1369,7 @@ void radv_GetPhysicalDeviceProperties2(
 			properties->robustBufferAccessUpdateAfterBind = false;
 			properties->quadDivergentImplicitLod = false;

-			size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS -
-				MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
-			          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
-			           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
-			           32 /* sampler, largest when combined with image */ +
-			           64 /* sampled image */ +
-			           64 /* storage image */);
+			size_t max_descriptor_set_size = radv_max_descriptor_set_size();
 			properties->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size;
 			properties->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size;
 			properties->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size;
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1122,15 +1122,32 @@ radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
 	int ps_iter_samples = 1;
 	uint32_t mask = 0xffff;

-	if (vkms)
+	if (vkms) {
 		ms->num_samples = vkms->rasterizationSamples;
-	else
-		ms->num_samples = 1;

-	if (vkms)
-		ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms);
-	if (vkms && !vkms->sampleShadingEnable && pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.force_persample) {
-		ps_iter_samples = ms->num_samples;
+		/* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
+		 *
+		 * "Sample shading is enabled for a graphics pipeline:
+		 *
+		 * - If the interface of the fragment shader entry point of the
+		 *   graphics pipeline includes an input variable decorated
+		 *   with SampleId or SamplePosition. In this case
+		 *   minSampleShadingFactor takes the value 1.0.
+		 * - Else if the sampleShadingEnable member of the
+		 *   VkPipelineMultisampleStateCreateInfo structure specified
+		 *   when creating the graphics pipeline is set to VK_TRUE. In
+		 *   this case minSampleShadingFactor takes the value of
+		 *   VkPipelineMultisampleStateCreateInfo::minSampleShading.
+		 *
+		 * Otherwise, sample shading is considered disabled."
+		 */
+		if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.force_persample) {
+			ps_iter_samples = ms->num_samples;
+		} else {
+			ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms);
+		}
+	} else {
+		ms->num_samples = 1;
 	}

 	const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
@@ -4738,6 +4755,19 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline)
 	assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
 }

+static struct radv_pipeline_key
+radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
+				   const VkComputePipelineCreateInfo *pCreateInfo)
+{
+	struct radv_pipeline_key key;
+	memset(&key, 0, sizeof(key));
+
+	if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
+		key.optimisations_disabled = 1;
+
+	return key;
+}
+
 static VkResult radv_compute_pipeline_create(
 	VkDevice                                    _device,
 	VkPipelineCache                             _cache,
@@ -4770,7 +4800,11 @@ static VkResult radv_compute_pipeline_create(
 		stage_feedbacks[MESA_SHADER_COMPUTE] = &creation_feedback->pPipelineStageCreationFeedbacks[0];

 	pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;
-	radv_create_shaders(pipeline, device, cache, &(struct radv_pipeline_key) {0}, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks);
+
+	struct radv_pipeline_key key =
+		radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
+
+	radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks);

 	pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
 	pipeline->need_indirect_descriptor_sets |= pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets;
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -151,6 +151,13 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
 			((wrmask >> (i * 4)) & 0xf) << comp;
 }

+static void
+set_writes_memory(const nir_shader *nir, struct radv_shader_info *info)
+{
+	if (nir->info.stage == MESA_SHADER_FRAGMENT)
+		info->ps.writes_memory = true;
+}
+
 static void
 gather_intrinsic_store_deref_info(const nir_shader *nir,
 				const nir_intrinsic_instr *instr,
@@ -304,8 +311,7 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_xor ||
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_exchange ||
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) {
-			if (nir->info.stage == MESA_SHADER_FRAGMENT)
-				info->ps.writes_memory = true;
+			set_writes_memory(nir, info);
 		}
 		break;
 	}
@@ -320,15 +326,28 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 	case nir_intrinsic_ssbo_atomic_xor:
 	case nir_intrinsic_ssbo_atomic_exchange:
 	case nir_intrinsic_ssbo_atomic_comp_swap:
-		if (nir->info.stage == MESA_SHADER_FRAGMENT)
-			info->ps.writes_memory = true;
+		set_writes_memory(nir, info);
 		break;
 	case nir_intrinsic_load_deref:
 		gather_intrinsic_load_deref_info(nir, instr, info);
 		break;
 	case nir_intrinsic_store_deref:
 		gather_intrinsic_store_deref_info(nir, instr, info);
+		/* fallthrough */
+	case nir_intrinsic_deref_atomic_add:
+	case nir_intrinsic_deref_atomic_imin:
+	case nir_intrinsic_deref_atomic_umin:
+	case nir_intrinsic_deref_atomic_imax:
+	case nir_intrinsic_deref_atomic_umax:
+	case nir_intrinsic_deref_atomic_and:
+	case nir_intrinsic_deref_atomic_or:
+	case nir_intrinsic_deref_atomic_xor:
+	case nir_intrinsic_deref_atomic_exchange:
+	case nir_intrinsic_deref_atomic_comp_swap: {
+		if (nir_src_as_deref(instr->src[0])->mode & (nir_var_mem_global | nir_var_mem_ssbo))
+			set_writes_memory(nir, info);
 		break;
+	}
 	default:
 		break;
 	}
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -58,6 +58,6 @@ libbroadcom_cle = static_library(
  'v3d_decoder.c',
  include_directories : [inc_common, inc_broadcom],
  c_args : [c_vis_args, no_override_init_args],
-  dependencies : [dep_libdrm, dep_valgrind],
+  dependencies : [dep_libdrm, dep_valgrind, dep_expat, dep_zlib],
  build_by_default : false,
 )
--- a/src/compiler/glsl/gl_nir_linker.c
+++ b/src/compiler/glsl/gl_nir_linker.c
@@ -34,32 +34,11 @@
 */

 static bool
-add_interface_variables(const struct gl_context *cts,
-                        struct gl_shader_program *prog,
-                        struct set *resource_set,
-                        unsigned stage, GLenum programInterface)
+add_vars_from_list(const struct gl_context *ctx,
+                   struct gl_shader_program *prog, struct set *resource_set,
+                   const struct exec_list *var_list, unsigned stage,
+                   GLenum programInterface)
 {
-   const struct exec_list *var_list = NULL;
-
-   struct gl_linked_shader *sh = prog->_LinkedShaders[stage];
-   if (!sh)
-      return true;
-
-   nir_shader *nir = sh->Program->nir;
-   assert(nir);
-
-   switch (programInterface) {
-   case GL_PROGRAM_INPUT:
-      var_list = &nir->inputs;
-      break;
-   case GL_PROGRAM_OUTPUT:
-      var_list = &nir->outputs;
-      break;
-   default:
-      assert("!Should not get here");
-      break;
-   }
-
   nir_foreach_variable(var, var_list) {
      if (var->data.how_declared == nir_var_hidden)
         continue;
@@ -108,6 +87,38 @@ add_interface_variables(const struct gl_context *cts,
   return true;
 }

+static bool
+add_interface_variables(const struct gl_context *ctx,
+                        struct gl_shader_program *prog,
+                        struct set *resource_set,
+                        unsigned stage, GLenum programInterface)
+{
+   struct gl_linked_shader *sh = prog->_LinkedShaders[stage];
+   if (!sh)
+      return true;
+
+   nir_shader *nir = sh->Program->nir;
+   assert(nir);
+
+   switch (programInterface) {
+   case GL_PROGRAM_INPUT: {
+      bool result = add_vars_from_list(ctx, prog, resource_set,
+                                       &nir->inputs, stage, programInterface);
+      result &= add_vars_from_list(ctx, prog, resource_set, &nir->system_values,
+                                   stage, programInterface);
+      return result;
+   }
+   case GL_PROGRAM_OUTPUT:
+      return add_vars_from_list(ctx, prog, resource_set, &nir->outputs, stage,
+                                programInterface);
+   default:
+      assert("!Should not get here");
+      break;
+   }
+
+   return false;
+}
+
 /* TODO: as we keep adding features, this method is becoming more and more
 * similar to its GLSL counterpart at linker.cpp. Eventually it would be good
 * to check if they could be refactored, and reduce code duplication somehow
--- a/src/compiler/nir/nir_lower_clip.c
+++ b/src/compiler/nir/nir_lower_clip.c
@@ -262,6 +262,17 @@ nir_lower_clip_vs(nir_shader *shader, unsigned ucp_enables, bool use_vars)
   if (!ucp_enables)
      return false;

+   /* find clipvertex/position outputs: */
+   nir_foreach_variable(var, &shader->outputs) {
+      int loc = var->data.driver_location;
+
+      /* keep track of last used driver-location.. we'll be
+       * appending CLIP_DIST0/CLIP_DIST1 after last existing
+       * output:
+       */
+      maxloc = MAX2(maxloc, loc);
+   }
+
   nir_builder_init(&b, impl);

   /* NIR should ensure that, even in case of loops/if-else, there
--- a/src/gallium/auxiliary/draw/draw_pt_decompose.h
+++ b/src/gallium/auxiliary/draw/draw_pt_decompose.h
@@ -3,8 +3,6 @@
   const boolean quads_flatshade_last =      \
      draw->quads_always_flatshade_last;     \
   const boolean last_vertex_last =          \
-      !(draw->rasterizer->flatshade &&       \
-        draw->rasterizer->flatshade_first);
-/* FIXME: the draw->rasterizer->flatshade part is really wrong */
+      !draw->rasterizer->flatshade_first;

 #include "draw_decompose_tmp.h"
--- a/src/gallium/auxiliary/util/u_prim.h
+++ b/src/gallium/auxiliary/util/u_prim.h
@@ -338,7 +338,14 @@ u_stream_outputs_for_vertices(enum pipe_prim_type primitive, unsigned nr)
   /* Extraneous vertices don't contribute to stream outputs */
   u_trim_pipe_prim(primitive, &nr);

-   /* Consider how many primitives are actually generated */
+   /* Polygons are special, since they are a single primitive with many
+    * vertices. In this case, we just have an output for each vertex (after
+    * trimming) */
+
+   if (primitive == PIPE_PRIM_POLYGON)
+      return nr;
+
+   /* Normally, consider how many primitives are actually generated */
   unsigned prims = u_decomposed_prims_for_vertices(primitive, nr);

   /* One output per vertex after decomposition */
--- a/src/gallium/drivers/iris/iris_resource.c
+++ b/src/gallium/drivers/iris/iris_resource.c
@@ -739,6 +739,8 @@ iris_resource_create_with_modifiers(struct pipe_screen *pscreen,
      if (templ->usage == PIPE_USAGE_STAGING ||
          templ->bind & (PIPE_BIND_LINEAR | PIPE_BIND_CURSOR) )
         tiling_flags = ISL_TILING_LINEAR_BIT;
+      else if (templ->bind & PIPE_BIND_SCANOUT)
+         tiling_flags = ISL_TILING_X_BIT;
   }

   isl_surf_usage_flags_t usage = pipe_bind_to_isl_usage(templ->bind);
--- a/src/gallium/drivers/virgl/virgl_encode.c
+++ b/src/gallium/drivers/virgl/virgl_encode.c
@@ -492,12 +492,13 @@ int virgl_encode_shader_state(struct virgl_context *ctx,
         if (virgl_debug & VIRGL_DEBUG_VERBOSE)
            debug_printf("Failed to translate shader in available space - trying again\n");
         old_size = str_total_size;
-         str_total_size = 65536 * ++retry_size;
+         str_total_size = 65536 * retry_size;
+         retry_size *= 2;
         str = REALLOC(str, old_size, str_total_size);
         if (!str)
            return -1;
      }
-   } while (bret == false && retry_size < 10);
+   } while (bret == false && retry_size < 1024);

   if (bret == false)
      return -1;
--- a/src/gallium/include/state_tracker/winsys_handle.h
+++ b/src/gallium/include/state_tracker/winsys_handle.h
@@ -49,6 +49,12 @@ struct winsys_handle
    */
   unsigned offset;

+   /**
+    * Input to resource_from_handle.
+    * Output from resource_get_handle.
+    */
+   uint64_t format;
+
   /**
    * Input to resource_from_handle.
    * Output from resource_get_handle.
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -529,6 +529,7 @@ dri2_allocate_textures(struct dri_context *ctx,
         whandle.handle = buf->name;
         whandle.stride = buf->pitch;
         whandle.offset = 0;
+         whandle.format = format;
         whandle.modifier = DRM_FORMAT_MOD_INVALID;
         if (screen->can_share_buffer)
            whandle.type = WINSYS_HANDLE_TYPE_SHARED;
@@ -759,18 +760,12 @@ dri2_create_image_from_winsys(__DRIscreen *_screen,
   for (i = num_handles - 1; i >= 0; i--) {
      struct pipe_resource *tex;

-      if (whandle[i].modifier == DRM_FORMAT_MOD_INVALID) {
-         templ.width0 = width >> map->planes[i].width_shift;
-         templ.height0 = height >> map->planes[i].height_shift;
-         if (is_yuv)
-            templ.format = dri2_get_pipe_format_for_dri_format(map->planes[i].dri_format);
-         else
-            templ.format = map->pipe_format;
-      } else {
-         templ.width0 = width;
-         templ.height0 = height;
+      templ.width0 = width >> map->planes[i].width_shift;
+      templ.height0 = height >> map->planes[i].height_shift;
+      if (is_yuv)
+         templ.format = dri2_get_pipe_format_for_dri_format(map->planes[i].dri_format);
+      else
         templ.format = map->pipe_format;
-      }
      assert(templ.format != PIPE_FORMAT_NONE);

      tex = pscreen->resource_from_handle(pscreen,
@@ -808,6 +803,7 @@ dri2_create_image_from_name(__DRIscreen *_screen,
   memset(&whandle, 0, sizeof(whandle));
   whandle.type = WINSYS_HANDLE_TYPE_SHARED;
   whandle.handle = name;
+   whandle.format = map->pipe_format;
   whandle.modifier = DRM_FORMAT_MOD_INVALID;

   whandle.stride = pitch * util_format_get_blocksize(map->pipe_format);
@@ -826,8 +822,13 @@ dri2_create_image_from_name(__DRIscreen *_screen,
 }

 static unsigned
-dri2_get_modifier_num_planes(uint64_t modifier)
+dri2_get_modifier_num_planes(uint64_t modifier, int fourcc)
 {
+   const struct dri2_format_mapping *map = dri2_get_mapping_by_fourcc(fourcc);
+
+   if (!map)
+      return 0;
+
   switch (modifier) {
   case I915_FORMAT_MOD_Y_TILED_CCS:
      return 2;
@@ -849,8 +850,8 @@ dri2_get_modifier_num_planes(uint64_t modifier)
   /* FD_FORMAT_MOD_QCOM_TILED is not in drm_fourcc.h */
   case I915_FORMAT_MOD_X_TILED:
   case I915_FORMAT_MOD_Y_TILED:
-      return 1;
   case DRM_FORMAT_MOD_INVALID:
+      return map->nplanes;
   default:
      return 0;
   }
@@ -868,15 +869,13 @@ dri2_create_image_from_fd(__DRIscreen *_screen,
   __DRIimage *img = NULL;
   unsigned err = __DRI_IMAGE_ERROR_SUCCESS;
   int i, expected_num_fds;
-   uint64_t mod_planes = dri2_get_modifier_num_planes(modifier);
+   int num_handles = dri2_get_modifier_num_planes(modifier, fourcc);

-   if (!map || (modifier != DRM_FORMAT_MOD_INVALID && mod_planes == 0)) {
+   if (!map || num_handles == 0) {
      err = __DRI_IMAGE_ERROR_BAD_MATCH;
      goto exit;
   }

-   int num_handles = mod_planes > 0 ? mod_planes : map->nplanes;
-
   switch (fourcc) {
   case __DRI_IMAGE_FOURCC_YUYV:
   case __DRI_IMAGE_FOURCC_UYVY:
@@ -896,7 +895,7 @@ dri2_create_image_from_fd(__DRIscreen *_screen,

   for (i = 0; i < num_handles; i++) {
      int fdnum = i >= num_fds ? 0 : i;
-      int index = mod_planes > 0 ? i : map->planes[i].buffer_index;
+      int index = i >= map->nplanes ? i : map->planes[i].buffer_index;
      if (fds[fdnum] < 0) {
         err = __DRI_IMAGE_ERROR_BAD_ALLOC;
         goto exit;
@@ -906,6 +905,7 @@ dri2_create_image_from_fd(__DRIscreen *_screen,
      whandles[i].handle = (unsigned)fds[fdnum];
      whandles[i].stride = (unsigned)strides[index];
      whandles[i].offset = (unsigned)offsets[index];
+      whandles[i].format = map->pipe_format;
      whandles[i].modifier = modifier;
      whandles[i].plane = index;
   }
@@ -1296,6 +1296,7 @@ dri2_from_names(__DRIscreen *screen, int width, int height, int format,
   whandle.handle = names[0];
   whandle.stride = strides[0];
   whandle.offset = offsets[0];
+   whandle.format = map->pipe_format;
   whandle.modifier = DRM_FORMAT_MOD_INVALID;

   img = dri2_create_image_from_winsys(screen, width, height, map,
@@ -1393,7 +1394,7 @@ dri2_query_dma_buf_format_modifier_attribs(__DRIscreen *_screen,
 {
   switch (attrib) {
   case __DRI_IMAGE_FORMAT_MODIFIER_ATTRIB_PLANE_COUNT: {
-      uint64_t mod_planes = dri2_get_modifier_num_planes(modifier);
+      uint64_t mod_planes = dri2_get_modifier_num_planes(modifier, fourcc);
      if (mod_planes > 0)
         *value = mod_planes;
      return mod_planes > 0;
@@ -1861,8 +1862,6 @@ static void
 dri2_set_damage_region(__DRIdrawable *dPriv, unsigned int nrects, int *rects)
 {
   struct dri_drawable *drawable = dri_drawable(dPriv);
-   struct pipe_resource *resource = drawable->textures[ST_ATTACHMENT_BACK_LEFT];
-   struct pipe_screen *screen = resource->screen;
   struct pipe_box *boxes = NULL;

   if (nrects) {
@@ -1876,8 +1875,25 @@ dri2_set_damage_region(__DRIdrawable *dPriv, unsigned int nrects, int *rects)
      }
   }

-   screen->set_damage_region(screen, resource, nrects, boxes);
-   FREE(boxes);
+   FREE(drawable->damage_rects);
+   drawable->damage_rects = boxes;
+   drawable->num_damage_rects = nrects;
+
+   /* Only apply the damage region if the BACK_LEFT texture is up-to-date. */
+   if (drawable->texture_stamp == drawable->dPriv->lastStamp &&
+       (drawable->texture_mask & (1 << ST_ATTACHMENT_BACK_LEFT))) {
+      struct pipe_screen *screen = drawable->screen->base.screen;
+      struct pipe_resource *resource;
+
+      if (drawable->stvis.samples > 1)
+         resource = drawable->msaa_textures[ST_ATTACHMENT_BACK_LEFT];
+      else
+         resource = drawable->textures[ST_ATTACHMENT_BACK_LEFT];
+
+      screen->set_damage_region(screen, resource,
+                                drawable->num_damage_rects,
+                                drawable->damage_rects);
+   }
 }

 static __DRI2bufferDamageExtension dri2BufferDamageExtension = {
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -95,6 +95,18 @@ dri_st_framebuffer_validate(struct st_context_iface *stctx,
      }
   } while (lastStamp != drawable->dPriv->lastStamp);

+   /* Flush the pending set_damage_region request. */
+   struct pipe_screen *pscreen = screen->base.screen;
+
+   if (new_mask & (1 << ST_ATTACHMENT_BACK_LEFT) &&
+       pscreen->set_damage_region) {
+      struct pipe_resource *resource = textures[ST_ATTACHMENT_BACK_LEFT];
+
+      pscreen->set_damage_region(pscreen, resource,
+                                 drawable->num_damage_rects,
+                                 drawable->damage_rects);
+   }
+
   if (!out)
      return true;

@@ -202,6 +214,7 @@ dri_destroy_buffer(__DRIdrawable * dPriv)
   /* Notify the st manager that this drawable is no longer valid */
   stapi->destroy_drawable(stapi, &drawable->base);

+   FREE(drawable->damage_rects);
   FREE(drawable);
 }

--- a/src/gallium/state_trackers/dri/dri_drawable.h
+++ b/src/gallium/state_trackers/dri/dri_drawable.h
@@ -56,6 +56,9 @@ struct dri_drawable
   unsigned old_w;
   unsigned old_h;

+   struct pipe_box *damage_rects;
+   unsigned int num_damage_rects;
+
   struct pipe_resource *textures[ST_ATTACHMENT_COUNT];
   struct pipe_resource *msaa_textures[ST_ATTACHMENT_COUNT];
   unsigned int texture_mask, texture_stamp;
--- a/src/gallium/targets/d3dadapter9/meson.build
+++ b/src/gallium/targets/d3dadapter9/meson.build
@@ -28,12 +28,24 @@ nine_version = ['1', '0', '0']
 gallium_nine_c_args = []
 gallium_nine_ld_args = []
 gallium_nine_link_depends = []
+gallium_nine_link_with = [
+    libgallium, libnine_st,
+    libpipe_loader_static, libws_null, libwsw, libswdri,
+    libswkmsdri,
+]

 if with_ld_version_script
  gallium_nine_ld_args += ['-Wl,--version-script', join_paths(meson.current_source_dir(), 'd3dadapter9.sym')]
  gallium_nine_link_depends += files('d3dadapter9.sym')
 endif

+if (with_gallium_va or with_gallium_vdpau or with_gallium_omx != 'disabled' or
+    with_gallium_xvmc or with_dri)
+  gallium_nine_link_with += libgalliumvl
+else
+  gallium_nine_link_with += libgalliumvl_stub
+endif
+
 libgallium_nine = shared_library(
  'd3dadapter9',
  files('description.c', 'getproc.c', 'drm.c'),
@@ -47,11 +59,7 @@ libgallium_nine = shared_library(
  cpp_args : [cpp_vis_args],
  link_args : [ld_args_gc_sections, gallium_nine_ld_args],
  link_depends : gallium_nine_link_depends,
-  link_with : [
-    libgalliumvl_stub, libgallium, libnine_st,
-    libpipe_loader_static, libws_null, libwsw, libswdri,
-    libswkmsdri, libnir,
-  ],
+  link_with : gallium_nine_link_with,
  dependencies : [
    dep_selinux, dep_libdrm, dep_llvm, dep_thread, idep_xmlconfig, idep_mesautil,
    driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -326,7 +326,6 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
   aws = util_hash_table_get(dev_tab, dev);
   if (aws) {
      pipe_reference(NULL, &aws->reference);
-      simple_mtx_unlock(&dev_tab_mutex);

      /* Release the device handle, because we don't need it anymore.
       * This function is returning an existing winsys instance, which
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3368,7 +3368,14 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,

         if (alu != NULL &&
             alu->op != nir_op_bcsel &&
-             alu->op != nir_op_inot) {
+             alu->op != nir_op_inot &&
+             (devinfo->gen > 5 ||
+              (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE ||
+              alu->op == nir_op_fne32 || alu->op == nir_op_feq32 ||
+              alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
+              alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
+              alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
+              alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
            /* Re-emit the instruction that generated the Boolean value, but
             * do not store it.  Since this instruction will be conditional,
             * other instructions that want to use the real Boolean value may
--- a/src/intel/perf/gen_perf.c
+++ b/src/intel/perf/gen_perf.c
@@ -69,6 +69,8 @@
 #define MAP_READ  (1 << 0)
 #define MAP_WRITE (1 << 1)

+#define OA_REPORT_INVALID_CTX_ID (0xffffffff)
+
 /**
 * Periodic OA samples are read() into these buffer structures via the
 * i915 perf kernel interface and appended to the
@@ -997,7 +999,9 @@ query_result_accumulate(struct gen_perf_query_result *result,
 {
   int i, idx = 0;

-   result->hw_id = start[2];
+   if (result->hw_id == OA_REPORT_INVALID_CTX_ID &&
+       start[2] != OA_REPORT_INVALID_CTX_ID)
+      result->hw_id = start[2];
   result->reports_accumulated++;

   switch (query->oa_format) {
@@ -1035,7 +1039,7 @@ static void
 query_result_clear(struct gen_perf_query_result *result)
 {
   memset(result, 0, sizeof(*result));
-   result->hw_id = 0xffffffff; /* invalid */
+   result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
 }

 static void
@@ -1316,8 +1320,8 @@ get_free_sample_buf(struct gen_perf_context *perf_ctx)

      exec_node_init(&buf->link);
      buf->refcount = 0;
-      buf->len = 0;
   }
+   buf->len = 0;

   return buf;
 }
@@ -1834,7 +1838,8 @@ read_oa_samples_until(struct gen_perf_context *perf_ctx,
      exec_list_get_tail(&perf_ctx->sample_buffers);
   struct oa_sample_buf *tail_buf =
      exec_node_data(struct oa_sample_buf, tail_node, link);
-   uint32_t last_timestamp = tail_buf->last_timestamp;
+   uint32_t last_timestamp =
+      tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp;

   while (1) {
      struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
@@ -1849,12 +1854,13 @@ read_oa_samples_until(struct gen_perf_context *perf_ctx,
         exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);

         if (len < 0) {
-            if (errno == EAGAIN)
-               return ((last_timestamp - start_timestamp) >=
+            if (errno == EAGAIN) {
+               return ((last_timestamp - start_timestamp) < INT32_MAX &&
+                       (last_timestamp - start_timestamp) >=
                       (end_timestamp - start_timestamp)) ?
                      OA_READ_STATUS_FINISHED :
                      OA_READ_STATUS_UNFINISHED;
-            else {
+            } else {
               DBG("Error reading i915 perf samples: %m\n");
            }
         } else
@@ -2070,6 +2076,17 @@ discard_all_queries(struct gen_perf_context *perf_ctx)
   }
 }

+/* Looks for the validity bit of context ID (dword 2) of an OA report. */
+static bool
+oa_report_ctx_id_valid(const struct gen_device_info *devinfo,
+                       const uint32_t *report)
+{
+   assert(devinfo->gen >= 8);
+   if (devinfo->gen == 8)
+      return (report[0] & (1 << 25)) != 0;
+   return (report[0] & (1 << 16)) != 0;
+}
+
 /**
 * Accumulate raw OA counter values based on deltas between pairs of
 * OA reports.
@@ -2097,7 +2114,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
   uint32_t *last;
   uint32_t *end;
   struct exec_node *first_samples_node;
-   bool in_ctx = true;
+   bool last_report_ctx_match = true;
   int out_duration = 0;

   assert(query->oa.map != NULL);
@@ -2126,7 +2143,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
   first_samples_node = query->oa.samples_head->next;

   foreach_list_typed_from(struct oa_sample_buf, buf, link,
-                           &perf_ctx.sample_buffers,
+                           &perf_ctx->sample_buffers,
                           first_samples_node)
   {
      int offset = 0;
@@ -2143,6 +2160,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
         switch (header->type) {
         case DRM_I915_PERF_RECORD_SAMPLE: {
            uint32_t *report = (uint32_t *)(header + 1);
+            bool report_ctx_match = true;
            bool add = true;

            /* Ignore reports that come before the start marker.
@@ -2171,35 +2189,30 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
             * of OA counters while any other context is acctive.
             */
            if (devinfo->gen >= 8) {
-               if (in_ctx && report[2] != query->oa.result.hw_id) {
-                  DBG("i915 perf: Switch AWAY (observed by ID change)\n");
-                  in_ctx = false;
+               /* Consider that the current report matches our context only if
+                * the report says the report ID is valid.
+                */
+               report_ctx_match = oa_report_ctx_id_valid(devinfo, report) &&
+                  report[2] == start[2];
+               if (report_ctx_match)
                  out_duration = 0;
-               } else if (in_ctx == false && report[2] == query->oa.result.hw_id) {
-                  DBG("i915 perf: Switch TO\n");
-                  in_ctx = true;
-
-                  /* From experimentation in IGT, we found that the OA unit
-                   * might label some report as "idle" (using an invalid
-                   * context ID), right after a report for a given context.
-                   * Deltas generated by those reports actually belong to the
-                   * previous context, even though they're not labelled as
-                   * such.
-                   *
-                   * We didn't *really* Switch AWAY in the case that we e.g.
-                   * saw a single periodic report while idle...
-                   */
-                  if (out_duration >= 1)
-                     add = false;
-               } else if (in_ctx) {
-                  assert(report[2] == query->oa.result.hw_id);
-                  DBG("i915 perf: Continuation IN\n");
-               } else {
-                  assert(report[2] != query->oa.result.hw_id);
-                  DBG("i915 perf: Continuation OUT\n");
-                  add = false;
+               else
                  out_duration++;
-               }
+
+               /* Only add the delta between <last, report> if the last report
+                * was clearly identified as our context, or if we have at most
+                * 1 report without a matching ID.
+                *
+                * The OA unit will sometimes label reports with an invalid
+                * context ID when i915 rewrites the execlist submit register
+                * with the same context as the one currently running. This
+                * happens when i915 wants to notify the HW of ringbuffer tail
+                * register update. We have to consider this report as part of
+                * our context as the 3d pipeline behind the OACS unit is still
+                * processing the operations started at the previous execlist
+                * submission.
+                */
+               add = last_report_ctx_match && out_duration < 2;
            }

            if (add) {
@@ -2208,6 +2221,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
            }

            last = report;
+            last_report_ctx_match = report_ctx_match;

            break;
         }
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1640,7 +1640,7 @@ void anv_GetPhysicalDeviceProperties2(
         VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *properties =
            (VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *)ext;
         properties->filterMinmaxImageComponentMapping = pdevice->info.gen >= 9;
-         properties->filterMinmaxSingleComponentFormats = true;
+         properties->filterMinmaxSingleComponentFormats = pdevice->info.gen >= 9;
         break;
      }

@@ -3098,9 +3098,10 @@ VkResult anv_AllocateMemory(
                                      i915_tiling);
         if (ret) {
            anv_bo_cache_release(device, &device->bo_cache, mem->bo);
-            return vk_errorf(device->instance, NULL,
-                             VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                             "failed to set BO tiling: %m");
+            result = vk_errorf(device->instance, NULL,
+                               VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                               "failed to set BO tiling: %m");
+            goto fail;
         }
      }
   }
--- a/src/intel/vulkan/anv_queue.c
+++ b/src/intel/vulkan/anv_queue.c
@@ -681,7 +681,11 @@ anv_wait_for_fences(struct anv_device *device,
   if (fenceCount <= 1 || waitAll) {
      for (uint32_t i = 0; i < fenceCount; i++) {
         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-         switch (fence->permanent.type) {
+         struct anv_fence_impl *impl =
+            fence->temporary.type != ANV_FENCE_TYPE_NONE ?
+            &fence->temporary : &fence->permanent;
+
+         switch (impl->type) {
         case ANV_FENCE_TYPE_BO:
            result = anv_wait_for_bo_fences(device, 1, &pFences[i],
                                            true, abs_timeout);
@@ -716,7 +720,10 @@ static bool anv_all_fences_syncobj(uint32_t fenceCount, const VkFence *pFences)
 {
   for (uint32_t i = 0; i < fenceCount; ++i) {
      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-      if (fence->permanent.type != ANV_FENCE_TYPE_SYNCOBJ)
+      struct anv_fence_impl *impl =
+         fence->temporary.type != ANV_FENCE_TYPE_NONE ?
+         &fence->temporary : &fence->permanent;
+      if (impl->type != ANV_FENCE_TYPE_SYNCOBJ)
         return false;
   }
   return true;
@@ -726,7 +733,10 @@ static bool anv_all_fences_bo(uint32_t fenceCount, const VkFence *pFences)
 {
   for (uint32_t i = 0; i < fenceCount; ++i) {
      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-      if (fence->permanent.type != ANV_FENCE_TYPE_BO)
+      struct anv_fence_impl *impl =
+         fence->temporary.type != ANV_FENCE_TYPE_NONE ?
+         &fence->temporary : &fence->permanent;
+      if (impl->type != ANV_FENCE_TYPE_BO)
         return false;
   }
   return true;
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -3803,6 +3803,13 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
         vfe.NumberofURBEntries     = 2;
         vfe.URBEntryAllocationSize = 2;
      }
+
+      /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
+       * invalid. Set the compute pipeline to dirty to force a re-emit of the
+       * pipeline in case we get back-to-back dispatch calls with the same
+       * pipeline and a PIPELINE_SELECT in between.
+       */
+      cmd_buffer->state.compute.pipeline_dirty = true;
   }
 #endif

--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -369,8 +369,8 @@ emit_3dstate_sbe(struct anv_pipeline *pipeline)
      if (input_index < 0)
         continue;

-      /* gl_Layer is stored in the VUE header */
-      if (attr == VARYING_SLOT_LAYER) {
+      /* gl_Viewport and gl_Layer are stored in the VUE header */
+      if (attr == VARYING_SLOT_VIEWPORT || attr == VARYING_SLOT_LAYER) {
         urb_entry_read_offset = 0;
         continue;
      }
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -35,9 +35,7 @@ i965_FILES = \
 	brw_object_purgeable.c \
 	brw_pipe_control.c \
 	brw_pipe_control.h \
-	brw_performance_query.h \
 	brw_performance_query.c \
-	brw_performance_query_metrics.h \
 	brw_program.c \
 	brw_program.h \
 	brw_program_binary.c \
--- a/src/mesa/main/performance_query.c
+++ b/src/mesa/main/performance_query.c
@@ -48,6 +48,12 @@ free_performance_query(GLuint key, void *data, void *user)
   struct gl_perf_query_object *m = data;
   struct gl_context *ctx = user;

+   /* Don't confuse the implementation by deleting an active query. We can
+    * toggle Active/Used to false because we're tearing down the GL context
+    * and it's already idle (see _mesa_free_context_data).
+    */
+   m->Active = false;
+   m->Used = false;
   ctx->Driver.DeletePerfQuery(ctx, m);
 }
@@ -1 +1 @@
 .2.6
 .2.8