docs: add release notes for 17.1.4

Signed-off-by: Andres Gomez <agomez@igalia.com>
Update version to 17.1.4
2017-06-30 20:19:05 +03:00 · 2017-06-30 20:09:50 +03:00 · 2017-06-28 20:17:13 +03:00 · 2017-06-28 20:17:13 +03:00 · 2017-06-28 20:17:13 +03:00 · 2017-06-28 20:17:13 +03:00
96 changed files with 1335 additions and 502 deletions
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -59,6 +59,7 @@ LOCAL_CFLAGS += \
 	-DHAVE_PTHREAD=1 \
 	-DHAVE_DLOPEN \
 	-DHAVE_DL_ITERATE_PHDR \
+	-DMAJOR_IN_SYSMACROS \
 	-fvisibility=hidden \
 	-Wno-sign-compare

--- a/2
+++ b/2
@@ -1 +1 @@
-17.1.2
+17.1.4
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -1,4 +1,14 @@
-# This commit depends on 9fd9a7d0ba3 and 678d568c7b2, neither of which is in branch.
+# stable: This commit depends on 9fd9a7d0ba3 and 678d568c7b2, neither
+#         of which is in branch.
 b84b631c6381d9b36bca5d0e7cc67dd23af188c1 radeonsi: load patch_id for TES-as-ES when exporting for PS
-# This commit addressed an earlier commit 126d5ad which did not land in branch.
+# fixes:  This commit addressed an earlier commit 126d5ad which did not
+#         land in branch.
 9da104593386f6e8ddec8f0d9d288aceb8908fe1 radv: fix regression in descriptor set freeing.
+# stable: This commit addressed an earlier commit 944455217b which did
+#         not land in branch.
+b28938ffce0580e89e6012826900da2b6013b0df st/glsl_to_tgsi: use correct writemask when converting generic intrinsics
+# stable: This commit depends on 330d0607e and 61d8f3387d, neither of
+#         which is in branch.
+c12f8305a8ae4fd5d78a9ab8bbda790a711d5bed nv50,nvc0: remove IDX from bufctx immediately, to avoid conflicts with clear
+# fixes:  Genuine false positive.
+5d87667fed1bd5ab850abdfb3a10db8c8c21c330 bin/get-fixes-pick-list.sh: better identify multiple "fixes:" tags" has more than one Fixes tag
--- a/configure.ac
+++ b/configure.ac
@@ -97,7 +97,7 @@ XSHMFENCE_REQUIRED=1.1
 XVMC_REQUIRED=1.0.6
 PYTHON_MAKO_REQUIRED=0.8.0
 LIBSENSORS_REQUIRED=4.0.0
-ZLIB_REQUIRED=1.2.8
+ZLIB_REQUIRED=1.2.3

 dnl LLVM versions
 LLVM_REQUIRED_GALLIUM=3.3.0
@@ -837,6 +837,11 @@ dnl is not valid for that platform.
 if test "x$android" = xno; then
    test -z "$PTHREAD_LIBS" && PTHREAD_LIBS="-lpthread"
 fi
+dnl According to the manual when using pthreads, one should add -pthread to
+dnl both compile and link-time arguments.
+dnl In practise that should be sufficient for all platforms, since any
+dnl platforms build with GCC and Clang support the flag.
+PTHREAD_LIBS="$PTHREAD_LIBS -pthread"

 dnl pthread-stubs is mandatory on BSD platforms, due to the nature of the
 dnl project. Even then there's a notable issue as described in the project README
@@ -2476,10 +2481,10 @@ if test -n "$with_gallium_drivers"; then
        xswr)
            llvm_require_version $LLVM_REQUIRED_SWR "swr"

-            swr_require_cxx_feature_flags "C++14" "__cplusplus >= 201402L" \
-                "-std=c++14" \
-                SWR_CXX14_CXXFLAGS
-            AC_SUBST([SWR_CXX14_CXXFLAGS])
+            swr_require_cxx_feature_flags "C++11" "__cplusplus >= 201103L" \
+                ",-std=c++11" \
+                SWR_CXX11_CXXFLAGS
+            AC_SUBST([SWR_CXX11_CXXFLAGS])

            swr_require_cxx_feature_flags "AVX" "defined(__AVX__)" \
                ",-mavx,-march=core-avx" \
--- a/docs/relnotes/17.1.2.html
+++ b/docs/relnotes/17.1.2.html
@@ -31,7 +31,8 @@ because compatibility contexts are not supported.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+0d2020c2115db0d13a5be0075abf0da143290f69f5817a2f277861e89166a3e1  mesa-17.1.2.tar.gz
+0937804f43746339b1f9540d8f9c8b4a1bb3d3eec0e4020eac283b8799798239  mesa-17.1.2.tar.xz
 </pre>


--- a/docs/relnotes/17.1.3.html
+++ b/docs/relnotes/17.1.3.html
@@ -0,0 +1,156 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.1.3 Release Notes / June 19, 2017</h1>
+
+<p>
+Mesa 17.1.3 is a bug fix release which fixes bugs found since the 17.1.2 release.
+</p>
+<p>
+Mesa 17.1.3 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+81ae9127286ff8d631e466d258608d6dea9854fe7bee2e8521da44c7544f01e5  mesa-17.1.3.tar.gz
+5f1ee9a8aea2880f887884df2dea0c16dd1b13eb42fd2e52265db0dc1b380e8c  mesa-17.1.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100988">Bug 100988</a> - glXGetCurrentDisplay() no longer works for FakeGLX contexts?</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Bas Nieuwenhuizen (3):</p>
+<ul>
+  <li>radv: Set both compute and graphics SGPRS on descriptor set flush.</li>
+  <li>radv: Dirty all descriptors sets when changing the pipeline.</li>
+  <li>radv: Remove SI num RB override for occlusion queries.</li>
+</ul>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>xlib: fix glXGetCurrentDisplay() failure</li>
+</ul>
+
+<p>Chad Versace (1):</p>
+<ul>
+  <li>i965/dri: Fix bad GL error in intel_create_winsys_renderbuffer()</li>
+</ul>
+
+<p>Chuck Atkins (1):</p>
+<ul>
+  <li>configure.ac: Reduce zlib requirement from 1.2.8 to 1.2.3.</li>
+</ul>
+
+<p>Dave Airlie (3):</p>
+<ul>
+  <li>radv: expose integrated device type for APUs.</li>
+  <li>radv: set fmask state to all 0s when no fmask. (v2)</li>
+  <li>glsl/lower_distance: only set max_array_access for 1D clip dist arrays</li>
+</ul>
+
+<p>Emil Velikov (1):</p>
+<ul>
+  <li>Update version to 17.1.3</li>
+</ul>
+
+<p>Grazvydas Ignotas (1):</p>
+<ul>
+  <li>radv: fix trace dumping for !use_ib_bos</li>
+</ul>
+
+<p>Jason Ekstrand (4):</p>
+<ul>
+  <li>i965/blorp: Take a layer range in intel_hiz_exec</li>
+  <li>i965: Move the pre-depth-clear flush/stalls to intel_hiz_exec</li>
+  <li>i965: Perform HiZ flush/stall prior to HiZ resolves</li>
+  <li>i965: Mark depth surfaces as needing a HiZ resolve after blitting</li>
+</ul>
+
+<p>José Fonseca (1):</p>
+<ul>
+  <li>automake: Link all libGL.so variants with -Bsymbolic.</li>
+</ul>
+
+<p>Juan A. Suarez Romero (1):</p>
+<ul>
+  <li>docs: add sha256 checksums for 17.1.2</li>
+</ul>
+
+<p>Lucas Stach (1):</p>
+<ul>
+  <li>etnaviv: always do cpu_fini in transfer_unmap</li>
+</ul>
+
+<p>Lyude (1):</p>
+<ul>
+  <li>nvc0: disable BGRA8 images on Fermi</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>st/mesa: don't load cached TGSI shaders on demand</li>
+  <li>radeonsi: fix a GPU hang with tessellation on 2-CU configs</li>
+  <li>radeonsi: disable the patch ID workaround on SI when the patch ID isn't used (v2)</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>radv: fewer than 8 RBs are possible</li>
+</ul>
+
+<p>Nicolas Dechesne (1):</p>
+<ul>
+  <li>util/rand_xor: add missing include statements</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>egl: fix _eglQuerySurface in EGL_BUFFER_AGE_EXT case</li>
+</ul>
+
+<p>Thomas Hellstrom (1):</p>
+<ul>
+  <li>dri3/GLX: Fix drawable invalidation v2</li>
+</ul>
+
+<p>Tim Rowley (1):</p>
+<ul>
+  <li>swr: relax c++ requirement from c++14 to c++11</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/17.1.4.html
+++ b/docs/relnotes/17.1.4.html
@@ -0,0 +1,219 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 17.1.4 Release Notes / June 30, 2017</h1>
+
+<p>
+Mesa 17.1.4 is a bug fix release which fixes bugs found since the 17.1.3 release.
+</p>
+<p>
+Mesa 17.1.4 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=77240">Bug 77240</a> - khrplatform.h not installed if EGL is disabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95530">Bug 95530</a> - Stellaris - colored overlay of sectors doesn't render on i965</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96958">Bug 96958</a> - [SKL] Improper rendering in Europa Universalis IV</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99467">Bug 99467</a> - [radv] DOOM 2016 + wine. Green screen everywhere (but can be started)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101071">Bug 101071</a> - compiling glsl fails with undefined reference to `pthread_create'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101252">Bug 101252</a> - eglGetDisplay() is not thread safe</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101294">Bug 101294</a> - radeonsi minecraft forge splash freeze since 17.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101451">Bug 101451</a> - [G33] ES2-CTS.functional.clipping.polygon regression</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Deucher (1):</p>
+<ul>
+  <li>radeonsi: add new polaris12 pci id</li>
+</ul>
+
+<p>Andres Gomez (3):</p>
+<ul>
+  <li>cherry-ignore: 17.1.4 rejected commits</li>
+  <li>cherry-ignore: bin/get-fixes-pick-list.sh: better identify multiple "fixes:" tags</li>
+  <li>Update version to 17.1.4</li>
+</ul>
+
+<p>Anuj Phogat (2):</p>
+<ul>
+  <li>i965: Add and initialize l3_banks field for gen7+</li>
+  <li>i965: Fix broxton 2x6 l3 config</li>
+</ul>
+
+<p>Ben Crocker (1):</p>
+<ul>
+  <li>egl_dri2: swrastGetDrawableInfo: set *x, common.py [v2]</li>
+</ul>
+
+<p>Brian Paul (2):</p>
+<ul>
+  <li>svga: check return value from svga_set_shader( SVGA3D_SHADERTYPE_GS, NULL)</li>
+  <li>gallium/vbuf: avoid segfault when we get invalid glDrawRangeElements()</li>
+</ul>
+
+<p>Chad Versace (1):</p>
+<ul>
+  <li>egl/android: Change order of EGLConfig generation (v2)</li>
+</ul>
+
+<p>Chandu Babu N (1):</p>
+<ul>
+  <li>change va max_entrypoints</li>
+</ul>
+
+<p>Charmaine Lee (1):</p>
+<ul>
+  <li>svga: use the winsys interface to invalidate surface</li>
+</ul>
+
+<p>Emil Velikov (3):</p>
+<ul>
+  <li>docs: add sha256 checksums for 17.1.3</li>
+  <li>configure.ac: add -pthread to PTHREAD_LIBS</li>
+  <li>radeonsi: include ac_binary.h for struct ac_shader_binary</li>
+</ul>
+
+<p>Eric Engestrom (3):</p>
+<ul>
+  <li>egl: properly count configs</li>
+  <li>egl/display: only detect the platform once</li>
+  <li>egl/display: make platform detection thread-safe</li>
+</ul>
+
+<p>Eric Le Bihan (1):</p>
+<ul>
+  <li>Fix khrplatform.h not installed if EGL is disabled.</li>
+</ul>
+
+<p>Iago Toral Quiroga (1):</p>
+<ul>
+  <li>i965: update MaxTextureRectSize to match PRMs and comply with OpenGL 4.1+</li>
+</ul>
+
+<p>Ilia Mirkin (2):</p>
+<ul>
+  <li>nv50/ir: fetch indirect sources BEFORE the op that uses them</li>
+  <li>nv50/ir: fix combineLd/St to update existing records as necessary</li>
+</ul>
+
+<p>Jason Ekstrand (10):</p>
+<ul>
+  <li>i965: Flush around state base address</li>
+  <li>i965: Take a uint64_t immediate in emit_pipe_control_write</li>
+  <li>i965: Unify the two emit_pipe_control functions</li>
+  <li>i965: Do an end-of-pipe sync prior to STATE_BASE_ADDRESS</li>
+  <li>i965/blorp: Do an end-of-pipe sync around CCS ops</li>
+  <li>i965: Do an end-of-pipe sync after flushes</li>
+  <li>i965: Disable the interleaved vertex optimization when instancing</li>
+  <li>i965: Set step_rate = 0 for interleaved vertex buffers</li>
+  <li>spirv: Work around the Doom shader bug</li>
+  <li>i965: Clamp clear colors to the representable range</li>
+</ul>
+
+<p>Jonas Kulla (1):</p>
+<ul>
+  <li>anv: Fix L3 cache programming on Bay Trail</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>i965: Ignore anisotropic filtering in nearest mode.</li>
+</ul>
+
+<p>Lucas Stach (7):</p>
+<ul>
+  <li>etnaviv: don't try RS blit if blit region is unaligned</li>
+  <li>etnaviv: use padded width/height for resource copies</li>
+  <li>etnaviv: remove bogus assert</li>
+  <li>etnaviv: replace translate_clear_color with util_pack_color</li>
+  <li>etnaviv: mask correct channel for RB swapped rendertargets</li>
+  <li>etnaviv: advertise correct max LOD bias</li>
+  <li>etnaviv: only flush resource to self if no scanout buffer exists</li>
+</ul>
+
+<p>Marek Olšák (4):</p>
+<ul>
+  <li>winsys/amdgpu: fix a deadlock when waiting for submission_in_progress</li>
+  <li>mesa: flush vertices before changing viewports</li>
+  <li>mesa: flush vertices before updating ctx-&gt;_Shader</li>
+  <li>st/mesa: fix pipe_rasterizer_state::scissor with multiple viewports</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>gallium/util: Break recursion in pipe_resource_reference</li>
+</ul>
+
+<p>Nicolai Hähnle (2):</p>
+<ul>
+  <li>gallium/radeon/gfx9: fix PBO texture uploads to compressed textures</li>
+  <li>amd/common: fix off-by-one in sid_tables.py</li>
+</ul>
+
+<p>Pierre Moreau (1):</p>
+<ul>
+  <li>nv50/ir: Properly fold constants in SPLIT operation</li>
+</ul>
+
+<p>Rob Herring (1):</p>
+<ul>
+  <li>Android: major/minor/makedev live in &lt;sys/sysmacros.h&gt;</li>
+</ul>
+
+<p>Topi Pohjolainen (2):</p>
+<ul>
+  <li>i965: Add an end-of-pipe sync helper</li>
+  <li>i965/gen4: Set depth offset when there is stencil attachment only</li>
+</ul>
+
+<p>Ville Syrjälä (2):</p>
+<ul>
+  <li>i915: Fix gl_Fragcoord interpolation</li>
+  <li>i915: Fix wpos_tex vs. -1 comparison</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -213,6 +213,7 @@ CHIPSET(0x6985, POLARIS12_, POLARIS12)
 CHIPSET(0x6986, POLARIS12_, POLARIS12)
 CHIPSET(0x6987, POLARIS12_, POLARIS12)
 CHIPSET(0x6995, POLARIS12_, POLARIS12)
+CHIPSET(0x6997, POLARIS12_, POLARIS12)
 CHIPSET(0x699F, POLARIS12_, POLARIS12)

 CHIPSET(0x6860, VEGA10_, VEGA10)
--- a/src/amd/common/sid_tables.py
+++ b/src/amd/common/sid_tables.py
@@ -110,7 +110,7 @@ class IntTable:
        [static] const typename name[] = { ... };
        to filp.
        """
-        idxs = sorted(self.idxs) + [-1]
+        idxs = sorted(self.idxs) + [len(self.table)]

        fragments = [
            ('\t/* %s */ %s' % (
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1268,38 +1268,39 @@ emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,

 static void
 radv_emit_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
-				  struct radv_pipeline *pipeline,
 				  VkShaderStageFlags stages,
 				  struct radv_descriptor_set *set,
 				  unsigned idx)
 {
-	if (stages & VK_SHADER_STAGE_FRAGMENT_BIT)
-		emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
-						   idx, set->va,
-						   MESA_SHADER_FRAGMENT);
+	if (cmd_buffer->state.pipeline) {
+		if (stages & VK_SHADER_STAGE_FRAGMENT_BIT)
+			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
+							   idx, set->va,
+							   MESA_SHADER_FRAGMENT);

-	if (stages & VK_SHADER_STAGE_VERTEX_BIT)
-		emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
-						   idx, set->va,
-						   MESA_SHADER_VERTEX);
+		if (stages & VK_SHADER_STAGE_VERTEX_BIT)
+			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
+							   idx, set->va,
+							   MESA_SHADER_VERTEX);

-	if ((stages & VK_SHADER_STAGE_GEOMETRY_BIT) && radv_pipeline_has_gs(pipeline))
-		emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
-						   idx, set->va,
-						   MESA_SHADER_GEOMETRY);
+		if ((stages & VK_SHADER_STAGE_GEOMETRY_BIT) && radv_pipeline_has_gs(cmd_buffer->state.pipeline))
+			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
+							   idx, set->va,
+							   MESA_SHADER_GEOMETRY);

-	if ((stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) && radv_pipeline_has_tess(pipeline))
-		emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
-						   idx, set->va,
-						   MESA_SHADER_TESS_CTRL);
+		if ((stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) && radv_pipeline_has_tess(cmd_buffer->state.pipeline))
+			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
+							   idx, set->va,
+							   MESA_SHADER_TESS_CTRL);

-	if ((stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) && radv_pipeline_has_tess(pipeline))
-		emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
-						   idx, set->va,
-						   MESA_SHADER_TESS_EVAL);
+		if ((stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) && radv_pipeline_has_tess(cmd_buffer->state.pipeline))
+			emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
+							   idx, set->va,
+							   MESA_SHADER_TESS_EVAL);
+	}

-	if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
-		emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
+	if (cmd_buffer->state.compute_pipeline && (stages & VK_SHADER_STAGE_COMPUTE_BIT))
+		emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.compute_pipeline,
 						   idx, set->va,
 						   MESA_SHADER_COMPUTE);
 }
@@ -1324,7 +1325,6 @@ radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer)

 static void
 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
-		       struct radv_pipeline *pipeline,
 		       VkShaderStageFlags stages)
 {
 	unsigned i;
@@ -1345,7 +1345,7 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
 		if (!set)
 			continue;

-		radv_emit_descriptor_set_userdata(cmd_buffer, pipeline, stages, set, i);
+		radv_emit_descriptor_set_userdata(cmd_buffer, stages, set, i);
 	}
 	cmd_buffer->state.descriptors_dirty = 0;
 	cmd_buffer->state.push_descriptors_dirty = false;
@@ -1515,8 +1515,7 @@ radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer,

 	radv_emit_primitive_reset_state(cmd_buffer, indexed_draw);

-	radv_flush_descriptors(cmd_buffer, cmd_buffer->state.pipeline,
-			       VK_SHADER_STAGE_ALL_GRAPHICS);
+	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
 	radv_flush_constants(cmd_buffer, cmd_buffer->state.pipeline,
 			     VK_SHADER_STAGE_ALL_GRAPHICS);

@@ -2153,6 +2152,13 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
 	assert(cmd_buffer->cs->cdw <= cdw_max);
 }

+static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer)
+{
+	for (unsigned i = 0; i < MAX_SETS; i++) {
+		if (cmd_buffer->state.descriptors[i])
+			cmd_buffer->state.descriptors_dirty |= (1u << i);
+	}
+}

 void radv_CmdBindPipeline(
 	VkCommandBuffer                             commandBuffer,
@@ -2162,10 +2168,7 @@ void radv_CmdBindPipeline(
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 	RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);

-	for (unsigned i = 0; i < MAX_SETS; i++) {
-		if (cmd_buffer->state.descriptors[i])
-			cmd_buffer->state.descriptors_dirty |= (1 << i);
-	}
+	radv_mark_descriptor_sets_dirty(cmd_buffer);

 	switch (pipelineBindPoint) {
 	case VK_PIPELINE_BIND_POINT_COMPUTE:
@@ -2174,6 +2177,9 @@ void radv_CmdBindPipeline(
 		break;
 	case VK_PIPELINE_BIND_POINT_GRAPHICS:
 		cmd_buffer->state.pipeline = pipeline;
+		if (!pipeline)
+			break;
+
 		cmd_buffer->state.vertex_descriptors_dirty = true;
 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
 		cmd_buffer->push_constant_stages |= pipeline->active_stages;
@@ -2336,7 +2342,6 @@ void radv_CmdSetStencilReference(
 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
 }

-
 void radv_CmdExecuteCommands(
 	VkCommandBuffer                             commandBuffer,
 	uint32_t                                    commandBufferCount,
@@ -2381,6 +2386,7 @@ void radv_CmdExecuteCommands(
 		primary->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL;
 		primary->state.last_primitive_reset_en = -1;
 		primary->state.last_primitive_reset_index = 0;
+		radv_mark_descriptor_sets_dirty(primary);
 	}
 }

@@ -2757,8 +2763,7 @@ static void
 radv_flush_compute_state(struct radv_cmd_buffer *cmd_buffer)
 {
 	radv_emit_compute_pipeline(cmd_buffer);
-	radv_flush_descriptors(cmd_buffer, cmd_buffer->state.compute_pipeline,
-			       VK_SHADER_STAGE_COMPUTE_BIT);
+	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
 	radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline,
 			     VK_SHADER_STAGE_COMPUTE_BIT);
 	si_emit_cache_flush(cmd_buffer);
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -676,7 +676,7 @@ void radv_GetPhysicalDeviceProperties(
 		.driverVersion = radv_get_driver_version(),
 		.vendorID = 0x1002,
 		.deviceID = pdevice->rad_info.pci_id,
-		.deviceType = VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU,
+		.deviceType = pdevice->rad_info.has_dedicated_vram ? VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU : VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
 		.limits = limits,
 		.sparseProperties = {0},
 	};
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -382,7 +382,8 @@ si_make_texture_descriptor(struct radv_device *device,
 			S_008F24_LAST_ARRAY(last_layer);
 		fmask_state[6] = 0;
 		fmask_state[7] = 0;
-	}
+	} else if (fmask_state)
+		memset(fmask_state, 0, 8 * 4);
 }

 static void
--- a/src/amd/vulkan/radv_meta.c
+++ b/src/amd/vulkan/radv_meta.c
@@ -51,10 +51,10 @@ void
 radv_meta_restore(const struct radv_meta_saved_state *state,
 		  struct radv_cmd_buffer *cmd_buffer)
 {
-	cmd_buffer->state.pipeline = state->old_pipeline;
+	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_GRAPHICS,
+			     radv_pipeline_to_handle(state->old_pipeline));

 	cmd_buffer->state.descriptors[0] = state->old_descriptor_set0;
-	cmd_buffer->state.descriptors_dirty |= (1u << 0);
 	memcpy(cmd_buffer->state.vertex_bindings, state->old_vertex_bindings,
 	       sizeof(state->old_vertex_bindings));

@@ -114,7 +114,6 @@ radv_meta_restore_compute(const struct radv_meta_saved_compute_state *state,
 			     radv_pipeline_to_handle(state->old_pipeline));

 	cmd_buffer->state.descriptors[0] = state->old_descriptor_set0;
-	cmd_buffer->state.descriptors_dirty |= (1u << 0);

 	if (push_constant_size) {
 		memcpy(cmd_buffer->push_constants, state->push_constants, push_constant_size);
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -44,11 +44,6 @@ static unsigned get_max_db(struct radv_device *device)
 	unsigned num_db = device->physical_device->rad_info.num_render_backends;
 	MAYBE_UNUSED unsigned rb_mask = device->physical_device->rad_info.enabled_rb_mask;

-	if (device->physical_device->rad_info.chip_class == SI)
-		num_db = 8;
-	else
-		num_db = MAX2(8, num_db);
-
 	/* Otherwise we need to change the query reset procedure */
 	assert(rb_mask == ((1ull << num_db) - 1));

--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -931,6 +931,9 @@ static void *radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr)
 {
 	struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
 	void *ret = NULL;
+
+	if (!cs->ib_buffer)
+		return NULL;
 	for (unsigned i = 0; i <= cs->num_old_ib_buffers; ++i) {
 		struct radv_amdgpu_winsys_bo *bo;

@@ -949,10 +952,15 @@ static void radv_amdgpu_winsys_cs_dump(struct radeon_winsys_cs *_cs,
                                       uint32_t trace_id)
 {
 	struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
+	void *ib = cs->base.buf;
+	int num_dw = cs->base.cdw;

-	ac_parse_ib(file,
-		    radv_amdgpu_winsys_get_cpu_addr(cs, cs->ib.ib_mc_address),
-		    cs->ib.size, trace_id,  "main IB", cs->ws->info.chip_class,
+	if (cs->ws->use_ib_bos) {
+		ib = radv_amdgpu_winsys_get_cpu_addr(cs, cs->ib.ib_mc_address);
+		num_dw = cs->ib.size;
+	}
+	assert(ib);
+	ac_parse_ib(file, ib, num_dw, trace_id, "main IB", cs->ws->info.chip_class,
 		    radv_amdgpu_winsys_get_cpu_addr, cs);
 }

--- a/src/compiler/glsl/lower_distance.cpp
+++ b/src/compiler/glsl/lower_distance.cpp
@@ -167,7 +167,6 @@ lower_distance_visitor::visit(ir_variable *ir)
      /* Clone the old var so that we inherit all of its properties */
      *new_var = ir->clone(ralloc_parent(ir), NULL);
      (*new_var)->name = ralloc_strdup(*new_var, GLSL_CLIP_VAR_NAME);
-      (*new_var)->data.max_array_access = new_size - 1;
      (*new_var)->data.location = VARYING_SLOT_CLIP_DIST0;

      if (!ir->type->fields.array->is_array()) {
@@ -182,6 +181,7 @@ lower_distance_visitor::visit(ir_variable *ir)
                  this->shader_stage == MESA_SHADER_GEOMETRY)));

         assert (ir->type->fields.array == glsl_type::float_type);
+         (*new_var)->data.max_array_access = new_size - 1;

         /* And change the properties that we need to change */
         (*new_var)->type = glsl_type::get_array_instance(glsl_type::vec4_type,
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -288,6 +288,20 @@ struct vtn_variable {
   nir_variable *var;
   nir_variable **members;

+   /**
+    * In some early released versions of GLSLang, it implemented all function
+    * calls by making copies of all parameters into temporary variables and
+    * passing those variables into the function.  It even did so for samplers
+    * and images which violates the SPIR-V spec.  Unfortunately, two games
+    * (Talos Principle and Doom) shipped with this old version of GLSLang and
+    * also happen to pass samplers into functions.  Talos Principle received
+    * an update fairly shortly after release with an updated GLSLang.  Doom,
+    * on the other hand, has never received an update so we need to work
+    * around this GLSLang issue in SPIR-V -> NIR.  Hopefully, we can drop this
+    * hack at some point in the future.
+    */
+   struct vtn_access_chain *copy_prop_sampler;
+
   struct vtn_access_chain chain;
 };

--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -96,6 +96,10 @@ rewrite_deref_types(nir_deref *deref, const struct glsl_type *type)
 nir_deref_var *
 vtn_access_chain_to_deref(struct vtn_builder *b, struct vtn_access_chain *chain)
 {
+   /* Do on-the-fly copy propagation for samplers. */
+   if (chain->var->copy_prop_sampler)
+      return vtn_access_chain_to_deref(b, chain->var->copy_prop_sampler);
+
   nir_deref_var *deref_var;
   if (chain->var->var) {
      deref_var = nir_deref_var_create(b, chain->var->var);
@@ -1609,6 +1613,16 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
   case SpvOpStore: {
      struct vtn_access_chain *dest =
         vtn_value(b, w[1], vtn_value_type_access_chain)->access_chain;
+
+      if (glsl_type_is_sampler(dest->var->type->type)) {
+         vtn_warn("OpStore of a sampler detected.  Doing on-the-fly copy "
+                  "propagation to workaround the problem.");
+         assert(dest->var->copy_prop_sampler == NULL);
+         dest->var->copy_prop_sampler =
+            vtn_value(b, w[2], vtn_value_type_access_chain)->access_chain;
+         break;
+      }
+
      struct vtn_ssa_value *src = vtn_ssa_value(b, w[2]);
      vtn_variable_store(b, src, dest);
      break;
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -163,9 +163,6 @@ pkgconfigdir = $(libdir)/pkgconfig

 pkgconfig_DATA = main/egl.pc

-khrdir = $(includedir)/KHR
-khr_HEADERS = $(top_srcdir)/include/KHR/khrplatform.h
-
 egldir = $(includedir)/EGL
 egl_HEADERS = \
 	$(top_srcdir)/include/EGL/eglext.h \
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -609,10 +609,10 @@ droid_query_buffer_age(_EGLDriver *drv,

   if (update_buffers(dri2_surf) < 0) {
      _eglError(EGL_BAD_ALLOC, "droid_query_buffer_age");
-      return 0;
+      return -1;
   }

-   return dri2_surf->back->age;
+   return dri2_surf->back ? dri2_surf->back->age : 0;
 }

 static EGLBoolean
@@ -1005,20 +1005,39 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
   unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 };
   int count, i, j;

+   /* The nesting of loops is significant here. Also significant is the order
+    * of the HAL pixel formats. Many Android apps (such as Google's official
+    * NDK GLES2 example app), and even portions the core framework code (such
+    * as SystemServiceManager in Nougat), incorrectly choose their EGLConfig.
+    * They neglect to match the EGLConfig's EGL_NATIVE_VISUAL_ID against the
+    * window's native format, and instead choose the first EGLConfig whose
+    * channel sizes match those of the native window format while ignoring the
+    * channel *ordering*.
+    *
+    * We can detect such buggy clients in logcat when they call
+    * eglCreateSurface, by detecting the mismatch between the EGLConfig's
+    * format and the window's format.
+    *
+    * As a workaround, we generate EGLConfigs such that all EGLConfigs for HAL
+    * pixel format i precede those for HAL pixel format i+1. In my
+    * (chadversary) testing on Android Nougat, this was good enough to pacify
+    * the buggy clients.
+    */
   count = 0;
-   for (i = 0; dri2_dpy->driver_configs[i]; i++) {
+   for (i = 0; i < ARRAY_SIZE(visuals); i++) {
      const EGLint surface_type = EGL_WINDOW_BIT | EGL_PBUFFER_BIT;
      struct dri2_egl_config *dri2_conf;

-      for (j = 0; j < ARRAY_SIZE(visuals); j++) {
-         config_attrs[1] = visuals[j].format;
-         config_attrs[3] = visuals[j].format;
+      for (j = 0; dri2_dpy->driver_configs[j]; j++) {
+         config_attrs[1] = visuals[i].format;
+         config_attrs[3] = visuals[i].format;

-         dri2_conf = dri2_add_config(dpy, dri2_dpy->driver_configs[i],
-               count + 1, surface_type, config_attrs, visuals[j].rgba_masks);
+         dri2_conf = dri2_add_config(dpy, dri2_dpy->driver_configs[j],
+               count + 1, surface_type, config_attrs, visuals[i].rgba_masks);
         if (dri2_conf) {
-            count++;
-            format_count[j]++;
+            if (dri2_conf->base.ConfigID == count + 1)
+               count++;
+            format_count[i]++;
         }
      }
   }
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -463,7 +463,7 @@ dri2_drm_query_buffer_age(_EGLDriver *drv,

   if (get_back_bo(dri2_surf) < 0) {
      _eglError(EGL_BAD_ALLOC, "dri2_query_buffer_age");
-      return 0;
+      return -1;
   }

   return dri2_surf->back->age;
@@ -630,7 +630,8 @@ drm_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *disp)
         dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i],
               count + 1, EGL_WINDOW_BIT, attr_list, NULL);
         if (dri2_conf) {
-            count++;
+            if (dri2_conf->base.ConfigID == count + 1)
+               count++;
            format_count[j]++;
         }
      }
--- a/src/egl/drivers/dri2/platform_surfaceless.c
+++ b/src/egl/drivers/dri2/platform_surfaceless.c
@@ -212,7 +212,8 @@ surfaceless_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
               count + 1, EGL_PBUFFER_BIT, NULL, visuals[j].rgba_masks);

         if (dri2_conf) {
-            count++;
+            if (dri2_conf->base.ConfigID == count + 1)
+               count++;
            format_count[j]++;
         }
      }
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -808,7 +808,7 @@ dri2_wl_query_buffer_age(_EGLDriver *drv,

   if (get_back_bo(dri2_surf) < 0) {
      _eglError(EGL_BAD_ALLOC, "dri2_query_buffer_age");
-      return 0;
+      return -1;
   }

   return dri2_surf->back->age;
@@ -1128,7 +1128,8 @@ dri2_wl_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *disp)
         dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i],
               count + 1, EGL_WINDOW_BIT, NULL, visuals[j].rgba_masks);
         if (dri2_conf) {
-            count++;
+            if (dri2_conf->base.ConfigID == count + 1)
+               count++;
            format_count[j]++;
         }
      }
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -110,7 +110,7 @@ swrastGetDrawableInfo(__DRIdrawable * draw,
   xcb_get_geometry_reply_t *reply;
   xcb_generic_error_t *error;

-   *w = *h = 0;
+   *x = *y = *w = *h = 0;
   cookie = xcb_get_geometry (dri2_dpy->conn, dri2_surf->drawable);
   reply = xcb_get_geometry_reply (dri2_dpy->conn, cookie, &error);
   if (reply == NULL)
@@ -120,6 +120,8 @@ swrastGetDrawableInfo(__DRIdrawable * draw,
      _eglLog(_EGL_WARNING, "error in xcb_get_geometry");
      free(error);
   } else {
+      *x = reply->x;
+      *y = reply->y;
      *w = reply->width;
      *h = reply->height;
   }
@@ -772,7 +774,8 @@ dri2_x11_add_configs_for_visuals(struct dri2_egl_display *dri2_dpy,
            dri2_conf = dri2_add_config(disp, config, count + 1, surface_type,
                                        config_attrs, rgba_masks);
            if (dri2_conf)
-               count++;
+               if (dri2_conf->base.ConfigID == count + 1)
+                  count++;

            /* Allow a 24-bit RGB visual to match a 32-bit RGBA EGLConfig.
             * Otherwise it will only match a 32-bit RGBA visual.  On a
@@ -787,7 +790,8 @@ dri2_x11_add_configs_for_visuals(struct dri2_egl_display *dri2_dpy,
               dri2_conf = dri2_add_config(disp, config, count + 1, surface_type,
                                           config_attrs, rgba_masks);
               if (dri2_conf)
-                  count++;
+                  if (dri2_conf->base.ConfigID == count + 1)
+                     count++;
            }
 	 }
      }
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -36,6 +36,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "c11/threads.h"
+#include "util/u_atomic.h"

 #include "eglcontext.h"
 #include "eglcurrent.h"
@@ -180,25 +181,32 @@ _eglNativePlatformDetectNativeDisplay(void *nativeDisplay)
 _EGLPlatformType
 _eglGetNativePlatform(void *nativeDisplay)
 {
-   static _EGLPlatformType native_platform;
-   char *detection_method;
+   static _EGLPlatformType native_platform = _EGL_INVALID_PLATFORM;
+   _EGLPlatformType detected_platform = native_platform;

-   native_platform = _eglGetNativePlatformFromEnv();
-   detection_method = "environment overwrite";
+   if (detected_platform == _EGL_INVALID_PLATFORM) {
+      const char *detection_method;

-   if (native_platform == _EGL_INVALID_PLATFORM) {
-      native_platform = _eglNativePlatformDetectNativeDisplay(nativeDisplay);
-      detection_method = "autodetected";
+      detected_platform = _eglGetNativePlatformFromEnv();
+      detection_method = "environment overwrite";
+
+      if (detected_platform == _EGL_INVALID_PLATFORM) {
+         detected_platform = _eglNativePlatformDetectNativeDisplay(nativeDisplay);
+         detection_method = "autodetected";
+      }
+
+      if (detected_platform == _EGL_INVALID_PLATFORM) {
+         detected_platform = _EGL_NATIVE_PLATFORM;
+         detection_method = "build-time configuration";
+      }
+
+      _eglLog(_EGL_DEBUG, "Native platform type: %s (%s)",
+              egl_platforms[detected_platform].name, detection_method);
+
+      p_atomic_cmpxchg(&native_platform, _EGL_INVALID_PLATFORM,
+                       detected_platform);
   }

-   if (native_platform == _EGL_INVALID_PLATFORM) {
-      native_platform = _EGL_NATIVE_PLATFORM;
-      detection_method = "build-time configuration";
-   }
-
-   _eglLog(_EGL_DEBUG, "Native platform type: %s (%s)",
-           egl_platforms[native_platform].name, detection_method);
-
   return native_platform;
 }

--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -409,7 +409,11 @@ _eglQuerySurface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface,
         _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
         return EGL_FALSE;
      }
-      *value = drv->API.QueryBufferAge(drv, dpy, surface);
+      EGLint result = drv->API.QueryBufferAge(drv, dpy, surface);
+      /* error happened */
+      if (result < 0)
+         return EGL_FALSE;
+      *value = result;
      break;
   default:
      _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -137,8 +137,14 @@ pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex)

   if (pipe_reference_described(&(*ptr)->reference, &tex->reference, 
                                (debug_reference_descriptor)debug_describe_resource)) {
-      pipe_resource_reference(&old_tex->next, NULL);
-      old_tex->screen->resource_destroy(old_tex->screen, old_tex);
+      /* Avoid recursion, which would prevent inlining this function */
+      do {
+         struct pipe_resource *next = old_tex->next;
+
+         old_tex->screen->resource_destroy(old_tex->screen, old_tex);
+         old_tex = next;
+      } while (pipe_reference_described(&old_tex->reference, NULL,
+                                        (debug_reference_descriptor)debug_describe_resource));
   }
   *ptr = tex;
 }
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -423,8 +423,22 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
         unsigned size = vb->stride ? num_vertices * vb->stride
                                    : sizeof(double)*4;

-         if (offset+size > vb->buffer->width0) {
+         if (offset + size > vb->buffer->width0) {
+            /* Don't try to map past end of buffer.  This often happens when
+             * we're translating an attribute that's at offset > 0 from the
+             * start of the vertex.  If we'd subtract attrib's offset from
+             * the size, this probably wouldn't happen.
+             */
            size = vb->buffer->width0 - offset;
+
+            /* Also adjust num_vertices.  A common user error is to call
+             * glDrawRangeElements() with incorrect 'end' argument.  The 'end
+             * value should be the max index value, but people often
+             * accidentally add one to this value.  This adjustment avoids
+             * crashing (by reading past the end of a hardware buffer mapping)
+             * when people do that.
+             */
+            num_vertices = (size + vb->stride - 1) / vb->stride;
         }

         map = pipe_buffer_map_range(mgr->pipe, vb->buffer, offset, size,
--- a/src/gallium/drivers/etnaviv/etnaviv_blend.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_blend.c
@@ -48,7 +48,7 @@ etna_blend_state_create(struct pipe_context *pctx,
    * - NOT source factor is ONE and destination factor ZERO for both rgb and
    *   alpha (which would mean that blending is effectively disabled)
    */
-   bool enable = rt0->blend_enable &&
+   co->enable = rt0->blend_enable &&
                 !(rt0->rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
                   rt0->rgb_dst_factor == PIPE_BLENDFACTOR_ZERO &&
                   rt0->alpha_src_factor == PIPE_BLENDFACTOR_ONE &&
@@ -59,17 +59,11 @@ etna_blend_state_create(struct pipe_context *pctx,
    * - NOT source factor is equal to destination factor for both rgb abd
    *   alpha (which would effectively that mean alpha is not separate)
    */
-   bool separate_alpha = enable &&
+   bool separate_alpha = co->enable &&
                         !(rt0->rgb_src_factor == rt0->alpha_src_factor &&
                           rt0->rgb_dst_factor == rt0->alpha_dst_factor);

-   /* If the complete render target is written, set full_overwrite:
-    * - The color mask is 1111
-    * - No blending is used
-    */
-   bool full_overwrite = (rt0->colormask == 15) && !enable;
-
-   if (enable) {
+   if (co->enable) {
      co->PE_ALPHA_CONFIG =
         VIVS_PE_ALPHA_CONFIG_BLEND_ENABLE_COLOR |
         COND(separate_alpha, VIVS_PE_ALPHA_CONFIG_BLEND_SEPARATE_ALPHA) |
@@ -83,10 +77,6 @@ etna_blend_state_create(struct pipe_context *pctx,
      co->PE_ALPHA_CONFIG = 0;
   }

-   co->PE_COLOR_FORMAT =
-         VIVS_PE_COLOR_FORMAT_COMPONENTS(rt0->colormask) |
-         COND(full_overwrite, VIVS_PE_COLOR_FORMAT_OVERWRITE);
-
   co->PE_LOGIC_OP =
         VIVS_PE_LOGIC_OP_OP(so->logicop_enable ? so->logicop_func : LOGIC_OP_COPY) |
         0x000E4000 /* ??? */;
@@ -107,3 +97,35 @@ etna_blend_state_create(struct pipe_context *pctx,

   return co;
 }
+
+bool
+etna_update_blend(struct etna_context *ctx)
+{
+   struct pipe_framebuffer_state *pfb = &ctx->framebuffer_s;
+   struct pipe_blend_state *pblend = ctx->blend;
+   struct etna_blend_state *blend = etna_blend_state(pblend);
+   const struct pipe_rt_blend_state *rt0 = &pblend->rt[0];
+   uint32_t colormask;
+
+   if (pfb->cbufs[0] &&
+       translate_rs_format_rb_swap(pfb->cbufs[0]->texture->format)) {
+      colormask = rt0->colormask & (PIPE_MASK_A | PIPE_MASK_G);
+      if (rt0->colormask & PIPE_MASK_R)
+         colormask |= PIPE_MASK_B;
+      if (rt0->colormask & PIPE_MASK_B)
+         colormask |= PIPE_MASK_R;
+   } else {
+      colormask = rt0->colormask;
+   }
+
+   /* If the complete render target is written, set full_overwrite:
+    * - The color mask is 1111
+    * - No blending is used
+    */
+   bool full_overwrite = (rt0->colormask == 0xf) && !blend->enable;
+   blend->PE_COLOR_FORMAT =
+            VIVS_PE_COLOR_FORMAT_COMPONENTS(colormask) |
+            COND(full_overwrite, VIVS_PE_COLOR_FORMAT_OVERWRITE);
+
+   return true;
+}
--- a/src/gallium/drivers/etnaviv/etnaviv_blend.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_blend.h
@@ -30,9 +30,13 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"

+struct etna_context;
+
 struct etna_blend_state {
   struct pipe_blend_state base;

+   bool enable;
+
   uint32_t PE_ALPHA_CONFIG;
   uint32_t PE_COLOR_FORMAT;
   uint32_t PE_LOGIC_OP;
@@ -49,4 +53,7 @@ void *
 etna_blend_state_create(struct pipe_context *pctx,
                        const struct pipe_blend_state *so);

+bool
+etna_update_blend(struct etna_context *ctx);
+
 #endif
--- a/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c
@@ -100,13 +100,24 @@ etna_rs_gen_clear_surface(struct etna_context *ctx, struct etna_surface *surf,
   });
 }

+static inline uint32_t
+pack_rgba(enum pipe_format format, const float *rgba)
+{
+   union util_color uc;
+   util_pack_color(rgba, format, &uc);
+   if (util_format_get_blocksize(format) == 2)
+      return uc.ui[0] << 16 | uc.ui[0];
+   else
+      return uc.ui[0];
+}
+
 static void
 etna_blit_clear_color(struct pipe_context *pctx, struct pipe_surface *dst,
                      const union pipe_color_union *color)
 {
   struct etna_context *ctx = etna_context(pctx);
   struct etna_surface *surf = etna_surface(dst);
-   uint32_t new_clear_value = translate_clear_color(surf->base.format, color);
+   uint32_t new_clear_value = pack_rgba(surf->base.format, color->f);

   if (surf->surf.ts_size) { /* TS: use precompiled clear command */
      ctx->framebuffer.TS_COLOR_CLEAR_VALUE = new_clear_value;
@@ -287,8 +298,6 @@ etna_resource_copy_region(struct pipe_context *pctx, struct pipe_resource *dst,

   /* The resource must be of the same format. */
   assert(src->format == dst->format);
-   /* Resources with nr_samples > 1 are not allowed. */
-   assert(src->nr_samples <= 1 && dst->nr_samples <= 1);

   /* XXX we can use the RS as a literal copy engine here
    * the only complexity is tiling; the size of the boxes needs to be aligned
@@ -448,7 +457,8 @@ etna_try_rs_blit(struct pipe_context *pctx,
   if (width > src_lev->padded_width ||
       width > dst_lev->padded_width * msaa_xscale ||
       height > src_lev->padded_height ||
-       height > dst_lev->padded_height * msaa_yscale)
+       height > dst_lev->padded_height * msaa_yscale ||
+       width & (w_align - 1) || height & (h_align - 1))
      goto manual;

   if (src->base.nr_samples > 1) {
@@ -593,10 +603,11 @@ etna_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
 {
   struct etna_resource *rsc = etna_resource(prsc);

-   if (rsc->scanout &&
-       etna_resource_older(etna_resource(rsc->scanout->prime), rsc)) {
-      etna_copy_resource(pctx, rsc->scanout->prime, prsc, 0, 0);
-      etna_resource(rsc->scanout->prime)->seqno = rsc->seqno;
+   if (rsc->scanout) {
+      if (etna_resource_older(etna_resource(rsc->scanout->prime), rsc)) {
+         etna_copy_resource(pctx, rsc->scanout->prime, prsc, 0, 0);
+         etna_resource(rsc->scanout->prime)->seqno = rsc->seqno;
+      }
   } else if (etna_resource_needs_flush(rsc)) {
      etna_copy_resource(pctx, prsc, prsc, 0, 0);
      rsc->flush_seqno = rsc->seqno;
@@ -627,9 +638,9 @@ etna_copy_resource(struct pipe_context *pctx, struct pipe_resource *dst,
   for (int level = first_level; level <= last_level; level++) {
      blit.src.level = blit.dst.level = level;
      blit.src.box.width = blit.dst.box.width =
-         MIN2(src_priv->levels[level].width, dst_priv->levels[level].width);
+         MIN2(src_priv->levels[level].padded_width, dst_priv->levels[level].padded_width);
      blit.src.box.height = blit.dst.box.height =
-         MIN2(src_priv->levels[level].height, dst_priv->levels[level].height);
+         MIN2(src_priv->levels[level].padded_height, dst_priv->levels[level].padded_height);

      for (int layer = 0; layer < dst->array_size; layer++) {
         blit.src.box.z = blit.dst.box.z = layer;
--- a/src/gallium/drivers/etnaviv/etnaviv_screen.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c
@@ -341,6 +341,8 @@ etna_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 static float
 etna_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
 {
+   struct etna_screen *screen = etna_screen(pscreen);
+
   switch (param) {
   case PIPE_CAPF_MAX_LINE_WIDTH:
   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
@@ -350,7 +352,7 @@ etna_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
      return 16.0f;
   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-      return 16.0f;
+      return util_last_bit(screen->specs.max_texture_size);
   case PIPE_CAPF_GUARD_BAND_LEFT:
   case PIPE_CAPF_GUARD_BAND_TOP:
   case PIPE_CAPF_GUARD_BAND_RIGHT:
--- a/src/gallium/drivers/etnaviv/etnaviv_state.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_state.c
@@ -29,6 +29,7 @@

 #include "hw/common.xml.h"

+#include "etnaviv_blend.h"
 #include "etnaviv_clear_blit.h"
 #include "etnaviv_context.h"
 #include "etnaviv_format.h"
@@ -624,6 +625,9 @@ static const struct etna_state_updater etna_state_updates[] = {
   },
   {
      etna_shader_link, ETNA_DIRTY_SHADER,
+   },
+   {
+      etna_update_blend, ETNA_DIRTY_BLEND | ETNA_DIRTY_FRAMEBUFFER
   }
 };

--- a/src/gallium/drivers/etnaviv/etnaviv_transfer.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_transfer.c
@@ -70,6 +70,9 @@ etna_transfer_unmap(struct pipe_context *pctx, struct pipe_transfer *ptrans)
   if (rsc->texture && !etna_resource_newer(rsc, etna_resource(rsc->texture)))
      rsc = etna_resource(rsc->texture); /* switch to using the texture resource */

+   if (trans->rsc)
+      etna_bo_cpu_fini(etna_resource(trans->rsc)->bo);
+
   if (ptrans->usage & PIPE_TRANSFER_WRITE) {
      if (trans->rsc) {
         /* We have a temporary resource due to either tile status or
@@ -105,15 +108,15 @@ etna_transfer_unmap(struct pipe_context *pctx, struct pipe_transfer *ptrans)
      }

      rsc->seqno++;
-      etna_bo_cpu_fini(rsc->bo);

      if (rsc->base.bind & PIPE_BIND_SAMPLER_VIEW) {
-         /* XXX do we need to flush the CPU cache too or start a write barrier
-          * to make sure the GPU sees it? */
         ctx->dirty |= ETNA_DIRTY_TEXTURE_CACHES;
      }
   }

+   if (!trans->rsc)
+      etna_bo_cpu_fini(rsc->bo);
+
   pipe_resource_reference(&trans->rsc, NULL);
   pipe_resource_reference(&ptrans->resource, NULL);
   slab_free(&ctx->transfer_pool, trans);
--- a/src/gallium/drivers/etnaviv/etnaviv_translate.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_translate.h
@@ -405,53 +405,6 @@ etna_layout_multiple(unsigned layout, unsigned pixel_pipes, bool rs_align,
   }
 }

-/* return 32-bit clear pattern for color */
-static inline uint32_t
-translate_clear_color(enum pipe_format format,
-                      const union pipe_color_union *color)
-{
-   uint32_t clear_value = 0;
-
-   // XXX util_pack_color
-   switch (format) {
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-   case PIPE_FORMAT_B8G8R8X8_UNORM:
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-   case PIPE_FORMAT_R8G8B8X8_UNORM:
-      clear_value = etna_cfloat_to_uintN(color->f[2], 8) |
-                    (etna_cfloat_to_uintN(color->f[1], 8) << 8) |
-                    (etna_cfloat_to_uintN(color->f[0], 8) << 16) |
-                    (etna_cfloat_to_uintN(color->f[3], 8) << 24);
-      break;
-   case PIPE_FORMAT_B4G4R4X4_UNORM:
-   case PIPE_FORMAT_B4G4R4A4_UNORM:
-      clear_value = etna_cfloat_to_uintN(color->f[2], 4) |
-                    (etna_cfloat_to_uintN(color->f[1], 4) << 4) |
-                    (etna_cfloat_to_uintN(color->f[0], 4) << 8) |
-                    (etna_cfloat_to_uintN(color->f[3], 4) << 12);
-      clear_value |= clear_value << 16;
-      break;
-   case PIPE_FORMAT_B5G5R5X1_UNORM:
-   case PIPE_FORMAT_B5G5R5A1_UNORM:
-      clear_value = etna_cfloat_to_uintN(color->f[2], 5) |
-                    (etna_cfloat_to_uintN(color->f[1], 5) << 5) |
-                    (etna_cfloat_to_uintN(color->f[0], 5) << 10) |
-                    (etna_cfloat_to_uintN(color->f[3], 1) << 15);
-      clear_value |= clear_value << 16;
-      break;
-   case PIPE_FORMAT_B5G6R5_UNORM:
-      clear_value = etna_cfloat_to_uintN(color->f[2], 5) |
-                    (etna_cfloat_to_uintN(color->f[1], 6) << 5) |
-                    (etna_cfloat_to_uintN(color->f[0], 5) << 11);
-      clear_value |= clear_value << 16;
-      break;
-   default:
-      DBG("Unhandled pipe format for color clear: %i", format);
-   }
-
-   return clear_value;
-}
-
 static inline uint32_t
 translate_clear_depth_stencil(enum pipe_format format, float depth,
                              unsigned stencil)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2628,6 +2628,10 @@ Converter::handleLOAD(Value *dst0[4])
   const int r = tgsi.getSrc(0).getIndex(0);
   int c;
   std::vector<Value *> off, src, ldv, def;
+   Value *ind = NULL;
+
+   if (tgsi.getSrc(0).isIndirect(0))
+      ind = fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0);

   switch (tgsi.getSrc(0).getFile()) {
   case TGSI_FILE_BUFFER:
@@ -2654,8 +2658,8 @@ Converter::handleLOAD(Value *dst0[4])

         Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off);
         ld->cache = tgsi.getCacheMode();
-         if (tgsi.getSrc(0).isIndirect(0))
-            ld->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
+         if (ind)
+            ld->setIndirect(0, 1, ind);
      }
      break;
   case TGSI_FILE_IMAGE: {
@@ -2677,8 +2681,8 @@ Converter::handleLOAD(Value *dst0[4])
      ld->tex.mask = tgsi.getDst(0).getMask();
      ld->tex.format = getImageFormat(code, r);
      ld->cache = tgsi.getCacheMode();
-      if (tgsi.getSrc(0).isIndirect(0))
-         ld->setIndirectR(fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, NULL));
+      if (ind)
+         ld->setIndirectR(ind);

      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
         if (dst0[c] != def[c])
@@ -2766,6 +2770,10 @@ Converter::handleSTORE()
   const int r = tgsi.getDst(0).getIndex(0);
   int c;
   std::vector<Value *> off, src, dummy;
+   Value *ind = NULL;
+
+   if (tgsi.getDst(0).isIndirect(0))
+      ind = fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0);

   switch (tgsi.getDst(0).getFile()) {
   case TGSI_FILE_BUFFER:
@@ -2788,8 +2796,8 @@ Converter::handleSTORE()

         Instruction *st = mkStore(OP_STORE, TYPE_U32, sym, off, fetchSrc(1, c));
         st->cache = tgsi.getCacheMode();
-         if (tgsi.getDst(0).isIndirect(0))
-            st->setIndirect(0, 1, fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0));
+         if (ind)
+            st->setIndirect(0, 1, ind);
      }
      break;
   case TGSI_FILE_IMAGE: {
@@ -2807,8 +2815,8 @@ Converter::handleSTORE()
      st->tex.mask = tgsi.getDst(0).getMask();
      st->tex.format = getImageFormat(code, r);
      st->cache = tgsi.getCacheMode();
-      if (tgsi.getDst(0).isIndirect(0))
-         st->setIndirectR(fetchSrc(tgsi.getDst(0).getIndirect(0), 0, NULL));
+      if (ind)
+         st->setIndirectR(ind);
      }
      break;
   default:
@@ -2877,6 +2885,10 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
   std::vector<Value *> srcv;
   std::vector<Value *> defv;
   LValue *dst = getScratch();
+   Value *ind = NULL;
+
+   if (tgsi.getSrc(0).isIndirect(0))
+      ind = fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0);

   switch (tgsi.getSrc(0).getFile()) {
   case TGSI_FILE_BUFFER:
@@ -2886,23 +2898,21 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
            continue;

         Instruction *insn;
-         Value *off = fetchSrc(1, c), *off2 = NULL;
+         Value *off = fetchSrc(1, c);
         Value *sym;
         if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE)
            sym = makeSym(tgsi.getSrc(0).getFile(), r, -1, c,
                          tgsi.getSrc(1).getValueU32(c, info));
         else
            sym = makeSym(tgsi.getSrc(0).getFile(), r, -1, c, 0);
-         if (tgsi.getSrc(0).isIndirect(0))
-            off2 = fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0);
         if (subOp == NV50_IR_SUBOP_ATOM_CAS)
            insn = mkOp3(OP_ATOM, ty, dst, sym, fetchSrc(2, c), fetchSrc(3, c));
         else
            insn = mkOp2(OP_ATOM, ty, dst, sym, fetchSrc(2, c));
         if (tgsi.getSrc(1).getFile() != TGSI_FILE_IMMEDIATE)
            insn->setIndirect(0, 0, off);
-         if (off2)
-            insn->setIndirect(0, 1, off2);
+         if (ind)
+            insn->setIndirect(0, 1, ind);
         insn->subOp = subOp;
      }
      for (int c = 0; c < 4; ++c)
@@ -2925,8 +2935,8 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
      tex->tex.mask = 1;
      tex->tex.format = getImageFormat(code, r);
      tex->setType(ty);
-      if (tgsi.getSrc(0).isIndirect(0))
-         tex->setIndirectR(fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, NULL));
+      if (ind)
+         tex->setIndirectR(ind);

      for (int c = 0; c < 4; ++c)
         if (dst0[c])
@@ -3798,12 +3808,14 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
      break;
   case TGSI_OPCODE_RESQ:
      if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER) {
+         Value *ind = NULL;
+         if (tgsi.getSrc(0).isIndirect(0))
+            ind = fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0);
         geni = mkOp1(OP_BUFQ, TYPE_U32, dst0[0],
                      makeSym(tgsi.getSrc(0).getFile(),
                              tgsi.getSrc(0).getIndex(0), -1, 0, 0));
-         if (tgsi.getSrc(0).isIndirect(0))
-            geni->setIndirect(0, 1,
-                              fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
+         if (ind)
+            geni->setIndirect(0, 1, ind);
      } else {
         assert(tgsi.getSrc(0).getFile() == TGSI_FILE_IMAGE);

@@ -3816,10 +3828,11 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
         }
         texi->tex.r = tgsi.getSrc(0).getIndex(0);
         texi->tex.target = getImageTarget(code, texi->tex.r);
-         bb->insertTail(texi);

         if (tgsi.getSrc(0).isIndirect(0))
            texi->setIndirectR(fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, NULL));
+
+         bb->insertTail(texi);
      }
      break;
   case TGSI_OPCODE_IBFE:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -938,8 +938,9 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
      bld.setPosition(i, false);

      uint8_t size = i->getDef(0)->reg.size;
-      uint32_t mask = (1ULL << size) - 1;
-      assert(size <= 32);
+      uint8_t bitsize = size * 8;
+      uint32_t mask = (1ULL << bitsize) - 1;
+      assert(bitsize <= 32);

      uint64_t val = imm0.reg.data.u64;
      for (int8_t d = 0; i->defExists(d); ++d) {
@@ -947,7 +948,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
         assert(def->reg.size == size);

         newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32);
-         val >>= size;
+         val >>= bitsize;
      }
      delete_Instruction(prog, i);
      break;
@@ -2485,6 +2486,10 @@ MemoryOpt::combineLd(Record *rec, Instruction *ld)

   assert(sizeRc + sizeLd <= 16 && offRc != offLd);

+   // lock any stores that overlap with the load being merged into the
+   // existing record.
+   lockStores(ld);
+
   for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);

   if (offLd < offRc) {
@@ -2541,6 +2546,10 @@ MemoryOpt::combineSt(Record *rec, Instruction *st)
   if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
      return false;

+   // remove any existing load/store records for the store being merged into
+   // the existing record.
+   purgeRecords(st, DATA_FILE_COUNT);
+
   st->takeExtraSources(0, extra); // save predicate and indirect address

   if (offRc < offSt) {
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -90,11 +90,20 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
   bindings &= ~(PIPE_BIND_LINEAR |
                 PIPE_BIND_SHARED);

-   if (bindings & PIPE_BIND_SHADER_IMAGE && sample_count > 1 &&
-       nouveau_screen(pscreen)->class_3d >= GM107_3D_CLASS) {
-      /* MS images are currently unsupported on Maxwell because they have to
-       * be handled explicitly. */
-      return false;
+   if (bindings & PIPE_BIND_SHADER_IMAGE) {
+      if (sample_count > 1 &&
+          nouveau_screen(pscreen)->class_3d >= GM107_3D_CLASS) {
+         /* MS images are currently unsupported on Maxwell because they have to
+          * be handled explicitly. */
+         return false;
+      }
+
+      if (format == PIPE_FORMAT_B8G8R8A8_UNORM &&
+          nouveau_screen(pscreen)->class_3d < NVE4_3D_CLASS) {
+         /* This should work on Fermi, but for currently unknown reasons it
+          * does not and results in breaking reads from pbos. */
+         return false;
+      }
   }

   return (( nvc0_format_table[format].usage |
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -1965,6 +1965,8 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,
 	unsigned level = templ->u.tex.level;
 	unsigned width = u_minify(tex->width0, level);
 	unsigned height = u_minify(tex->height0, level);
+	unsigned width0 = tex->width0;
+	unsigned height0 = tex->height0;

 	if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
 		const struct util_format_description *tex_desc
@@ -1983,11 +1985,14 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,

 			width = nblks_x * templ_desc->block.width;
 			height = nblks_y * templ_desc->block.height;
+
+			width0 = util_format_get_nblocksx(tex->format, width0);
+			height0 = util_format_get_nblocksy(tex->format, height0);
 		}
 	}

 	return r600_create_surface_custom(pipe, tex, templ,
-					  tex->width0, tex->height0,
+					  width0, height0,
 					  width, height);
 }

--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -366,6 +366,7 @@ struct si_context {
 	struct si_shader_selector *last_tcs;
 	int			last_num_tcs_input_cp;
 	int			last_tes_sh_base;
+	bool			last_tess_uses_primid;
 	unsigned		last_num_patches;

 	/* Debug state. */
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -72,9 +72,9 @@
 #include <llvm-c/TargetMachine.h>
 #include "tgsi/tgsi_scan.h"
 #include "util/u_queue.h"
-#include "si_state.h"

-struct ac_shader_binary;
+#include "ac_binary.h"
+#include "si_state.h"

 #define SI_MAX_VS_OUTPUTS	40

--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -101,6 +101,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	 * It would be wrong to think that TCS = TES. */
 	struct si_shader_selector *tcs =
 		sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
+	unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tcs_tes_uses_prim_id;
+	bool has_primid_instancing_bug = sctx->b.chip_class == SI &&
+					 sctx->b.screen->info.max_se == 1;
 	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
 	unsigned num_tcs_input_cp = info->vertices_per_patch;
 	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
@@ -114,7 +117,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->last_ls == ls->current &&
 	    sctx->last_tcs == tcs &&
 	    sctx->last_tes_sh_base == tes_sh_base &&
-	    sctx->last_num_tcs_input_cp == num_tcs_input_cp) {
+	    sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
+	    (!has_primid_instancing_bug ||
+	     (sctx->last_tess_uses_primid == tess_uses_primid))) {
 		*num_patches = sctx->last_num_patches;
 		return;
 	}
@@ -123,6 +128,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	sctx->last_tcs = tcs;
 	sctx->last_tes_sh_base = tes_sh_base;
 	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+	sctx->last_tess_uses_primid = tess_uses_primid;

 	/* This calculates how shader inputs and outputs among VS, TCS, and TES
 	 * are laid out in LDS. */
@@ -155,8 +161,12 @@ static void si_emit_derived_tess_state(struct si_context *sctx,

 	/* Make sure that the data fits in LDS. This assumes the shaders only
 	 * use LDS for the inputs and outputs.
+	 *
+	 * While CIK can use 64K per threadgroup, there is a hang on Stoney
+	 * with 2 CUs if we use more than 32K. The closed Vulkan driver also
+	 * uses 32K at most on all GCN chips.
 	 */
-	hardware_lds_size = sctx->b.chip_class >= CIK ? 65536 : 32768;
+	hardware_lds_size = 32768;
 	*num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size +
 	                                                       output_patch_size));

@@ -174,22 +184,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == SI) {
 		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
 		*num_patches = MIN2(*num_patches, one_wave);
-
-		if (sctx->screen->b.info.max_se == 1) {
-			/* The VGT HS block increments the patch ID unconditionally
-			 * within a single threadgroup. This results in incorrect
-			 * patch IDs when instanced draws are used.
-			 *
-			 * The intended solution is to restrict threadgroups to
-			 * a single instance by setting SWITCH_ON_EOI, which
-			 * should cause IA to split instances up. However, this
-			 * doesn't work correctly on SI when there is no other
-			 * SE to switch to.
-			 */
-			*num_patches = 1;
-		}
 	}

+	/* The VGT HS block increments the patch ID unconditionally
+	 * within a single threadgroup. This results in incorrect
+	 * patch IDs when instanced draws are used.
+	 *
+	 * The intended solution is to restrict threadgroups to
+	 * a single instance by setting SWITCH_ON_EOI, which
+	 * should cause IA to split instances up. However, this
+	 * doesn't work correctly on SI when there is no other
+	 * SE to switch to.
+	 */
+	if (has_primid_instancing_bug)
+		*num_patches = 1;
+
 	sctx->last_num_patches = *num_patches;

 	output_patch0_offset = input_patch_size * *num_patches;
--- a/src/gallium/drivers/svga/svga_screen_cache.c
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -362,7 +362,21 @@ svga_screen_cache_flush(struct svga_screen *svgascreen,
         /* It is now safe to invalidate the surface content.
          * It will be done using the current context.
          */
-         svga->swc->surface_invalidate(svga->swc, entry->handle);
+         if (svga->swc->surface_invalidate(svga->swc, entry->handle) != PIPE_OK) {
+            enum pipe_error ret;
+
+            /* Even though surface invalidation here is done after the command
+             * buffer is flushed, it is still possible that it will
+             * fail because there might be just enough of this command that is
+             * filling up the command buffer, so in this case we will call
+             * the winsys flush directly to flush the buffer.
+             * Note, we don't want to call svga_context_flush() here because
+             * this function itself is called inside svga_context_flush().
+             */
+            svga->swc->flush(svga->swc, NULL);
+            ret = svga->swc->surface_invalidate(svga->swc, entry->handle);
+            assert(ret == PIPE_OK);
+         }

         /* add the entry to the invalidated list */
         LIST_ADD(&entry->head, &cache->invalidated);
--- a/src/gallium/drivers/svga/svga_state_gs.c
+++ b/src/gallium/drivers/svga/svga_state_gs.c
@@ -190,6 +190,8 @@ emit_hw_gs(struct svga_context *svga, unsigned dirty)
          *  Needs to unbind the geometry shader.
          */
         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_GS, NULL);
+         if (ret != PIPE_OK)
+            goto done;
         svga->state.hw_draw.gs = NULL;
      }
      goto done;
--- a/src/gallium/drivers/svga/svga_surface.c
+++ b/src/gallium/drivers/svga/svga_surface.c
@@ -502,10 +502,10 @@ svga_validate_surface_view(struct svga_context *svga, struct svga_surface *s)
          * need to update the host-side copy with the invalid
          * content when the associated mob is first bound to the surface.
          */
-         ret = SVGA3D_InvalidateGBSurface(svga->swc, stex->handle);
-         if (ret != PIPE_OK) {
-            s = NULL;
-            goto done;
+         if (svga->swc->surface_invalidate(svga->swc, stex->handle) != PIPE_OK) {
+            svga_context_flush(svga, NULL);
+            ret = svga->swc->surface_invalidate(svga->swc, stex->handle);
+            assert(ret == PIPE_OK);
         }
         stex->validated = TRUE;
      }
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -394,7 +394,7 @@ struct svga_winsys_context
   /**
    * Invalidate the content of this surface
    */
-   void
+   enum pipe_error
   (*surface_invalidate)(struct svga_winsys_context *swc,
                         struct svga_winsys_surface *surface);

--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -22,7 +22,7 @@
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc

-AM_CXXFLAGS = $(GALLIUM_DRIVER_CFLAGS) $(SWR_CXX14_CXXFLAGS)
+AM_CXXFLAGS = $(GALLIUM_DRIVER_CFLAGS) $(SWR_CXX11_CXXFLAGS)

 noinst_LTLIBRARIES = libmesaswr.la

@@ -32,7 +32,7 @@ COMMON_CXXFLAGS = \
 	-fno-strict-aliasing \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(LLVM_CXXFLAGS) \
-	$(SWR_CXX14_CXXFLAGS) \
+	$(SWR_CXX11_CXXFLAGS) \
 	-I$(builddir)/rasterizer/codegen \
 	-I$(builddir)/rasterizer/jitter \
 	-I$(builddir)/rasterizer/archrast \
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -38,7 +38,7 @@ loadersource = env.ParseSourceList('Makefile.sources', [

 if not env['msvc'] :
    env.Append(CCFLAGS = [
-        '-std=c++14',
+        '-std=c++11',
    ])

 swrroot = '#src/gallium/drivers/swr/'
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -953,26 +953,27 @@ public:


 private:
+    template <typename MaskT>
+    INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max) // @llvm_func_start
+    {
+        __m128i vMin = _mm_set1_epi32(*min);
+        __m128i vMax = _mm_set1_epi32(*max);
+        return _simd_blend4_epi32<MaskT::value>(vMin, vMax);
+    }  // @llvm_func_end
+
    INLINE void CalcTileSampleOffsets(int numSamples)   // @llvm_func_start
-    {                                                                      
-        auto expandThenBlend4 = [](uint32_t* min, uint32_t* max, auto mask)
-        {
-            __m128i vMin = _mm_set1_epi32(*min);
-            __m128i vMax = _mm_set1_epi32(*max);
-            return _simd_blend4_epi32<decltype(mask)::value>(vMin, vMax);
-        };
-                                                                           
+    {
        auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]);
        auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]);
-        std::integral_constant<int, 0xA> xMask;
+        using xMask = std::integral_constant<int, 0xA>;
        // BR(max),    BL(min),    UR(max),    UL(min)
-        tileSampleOffsetsX = expandThenBlend4(minXi, maxXi, xMask);
-        
+        tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi);
+
        auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]);
        auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]);
-        std::integral_constant<int, 0xC> yMask;
+        using yMask = std::integral_constant<int, 0xC>;
        // BR(max),    BL(min),    UR(max),    UL(min)
-        tileSampleOffsetsY = expandThenBlend4(minYi, maxYi, yMask);
+        tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi);
    };  // @llvm_func_end
    // scalar sample values
    uint32_t _xi[SWR_MAX_NUM_MULTISAMPLES];
--- a/src/gallium/state_trackers/va/config.c
+++ b/src/gallium/state_trackers/va/config.c
@@ -101,6 +101,8 @@ vlVaQueryConfigEntrypoints(VADriverContextP ctx, VAProfile profile,
   if (num_entrypoints == 0)
      return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;

+   assert(*num_entrypoints <= ctx->max_entrypoints);
+
   return VA_STATUS_SUCCESS;
 }

--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -169,7 +169,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
   *ctx->vtable = vtable;
   *ctx->vtable_vpp = vtable_vpp;
   ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN;
-   ctx->max_entrypoints = 1;
+   ctx->max_entrypoints = 2;
   ctx->max_attributes = 1;
   ctx->max_image_formats = VL_VA_MAX_IMAGE_FORMATS;
   ctx->max_subpic_formats = 1;
--- a/src/gallium/targets/libgl-xlib/Makefile.am
+++ b/src/gallium/targets/libgl-xlib/Makefile.am
@@ -54,6 +54,7 @@ lib@GL_LIB@_la_SOURCES = xlib.c
 lib@GL_LIB@_la_LDFLAGS = \
 	-no-undefined \
 	-version-number $(GL_MAJOR):$(GL_MINOR):$(GL_TINY) \
+	$(BSYMBOLIC) \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)

--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -747,10 +747,13 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
      p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references);
      amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL);
   }
+   for (i = 0; i < cs->num_fence_dependencies; i++)
+      amdgpu_fence_reference(&cs->fence_dependencies[i], NULL);

   cs->num_real_buffers = 0;
   cs->num_slab_buffers = 0;
   cs->num_sparse_buffers = 0;
+   cs->num_fence_dependencies = 0;
   amdgpu_fence_reference(&cs->fence, NULL);

   memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
@@ -765,7 +768,7 @@ static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
   FREE(cs->handles);
   FREE(cs->slab_buffers);
   FREE(cs->sparse_buffers);
-   FREE(cs->request.dependencies);
+   FREE(cs->fence_dependencies);
 }


@@ -976,7 +979,6 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
 {
   struct amdgpu_cs_context *cs = acs->csc;
   struct amdgpu_winsys_bo *bo = buffer->bo;
-   struct amdgpu_cs_fence *dep;
   unsigned new_num_fences = 0;

   for (unsigned j = 0; j < bo->num_fences; ++j) {
@@ -998,21 +1000,21 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs,
      if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
         continue;

-      if (bo_fence->submission_in_progress)
-         os_wait_until_zero(&bo_fence->submission_in_progress,
-                            PIPE_TIMEOUT_INFINITE);
-
-      idx = cs->request.number_of_dependencies++;
-      if (idx >= cs->max_dependencies) {
+      idx = cs->num_fence_dependencies++;
+      if (idx >= cs->max_fence_dependencies) {
         unsigned size;
+         const unsigned increment = 8;

-         cs->max_dependencies = idx + 8;
-         size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
-         cs->request.dependencies = realloc(cs->request.dependencies, size);
+         cs->max_fence_dependencies = idx + increment;
+         size = cs->max_fence_dependencies * sizeof(cs->fence_dependencies[0]);
+         cs->fence_dependencies = realloc(cs->fence_dependencies, size);
+         /* Clear the newly-allocated elements. */
+         memset(cs->fence_dependencies + idx, 0,
+                increment * sizeof(cs->fence_dependencies[0]));
      }

-      dep = &cs->request.dependencies[idx];
-      memcpy(dep, &bo_fence->fence, sizeof(*dep));
+      amdgpu_fence_reference(&cs->fence_dependencies[idx],
+                             (struct pipe_fence_handle*)bo_fence);
   }

   for (unsigned j = new_num_fences; j < bo->num_fences; ++j)
@@ -1083,7 +1085,7 @@ static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs)
 {
   struct amdgpu_cs_context *cs = acs->csc;

-   cs->request.number_of_dependencies = 0;
+   cs->num_fence_dependencies = 0;

   amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
   amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
@@ -1131,7 +1133,30 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
   struct amdgpu_winsys *ws = acs->ctx->ws;
   struct amdgpu_cs_context *cs = acs->cst;
   int i, r;
+   struct amdgpu_cs_fence *dependencies = NULL;

+   /* Set dependencies (input fences). */
+   if (cs->num_fence_dependencies) {
+      dependencies = alloca(sizeof(dependencies[0]) *
+                            cs->num_fence_dependencies);
+      unsigned num = 0;
+
+      for (i = 0; i < cs->num_fence_dependencies; i++) {
+         struct amdgpu_fence *fence =
+            (struct amdgpu_fence*)cs->fence_dependencies[i];
+
+         /* Past fences can't be unsubmitted because we have only 1 CS thread. */
+         assert(!fence->submission_in_progress);
+         memcpy(&dependencies[num++], &fence->fence, sizeof(dependencies[0]));
+      }
+      cs->request.dependencies = dependencies;
+      cs->request.number_of_dependencies = num;
+   } else {
+      cs->request.dependencies = NULL;
+      cs->request.number_of_dependencies = 0;
+   }
+
+   /* Set the output fence. */
   cs->request.fence_info.handle = NULL;
   if (amdgpu_cs_has_user_fence(cs)) {
 	cs->request.fence_info.handle = acs->ctx->user_fence_bo;
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -105,7 +105,9 @@ struct amdgpu_cs_context {
   unsigned                    last_added_bo_usage;
   uint64_t                    last_added_bo_priority_usage;

-   unsigned                    max_dependencies;
+   struct pipe_fence_handle    **fence_dependencies;
+   unsigned                    num_fence_dependencies;
+   unsigned                    max_fence_dependencies;

   struct pipe_fence_handle    *fence;

--- a/src/gallium/winsys/svga/drm/vmw_surface.c
+++ b/src/gallium/winsys/svga/drm/vmw_surface.c
@@ -176,7 +176,7 @@ vmw_svga_winsys_surface_unmap(struct svga_winsys_context *swc,
   mtx_unlock(&vsrf->mutex);
 }

-void
+enum pipe_error
 vmw_svga_winsys_surface_invalidate(struct svga_winsys_context *swc,
                                   struct svga_winsys_surface *surf)
 {
@@ -186,6 +186,7 @@ vmw_svga_winsys_surface_invalidate(struct svga_winsys_context *swc,
    * when guest-backed surface is enabled, that implies DMA is always enabled;
    * hence, surface invalidation is not needed.
    */
+   return PIPE_OK;
 }

 void
--- a/src/gallium/winsys/svga/drm/vmw_surface.h
+++ b/src/gallium/winsys/svga/drm/vmw_surface.h
@@ -94,7 +94,7 @@ void
 vmw_svga_winsys_surface_unmap(struct svga_winsys_context *swc,
                              struct svga_winsys_surface *srf,
                              boolean *rebind);
-void
+enum pipe_error
 vmw_svga_winsys_surface_invalidate(struct svga_winsys_context *swc,
                                   struct svga_winsys_surface *srf);

--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -235,6 +235,11 @@ dri3_bind_context(struct glx_context *context, struct glx_context *old,
   if (!(*psc->core->bindContext) (pcp->driContext, dri_draw, dri_read))
      return GLXBadContext;

+   if (dri_draw)
+      (*psc->f->invalidate)(dri_draw);
+   if (dri_read && dri_read != dri_draw)
+      (*psc->f->invalidate)(dri_read);
+
   return Success;
 }

@@ -493,6 +498,7 @@ dri3_flush_front_buffer(__DRIdrawable *driDrawable, void *loaderPrivate)

   loader_dri3_flush(draw, __DRI2_FLUSH_DRAWABLE, __DRI2_THROTTLE_FLUSHFRONT);

+   (*psc->f->invalidate)(driDrawable);
   loader_dri3_wait_gl(draw);
 }

--- a/src/intel/common/gen_device_info.c
+++ b/src/intel/common/gen_device_info.c
@@ -132,6 +132,7 @@ static const struct gen_device_info gen_device_info_snb_gt2 = {
 static const struct gen_device_info gen_device_info_ivb_gt1 = {
   GEN7_FEATURES, .is_ivybridge = true, .gt = 1,
   .num_slices = 1,
+   .l3_banks = 2,
   .max_vs_threads = 36,
   .max_tcs_threads = 36,
   .max_tes_threads = 36,
@@ -156,6 +157,7 @@ static const struct gen_device_info gen_device_info_ivb_gt1 = {
 static const struct gen_device_info gen_device_info_ivb_gt2 = {
   GEN7_FEATURES, .is_ivybridge = true, .gt = 2,
   .num_slices = 1,
+   .l3_banks = 4,
   .max_vs_threads = 128,
   .max_tcs_threads = 128,
   .max_tes_threads = 128,
@@ -180,6 +182,7 @@ static const struct gen_device_info gen_device_info_ivb_gt2 = {
 static const struct gen_device_info gen_device_info_byt = {
   GEN7_FEATURES, .is_baytrail = true, .gt = 1,
   .num_slices = 1,
+   .l3_banks = 1,
   .has_llc = false,
   .max_vs_threads = 36,
   .max_tcs_threads = 36,
@@ -211,6 +214,7 @@ static const struct gen_device_info gen_device_info_byt = {
 static const struct gen_device_info gen_device_info_hsw_gt1 = {
   HSW_FEATURES, .gt = 1,
   .num_slices = 1,
+   .l3_banks = 2,
   .max_vs_threads = 70,
   .max_tcs_threads = 70,
   .max_tes_threads = 70,
@@ -235,6 +239,7 @@ static const struct gen_device_info gen_device_info_hsw_gt1 = {
 static const struct gen_device_info gen_device_info_hsw_gt2 = {
   HSW_FEATURES, .gt = 2,
   .num_slices = 1,
+   .l3_banks = 4,
   .max_vs_threads = 280,
   .max_tcs_threads = 256,
   .max_tes_threads = 280,
@@ -259,6 +264,7 @@ static const struct gen_device_info gen_device_info_hsw_gt2 = {
 static const struct gen_device_info gen_device_info_hsw_gt3 = {
   HSW_FEATURES, .gt = 3,
   .num_slices = 2,
+   .l3_banks = 8,
   .max_vs_threads = 280,
   .max_tcs_threads = 256,
   .max_tes_threads = 280,
@@ -299,6 +305,7 @@ static const struct gen_device_info gen_device_info_hsw_gt3 = {
 static const struct gen_device_info gen_device_info_bdw_gt1 = {
   GEN8_FEATURES, .gt = 1,
   .num_slices = 1,
+   .l3_banks = 2,
   .max_cs_threads = 42,
   .urb = {
      .size = 192,
@@ -318,6 +325,7 @@ static const struct gen_device_info gen_device_info_bdw_gt1 = {
 static const struct gen_device_info gen_device_info_bdw_gt2 = {
   GEN8_FEATURES, .gt = 2,
   .num_slices = 1,
+   .l3_banks = 4,
   .max_cs_threads = 56,
   .urb = {
      .size = 384,
@@ -337,6 +345,7 @@ static const struct gen_device_info gen_device_info_bdw_gt2 = {
 static const struct gen_device_info gen_device_info_bdw_gt3 = {
   GEN8_FEATURES, .gt = 3,
   .num_slices = 2,
+   .l3_banks = 8,
   .max_cs_threads = 56,
   .urb = {
      .size = 384,
@@ -357,6 +366,7 @@ static const struct gen_device_info gen_device_info_chv = {
   GEN8_FEATURES, .is_cherryview = 1, .gt = 1,
   .has_llc = false,
   .num_slices = 1,
+   .l3_banks = 2,
   .max_vs_threads = 80,
   .max_tcs_threads = 80,
   .max_tes_threads = 80,
@@ -457,22 +467,26 @@ static const struct gen_device_info gen_device_info_chv = {
 static const struct gen_device_info gen_device_info_skl_gt1 = {
   GEN9_FEATURES, .gt = 1,
   .num_slices = 1,
+   .l3_banks = 2,
   .urb.size = 192,
 };

 static const struct gen_device_info gen_device_info_skl_gt2 = {
   GEN9_FEATURES, .gt = 2,
   .num_slices = 1,
+   .l3_banks = 4,
 };

 static const struct gen_device_info gen_device_info_skl_gt3 = {
   GEN9_FEATURES, .gt = 3,
   .num_slices = 2,
+   .l3_banks = 8,
 };

 static const struct gen_device_info gen_device_info_skl_gt4 = {
   GEN9_FEATURES, .gt = 4,
   .num_slices = 3,
+   .l3_banks = 12,
   /* From the "L3 Allocation and Programming" documentation:
    *
    * "URB is limited to 1008KB due to programming restrictions.  This is not a
@@ -485,11 +499,13 @@ static const struct gen_device_info gen_device_info_skl_gt4 = {
 };

 static const struct gen_device_info gen_device_info_bxt = {
-   GEN9_LP_FEATURES
+   GEN9_LP_FEATURES,
+   .l3_banks = 2,
 };

 static const struct gen_device_info gen_device_info_bxt_2x6 = {
-   GEN9_LP_FEATURES_2X6
+   GEN9_LP_FEATURES_2X6,
+   .l3_banks = 1,
 };
 /*
 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
@@ -504,6 +520,7 @@ static const struct gen_device_info gen_device_info_kbl_gt1 = {
   .max_cs_threads = 7 * 6,
   .urb.size = 192,
   .num_slices = 1,
+   .l3_banks = 2,
 };

 static const struct gen_device_info gen_device_info_kbl_gt1_5 = {
@@ -513,6 +530,7 @@ static const struct gen_device_info gen_device_info_kbl_gt1_5 = {

   .max_cs_threads = 7 * 6,
   .num_slices = 1,
+   .l3_banks = 4,
 };

 static const struct gen_device_info gen_device_info_kbl_gt2 = {
@@ -521,6 +539,7 @@ static const struct gen_device_info gen_device_info_kbl_gt2 = {
   .gt = 2,

   .num_slices = 1,
+   .l3_banks = 4,
 };

 static const struct gen_device_info gen_device_info_kbl_gt3 = {
@@ -529,6 +548,7 @@ static const struct gen_device_info gen_device_info_kbl_gt3 = {
   .gt = 3,

   .num_slices = 2,
+   .l3_banks = 8,
 };

 static const struct gen_device_info gen_device_info_kbl_gt4 = {
@@ -548,12 +568,15 @@ static const struct gen_device_info gen_device_info_kbl_gt4 = {
    */
   .urb.size = 1008 / 3,
   .num_slices = 3,
+   .l3_banks = 12,
 };

 static const struct gen_device_info gen_device_info_glk = {
-   GEN9_LP_FEATURES
+   GEN9_LP_FEATURES,
+   .l3_banks = 2,
 };

+/*TODO: Initialize l3_banks when we know the number. */
 static const struct gen_device_info gen_device_info_glk_2x6 = {
   GEN9_LP_FEATURES_2X6
 };
--- a/src/intel/common/gen_device_info.h
+++ b/src/intel/common/gen_device_info.h
@@ -96,6 +96,7 @@ struct gen_device_info
    * to change, so we program @max_cs_threads as the lower maximum.
    */
   unsigned num_slices;
+   unsigned l3_banks;
   unsigned max_vs_threads;   /**< Maximum Vertex Shader threads */
   unsigned max_tcs_threads;  /**< Maximum Hull Shader threads */
   unsigned max_tes_threads;  /**< Maximum Domain Shader threads */
--- a/src/intel/common/gen_l3_config.c
+++ b/src/intel/common/gen_l3_config.c
@@ -101,6 +101,20 @@ static const struct gen_l3_config chv_l3_configs[] = {
   {{ 0 }}
 };

+/**
+ * BXT 2x6 validated L3 configurations.  \sa ivb_l3_configs.
+ */
+static const struct gen_l3_config bxt_2x6_l3_configs[] = {
+   /* SLM URB ALL DC  RO  IS   C   T */
+   {{  0, 32, 48,  0,  0,  0,  0,  0 }},
+   {{  0, 32,  0,  8, 40,  0,  0,  0 }},
+   {{  0, 32,  0, 32, 16,  0,  0,  0 }},
+   {{ 16, 16, 48,  0,  0,  0,  0,  0 }},
+   {{ 16, 16,  0, 40,  8,  0,  0,  0 }},
+   {{ 16, 16,  0, 16, 32,  0,  0,  0 }},
+   {{ 0 }}
+};
+
 /**
 * Return a zero-terminated array of validated L3 configurations for the
 * specified device.
@@ -116,6 +130,8 @@ get_l3_configs(const struct gen_device_info *devinfo)
      return (devinfo->is_cherryview ? chv_l3_configs : bdw_l3_configs);

   case 9:
+      if (devinfo->l3_banks == 1)
+	 return bxt_2x6_l3_configs;
      return chv_l3_configs;

   default:
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -822,7 +822,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
   anv_pack_struct(&l3cr2, GENX(L3CNTLREG2),
                   .SLMEnable = has_slm,
                   .URBLowBandwidth = urb_low_bw,
-                   .URBAllocation = cfg->n[GEN_L3P_URB],
+                   .URBAllocation = cfg->n[GEN_L3P_URB] - n0_urb,
 #if !GEN_IS_HASWELL
                   .ALLAllocation = cfg->n[GEN_L3P_ALL],
 #endif
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -245,3 +245,6 @@ es2api/glapi_mapi_tmp.h: glapi/gen/gl_and_es_API.xml $(glapi_gen_mapi_deps)
 		$(srcdir)/glapi/gen/gl_and_es_API.xml > $@

 include $(top_srcdir)/install-lib-links.mk
+
+khrdir = $(includedir)/KHR
+khr_HEADERS = $(top_srcdir)/include/KHR/khrplatform.h
--- a/src/mesa/drivers/dri/i915/i915_context.h
+++ b/src/mesa/drivers/dri/i915/i915_context.h
@@ -79,12 +79,13 @@
 #define I915_CTXREG_STATE4		0
 #define I915_CTXREG_LI			1
 #define I915_CTXREG_LIS2		2
-#define I915_CTXREG_LIS4		3
-#define I915_CTXREG_LIS5		4
-#define I915_CTXREG_LIS6		5
-#define I915_CTXREG_BF_STENCIL_OPS	6
-#define I915_CTXREG_BF_STENCIL_MASKS	7
-#define I915_CTX_SETUP_SIZE		8
+#define I915_CTXREG_LIS3		3
+#define I915_CTXREG_LIS4		4
+#define I915_CTXREG_LIS5		5
+#define I915_CTXREG_LIS6		6
+#define I915_CTXREG_BF_STENCIL_OPS	7
+#define I915_CTXREG_BF_STENCIL_MASKS	8
+#define I915_CTX_SETUP_SIZE		9

 #define I915_BLENDREG_IAB		0
 #define I915_BLENDREG_BLENDCOLOR0	1
@@ -116,6 +117,7 @@ enum {
 };

 #define I915_TEX_UNITS 8
+#define I915_WPOS_TEX_INVALID 0xff

 #define I915_MAX_CONSTANT      32
 #define I915_CONSTANT_SIZE     (2+(4*I915_MAX_CONSTANT))
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -1063,7 +1063,7 @@ check_wpos(struct i915_fragment_program *p)
   GLint i;
   unsigned unit = 0;

-   p->wpos_tex = -1;
+   p->wpos_tex = I915_WPOS_TEX_INVALID;

   if ((inputs & VARYING_BIT_POS) == 0)
      return;
@@ -1238,6 +1238,7 @@ i915ValidateFragmentProgram(struct i915_context *i915)
   const GLbitfield64 inputsRead = p->FragProg.info.inputs_read;
   GLuint s4 = i915->state.Ctx[I915_CTXREG_LIS4] & ~S4_VFMT_MASK;
   GLuint s2 = S2_TEXCOORD_NONE;
+   GLuint s3 = 0;
   int i, offset = 0;

   /* Important:
@@ -1252,12 +1253,10 @@ i915ValidateFragmentProgram(struct i915_context *i915)
   intel->coloroffset = 0;
   intel->specoffset = 0;

-   if (inputsRead & VARYING_BITS_TEX_ANY || p->wpos_tex != -1) {
-      EMIT_ATTR(_TNL_ATTRIB_POS, EMIT_4F_VIEWPORT, S4_VFMT_XYZW, 16);
-   }
-   else {
-      EMIT_ATTR(_TNL_ATTRIB_POS, EMIT_3F_VIEWPORT, S4_VFMT_XYZ, 12);
-   }
+   /* Always emit W to get consistent perspective
+    * correct interpolation of primary/secondary colors.
+    */
+   EMIT_ATTR(_TNL_ATTRIB_POS, EMIT_4F_VIEWPORT, S4_VFMT_XYZW, 16);

   /* Handle gl_PointSize builtin var here */
   if (ctx->Point._Attenuated || ctx->VertexProgram.PointSizeEnabled)
@@ -1303,6 +1302,7 @@ i915ValidateFragmentProgram(struct i915_context *i915)
          */
         s2 &= ~S2_TEXCOORD_FMT(i, S2_TEXCOORD_FMT0_MASK);
         s2 |= S2_TEXCOORD_FMT(i, SZ_TO_HW(wpos_size));
+         s3 |= S3_TEXCOORD_PERSPECTIVE_DISABLE(i);

         intel->wpos_offset = offset;
         EMIT_PAD(wpos_size);
@@ -1310,6 +1310,7 @@ i915ValidateFragmentProgram(struct i915_context *i915)
   }

   if (s2 != i915->state.Ctx[I915_CTXREG_LIS2] ||
+       s3 != i915->state.Ctx[I915_CTXREG_LIS3] ||
       s4 != i915->state.Ctx[I915_CTXREG_LIS4]) {
      I915_STATECHANGE(i915, I915_UPLOAD_CTX);

@@ -1328,6 +1329,7 @@ i915ValidateFragmentProgram(struct i915_context *i915)
      intel->vertex_size >>= 2;

      i915->state.Ctx[I915_CTXREG_LIS2] = s2;
+      i915->state.Ctx[I915_CTXREG_LIS3] = s3;
      i915->state.Ctx[I915_CTXREG_LIS4] = s4;

      assert(intel->vtbl.check_vertex_size(intel, intel->vertex_size));
--- a/src/mesa/drivers/dri/i915/i915_program.c
+++ b/src/mesa/drivers/dri/i915/i915_program.c
@@ -482,7 +482,7 @@ i915_init_program(struct i915_context *i915, struct i915_fragment_program *p)
   p->decl_t = 0;
   p->temp_flag = 0xffff000;
   p->utemp_flag = ~0x7;
-   p->wpos_tex = -1;
+   p->wpos_tex = I915_WPOS_TEX_INVALID;
   p->depth_written = 0;
   p->nr_params = 0;

--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -925,11 +925,12 @@ i915_init_packets(struct i915_context *i915)
       * piece changes.
       */
      i915->state.Ctx[I915_CTXREG_LI] = (_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
-                                         I1_LOAD_S(2) |
-                                         I1_LOAD_S(4) |
-                                         I1_LOAD_S(5) | I1_LOAD_S(6) | (3));
+                                         I1_LOAD_S(2) | I1_LOAD_S(3) |
+                                         I1_LOAD_S(4) | I1_LOAD_S(5) |
+                                         I1_LOAD_S(6) | (4));
      i915->state.Ctx[I915_CTXREG_LIS2] = 0;
      i915->state.Ctx[I915_CTXREG_LIS4] = 0;
+      i915->state.Ctx[I915_CTXREG_LIS3] = 0;
      i915->state.Ctx[I915_CTXREG_LIS5] = 0;

      if (i915->intel.ctx.Visual.rgbBits == 16)
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -176,7 +176,7 @@ i915_emit_invarient_state(struct intel_context *intel)
 {
   BATCH_LOCALS;

-   BEGIN_BATCH(17);
+   BEGIN_BATCH(15);

   OUT_BATCH(_3DSTATE_AA_CMD |
             AA_LINE_ECAAR_WIDTH_ENABLE |
@@ -200,11 +200,6 @@ i915_emit_invarient_state(struct intel_context *intel)
             CSB_TCB(3, 3) |
             CSB_TCB(4, 4) | CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));

-   /* Need to initialize this to zero.
-    */
-   OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | (0));
-   OUT_BATCH(0);
-
   OUT_BATCH(_3DSTATE_SCISSOR_RECT_0_CMD);
   OUT_BATCH(0);
   OUT_BATCH(0);
--- a/src/mesa/drivers/dri/i915/intel_reg.h
+++ b/src/mesa/drivers/dri/i915/intel_reg.h
@@ -93,7 +93,11 @@
 #define S2_TEX_COUNT_SHIFT_830		12
 #define S2_VERTEX_1_WIDTH_SHIFT_830	0
 #define S2_VERTEX_0_WIDTH_SHIFT_830	6
-/* S3 not interesting */
+
+#define S3_TEXCOORD_WRAP_SHORTEST_TCX(unit)	(1<<((unit)*4+3))
+#define S3_TEXCOORD_WRAP_SHORTEST_TCY(unit)	(1<<((unit)*4+2))
+#define S3_TEXCOORD_WRAP_SHORTEST_TCZ(unit)	(1<<((unit)*4+1))
+#define S3_TEXCOORD_PERSPECTIVE_DISABLE(unit)	(1<<((unit)*4+0))

 #define S4_POINT_WIDTH_SHIFT           23
 #define S4_POINT_WIDTH_MASK            (0x1ff<<23)
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -887,9 +887,7 @@ do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,
       * and again afterwards to ensure that the resolve is complete before we
       * do any more regular drawing.
       */
-      brw_emit_pipe_control_flush(brw,
-                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                                  PIPE_CONTROL_CS_STALL);
+      brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH);

      struct blorp_batch batch;
      blorp_batch_init(&brw->blorp, &batch, brw, 0);
@@ -899,9 +897,7 @@ do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,
                       x0, y0, x1, y1);
      blorp_batch_finish(&batch);

-      brw_emit_pipe_control_flush(brw,
-                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                                  PIPE_CONTROL_CS_STALL);
+      brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH);

      /* Now that the fast clear has occurred, put the buffer in
       * INTEL_FAST_CLEAR_STATE_CLEAR so that we won't waste time doing
@@ -1001,9 +997,7 @@ brw_blorp_resolve_color(struct brw_context *brw, struct intel_mipmap_tree *mt,
    * and again afterwards to ensure that the resolve is complete before we
    * do any more regular drawing.
    */
-   brw_emit_pipe_control_flush(brw,
-                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                               PIPE_CONTROL_CS_STALL);
+   brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH);


   struct blorp_batch batch;
@@ -1014,9 +1008,7 @@ brw_blorp_resolve_color(struct brw_context *brw, struct intel_mipmap_tree *mt,
   blorp_batch_finish(&batch);

   /* See comment above */
-   brw_emit_pipe_control_flush(brw,
-                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                               PIPE_CONTROL_CS_STALL);
+   brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH);
 }

 static void
@@ -1047,7 +1039,8 @@ gen6_blorp_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
 */
 void
 intel_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
-	       unsigned int level, unsigned int layer, enum blorp_hiz_op op)
+               unsigned int level, unsigned int start_layer,
+               unsigned int num_layers, enum blorp_hiz_op op)
 {
   const char *opname = NULL;

@@ -1066,12 +1059,85 @@ intel_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
      break;
   }

-   DBG("%s %s to mt %p level %d layer %d\n",
-       __func__, opname, mt, level, layer);
+   DBG("%s %s to mt %p level %d layers %d-%d\n",
+       __func__, opname, mt, level, start_layer, start_layer + num_layers - 1);
+
+   /* The following stalls and flushes are only documented to be required for
+    * HiZ clear operations.  However, they also seem to be required for the
+    * HiZ resolve operation which is basically the same as a fast clear only a
+    * different value is written into the HiZ surface.
+    */
+   if (op == BLORP_HIZ_OP_DEPTH_CLEAR || op == BLORP_HIZ_OP_HIZ_RESOLVE) {
+      if (brw->gen == 6) {
+         /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+          *
+          *   "If other rendering operations have preceded this clear, a
+          *   PIPE_CONTROL with write cache flush enabled and Z-inhibit
+          *   disabled must be issued before the rectangle primitive used for
+          *   the depth buffer clear operation.
+          */
+          brw_emit_pipe_control_flush(brw,
+                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                      PIPE_CONTROL_CS_STALL);
+      } else if (brw->gen >= 7) {
+         /*
+          * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
+          *
+          *   If other rendering operations have preceded this clear, a
+          *   PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+          *   enabled must be issued before the rectangle primitive used for
+          *   the depth buffer clear operation.
+          *
+          * Same applies for Gen8 and Gen9.
+          *
+          * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1
+          * PIPE_CONTROL, Depth Cache Flush Enable:
+          *
+          *   This bit must not be set when Depth Stall Enable bit is set in
+          *   this packet.
+          *
+          * This is confirmed to hold for real, HSW gets immediate gpu hangs.
+          *
+          * Therefore issue two pipe control flushes, one for cache flush and
+          * another for depth stall.
+          */
+          brw_emit_pipe_control_flush(brw,
+                                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                      PIPE_CONTROL_CS_STALL);
+
+          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+      }
+   }

   if (brw->gen >= 8) {
-      gen8_hiz_exec(brw, mt, level, layer, op);
+      for (unsigned a = 0; a < num_layers; a++)
+         gen8_hiz_exec(brw, mt, level, start_layer + a, op);
   } else {
-      gen6_blorp_hiz_exec(brw, mt, level, layer, op);
+      for (unsigned a = 0; a < num_layers; a++)
+         gen6_blorp_hiz_exec(brw, mt, level, start_layer + a, op);
+   }
+
+
+   /* The following stalls and flushes are only documented to be required for
+    * HiZ clear operations.  However, they also seem to be required for the
+    * HiZ resolve operation which is basically the same as a fast clear only a
+    * different value is written into the HiZ surface.
+    */
+   if (op == BLORP_HIZ_OP_DEPTH_CLEAR || op == BLORP_HIZ_OP_HIZ_RESOLVE) {
+      if (brw->gen == 6) {
+         /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
+          *
+          *     "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be
+          *     followed by a PIPE_CONTROL command with DEPTH_STALL bit set
+          *     and Then followed by Depth FLUSH'
+         */
+         brw_emit_pipe_control_flush(brw,
+                                     PIPE_CONTROL_DEPTH_STALL);
+
+         brw_emit_pipe_control_flush(brw,
+                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                     PIPE_CONTROL_CS_STALL);
+      }
   }
 }
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -70,7 +70,8 @@ brw_blorp_resolve_color(struct brw_context *brw,

 void
 intel_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
-	       unsigned int level, unsigned int layer, enum blorp_hiz_op op);
+               unsigned int level, unsigned int start_layer,
+               unsigned int num_layers, enum blorp_hiz_op op);

 void gen6_blorp_exec(struct blorp_batch *batch,
                     const struct blorp_params *params);
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -175,71 +175,13 @@ brw_fast_clear_depth(struct gl_context *ctx)
      mt->depth_clear_value = depth_clear_value;
   }

-   if (brw->gen == 6) {
-      /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
-       *
-       *   "If other rendering operations have preceded this clear, a
-       *    PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled
-       *    must be issued before the rectangle primitive used for the depth
-       *    buffer clear operation.
-       */
-       brw_emit_pipe_control_flush(brw,
-                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                                   PIPE_CONTROL_CS_STALL);
-   } else if (brw->gen >= 7) {
-      /*
-       * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
-       *
-       *   If other rendering operations have preceded this clear, a
-       *   PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
-       *   enabled must be issued before the rectangle primitive used for the
-       *   depth buffer clear operation.
-       *
-       * Same applies for Gen8 and Gen9.
-       *
-       * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1 PIPE_CONTROL,
-       * Depth Cache Flush Enable:
-       *
-       *   This bit must not be set when Depth Stall Enable bit is set in
-       *   this packet.
-       *
-       * This is confirmed to hold for real, HSW gets immediate gpu hangs.
-       *
-       * Therefore issue two pipe control flushes, one for cache flush and
-       * another for depth stall.
-       */
-       brw_emit_pipe_control_flush(brw,
-                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                                   PIPE_CONTROL_CS_STALL);
-
-       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
-   }
-
   if (fb->MaxNumLayers > 0) {
-      for (unsigned layer = 0; layer < depth_irb->layer_count; layer++) {
-         intel_hiz_exec(brw, mt, depth_irb->mt_level,
-                        depth_irb->mt_layer + layer,
-                        BLORP_HIZ_OP_DEPTH_CLEAR);
-      }
-   } else {
-      intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer,
+      intel_hiz_exec(brw, mt, depth_irb->mt_level,
+                     depth_irb->mt_layer, depth_irb->layer_count,
+                     BLORP_HIZ_OP_DEPTH_CLEAR);
+   } else {
+      intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer, 1,
                     BLORP_HIZ_OP_DEPTH_CLEAR);
-   }
-
-   if (brw->gen == 6) {
-      /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
-       *
-       *     "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be followed
-       *      by a PIPE_CONTROL command with DEPTH_STALL bit set and Then
-       *      followed by Depth FLUSH'
-      */
-      brw_emit_pipe_control_flush(brw,
-                                  PIPE_CONTROL_DEPTH_STALL);
-
-      brw_emit_pipe_control_flush(brw,
-                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                                  PIPE_CONTROL_CS_STALL);
   }

   /* Now, the HiZ buffer contains data that needs to be resolved to the depth
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -555,7 +555,7 @@ brw_initialize_context_constants(struct brw_context *brw)
   ctx->Const.Max3DTextureLevels = 12; /* 2048 */
   ctx->Const.MaxArrayTextureLayers = brw->gen >= 7 ? 2048 : 512;
   ctx->Const.MaxTextureMbytes = 1536;
-   ctx->Const.MaxTextureRectSize = 1 << 12;
+   ctx->Const.MaxTextureRectSize = brw->gen >= 7 ? 16384 : 8192;
   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
   ctx->Const.MaxTextureLodBias = 15.0;
   ctx->Const.StripTextureBorder = true;
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1700,7 +1700,8 @@ void brw_fini_pipe_control(struct brw_context *brw);
 void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
 void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
                                 struct brw_bo *bo, uint32_t offset,
-                                 uint32_t imm_lower, uint32_t imm_upper);
+                                 uint64_t imm);
+void brw_emit_end_of_pipe_sync(struct brw_context *brw, uint32_t flags);
 void brw_emit_mi_flush(struct brw_context *brw);
 void brw_emit_post_sync_nonzero_flush(struct brw_context *brw);
 void brw_emit_depth_stall_flushes(struct brw_context *brw);
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -664,15 +664,16 @@ brw_prepare_vertices(struct brw_context *brw)
 	    ptr = glarray->Ptr;
 	 }
 	 else if (interleaved != glarray->StrideB ||
+                  glarray->InstanceDivisor != 0 ||
                  glarray->Ptr < ptr ||
                  (uintptr_t)(glarray->Ptr - ptr) + glarray->_ElementSize > interleaved)
 	 {
            /* If our stride is different from the first attribute's stride,
-             * or if the first attribute's stride didn't cover our element,
-             * disable the interleaved upload optimization.  The second case
-             * can most commonly occur in cases where there is a single vertex
-             * and, for example, the data is stored on the application's
-             * stack.
+             * or if we are using an instance divisor or if the first
+             * attribute's stride didn't cover our element, disable the
+             * interleaved upload optimization.  The second case can most
+             * commonly occur in cases where there is a single vertex and, for
+             * example, the data is stored on the application's stack.
             *
             * NOTE: This will also disable the optimization in cases where
             * the data is in a different order than the array indices.
@@ -727,6 +728,7 @@ brw_prepare_vertices(struct brw_context *brw)
 				 buffer, interleaved);
 	 buffer->offset -= delta * interleaved;
         buffer->size += delta * interleaved;
+         buffer->step_rate = 0;

 	 for (i = 0; i < nr_uploads; i++) {
 	    /* Then, just point upload[i] at upload[0]'s buffer. */
--- a/src/mesa/drivers/dri/i965/brw_meta_util.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_util.c
@@ -357,6 +357,46 @@ brw_meta_convert_fast_clear_color(const struct brw_context *brw,
      break;
   }

+   switch (_mesa_get_format_datatype(mt->format)) {
+   case GL_UNSIGNED_NORMALIZED:
+      for (int i = 0; i < 4; i++)
+         override_color.f[i] = CLAMP(override_color.f[i], 0.0f, 1.0f);
+      break;
+
+   case GL_SIGNED_NORMALIZED:
+      for (int i = 0; i < 4; i++)
+         override_color.f[i] = CLAMP(override_color.f[i], -1.0f, 1.0f);
+      break;
+
+   case GL_UNSIGNED_INT:
+      for (int i = 0; i < 4; i++) {
+         unsigned bits = _mesa_get_format_bits(mt->format, GL_RED_BITS + i);
+         if (bits < 32) {
+            uint32_t max = (1u << bits) - 1;
+            override_color.ui[i] = MIN2(override_color.ui[i], max);
+         }
+      }
+      break;
+
+   case GL_INT:
+      for (int i = 0; i < 4; i++) {
+         unsigned bits = _mesa_get_format_bits(mt->format, GL_RED_BITS + i);
+         if (bits < 32) {
+            int32_t max = (1 << (bits - 1)) - 1;
+            int32_t min = -(1 << (bits - 1));
+            override_color.i[i] = CLAMP(override_color.i[i], min, max);
+         }
+      }
+      break;
+
+   case GL_FLOAT:
+      if (!_mesa_is_format_signed(mt->format)) {
+         for (int i = 0; i < 4; i++)
+            override_color.f[i] = MAX2(override_color.f[i], 0.0f);
+      }
+      break;
+   }
+
   if (!_mesa_format_has_color_component(mt->format, 3)) {
      if (_mesa_is_format_integer_color(mt->format))
         override_color.ui[3] = 1;
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -457,6 +457,12 @@ brw_workaround_depthstencil_alignment(struct brw_context *brw,
         brw->depthstencil.stencil_offset =
            (stencil_draw_y & ~tile_mask_y) * stencil_mt->pitch +
            (stencil_draw_x & ~tile_mask_x) * 64;
+      } else if (!depth_irb) {
+         brw->depthstencil.depth_offset =
+            intel_miptree_get_aligned_offset(
+               stencil_mt,
+               stencil_irb->draw_x & ~tile_mask_x,
+               stencil_irb->draw_y & ~tile_mask_y);
      }
   }
 }
@@ -993,6 +999,37 @@ brw_upload_state_base_address(struct brw_context *brw)
    * maybe this isn't required for us in particular.
    */

+   if (brw->gen >= 6) {
+      const unsigned dc_flush =
+         brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
+
+      /* Emit a render target cache flush.
+       *
+       * This isn't documented anywhere in the PRM.  However, it seems to be
+       * necessary prior to changing the surface state base adress.  We've
+       * seen issues in Vulkan where we get GPU hangs when using multi-level
+       * command buffers which clear depth, reset state base address, and then
+       * go render stuff.
+       *
+       * Normally, in GL, we would trust the kernel to do sufficient stalls
+       * and flushes prior to executing our batch.  However, it doesn't seem
+       * as if the kernel's flushing is always sufficient and we don't want to
+       * rely on it.
+       *
+       * We make this an end-of-pipe sync instead of a normal flush because we
+       * do not know the current status of the GPU.  On Haswell at least,
+       * having a fast-clear operation in flight at the same time as a normal
+       * rendering operation can cause hangs.  Since the kernel's flushing is
+       * insufficient, we need to ensure that any rendering operations from
+       * other processes are definitely complete before we try to do our own
+       * rendering.  It's a bit of a big hammer but it appears to work.
+       */
+      brw_emit_end_of_pipe_sync(brw,
+                                PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                dc_flush);
+   }
+
   if (brw->gen >= 8) {
      uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
      int pkt_len = brw->gen >= 9 ? 19 : 16;
@@ -1096,6 +1133,13 @@ brw_upload_state_base_address(struct brw_context *brw)
       ADVANCE_BATCH();
   }

+   if (brw->gen >= 6) {
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+   }
+
   /* According to section 3.6.1 of VOL1 of the 965 PRM,
    * STATE_BASE_ADDRESS updates require a reissue of:
    *
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -87,6 +87,83 @@ gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
   return 0;
 }

+static void
+brw_emit_pipe_control(struct brw_context *brw, uint32_t flags,
+                      struct brw_bo *bo, uint32_t offset, uint64_t imm)
+{
+   if (brw->gen >= 8) {
+      if (brw->gen == 8)
+         gen8_add_cs_stall_workaround_bits(&flags);
+
+      if (brw->gen == 9 &&
+          (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
+         /* Hardware workaround: SKL
+          *
+          * Emit Pipe Control with all bits set to zero before emitting
+          * a Pipe Control with VF Cache Invalidate set.
+          */
+         brw_emit_pipe_control_flush(brw, 0);
+      }
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      if (bo) {
+         OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION,
+                     I915_GEM_DOMAIN_INSTRUCTION, offset);
+      } else {
+         OUT_BATCH(0);
+         OUT_BATCH(0);
+      }
+      OUT_BATCH(imm);
+      OUT_BATCH(imm >> 32);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      if (brw->gen == 6 &&
+          (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
+         /* Hardware workaround: SNB B-Spec says:
+          *
+          *   [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
+          *   Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
+          *   required.
+          */
+         brw_emit_post_sync_nonzero_flush(brw);
+      }
+
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
+       * on later platforms.  We always use PPGTT on Gen7+.
+       */
+      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      if (bo) {
+         OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                   gen6_gtt | offset);
+      } else {
+         OUT_BATCH(0);
+      }
+      OUT_BATCH(imm);
+      OUT_BATCH(imm >> 32);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      if (bo) {
+         OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                   PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
+      } else {
+         OUT_BATCH(0);
+      }
+      OUT_BATCH(imm);
+      OUT_BATCH(imm >> 32);
+      ADVANCE_BATCH();
+   }
+}
+
 /**
 * Emit a PIPE_CONTROL with various flushing flags.
 *
@@ -107,64 +184,14 @@ brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
       * caches are coherent with memory once the specified R/O caches are
       * invalidated.  On pre-Gen6 hardware the (implicit) R/O cache
       * invalidation seems to happen at the bottom of the pipeline together
-       * with any write cache flush, so this shouldn't be a concern.
+       * with any write cache flush, so this shouldn't be a concern.  In order
+       * to ensure a full stall, we do an end-of-pipe sync.
       */
-      brw_emit_pipe_control_flush(brw, (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) |
-                                       PIPE_CONTROL_CS_STALL);
+      brw_emit_end_of_pipe_sync(brw, (flags & PIPE_CONTROL_CACHE_FLUSH_BITS));
      flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
   }

-   if (brw->gen >= 8) {
-      if (brw->gen == 8)
-         gen8_add_cs_stall_workaround_bits(&flags);
-
-      if (brw->gen == 9 &&
-          (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
-         /* Hardware workaround: SKL
-          *
-          * Emit Pipe Control with all bits set to zero before emitting
-          * a Pipe Control with VF Cache Invalidate set.
-          */
-         brw_emit_pipe_control_flush(brw, 0);
-      }
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
-      OUT_BATCH(flags);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else if (brw->gen >= 6) {
-      if (brw->gen == 6 &&
-          (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
-         /* Hardware workaround: SNB B-Spec says:
-          *
-          *   [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
-          *   Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
-          *   required.
-          */
-         brw_emit_post_sync_nonzero_flush(brw);
-      }
-
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(flags);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
+   brw_emit_pipe_control(brw, flags, NULL, 0, 0);
 }

 /**
@@ -178,45 +205,9 @@ brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
 void
 brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
                            struct brw_bo *bo, uint32_t offset,
-                            uint32_t imm_lower, uint32_t imm_upper)
+                            uint64_t imm)
 {
-   if (brw->gen >= 8) {
-      if (brw->gen == 8)
-         gen8_add_cs_stall_workaround_bits(&flags);
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
-      OUT_BATCH(flags);
-      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                  offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   } else if (brw->gen >= 6) {
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
-
-      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
-       * on later platforms.  We always use PPGTT on Gen7+.
-       */
-      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(flags);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                gen6_gtt | offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   }
+   brw_emit_pipe_control(brw, flags, bo, offset, imm);
 }

 /**
@@ -264,8 +255,7 @@ gen7_emit_vs_workaround_flush(struct brw_context *brw)
   brw_emit_pipe_control_write(brw,
                               PIPE_CONTROL_WRITE_IMMEDIATE
                               | PIPE_CONTROL_DEPTH_STALL,
-                               brw->workaround_bo, 0,
-                               0, 0);
+                               brw->workaround_bo, 0, 0);
 }


@@ -278,11 +268,9 @@ gen7_emit_cs_stall_flush(struct brw_context *brw)
   brw_emit_pipe_control_write(brw,
                               PIPE_CONTROL_CS_STALL
                               | PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->workaround_bo, 0,
-                               0, 0);
+                               brw->workaround_bo, 0, 0);
 }

-
 /**
 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 * implementing two workarounds on gen6.  From section 1.4.7.1
@@ -328,7 +316,106 @@ brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
                               PIPE_CONTROL_STALL_AT_SCOREBOARD);

   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->workaround_bo, 0, 0, 0);
+                               brw->workaround_bo, 0, 0);
+}
+
+/*
+ * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
+ *
+ *  Write synchronization is a special case of end-of-pipe
+ *  synchronization that requires that the render cache and/or depth
+ *  related caches are flushed to memory, where the data will become
+ *  globally visible. This type of synchronization is required prior to
+ *  SW (CPU) actually reading the result data from memory, or initiating
+ *  an operation that will use as a read surface (such as a texture
+ *  surface) a previous render target and/or depth/stencil buffer
+ *
+ *
+ * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
+ *
+ *  Exercising the write cache flush bits (Render Target Cache Flush
+ *  Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
+ *  ensures the write caches are flushed and doesn't guarantee the data
+ *  is globally visible.
+ *
+ *  SW can track the completion of the end-of-pipe-synchronization by
+ *  using "Notify Enable" and "PostSync Operation - Write Immediate
+ *  Data" in the PIPE_CONTROL command. 
+ */
+void
+brw_emit_end_of_pipe_sync(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen >= 6) {
+      /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
+       *
+       *    "The most common action to perform upon reaching a synchronization
+       *    point is to write a value out to memory. An immediate value
+       *    (included with the synchronization command) may be written."
+       *
+       *
+       * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
+       *
+       *    "In case the data flushed out by the render engine is to be read
+       *    back in to the render engine in coherent manner, then the render
+       *    engine has to wait for the fence completion before accessing the
+       *    flushed data. This can be achieved by following means on various
+       *    products: PIPE_CONTROL command with CS Stall and the required
+       *    write caches flushed with Post-Sync-Operation as Write Immediate
+       *    Data.
+       *
+       *    Example:
+       *       - Workload-1 (3D/GPGPU/MEDIA)
+       *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate
+       *         Data, Required Write Cache Flush bits set)
+       *       - Workload-2 (Can use the data produce or output by Workload-1)
+       */
+      brw_emit_pipe_control_write(brw,
+                                  flags | PIPE_CONTROL_CS_STALL |
+                                  PIPE_CONTROL_WRITE_IMMEDIATE,
+                                  brw->workaround_bo, 0, 0);
+
+      if (brw->is_haswell) {
+         /* Haswell needs addition work-arounds:
+          *
+          * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
+          *
+          *    Option 1:
+          *    PIPE_CONTROL command with the CS Stall and the required write
+          *    caches flushed with Post-SyncOperation as Write Immediate Data
+          *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
+          *    spce) commands.
+          *
+          *    Example:
+          *       - Workload-1
+          *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
+          *         Immediate Data, Required Write Cache Flush bits set)
+          *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
+          *       - Workload-2 (Can use the data produce or output by
+          *         Workload-1)
+          *
+          * Unfortunately, both the PRMs and the internal docs are a bit
+          * out-of-date in this regard.  What the windows driver does (and
+          * this appears to actually work) is to emit a register read from the
+          * memory address written by the pipe control above.
+          *
+          * What register we load into doesn't matter.  We choose an indirect
+          * rendering register because we know it always exists and it's one
+          * of the first registers the command parser allows us to write.  If
+          * you don't have command parser support in your kernel (pre-4.2),
+          * this will get turned into MI_NOOP and you won't get the
+          * workaround.  Unfortunately, there's just not much we can do in
+          * that case.  This register is perfectly safe to write since we
+          * always re-load all of the indirect draw registers right before
+          * 3DPRIMITIVE when needed anyway.
+          */
+         brw_load_register_mem(brw, GEN7_3DPRIM_START_INSTANCE,
+                               brw->workaround_bo,
+                               I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+      }
+   } else {
+      /* On gen4-5, a regular pipe control seems to suffice. */
+      brw_emit_pipe_control_flush(brw, flags);
+   }
 }

 /* Emit a pipelined flush to either flush render and texture cache for
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -97,7 +97,7 @@ brw_write_timestamp(struct brw_context *brw, struct brw_bo *query_bo, int idx)
      flags |= PIPE_CONTROL_CS_STALL;

   brw_emit_pipe_control_write(brw, flags,
-                               query_bo, idx * sizeof(uint64_t), 0, 0);
+                               query_bo, idx * sizeof(uint64_t), 0);
 }

 /**
@@ -112,8 +112,7 @@ brw_write_depth_count(struct brw_context *brw, struct brw_bo *query_bo, int idx)
      flags |= PIPE_CONTROL_CS_STALL;

   brw_emit_pipe_control_write(brw, flags,
-                               query_bo, idx * sizeof(uint64_t),
-                               0, 0);
+                               query_bo, idx * sizeof(uint64_t), 0);
 }

 /**
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -450,8 +450,10 @@ brw_update_sampler_state(struct brw_context *brw,
   /* Enable anisotropic filtering if desired. */
   unsigned max_anisotropy = BRW_ANISORATIO_2;
   if (sampler->MaxAnisotropy > 1.0f) {
-      min_filter = BRW_MAPFILTER_ANISOTROPIC;
-      mag_filter = BRW_MAPFILTER_ANISOTROPIC;
+      if (min_filter == BRW_MAPFILTER_LINEAR)
+         min_filter = BRW_MAPFILTER_ANISOTROPIC;
+      if (mag_filter == BRW_MAPFILTER_LINEAR)
+         mag_filter = BRW_MAPFILTER_ANISOTROPIC;

      if (sampler->MaxAnisotropy > 2.0f) {
 	 max_anisotropy =
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -63,7 +63,7 @@ set_query_availability(struct brw_context *brw, struct brw_query_object *query,
      brw_emit_pipe_control_write(brw,
                                  PIPE_CONTROL_WRITE_IMMEDIATE,
                                  query->bo, 2 * sizeof(uint64_t),
-                                  available, 0);
+                                  available);
   }
 }

--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -513,7 +513,7 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
    */
   brw_emit_pipe_control_write(brw,
                               PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->workaround_bo, 0, 0, 0);
+                               brw->workaround_bo, 0, 0);

   /* Emit 3DSTATE_WM_HZ_OP again to disable the state overrides. */
   BEGIN_BATCH(5);
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -180,7 +180,7 @@ genX(blorp_exec)(struct blorp_batch *batch,
   assert(batch->blorp->driver_ctx == batch->driver_batch);
   struct brw_context *brw = batch->driver_batch;
   struct gl_context *ctx = &brw->ctx;
-   const uint32_t estimated_max_batch_usage = GEN_GEN >= 8 ? 1920 : 1500;
+   const uint32_t estimated_max_batch_usage = GEN_GEN >= 8 ? 1920 : 1700;
   bool check_aperture_failed_once = false;

   /* Flush the sampler and render caches.  We definitely need to flush the
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -329,6 +329,7 @@ intel_miptree_blit(struct brw_context *brw,
   intel_miptree_slice_resolve_depth(brw, dst_mt, dst_level, dst_slice);
   intel_miptree_resolve_color(brw, src_mt, src_level, src_slice, 1, 0);
   intel_miptree_resolve_color(brw, dst_mt, dst_level, dst_slice, 1, 0);
+   intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_slice);

   if (src_flip)
      src_y = minify(src_mt->physical_height0, src_level - src_mt->first_level) - src_y - height;
@@ -387,6 +388,7 @@ intel_miptree_copy(struct brw_context *brw,
   intel_miptree_slice_resolve_depth(brw, dst_mt, dst_level, dst_slice);
   intel_miptree_resolve_color(brw, src_mt, src_level, src_slice, 1, 0);
   intel_miptree_resolve_color(brw, dst_mt, dst_level, dst_slice, 1, 0);
+   intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_slice);

   uint32_t src_image_x, src_image_y;
   intel_miptree_get_image_offset(src_mt, src_level, src_slice,
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -442,13 +442,9 @@ intel_create_renderbuffer(mesa_format format, unsigned num_samples)
   struct intel_renderbuffer *irb;
   struct gl_renderbuffer *rb;

-   GET_CURRENT_CONTEXT(ctx);
-
   irb = CALLOC_STRUCT(intel_renderbuffer);
-   if (!irb) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "creating renderbuffer");
+   if (!irb)
      return NULL;
-   }

   rb = &irb->Base.Base;
   irb->layer_count = 1;
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -1992,7 +1992,7 @@ intel_miptree_slice_resolve(struct brw_context *brw,
   if (!item || item->need != need)
      return false;

-   intel_hiz_exec(brw, mt, level, layer, need);
+   intel_hiz_exec(brw, mt, level, layer, 1, need);
   intel_resolve_map_remove(item);
   return true;
 }
@@ -2028,7 +2028,7 @@ intel_miptree_all_slices_resolve(struct brw_context *brw,
      if (map->need != need)
 	 continue;

-      intel_hiz_exec(brw, mt, map->level, map->layer, need);
+      intel_hiz_exec(brw, mt, map->level, map->layer, 1, need);
      intel_resolve_map_remove(map);
      did_resolve = true;
   }
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -1029,10 +1029,6 @@ intel_miptree_unmap(struct brw_context *brw,
 		    unsigned int level,
 		    unsigned int slice);

-void
-intel_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
-	       unsigned int level, unsigned int layer, enum blorp_hiz_op op);
-
 bool
 intel_miptree_sample_with_hiz(struct brw_context *brw,
                              struct intel_mipmap_tree *mt);
--- a/src/mesa/drivers/x11/Makefile.am
+++ b/src/mesa/drivers/x11/Makefile.am
@@ -76,6 +76,7 @@ lib@GL_LIB@_la_LIBADD = \
 lib@GL_LIB@_la_LDFLAGS = \
 	-no-undefined \
 	-version-number $(GL_MAJOR):$(GL_MINOR):$(GL_PATCH) \
+	$(BSYMBOLIC) \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)

--- a/src/mesa/drivers/x11/glxapi.c
+++ b/src/mesa/drivers/x11/glxapi.c
@@ -379,13 +379,13 @@ glXQueryServerString(Display *dpy, int screen, int name)

 /*** GLX_VERSION_1_2 ***/

+/* declare here to avoid including xmesa.h */
+extern Display *XMesaGetCurrentDisplay(void);
+
 Display PUBLIC *
 glXGetCurrentDisplay(void)
 {
-   /* Same code as in libGL's glxext.c */
-   __GLXcontext *gc = (__GLXcontext *) glXGetCurrentContext();
-   if (NULL == gc) return NULL;
-   return gc->currentDpy;
+   return XMesaGetCurrentDisplay();
 }


--- a/src/mesa/drivers/x11/glxapi.h
+++ b/src/mesa/drivers/x11/glxapi.h
@@ -37,7 +37,6 @@
 * work properly.
 */
 typedef struct __GLXcontextRec {
-   Display *currentDpy;
   GLboolean isDirect;
   GLXDrawable currentDrawable;
   GLXDrawable currentReadable;
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -1304,6 +1304,14 @@ XMesaBuffer XMesaGetCurrentReadBuffer( void )
 }


+Display *XMesaGetCurrentDisplay(void)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   XMesaContext xmctx = XMESA_CONTEXT(ctx);
+   return xmctx ? xmctx->display : NULL;
+}
+
+

 GLboolean XMesaSetFXmode( GLint mode )
 {
--- a/src/mesa/drivers/x11/xmesa.h
+++ b/src/mesa/drivers/x11/xmesa.h
@@ -240,6 +240,12 @@ extern XMesaBuffer XMesaGetCurrentBuffer( void );
 extern XMesaBuffer XMesaGetCurrentReadBuffer( void );


+/*
+ * Return display of current context.
+ */
+extern Display *XMesaGetCurrentDisplay( void );
+
+
 /*
 * Swap the front and back buffers for the given buffer.  No action is
 * taken if the buffer is not double buffered.
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -469,6 +469,8 @@ _mesa_bind_pipeline(struct gl_context *ctx,
    *     considered current."
    */
   if (&ctx->Shader != ctx->_Shader) {
+      FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+
      if (pipe != NULL) {
         /* Bound the pipeline to the current program and
          * restore the pipeline state
@@ -480,8 +482,6 @@ _mesa_bind_pipeline(struct gl_context *ctx,
                                         ctx->Pipeline.Default);
      }

-      FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
-
      for (i = 0; i < MESA_SHADER_STAGES; i++) {
         struct gl_program *prog = ctx->_Shader->CurrentProgram[i];
         if (prog) {
--- a/src/mesa/main/viewport.c
+++ b/src/mesa/main/viewport.c
@@ -40,6 +40,8 @@ set_viewport_no_notify(struct gl_context *ctx, unsigned idx,
                       GLfloat x, GLfloat y,
                       GLfloat width, GLfloat height)
 {
+   FLUSH_VERTICES(ctx, _NEW_VIEWPORT);
+
   /* clamp width and height to the implementation dependent range */
   width  = MIN2(width, (GLfloat) ctx->Const.MaxViewportWidth);
   height = MIN2(height, (GLfloat) ctx->Const.MaxViewportHeight);
@@ -71,7 +73,6 @@ set_viewport_no_notify(struct gl_context *ctx, unsigned idx,
   ctx->ViewportArray[idx].Width = width;
   ctx->ViewportArray[idx].Y = y;
   ctx->ViewportArray[idx].Height = height;
-   ctx->NewState |= _NEW_VIEWPORT;
 }

 struct gl_viewport_inputs {
@@ -240,9 +241,10 @@ set_depth_range_no_notify(struct gl_context *ctx, unsigned idx,
       ctx->ViewportArray[idx].Far == farval)
      return;

+   FLUSH_VERTICES(ctx, _NEW_VIEWPORT);
+
   ctx->ViewportArray[idx].Near = CLAMP(nearval, 0.0, 1.0);
   ctx->ViewportArray[idx].Far = CLAMP(farval, 0.0, 1.0);
-   ctx->NewState |= _NEW_VIEWPORT;
 }

 void
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -261,7 +261,7 @@ static void update_raster_state( struct st_context *st )
         _mesa_geometric_samples(ctx->DrawBuffer) > 1;

   /* _NEW_SCISSOR */
-   raster->scissor = ctx->Scissor.EnableFlags;
+   raster->scissor = !!ctx->Scissor.EnableFlags;

   /* _NEW_FRAG_CLAMP */
   raster->clamp_fragment_color = !st->clamp_frag_color_in_shader &&
--- a/src/mesa/state_tracker/st_shader_cache.c
+++ b/src/mesa/state_tracker/st_shader_cache.c
@@ -22,7 +22,7 @@
 */

 #include <stdio.h>
-
+#include "st_debug.h"
 #include "st_program.h"
 #include "st_shader_cache.h"
 #include "compiler/glsl/program.h"
@@ -383,6 +383,11 @@ st_load_tgsi_from_disk_cache(struct gl_context *ctx,
         _mesa_associate_uniform_storage(ctx, prog, glprog->Parameters,
                                         false);

+         /* Create Gallium shaders now instead of on demand. */
+         if (ST_DEBUG & DEBUG_PRECOMPILE ||
+             st->shader_has_one_variant[glprog->info.stage])
+            st_precompile_shader_variant(st, glprog);
+
         free(buffer);
      } else {
         /* Failed to find a matching cached shader so fallback to recompile.
--- a/src/util/rand_xor.c
+++ b/src/util/rand_xor.c
@@ -25,6 +25,7 @@
 #if defined(__linux__)
 #include <sys/file.h>
 #include <unistd.h>
+#include <fcntl.h>
 #else
 #include <time.h>
 #endif
@@ -1 +1 @@
 .1.2
 .1.4