docs: use correct year for the 12.0.6 release notes

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
docs: add sha256 checksums for 12.0.6
2017-01-24 02:05:20 +00:00 · 2017-01-24 02:02:48 +00:00 · 2017-01-24 01:32:02 +00:00 · 2017-01-24 01:28:48 +00:00 · 2017-01-20 22:47:28 +00:00 · 2017-01-20 22:40:51 +00:00
87 changed files with 1802 additions and 616 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -40,7 +40,7 @@ AM_DISTCHECK_CONFIGURE_FLAGS = \
 	--enable-vdpau \
 	--enable-xa \
 	--enable-xvmc \
-	--disable-llvm-shared-libs \
+	--enable-llvm-shared-libs \
 	--with-egl-platforms=x11,wayland,drm,surfaceless \
 	--with-dri-drivers=i915,i965,nouveau,radeon,r200,swrast \
 	--with-gallium-drivers=i915,ilo,nouveau,r300,r600,radeonsi,freedreno,svga,swrast,vc4,virgl,swr \
--- a/2
+++ b/2
@@ -1 +1 @@
-12.0.4
+12.0.6
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -23,3 +23,6 @@ f2b9b0c730e345bcffa9eadabb25af3ab02642f2 i965: Add missing BRW_NEW_FS_PROG_DATA
 # Patches depend on the fence_finish() gallium API change and corresponding driver work
 f240ad98bc05281ea7013d91973cb5f932ae9434 st/mesa: unduplicate st_check_sync code
 b687f766fddb7b39479cd9ee0427984029ea3559 st/mesa: allow multiple concurrent waiters in ClientWaitSync
+
+# Commit was reverted shortly after it landed in master
+a39ad185932eab4f25a0cb2b112c10d8700ef242 configure.ac: honour LLVM_LIBDIR when linking against LLVM
--- a/bin/get-typod-pick-list.sh
+++ b/bin/get-typod-pick-list.sh
@@ -0,0 +1,39 @@
+#!/bin/sh
+
+# Script for generating a list of candidates which have typos in the nomination line
+#
+# Usage examples:
+#
+# $ bin/get-typod-pick-list.sh
+# $ bin/get-typod-pick-list.sh > picklist
+# $ bin/get-typod-pick-list.sh | tee picklist
+
+# NB:
+# This script intentionally _never_ checks for specific version tag
+# Should we consider folding it with the original get-pick-list.sh
+
+# Grep for commits with "cherry picked from commit" in the commit message.
+git log --reverse --grep="cherry picked from commit" origin/master..HEAD |\
+	grep "cherry picked from commit" |\
+	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
+
+# Grep for commits that were marked as a candidate for the stable tree.
+git log --reverse --pretty=%H -i --grep='^CC:.*mesa-dev' HEAD..origin/master |\
+while read sha
+do
+	# Check to see whether the patch is on the ignore list.
+	if [ -f bin/.cherry-ignore ] ; then
+		if grep -q ^$sha bin/.cherry-ignore ; then
+			continue
+		fi
+	fi
+
+	# Check to see if it has already been picked over.
+	if grep -q ^$sha already_picked ; then
+		continue
+	fi
+
+	git log -n1 --pretty=oneline $sha | cat
+done
+
+rm -f already_picked
--- a/docs/relnotes/12.0.4.html
+++ b/docs/relnotes/12.0.4.html
@@ -0,0 +1,321 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 12.0.4 Release Notes / November 10, 2016</h1>
+
+<p>
+Mesa 12.0.4 is a bug fix release which fixes bugs found since the 12.0.4 release.
+</p>
+<p>
+Mesa 12.0.4 implements the OpenGL 4.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.3.  OpenGL
+4.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+22026ce4f1c6a7908b0d10ff057decec0a5633afe7f38a0cef5c08d0689f02a6 mesa-12.0.4.tar.gz
+5d6003da867d3f54e5000b4acdfc37e6cce5b6a4459274fdad73e24bd2f0065e mesa-12.0.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=71759">Bug 71759</a> - Intel driver fails with &quot;intel_do_flush_locked failed: No such file or directory&quot; if buffer imported with EGL_NATIVE_PIXMAP_KHR</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=94354">Bug 94354</a> - R9285 Unigine Valley perf regression since radeonsi: use re-Z</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96770">Bug 96770</a> - include/GL/mesa_glinterop.h:62: error: redefinition of typedef ‘GLXContext’</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97231">Bug 97231</a> - GL_DEPTH_CLAMP doesn't clamp to the far plane</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97233">Bug 97233</a> - vkQuake VkSpecializationMapEntry related bug</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97260">Bug 97260</a> - R9 290 low performance in Linux 4.7</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97549">Bug 97549</a> - [SNB, BXT] up to 40% perf drop from &quot;loader/dri3: Overhaul dri3_update_num_back&quot; commit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97887">Bug 97887</a> - llvm segfault in janusvr -render vive</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98025">Bug 98025</a> - [radeonsi] incorrect primitive restart index used</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98134">Bug 98134</a> - dEQP-GLES31.functional.debug.negative_coverage.get_error.buffer.draw_buffers wants a different GL error code</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98326">Bug 98326</a> - [dEQP, EGL] pbuffer depth/stencil tests fail</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Axel Davy (4):</p>
+<ul>
+  <li>gallium/util: Really allow aliasing of dst for u_box_union_*</li>
+  <li>st/nine: Fix the calculation of the number of vs inputs</li>
+  <li>st/nine: Fix mistake in Volume9 UnlockBox</li>
+  <li>st/nine: Fix locking CubeTexture surfaces.</li>
+</ul>
+
+<p>Brendan King (1):</p>
+<ul>
+  <li>configure.ac: fix the name of the Wayland Scanner pc file</li>
+</ul>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>st/mesa: fix swizzle issue in st_create_sampler_view_from_stobj()</li>
+</ul>
+
+<p>Chad Versace (3):</p>
+<ul>
+  <li>egl: Fix truncation error in _eglParseSyncAttribList64</li>
+  <li>i965/sync: Fix uninitalized usage and leak of mutex</li>
+  <li>egl: Don't advertise unsupported platform extensions</li>
+</ul>
+
+<p>Chuanbo Weng (1):</p>
+<ul>
+  <li>gbm: fix potential NULL deref of mapImage/unmapImage.</li>
+</ul>
+
+<p>Chuck Atkins (1):</p>
+<ul>
+  <li>autoconf: Make header install distinct for various APIs (v2)</li>
+</ul>
+
+<p>Dave Airlie (3):</p>
+<ul>
+  <li>anv: initialise and increment send_sbc</li>
+  <li>anv/wsi: fix apps that acquire multiple images up front</li>
+  <li>Revert "st/vdpau: use linear layout for output surfaces"</li>
+</ul>
+
+<p>Emil Velikov (12):</p>
+<ul>
+  <li>docs: add sha256 checksums for 12.0.3</li>
+  <li>cherry-ignore: add non-applicable i965 commit</li>
+  <li>cherry-ignore: add vaapi encode fix</li>
+  <li>cherry-ignore: add EGL_KHR_debug fix</li>
+  <li>cherry-ignore: add update_renderbuffer_read_surfaces()</li>
+  <li>isl/gen6: correctly check msaa layout samples count</li>
+  <li>egl/x11: don't crash if dri2_dpy-&gt;conn is NULL</li>
+  <li>get-pick-list.sh: Require explicit "12.0" for nominating stable patches</li>
+  <li>automake: don't forget to pick wglext.h in the tarball</li>
+  <li>cherry-ignore: add N/A EGL revert</li>
+  <li>cherry-ignore: add ClientWaitSync fixes</li>
+  <li>Update version to 12.0.4</li>
+</ul>
+
+<p>Eric Anholt (5):</p>
+<ul>
+  <li>travis: Parse configure.ac to pick an updated LIBDRM_VERSION.</li>
+  <li>travis: Update to the Ubuntu Trusty image.</li>
+  <li>travis: Enable vc4 in libdrm to satisfy vc4 test build dependency.</li>
+  <li>travis: Upgrade LLVM dependency to 3.5 and enable LLVM drivers.</li>
+  <li>gallium: Fix install-gallium-links.mk on non-bash /bin/sh</li>
+</ul>
+
+<p>Hans de Goede (1):</p>
+<ul>
+  <li>pipe_loader_sw: Fix fd leak when instantiated via pipe_loader_sw_probe_kms</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>glsl: Fix cut-and-paste bug in hierarchical visitor ir_expression::accept</li>
+</ul>
+
+<p>Ilia Mirkin (16):</p>
+<ul>
+  <li>nv30: set usage to staging so that the buffer is allocated in GART</li>
+  <li>a3xx: make sure to actually clamp depth as requested</li>
+  <li>a3xx: make use of software clipping when hw can't handle it</li>
+  <li>a3xx: use window scissor to simulate viewport xy clip</li>
+  <li>main: GL_RGB10_A2UI does not come with GL 3.0/EXT_texture_integer</li>
+  <li>mesa/formatquery: limit ES target support, fix core context support</li>
+  <li>nir: fix definition of pack_uvec2_to_uint</li>
+  <li>gm107/ir: AL2P writes to a predicate register</li>
+  <li>st/mesa: fix is_scissor_enabled when X/Y are negative</li>
+  <li>nvc0/ir: fix overwriting of value backing non-constant gather offset</li>
+  <li>nv50/ir: copy over value's register id when resolving merge of a phi</li>
+  <li>nvc0/ir: fix textureGather with a single offset</li>
+  <li>gm107/ir: fix texturing with indirect samplers</li>
+  <li>gm107/ir: fix bit offset of tex lod setting for indirect texturing</li>
+  <li>nv50,nvc0: avoid reading out of bounds when getting bogus so info</li>
+  <li>nv50/ir: process texture offset sources as regular sources</li>
+</ul>
+
+<p>James Legg (1):</p>
+<ul>
+  <li>radeonsi: Fix primitive restart when index changes</li>
+</ul>
+
+<p>Jason Ekstrand (9):</p>
+<ul>
+  <li>nir/spirv: Swap the argument order for AtomicCompareExchange</li>
+  <li>nir/spirv: Use the correct sources for CompareExchange on images</li>
+  <li>nir/spirv: Break variable decoration handling into a helper</li>
+  <li>nir/spirv: Refactor variable deocration handling</li>
+  <li>nir/spirv/cfg: Handle switches whose break block is a loop continue</li>
+  <li>nir/spirv/cfg: Detect switch_break after loop_break/continue</li>
+  <li>nir: Add a nop intrinsic</li>
+  <li>nir/spirv/cfg: Use a nop intrinsic for tagging the ends of blocks</li>
+  <li>intel/blorp: Rework our usage of ralloc when compiling shaders</li>
+</ul>
+
+<p>Jonathan Gray (3):</p>
+<ul>
+  <li>genxml: add generated headers to EXTRA_DIST</li>
+  <li>mapi: automake: set VISIBILITY_CFLAGS for shared glapi</li>
+  <li>mesa: automake: include mesa_glinterop.h in distfile</li>
+</ul>
+
+<p>Julien Isorce (1):</p>
+<ul>
+  <li>st/va: also honors interlaced preference when providing a video format</li>
+</ul>
+
+<p>Kenneth Graunke (8):</p>
+<ul>
+  <li>nir: Call nir_metadata_preserve from nir_lower_alu_to_scalar().</li>
+  <li>mesa: Expose RESET_NOTIFICATION_STRATEGY with KHR_robustness.</li>
+  <li>i965: Fix missing _NEW_TRANSFORM in Gen8+ 3DSTATE_DS atom.</li>
+  <li>i965: Add missing BRW_NEW_VS_PROG_DATA to 3DSTATE_CLIP.</li>
+  <li>i965: Move BRW_NEW_FRAGMENT_PROGRAM from 3DSTATE_PS to PS_EXTRA.</li>
+  <li>i965: Add missing BRW_NEW_CS_PROG_DATA to compute constant atom.</li>
+  <li>i965: Add missing BRW_CS_PROG_DATA to CS work group surface atom.</li>
+  <li>i965: Fix gl_InvocationID in dual object GS where invocations == 1.</li>
+</ul>
+
+<p>Marek Olšák (12):</p>
+<ul>
+  <li>radeonsi: fix cubemaps viewed as 2D</li>
+  <li>radeonsi: take compute shader and dispatch indirect memory usage into account</li>
+  <li>radeonsi: fix FP64 UBO loads with indirect uniform block indexing</li>
+  <li>mesa: fix glGetFramebufferAttachmentParameteriv w/ on-demand FRONT_BACK alloc</li>
+  <li>radeonsi: fix interpolateAt opcodes for .zw components</li>
+  <li>radeonsi: fix texture border colors for compute shaders</li>
+  <li>radeonsi: disable ReZ</li>
+  <li>gallium/radeon: make sure the address of separate CMASK is aligned properly</li>
+  <li>winsys/amdgpu: fix radeon_surf::macro_tile_index for imported textures</li>
+  <li>egl: use util/macros.h</li>
+  <li>egl: make interop ABI visible again</li>
+  <li>glx: make interop ABI visible again</li>
+</ul>
+
+<p>Mario Kleiner (1):</p>
+<ul>
+  <li>glx: Perform check for valid fbconfig against proper X-Screen.</li>
+</ul>
+
+<p>Martin Peres (2):</p>
+<ul>
+  <li>loader/dri3: add get_dri_screen() to the vtable</li>
+  <li>loader/dri3: import prime buffers in the currently-bound screen</li>
+</ul>
+
+<p>Matt Whitlock (5):</p>
+<ul>
+  <li>egl/android: replace call to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+  <li>gallium/auxiliary: replace call to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+  <li>st/dri: replace calls to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+  <li>st/xa: replace call to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+  <li>gallium/winsys: replace calls to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+</ul>
+
+<p>Max Staudt (1):</p>
+<ul>
+  <li>r300g: Set R300_VAP_CNTL on RSxxx to avoid triangle flickering</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>loader/dri3: Overhaul dri3_update_num_back</li>
+</ul>
+
+<p>Nicholas Bishop (2):</p>
+<ul>
+  <li>gbm: return appropriate error when queryImage() fails</li>
+  <li>st/dri: check pipe_screen-&gt;resource_get_handle() return value</li>
+</ul>
+
+<p>Nicolai Hähnle (10):</p>
+<ul>
+  <li>gallium/radeon: cleanup and fix branch emits</li>
+  <li>st/glsl_to_tgsi: disable on-the-fly peephole for 64-bit operations</li>
+  <li>st/glsl_to_tgsi: simplify translate_tex_offset</li>
+  <li>st/glsl_to_tgsi: fix textureGatherOffset with indirectly loaded offsets</li>
+  <li>st/mesa: fix vertex elements setup for doubles</li>
+  <li>radeonsi: fix indirect loads of 64 bit constants</li>
+  <li>st/glsl_to_tgsi: fix atomic counter addressing</li>
+  <li>st/glsl_to_tgsi: fix block copies of arrays of doubles</li>
+  <li>st/mesa: only set primitive_restart when the restart index is in range</li>
+  <li>radeonsi: fix 64-bit loads from LDS</li>
+</ul>
+
+<p>Samuel Pitoiset (4):</p>
+<ul>
+  <li>nvc0/ir: fix subops for IMAD</li>
+  <li>gk110/ir: fix wrong emission of OP_NOT</li>
+  <li>nvc0: use correct bufctx when invalidating CP textures</li>
+  <li>nvc0/ir: fix emission of IMAD with NEG modifiers</li>
+</ul>
+
+<p>Stencel, Joanna (1):</p>
+<ul>
+  <li>egl/wayland: add missing destroy_window callback</li>
+</ul>
+
+<p>Tapani Pälli (5):</p>
+<ul>
+  <li>egl: stop claiming support for pbuffer + msaa</li>
+  <li>egl/dri2: set max values for pbuffer width and height</li>
+  <li>egl: add check that eglCreateContext gets a valid config</li>
+  <li>mesa: fix error handling in DrawBuffers</li>
+  <li>egl: set preserved behavior for surface only if config supports it</li>
+</ul>
+
+<p>Tim Rowley (1):</p>
+<ul>
+  <li>configure.ac: add llvm inteljitevents component if enabled</li>
+</ul>
+
+<p>Vedran Miletić (1):</p>
+<ul>
+  <li>clover: Fix build against clang SVN &gt;= r273191</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>Revert "mesa_glinterop: remove inclusion of GLX header"</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/12.0.5.html
+++ b/docs/relnotes/12.0.5.html
@@ -0,0 +1,138 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 12.0.5 Release Notes / December 5, 2016</h1>
+
+<p>
+Mesa 12.0.5 is a bug fix release which fixes bugs found since the 12.0.5 release.
+</p>
+<p>
+Mesa 12.0.5 implements the OpenGL 4.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.3.  OpenGL
+4.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+44d08a27d98bfeacd864381189e434d98afbf451689d01f80380dc1d66450e5b  mesa-12.0.5.tar.gz
+2b0a972d8282860a11291c09c3ef01ac45171405951eb21a83c45ed2b4321924  mesa-12.0.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=77662">Bug 77662</a> - Fail to render to different faces of depth-stencil cube map</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97779">Bug 97779</a> - [regression, bisected][BDW, GPU hang] stuck on render ring, always reproducible</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98415">Bug 98415</a> - Vulkan Driver JSON file contains incorrect field</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Adam Jackson (2):</p>
+<ul>
+  <li>glx/glvnd: Don't modify the dummy slot in the dispatch table</li>
+  <li>glx/glvnd: Fix dispatch function names and indices</li>
+</ul>
+
+<p>Anuj Phogat (1):</p>
+<ul>
+  <li>i965: Fix GPU hang related to multiple render targets and alpha testing</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add release notes for 12.0.4</li>
+  <li>docs: add sha256 checksums for 12.0.4</li>
+  <li>cherry-ignore: add reverted LLVM_LIBDIR patch</li>
+  <li>Update version to 12.0.5</li>
+</ul>
+
+<p>Haixia Shi (1):</p>
+<ul>
+  <li>mesa: change state query return value for RGB565</li>
+</ul>
+
+<p>Jason Ekstrand (3):</p>
+<ul>
+  <li>i965/fs/generator: Don't use the address immediate for MOV_INDIRECT</li>
+  <li>anv/cmd_buffer: Take a command buffer instead of a batch in two helpers</li>
+  <li>anv/cmd_buffer: Enable a CS stall workaround for Sky Lake gt4</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>intel: Fix pixel shader scratch space allocation on Gen9+ platforms.</li>
+</ul>
+
+<p>Marek Olšák (13):</p>
+<ul>
+  <li>gallium/radeon: fix behavior of GLSL findLSB(0)</li>
+  <li>gallium/radeon: make sure HTILE address is aligned properly</li>
+  <li>radeonsi: fix an assertion failure in si_decompress_sampler_color_textures</li>
+  <li>gallium/radeon: unify viewport emission code</li>
+  <li>gallium/radeon: set VPORT_ZMIN/MAX registers correctly</li>
+  <li>radeonsi: fix gl_PatchVerticesIn for tessellation evaluation shader</li>
+  <li>radeonsi: fix a crash in imageSize for cubemap arrays</li>
+  <li>radeonsi: emit TA_CS_BC_BASE_ADDR on SI only if the kernel allows it</li>
+  <li>gallium/radeon: add support for sharing textures with DCC between processes</li>
+  <li>radeonsi: always set all blend registers</li>
+  <li>radeonsi: set CB_BLEND1_CONTROL.ENABLE for dual source blending</li>
+  <li>radeonsi: disable RB+ blend optimizations for dual source blending</li>
+  <li>radeonsi: silence runtime warnings with LLVM 3.9</li>
+</ul>
+
+<p>Matt Turner (1):</p>
+<ul>
+  <li>anv: Replace "abi_versions" with correct "api_version".</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>mesa/fbobject: Update CubeMapFace when reusing textures</li>
+</ul>
+
+<p>Steinar H. Gunderson (1):</p>
+<ul>
+  <li>Fix races during _mesa_HashWalk().</li>
+</ul>
+
+<p>Tim Rowley (3):</p>
+<ul>
+  <li>swr: [rasterizer jitter] cleanup supporting different llvm versions</li>
+  <li>swr: [rasterizer jitter] fix llvm-3.7 compile</li>
+  <li>swr: [rasterizer] add support for llvm-3.9</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/12.0.6.html
+++ b/docs/relnotes/12.0.6.html
@@ -0,0 +1,148 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 12.0.6 Release Notes / January 23, 2017</h1>
+
+<p>
+Mesa 12.0.6 is a bug fix release which fixes bugs found since the 12.0.5 release.
+</p>
+<p>
+Mesa 12.0.6 implements the OpenGL 4.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.3.  OpenGL
+4.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+65339ba5d76a45225b8b56f9a1da9db15c569e1d163760faa2921da0a8461741  mesa-12.0.6.tar.gz
+7d6da9744c1022a4c2ab6ad01a206984d00443fb691568011d01b3dd97e36448  mesa-12.0.6.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92234">Bug 92234</a> - [BDW] GPU hang in Shogun2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95130">Bug 95130</a> - Derivatives of gl_Color wrong when helper pixels used</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98329">Bug 98329</a> - [dEQP, EGL, SKL, BDW, BSW] dEQP-EGL.functional.image.render_multiple_contexts.gles2_renderbuffer_depth16_depth_buffer</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99030">Bug 99030</a> - [HSW, regression] transform feedback fails on Linux 4.8</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99354">Bug 99354</a> - [G71] &quot;Assertion `bkref' failed&quot; reproducible with glmark2</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>Chad Versace (3):</p>
+<ul>
+  <li>i965/mt: Disable aux surfaces after making miptree shareable</li>
+  <li>i965/mt: Disable HiZ when sharing depth buffer externally (v2)</li>
+  <li>anv: Handle vkGetPhysicalDeviceQueueFamilyProperties with count == 0</li>
+</ul>
+
+<p>Emil Velikov (5):</p>
+<ul>
+  <li>docs: add sha256 checksums for 12.0.5</li>
+  <li>get-typod-pick-list.sh: add new script</li>
+  <li>automake: use shared llvm libs for make distcheck</li>
+  <li>egl/wayland: use the destroy_window_callback for swrast</li>
+  <li>Update version to 12.0.6</li>
+</ul>
+
+<p>Fredrik Höglund (1):</p>
+<ul>
+  <li>dri3: Fix MakeCurrent without a default framebuffer</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nouveau: take extra push space into account for pushbuf_space calls</li>
+</ul>
+
+<p>Jason Ekstrand (19):</p>
+<ul>
+  <li>spirv/nir: Fix some texture opcode asserts</li>
+  <li>spirv/nir: Add support for shadow samplers that return vec4</li>
+  <li>spirv/nir: Properly handle gather components</li>
+  <li>anv/pipeline: Set binding_table.gather_texture_start</li>
+  <li>nir: Add a helper for determining the type of a texture source</li>
+  <li>nir/lower_tex: Add some helpers for working with tex sources</li>
+  <li>nir/lower_tex: Add support for lowering coordinate offsets</li>
+  <li>i965/nir: Enable NIR lowering of txf and rect offsets</li>
+  <li>i965: Get rid of the do_lower_unnormalized_offsets pass</li>
+  <li>spirv/nir: Don't increment coord_components for array lod queries</li>
+  <li>anv/image: Assert that the image format is actually supported</li>
+  <li>spirv/nir: Move opcode selection higher up in handle_texture</li>
+  <li>spirv/nir: Refactor type handling in handle_texture</li>
+  <li>nir/spirv: Refactor coordinate handling in handle_texture</li>
+  <li>spirv/nir: Handle texture projectors</li>
+  <li>spirv/nir: Add support for ImageQuerySamples</li>
+  <li>anv/device: Return the right error for failed maps</li>
+  <li>anv/device: Implicitly unmap memory objects in FreeMemory</li>
+  <li>anv/descriptor_set: Write the state offset in the surface state free list.</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>spirv: Move cursor before calling vtn_ssa_value() in phi 2nd pass.</li>
+  <li>i965: Properly flush in hsw_pause_transform_feedback().</li>
+</ul>
+
+<p>Marek Olšák (6):</p>
+<ul>
+  <li>cso: don't release sampler states that are bound</li>
+  <li>radeonsi: always restore sampler states when unbinding sampler views</li>
+  <li>radeonsi: fix incorrect FMASK checking in bind_sampler_states</li>
+  <li>radeonsi: disable CE on SI + AMDGPU</li>
+  <li>radeonsi: disable the constant engine (CE) on Carrizo and Stoney</li>
+  <li>gallium/radeon: fix the draw-calls HUD query</li>
+</ul>
+
+<p>Matt Turner (3):</p>
+<ul>
+  <li>i965/fs: Rename opt_copy_propagate -&gt; opt_copy_propagation.</li>
+  <li>i965/fs: Add unit tests for copy propagation pass.</li>
+  <li>i965/fs: Reject copy propagation into SEL if not min/max.</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>cso: Don't restore nr_samplers in cso_restore_fragment_samplers</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>radeonsi: enable WQM in PS prolog when needed</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1234,6 +1234,50 @@ nir_tex_instr_is_query(nir_tex_instr *instr)
   }
 }

+static inline nir_alu_type
+nir_tex_instr_src_type(nir_tex_instr *instr, unsigned src)
+{
+   switch (instr->src[src].src_type) {
+   case nir_tex_src_coord:
+      switch (instr->op) {
+      case nir_texop_txf:
+      case nir_texop_txf_ms:
+      case nir_texop_txf_ms_mcs:
+      case nir_texop_samples_identical:
+         return nir_type_int;
+
+      default:
+         return nir_type_float;
+      }
+
+   case nir_tex_src_lod:
+      switch (instr->op) {
+      case nir_texop_txs:
+      case nir_texop_txf:
+         return nir_type_int;
+
+      default:
+         return nir_type_float;
+      }
+
+   case nir_tex_src_projector:
+   case nir_tex_src_comparitor:
+   case nir_tex_src_bias:
+   case nir_tex_src_ddx:
+   case nir_tex_src_ddy:
+      return nir_type_float;
+
+   case nir_tex_src_offset:
+   case nir_tex_src_ms_index:
+   case nir_tex_src_texture_offset:
+   case nir_tex_src_sampler_offset:
+      return nir_type_int;
+
+   default:
+      unreachable("Invalid texture source type");
+   }
+}
+
 static inline unsigned
 nir_tex_instr_src_size(nir_tex_instr *instr, unsigned src)
 {
@@ -2344,6 +2388,16 @@ typedef struct nir_lower_tex_options {
    */
   unsigned lower_txp;

+   /**
+    * If true, lower away nir_tex_src_offset for all texelfetch instructions.
+    */
+   bool lower_txf_offset;
+
+   /**
+    * If true, lower away nir_tex_src_offset for all rect textures.
+    */
+   bool lower_rect_offset;
+
   /**
    * If true, lower rect textures to 2D, using txs to fetch the
    * texture dimensions and dividing the texture coords by the
--- a/src/compiler/nir/nir_lower_tex.c
+++ b/src/compiler/nir/nir_lower_tex.c
@@ -38,16 +38,39 @@
 #include "nir.h"
 #include "nir_builder.h"

+static int
+tex_instr_find_src(nir_tex_instr *tex, nir_tex_src_type src_type)
+{
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      if (tex->src[i].src_type == src_type)
+         return i;
+   }
+
+   return -1;
+}
+
+static void
+tex_instr_remove_src(nir_tex_instr *tex, unsigned src_idx)
+{
+   assert(src_idx < tex->num_srcs);
+
+   /* First rewrite the source to NIR_SRC_INIT */
+   nir_instr_rewrite_src(&tex->instr, &tex->src[src_idx].src, NIR_SRC_INIT);
+
+   /* Now, move all of the other sources down */
+   for (unsigned i = src_idx + 1; i < tex->num_srcs; i++) {
+      tex->src[i-1].src_type = tex->src[i].src_type;
+      nir_instr_move_src(&tex->instr, &tex->src[i-1].src, &tex->src[i].src);
+   }
+   tex->num_srcs--;
+}
+
 static void
 project_src(nir_builder *b, nir_tex_instr *tex)
 {
   /* Find the projector in the srcs list, if present. */
-   unsigned proj_index;
-   for (proj_index = 0; proj_index < tex->num_srcs; proj_index++) {
-      if (tex->src[proj_index].src_type == nir_tex_src_projector)
-         break;
-   }
-   if (proj_index == tex->num_srcs)
+   int proj_index = tex_instr_find_src(tex, nir_tex_src_projector);
+   if (proj_index < 0)
      return;

   b->cursor = nir_before_instr(&tex->instr);
@@ -102,18 +125,57 @@ project_src(nir_builder *b, nir_tex_instr *tex)
                            nir_src_for_ssa(projected));
   }

-   /* Now move the later tex sources down the array so that the projector
-    * disappears.
-    */
-   nir_instr_rewrite_src(&tex->instr, &tex->src[proj_index].src,
-                         NIR_SRC_INIT);
-   for (unsigned i = proj_index + 1; i < tex->num_srcs; i++) {
-      tex->src[i-1].src_type = tex->src[i].src_type;
-      nir_instr_move_src(&tex->instr, &tex->src[i-1].src, &tex->src[i].src);
-   }
-   tex->num_srcs--;
+   tex_instr_remove_src(tex, proj_index);
 }

+static bool
+lower_offset(nir_builder *b, nir_tex_instr *tex)
+{
+   int offset_index = tex_instr_find_src(tex, nir_tex_src_offset);
+   if (offset_index < 0)
+      return false;
+
+   int coord_index = tex_instr_find_src(tex, nir_tex_src_coord);
+   assert(coord_index >= 0);
+
+   assert(tex->src[offset_index].src.is_ssa);
+   assert(tex->src[coord_index].src.is_ssa);
+   nir_ssa_def *offset = tex->src[offset_index].src.ssa;
+   nir_ssa_def *coord = tex->src[coord_index].src.ssa;
+
+   b->cursor = nir_before_instr(&tex->instr);
+
+   nir_ssa_def *offset_coord;
+   if (nir_tex_instr_src_type(tex, coord_index) == nir_type_float) {
+      assert(tex->sampler_dim == GLSL_SAMPLER_DIM_RECT);
+      offset_coord = nir_fadd(b, coord, nir_i2f(b, offset));
+   } else {
+      offset_coord = nir_iadd(b, coord, offset);
+   }
+
+   if (tex->is_array) {
+      /* The offset is not applied to the array index */
+      if (tex->coord_components == 2) {
+         offset_coord = nir_vec2(b, nir_channel(b, offset_coord, 0),
+                                    nir_channel(b, coord, 1));
+      } else if (tex->coord_components == 3) {
+         offset_coord = nir_vec3(b, nir_channel(b, offset_coord, 0),
+                                    nir_channel(b, offset_coord, 1),
+                                    nir_channel(b, coord, 2));
+      } else {
+         unreachable("Invalid number of components");
+      }
+   }
+
+   nir_instr_rewrite_src(&tex->instr, &tex->src[coord_index].src,
+                         nir_src_for_ssa(offset_coord));
+
+   tex_instr_remove_src(tex, offset_index);
+
+   return true;
+}
+
+
 static nir_ssa_def *
 get_texture_size(nir_builder *b, nir_tex_instr *tex)
 {
@@ -444,6 +506,12 @@ nir_lower_tex_block(nir_block *block, nir_builder *b,
         progress = true;
      }

+      if ((tex->op == nir_texop_txf && options->lower_txf_offset) ||
+          (tex->sampler_dim == GLSL_SAMPLER_DIM_RECT &&
+           options->lower_rect_offset)) {
+         progress = lower_offset(b, tex) || progress;
+      }
+
      if ((tex->sampler_dim == GLSL_SAMPLER_DIM_RECT) && options->lower_rect) {
         lower_rect(b, tex);
         progress = true;
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -1335,54 +1335,9 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
   } else {
      image_type = sampled.sampler->var->var->interface_type;
   }
-
-   nir_tex_src srcs[8]; /* 8 should be enough */
-   nir_tex_src *p = srcs;
-
-   unsigned idx = 4;
-
-   bool has_coord = false;
-   switch (opcode) {
-   case SpvOpImageSampleImplicitLod:
-   case SpvOpImageSampleExplicitLod:
-   case SpvOpImageSampleDrefImplicitLod:
-   case SpvOpImageSampleDrefExplicitLod:
-   case SpvOpImageSampleProjImplicitLod:
-   case SpvOpImageSampleProjExplicitLod:
-   case SpvOpImageSampleProjDrefImplicitLod:
-   case SpvOpImageSampleProjDrefExplicitLod:
-   case SpvOpImageFetch:
-   case SpvOpImageGather:
-   case SpvOpImageDrefGather:
-   case SpvOpImageQueryLod: {
-      /* All these types have the coordinate as their first real argument */
-      struct vtn_ssa_value *coord = vtn_ssa_value(b, w[idx++]);
-      has_coord = true;
-      p->src = nir_src_for_ssa(coord->def);
-      p->src_type = nir_tex_src_coord;
-      p++;
-      break;
-   }
-
-   default:
-      break;
-   }
-
-   /* These all have an explicit depth value as their next source */
-   switch (opcode) {
-   case SpvOpImageSampleDrefImplicitLod:
-   case SpvOpImageSampleDrefExplicitLod:
-   case SpvOpImageSampleProjDrefImplicitLod:
-   case SpvOpImageSampleProjDrefExplicitLod:
-      (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_comparitor);
-      break;
-   default:
-      break;
-   }
-
-   /* For OpImageQuerySizeLod, we always have an LOD */
-   if (opcode == SpvOpImageQuerySizeLod)
-      (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_lod);
+   const enum glsl_sampler_dim sampler_dim = glsl_get_sampler_dim(image_type);
+   const bool is_array = glsl_sampler_type_is_array(image_type);
+   const bool is_shadow = glsl_sampler_type_is_shadow(image_type);

   /* Figure out the base texture operation */
   nir_texop texop;
@@ -1428,10 +1383,108 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
      break;

   case SpvOpImageQuerySamples:
+      texop = nir_texop_texture_samples;
+      break;
+
   default:
      unreachable("Unhandled opcode");
   }

+   nir_tex_src srcs[8]; /* 8 should be enough */
+   nir_tex_src *p = srcs;
+
+   unsigned idx = 4;
+
+   struct nir_ssa_def *coord;
+   unsigned coord_components;
+   switch (opcode) {
+   case SpvOpImageSampleImplicitLod:
+   case SpvOpImageSampleExplicitLod:
+   case SpvOpImageSampleDrefImplicitLod:
+   case SpvOpImageSampleDrefExplicitLod:
+   case SpvOpImageSampleProjImplicitLod:
+   case SpvOpImageSampleProjExplicitLod:
+   case SpvOpImageSampleProjDrefImplicitLod:
+   case SpvOpImageSampleProjDrefExplicitLod:
+   case SpvOpImageFetch:
+   case SpvOpImageGather:
+   case SpvOpImageDrefGather:
+   case SpvOpImageQueryLod: {
+      /* All these types have the coordinate as their first real argument */
+      switch (sampler_dim) {
+      case GLSL_SAMPLER_DIM_1D:
+      case GLSL_SAMPLER_DIM_BUF:
+         coord_components = 1;
+         break;
+      case GLSL_SAMPLER_DIM_2D:
+      case GLSL_SAMPLER_DIM_RECT:
+      case GLSL_SAMPLER_DIM_MS:
+         coord_components = 2;
+         break;
+      case GLSL_SAMPLER_DIM_3D:
+      case GLSL_SAMPLER_DIM_CUBE:
+         coord_components = 3;
+         break;
+      default:
+         assert("Invalid sampler type");
+      }
+
+      if (is_array && texop != nir_texop_lod)
+         coord_components++;
+
+      coord = vtn_ssa_value(b, w[idx++])->def;
+      p->src = nir_src_for_ssa(coord);
+      p->src_type = nir_tex_src_coord;
+      p++;
+      break;
+   }
+
+   default:
+      coord = NULL;
+      coord_components = 0;
+      break;
+   }
+
+   switch (opcode) {
+   case SpvOpImageSampleProjImplicitLod:
+   case SpvOpImageSampleProjExplicitLod:
+   case SpvOpImageSampleProjDrefImplicitLod:
+   case SpvOpImageSampleProjDrefExplicitLod:
+      /* These have the projector as the last coordinate component */
+      p->src = nir_src_for_ssa(nir_channel(&b->nb, coord, coord_components));
+      p->src_type = nir_tex_src_projector;
+      p++;
+      break;
+
+   default:
+      break;
+   }
+
+   unsigned gather_component = 0;
+   switch (opcode) {
+   case SpvOpImageSampleDrefImplicitLod:
+   case SpvOpImageSampleDrefExplicitLod:
+   case SpvOpImageSampleProjDrefImplicitLod:
+   case SpvOpImageSampleProjDrefExplicitLod:
+   case SpvOpImageDrefGather:
+      /* These all have an explicit depth value as their next source */
+      (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_comparitor);
+      break;
+
+   case SpvOpImageGather:
+      /* This has a component as its next source */
+      gather_component =
+         vtn_value(b, w[idx++], vtn_value_type_constant)->constant->value.u[0];
+      break;
+
+   default:
+      break;
+   }
+
+   /* For OpImageQuerySizeLod, we always have an LOD */
+   if (opcode == SpvOpImageQuerySizeLod)
+      (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_lod);
+
   /* Now we need to handle some number of optional arguments */
   if (idx < count) {
      uint32_t operands = w[idx++];
@@ -1444,12 +1497,12 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,

      if (operands & SpvImageOperandsLodMask) {
         assert(texop == nir_texop_txl || texop == nir_texop_txf ||
-                texop == nir_texop_txf_ms || texop == nir_texop_txs);
+                texop == nir_texop_txs);
         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_lod);
      }

      if (operands & SpvImageOperandsGradMask) {
-         assert(texop == nir_texop_tex);
+         assert(texop == nir_texop_txl);
         texop = nir_texop_txd;
         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ddx);
         (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ddy);
@@ -1476,35 +1529,13 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,

   memcpy(instr->src, srcs, instr->num_srcs * sizeof(*instr->src));

-   instr->sampler_dim = glsl_get_sampler_dim(image_type);
-   instr->is_array = glsl_sampler_type_is_array(image_type);
-   instr->is_shadow = glsl_sampler_type_is_shadow(image_type);
-   instr->is_new_style_shadow = instr->is_shadow;
-
-   if (has_coord) {
-      switch (instr->sampler_dim) {
-      case GLSL_SAMPLER_DIM_1D:
-      case GLSL_SAMPLER_DIM_BUF:
-         instr->coord_components = 1;
-         break;
-      case GLSL_SAMPLER_DIM_2D:
-      case GLSL_SAMPLER_DIM_RECT:
-      case GLSL_SAMPLER_DIM_MS:
-         instr->coord_components = 2;
-         break;
-      case GLSL_SAMPLER_DIM_3D:
-      case GLSL_SAMPLER_DIM_CUBE:
-         instr->coord_components = 3;
-         break;
-      default:
-         assert("Invalid sampler type");
-      }
-
-      if (instr->is_array)
-         instr->coord_components++;
-   } else {
-      instr->coord_components = 0;
-   }
+   instr->coord_components = coord_components;
+   instr->sampler_dim = sampler_dim;
+   instr->is_array = is_array;
+   instr->is_shadow = is_shadow;
+   instr->is_new_style_shadow =
+      is_shadow && glsl_get_components(ret_type->type) == 1;
+   instr->component = gather_component;

   switch (glsl_get_sampler_result_type(image_type)) {
   case GLSL_TYPE_FLOAT:   instr->dest_type = nir_type_float;     break;
--- a/src/compiler/spirv/vtn_cfg.c
+++ b/src/compiler/spirv/vtn_cfg.c
@@ -527,12 +527,13 @@ vtn_handle_phi_second_pass(struct vtn_builder *b, SpvOp opcode,
   nir_variable *phi_var = phi_entry->data;

   for (unsigned i = 3; i < count; i += 2) {
-      struct vtn_ssa_value *src = vtn_ssa_value(b, w[i]);
      struct vtn_block *pred =
         vtn_value(b, w[i + 1], vtn_value_type_block)->block;

      b->nb.cursor = nir_after_instr(&pred->end_nop->instr);

+      struct vtn_ssa_value *src = vtn_ssa_value(b, w[i]);
+
      vtn_local_store(b, src, nir_deref_var_create(b, phi_var));
   }

--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -1706,6 +1706,8 @@ dri2_wl_swrast_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
      dri2_surf->format = WL_SHM_FORMAT_ARGB8888;

   dri2_surf->wl_win = window;
+   dri2_surf->wl_win->private = dri2_surf;
+   dri2_surf->wl_win->destroy_window_callback = destroy_window_callback;

   dri2_surf->base.Width = -1;
   dri2_surf->base.Height = -1;
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -188,7 +188,9 @@ cso_insert_state(struct cso_cache *sc,
                 void *state)
 {
   struct cso_hash *hash = _cso_hash_for_type(sc, type);
-   sanitize_hash(sc, hash, type, sc->max_size);
+
+   if (type != CSO_SAMPLER)
+      sanitize_hash(sc, hash, type, sc->max_size);

   return cso_hash_insert(hash, hash_key, state);
 }
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -1268,7 +1268,6 @@ cso_restore_fragment_samplers(struct cso_context *ctx)
 {
   struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];

-   info->nr_samplers = ctx->nr_fragment_samplers_saved;
   memcpy(info->samplers, ctx->fragment_samplers_saved,
          sizeof(info->samplers));
   cso_single_sampler_done(ctx, PIPE_SHADER_FRAGMENT);
--- a/src/gallium/drivers/nouveau/nouveau_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_video.c
@@ -73,7 +73,7 @@ nouveau_vpe_fini(struct nouveau_decoder *dec) {
   if (!dec->cmds)
      return;

-   nouveau_pushbuf_space(push, 8, 2, 0);
+   nouveau_pushbuf_space(push, 16, 2, 0);
   nouveau_bufctx_reset(dec->bufctx, NV31_VIDEO_BIND_CMD);

 #define BCTX_ARGS dec->bufctx, NV31_VIDEO_BIND_CMD, NOUVEAU_BO_RD
--- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
@@ -127,7 +127,7 @@ nv30_clear_render_target(struct pipe_context *pipe, struct pipe_surface *ps,

   refn.bo = mt->base.bo;
   refn.flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
-   if (nouveau_pushbuf_space(push, 16, 1, 0) ||
+   if (nouveau_pushbuf_space(push, 32, 1, 0) ||
       nouveau_pushbuf_refn (push, &refn, 1))
      return;

--- a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
@@ -431,7 +431,7 @@ nv30_transfer_rect_sifm(XFER_ARGS)
      si_arg |= NV03_SIFM_FORMAT_FILTER_BILINEAR;
   }

-   if (nouveau_pushbuf_space(push, 32, 6, 0) ||
+   if (nouveau_pushbuf_space(push, 64, 6, 0) ||
       nouveau_pushbuf_refn (push, refs, 2))
      return;

@@ -516,7 +516,7 @@ nv30_transfer_rect_m2mf(XFER_ARGS)
   while (h) {
      unsigned lines = (h > 2047) ? 2047 : h;

-      if (nouveau_pushbuf_space(push, 13, 2, 0) ||
+      if (nouveau_pushbuf_space(push, 32, 2, 0) ||
          nouveau_pushbuf_refn (push, refs, 2))
         return;

@@ -709,7 +709,7 @@ nv30_transfer_copy_data(struct nouveau_context *nv,
      lines  = (pages > 2047) ? 2047 : pages;
      pages -= lines;

-      if (nouveau_pushbuf_space(push, 13, 2, 0) ||
+      if (nouveau_pushbuf_space(push, 32, 2, 0) ||
          nouveau_pushbuf_refn (push, refs, 2))
         return;

@@ -733,7 +733,7 @@ nv30_transfer_copy_data(struct nouveau_context *nv,
   }

   if (size) {
-      if (nouveau_pushbuf_space(push, 13, 2, 0) ||
+      if (nouveau_pushbuf_space(push, 32, 2, 0) ||
          nouveau_pushbuf_refn (push, refs, 2))
         return;

--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -294,7 +294,7 @@ nv50_clear_render_target(struct pipe_context *pipe,
   PUSH_DATAf(push, color->f[2]);
   PUSH_DATAf(push, color->f[3]);

-   if (nouveau_pushbuf_space(push, 32 + sf->depth, 1, 0))
+   if (nouveau_pushbuf_space(push, 64 + sf->depth, 1, 0))
      return;

   PUSH_REFN(push, bo, mt->base.domain | NOUVEAU_BO_WR);
@@ -388,7 +388,7 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
      mode |= NV50_3D_CLEAR_BUFFERS_S;
   }

-   if (nouveau_pushbuf_space(push, 32 + sf->depth, 1, 0))
+   if (nouveau_pushbuf_space(push, 64 + sf->depth, 1, 0))
      return;

   PUSH_REFN(push, bo, mt->base.domain | NOUVEAU_BO_WR);
@@ -742,7 +742,7 @@ nv50_clear_buffer(struct pipe_context *pipe,
   PUSH_DATAf(push, color.f[2]);
   PUSH_DATAf(push, color.f[3]);

-   if (nouveau_pushbuf_space(push, 32, 1, 0))
+   if (nouveau_pushbuf_space(push, 64, 1, 0))
      return;

   PUSH_REFN(push, buf->bo, buf->domain | NOUVEAU_BO_WR);
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -636,7 +636,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
         BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
         PUSH_DATA (push, prim);

-         nouveau_pushbuf_space(push, 8, 0, 1);
+         nouveau_pushbuf_space(push, 16, 0, 1);
         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);

         switch (index_size) {
--- a/src/gallium/drivers/nouveau/nv50/nv98_video.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video.c
@@ -273,7 +273,7 @@ nv98_create_decoder(struct pipe_context *context,
   dec->comm = (struct comm *)(dec->fence_map + (COMM_OFFSET/sizeof(*dec->fence_map)));

   /* So lets test if the fence is working? */
-   nouveau_pushbuf_space(push[0], 6, 1, 0);
+   nouveau_pushbuf_space(push[0], 16, 1, 0);
   PUSH_REFN (push[0], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
   BEGIN_NV04(push[0], SUBC_BSP(0x240), 3);
   PUSH_DATAh(push[0], dec->fence_bo->offset);
@@ -284,7 +284,7 @@ nv98_create_decoder(struct pipe_context *context,
   PUSH_DATA (push[0], 0);
   PUSH_KICK (push[0]);

-   nouveau_pushbuf_space(push[1], 6, 1, 0);
+   nouveau_pushbuf_space(push[1], 16, 1, 0);
   PUSH_REFN (push[1], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
   BEGIN_NV04(push[1], SUBC_VP(0x240), 3);
   PUSH_DATAh(push[1], (dec->fence_bo->offset + 0x10));
@@ -295,7 +295,7 @@ nv98_create_decoder(struct pipe_context *context,
   PUSH_DATA (push[1], 0);
   PUSH_KICK (push[1]);

-   nouveau_pushbuf_space(push[2], 6, 1, 0);
+   nouveau_pushbuf_space(push[2], 16, 1, 0);
   PUSH_REFN (push[2], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
   BEGIN_NV04(push[2], SUBC_PPP(0x240), 3);
   PUSH_DATAh(push[2], (dec->fence_bo->offset + 0x20));
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -47,7 +47,6 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   int ret;
   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
-   unsigned fence_extra = 0;
   struct nouveau_pushbuf_refn bo_refs[] = {
      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
@@ -61,10 +60,6 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   if (!dec->bitplane_bo)
      num_refs--;

-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
-
   bsp_size = NOUVEAU_VP3_BSP_RESERVED_SIZE;
   for (i = 0; i < num_buffers; i++)
      bsp_size += num_bytes[i];
@@ -112,7 +107,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,

   nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);

-   nouveau_pushbuf_space(push, 6 + (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC ? 9 : 8) + fence_extra + 2, num_refs, 0);
+   nouveau_pushbuf_space(push, 32, num_refs, 0);
   nouveau_pushbuf_refn(push, bo_refs, num_refs);

   bsp_addr = bsp_bo->offset >> 8;
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_ppp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_ppp.c
@@ -93,13 +93,8 @@ nv98_decoder_ppp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, struct n
   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
   struct nouveau_pushbuf *push = dec->pushbuf[2];
   unsigned ppp_caps = 0x10;
-   unsigned fence_extra = 0;

-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
-
-   nouveau_pushbuf_space(push, 11 + (codec == PIPE_VIDEO_FORMAT_VC1 ? 2 : 0) + 3 + fence_extra + 2, 4, 0);
+   nouveau_pushbuf_space(push, 32, 4, 0);

   switch (codec) {
   case PIPE_VIDEO_FORMAT_MPEG12: {
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
@@ -76,7 +76,7 @@ nv98_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
-   u32 fence_extra = 0, codec_extra = 0;
+   u32 codec_extra = 0;
   struct nouveau_pushbuf_refn bo_refs[] = {
      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
      { dec->ref_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
@@ -88,10 +88,6 @@ nv98_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   };
   int num_refs = ARRAY_SIZE(bo_refs) - !dec->fw_bo;

-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
-
   if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
      nouveau_vp3_inter_sizes(dec, desc.h264->slice_count, &slice_size, &bucket_size, &ring_size);
      codec_extra += 2;
@@ -115,8 +111,7 @@ nv98_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   if (!is_ref && (dec->refs[target->valid_ref].decoded_top && dec->refs[target->valid_ref].decoded_bottom))
      nv98_decoder_kick_ref(dec, target);

-   nouveau_pushbuf_space(push, 8 + 3 * (codec != PIPE_VIDEO_FORMAT_MPEG12) +
-              6 + codec_extra + fence_extra + 2, num_refs, 0);
+   nouveau_pushbuf_space(push, 32 + codec_extra, num_refs, 0);

   nouveau_pushbuf_refn(push, bo_refs, num_refs);

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -403,7 +403,7 @@ nvc0_hw_get_query_result_resource(struct nvc0_context *nvc0,
   if (wait && hq->state != NVC0_HW_QUERY_STATE_READY)
      nvc0_hw_query_fifo_wait(nvc0, q);

-   nouveau_pushbuf_space(push, 16, 2, 0);
+   nouveau_pushbuf_space(push, 32, 2, 0);
   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
   PUSH_REFN (push, buf->bo, buf->domain | NOUVEAU_BO_WR);
   BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 2);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -799,7 +799,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
   }

   while (num_instances--) {
-      nouveau_pushbuf_space(push, 9, 0, 1);
+      nouveau_pushbuf_space(push, 16, 0, 1);
      BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
      PUSH_DATA (push, mode);
      BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -297,7 +297,7 @@ nvc0_create_decoder(struct pipe_context *context,
   dec->comm = (struct comm *)(dec->fence_map + (COMM_OFFSET/sizeof(*dec->fence_map)));

   /* So lets test if the fence is working? */
-   nouveau_pushbuf_space(push[0], 6, 1, 0);
+   nouveau_pushbuf_space(push[0], 16, 1, 0);
   PUSH_REFN (push[0], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
   BEGIN_NVC0(push[0], SUBC_BSP(0x240), 3);
   PUSH_DATAh(push[0], dec->fence_bo->offset);
@@ -308,7 +308,7 @@ nvc0_create_decoder(struct pipe_context *context,
   PUSH_DATA (push[0], 0);
   PUSH_KICK (push[0]);

-   nouveau_pushbuf_space(push[1], 6, 1, 0);
+   nouveau_pushbuf_space(push[1], 16, 1, 0);
   PUSH_REFN (push[1], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
   BEGIN_NVC0(push[1], SUBC_VP(0x240), 3);
   PUSH_DATAh(push[1], (dec->fence_bo->offset + 0x10));
@@ -319,7 +319,7 @@ nvc0_create_decoder(struct pipe_context *context,
   PUSH_DATA (push[1], 0);
   PUSH_KICK (push[1]);

-   nouveau_pushbuf_space(push[2], 6, 1, 0);
+   nouveau_pushbuf_space(push[2], 16, 1, 0);
   PUSH_REFN (push[2], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
   BEGIN_NVC0(push[2], SUBC_PPP(0x240), 3);
   PUSH_DATAh(push[2], (dec->fence_bo->offset + 0x20));
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -143,7 +143,6 @@ nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   uint32_t caps;
   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
-   unsigned fence_extra = 0;
   struct nouveau_pushbuf_refn bo_refs[] = {
      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
@@ -157,15 +156,11 @@ nvc0_decoder_bsp_end(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   if (!dec->bitplane_bo)
      num_refs--;

-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
-
   caps = nouveau_vp3_bsp_end(dec, desc);

   nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);

-   nouveau_pushbuf_space(push, 6 + (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC ? 9 : 7) + fence_extra + 2, num_refs, 0);
+   nouveau_pushbuf_space(push, 32, num_refs, 0);
   nouveau_pushbuf_refn(push, bo_refs, num_refs);

   bsp_addr = bsp_bo->offset >> 8;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_ppp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_ppp.c
@@ -93,13 +93,8 @@ nvc0_decoder_ppp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, struct n
   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
   struct nouveau_pushbuf *push = dec->pushbuf[2];
   unsigned ppp_caps = 0x10;
-   unsigned fence_extra = 0;

-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
-
-   nouveau_pushbuf_space(push, 11 + (codec == PIPE_VIDEO_FORMAT_VC1 ? 2 : 0) + 3 + fence_extra + 2, 4, 0);
+   nouveau_pushbuf_space(push, 32, 4, 0);

   switch (codec) {
   case PIPE_VIDEO_FORMAT_MPEG12: {
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c
@@ -76,7 +76,7 @@ nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
-   u32 fence_extra = 0, codec_extra = 0;
+   u32 codec_extra = 0;
   struct nouveau_pushbuf_refn bo_refs[] = {
      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
      { dec->ref_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
@@ -88,10 +88,6 @@ nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   };
   int num_refs = ARRAY_SIZE(bo_refs) - !dec->fw_bo;

-#if NOUVEAU_VP3_DEBUG_FENCE
-   fence_extra = 4;
-#endif
-
   if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
      nouveau_vp3_inter_sizes(dec, desc.h264->slice_count, &slice_size, &bucket_size, &ring_size);
      codec_extra += 2;
@@ -115,8 +111,7 @@ nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   if (!is_ref && (dec->refs[target->valid_ref].decoded_top && dec->refs[target->valid_ref].decoded_bottom))
      nvc0_decoder_kick_ref(dec, target);

-   nouveau_pushbuf_space(push, 8 + 3 * (codec != PIPE_VIDEO_FORMAT_MPEG12) +
-              6 + codec_extra + fence_extra + 2, num_refs, 0);
+   nouveau_pushbuf_space(push, 32 + codec_extra, num_refs, 0);

   nouveau_pushbuf_refn(push, bo_refs, num_refs);

--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -473,6 +473,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	r600_init_command_buffer(&rs->buffer, 30);

 	rs->scissor_enable = state->scissor;
+	rs->clip_halfz = state->clip_halfz;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
 	rs->two_side = state->light_twoside;
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -1862,8 +1862,8 @@
 #define R_0283F8_SQ_VTX_SEMANTIC_30                  0x000283F8
 #define R_0283FC_SQ_VTX_SEMANTIC_31                  0x000283FC
 #define R_0288F0_SQ_VTX_SEMANTIC_CLEAR               0x000288F0
-#define R_0282D0_PA_SC_VPORT_ZMIN_0                  0x000282D0
-#define R_0282D4_PA_SC_VPORT_ZMAX_0                  0x000282D4
+#define R_0282D0_PA_SC_VPORT_ZMIN_0                  0x0282D0
+#define R_0282D4_PA_SC_VPORT_ZMAX_0                  0x0282D4
 #define R_028400_VGT_MAX_VTX_INDX                    0x00028400
 #define R_028404_VGT_MIN_VTX_INDX                    0x00028404
 #define R_028408_VGT_INDX_OFFSET                     0x00028408
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -308,6 +308,7 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 	r600_mark_atom_dirty(ctx, &ctx->b.scissors.atom);
 	ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+	ctx->b.viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 	r600_mark_atom_dirty(ctx, &ctx->b.viewports.atom);
 	if (ctx->b.chip_class <= EVERGREEN) {
 		r600_mark_atom_dirty(ctx, &ctx->config_state.atom);
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -274,6 +274,7 @@ struct r600_rasterizer_state {
 	bool				offset_enable;
 	bool				scissor_enable;
 	bool				multisample_enable;
+	bool				clip_halfz;
 };

 struct r600_poly_offset_state {
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -459,6 +459,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	r600_init_command_buffer(&rs->buffer, 30);

 	rs->scissor_enable = state->scissor;
+	rs->clip_halfz = state->clip_halfz;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
 	rs->two_side = state->light_twoside;
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -364,7 +364,7 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 		r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 	}

-	r600_set_scissor_enable(&rctx->b, rs->scissor_enable);
+	r600_viewport_set_rast_deps(&rctx->b, rs->scissor_enable, rs->clip_halfz);

 	/* Re-emit PA_SC_LINE_STIPPLE. */
 	rctx->last_primitive_type = -1;
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -366,6 +366,10 @@ struct r600_common_screen {
 	void (*query_opaque_metadata)(struct r600_common_screen *rscreen,
 				      struct r600_texture *rtex,
 				      struct radeon_bo_metadata *md);
+
+	void (*apply_opaque_metadata)(struct r600_common_screen *rscreen,
+				    struct r600_texture *rtex,
+				    struct radeon_bo_metadata *md);
 };

 /* This encapsulates a state or an operation which can emitted into the GPU
@@ -430,6 +434,7 @@ struct r600_scissors {
 struct r600_viewports {
 	struct r600_atom		atom;
 	unsigned			dirty_mask;
+	unsigned			depth_range_dirty_mask;
 	struct pipe_viewport_state	states[R600_MAX_VIEWPORTS];
 	struct r600_signed_scissor	as_scissor[R600_MAX_VIEWPORTS];
 };
@@ -469,6 +474,7 @@ struct r600_common_context {
 	struct r600_scissors		scissors;
 	struct r600_viewports		viewports;
 	bool				scissor_enabled;
+	bool				clip_halfz;
 	bool				vs_writes_viewport_index;
 	bool				vs_disables_clipping_viewport;

@@ -669,7 +675,8 @@ void r600_init_context_texture_functions(struct r600_common_context *rctx);
 /* r600_viewport.c */
 void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx,
 					    struct pipe_scissor_state *scissor);
-void r600_set_scissor_enable(struct r600_common_context *rctx, bool enable);
+void r600_viewport_set_rast_deps(struct r600_common_context *rctx,
+				 bool scissor_enable, bool clip_halfz);
 void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx,
 					  struct tgsi_shader_info *info);
 void r600_init_viewport_functions(struct r600_common_context *rctx);
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -1139,7 +1139,7 @@ err:
 static struct pipe_driver_query_info r600_driver_query_list[] = {
 	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
-	X("draw-calls",			DRAW_CALLS,		UINT64, CUMULATIVE),
+	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
 	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
 	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
 	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -723,10 +723,11 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
 }

 static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
-					    struct r600_texture *rtex)
+					    struct r600_texture *rtex,
+					    unsigned *base_align)
 {
 	unsigned cl_width, cl_height, width, height;
-	unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align;
+	unsigned slice_elements, slice_bytes, pipe_interleave_bytes;
 	unsigned num_pipes = rscreen->info.num_tile_pipes;

 	if (rscreen->chip_class <= EVERGREEN &&
@@ -788,7 +789,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	slice_bytes = slice_elements * 4;

 	pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
-	base_align = num_pipes * pipe_interleave_bytes;
+	*base_align = num_pipes * pipe_interleave_bytes;

 	rtex->htile.pitch = width;
 	rtex->htile.height = height;
@@ -796,20 +797,22 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	rtex->htile.yalign = cl_height * 8;

 	return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
-		align(slice_bytes, base_align);
+		align(slice_bytes, *base_align);
 }

 static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
 					struct r600_texture *rtex)
 {
-	unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex);
+	unsigned alignment = 0;
+	unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex,
+							  &alignment);

 	if (!htile_size)
 		return;

 	rtex->htile_buffer = (struct r600_resource*)
-			     pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-						PIPE_USAGE_DEFAULT, htile_size);
+		r600_aligned_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
+					   htile_size, alignment);
 	if (rtex->htile_buffer == NULL) {
 		/* this is not a fatal error as we can still keep rendering
 		 * without htile buffer */
@@ -965,8 +968,12 @@ r600_texture_create_object(struct pipe_screen *screen,
 			}
 		}

-		if (!buf && rtex->surface.dcc_size &&
-		    !(rscreen->debug_flags & DBG_NO_DCC)) {
+		/* Shared textures must always set up DCC here.
+		 * If it's not present, it will be disabled by
+		 * apply_opaque_metadata later.
+		 */
+		if (rtex->surface.dcc_size &&
+		    (buf || !(rscreen->debug_flags & DBG_NO_DCC))) {
 			/* Reserve space for the DCC buffer. */
 			rtex->dcc_offset = align64(rtex->size, rtex->surface.dcc_alignment);
 			rtex->size = rtex->dcc_offset + rtex->surface.dcc_size;
@@ -993,7 +1000,9 @@ r600_texture_create_object(struct pipe_screen *screen,
 					 rtex->cmask.offset, rtex->cmask.size,
 					 0xCCCCCCCC, R600_COHERENCY_NONE);
 	}
-	if (rtex->dcc_offset) {
+
+	/* Initialize DCC only if the texture is not being imported. */
+	if (!buf && rtex->dcc_offset) {
 		r600_screen_clear_buffer(rscreen, &rtex->resource.b.b,
 					 rtex->dcc_offset,
 					 rtex->surface.dcc_size,
@@ -1159,6 +1168,10 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen

 	rtex->resource.is_shared = true;
 	rtex->resource.external_usage = usage;
+
+	if (rscreen->apply_opaque_metadata)
+		rscreen->apply_opaque_metadata(rscreen, rtex, &metadata);
+
 	return &rtex->resource.b.b;
 }

--- a/src/gallium/drivers/radeon/r600_viewport.c
+++ b/src/gallium/drivers/radeon/r600_viewport.c
@@ -22,6 +22,7 @@
 */

 #include "r600_cs.h"
+#include "util/u_viewport.h"
 #include "tgsi/tgsi_scan.h"

 #define GET_MAX_SCISSOR(rctx) (rctx->chip_class >= EVERGREEN ? 16384 : 8192)
@@ -260,6 +261,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx,
 				     const struct pipe_viewport_state *state)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	unsigned mask;
 	int i;

 	for (i = 0; i < num_viewports; i++) {
@@ -270,13 +272,28 @@ static void r600_set_viewport_states(struct pipe_context *ctx,
 					       &rctx->viewports.as_scissor[index]);
 	}

-	rctx->viewports.dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
-	rctx->scissors.dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
+	mask = ((1 << num_viewports) - 1) << start_slot;
+	rctx->viewports.dirty_mask |= mask;
+	rctx->viewports.depth_range_dirty_mask |= mask;
+	rctx->scissors.dirty_mask |= mask;
 	rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true);
 	rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true);
 }

-static void r600_emit_viewports(struct r600_common_context *rctx, struct r600_atom *atom)
+static void r600_emit_one_viewport(struct r600_common_context *rctx,
+				   struct pipe_viewport_state *state)
+{
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+
+	radeon_emit(cs, fui(state->scale[0]));
+	radeon_emit(cs, fui(state->translate[0]));
+	radeon_emit(cs, fui(state->scale[1]));
+	radeon_emit(cs, fui(state->translate[1]));
+	radeon_emit(cs, fui(state->scale[2]));
+	radeon_emit(cs, fui(state->translate[2]));
+}
+
+static void r600_emit_viewports(struct r600_common_context *rctx)
 {
 	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct pipe_viewport_state *states = rctx->viewports.states;
@@ -288,12 +305,7 @@ static void r600_emit_viewports(struct r600_common_context *rctx, struct r600_at
 			return;

 		radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
-		radeon_emit(cs, fui(states[0].scale[0]));
-		radeon_emit(cs, fui(states[0].translate[0]));
-		radeon_emit(cs, fui(states[0].scale[1]));
-		radeon_emit(cs, fui(states[0].translate[1]));
-		radeon_emit(cs, fui(states[0].scale[2]));
-		radeon_emit(cs, fui(states[0].translate[2]));
+		r600_emit_one_viewport(rctx, &states[0]);
 		rctx->viewports.dirty_mask &= ~1; /* clear one bit */
 		return;
 	}
@@ -305,25 +317,70 @@ static void r600_emit_viewports(struct r600_common_context *rctx, struct r600_at

 		radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE +
 					       start * 4 * 6, count * 6);
-		for (i = start; i < start+count; i++) {
-			radeon_emit(cs, fui(states[i].scale[0]));
-			radeon_emit(cs, fui(states[i].translate[0]));
-			radeon_emit(cs, fui(states[i].scale[1]));
-			radeon_emit(cs, fui(states[i].translate[1]));
-			radeon_emit(cs, fui(states[i].scale[2]));
-			radeon_emit(cs, fui(states[i].translate[2]));
-		}
+		for (i = start; i < start+count; i++)
+			r600_emit_one_viewport(rctx, &states[i]);
 	}
 	rctx->viewports.dirty_mask = 0;
 }

-void r600_set_scissor_enable(struct r600_common_context *rctx, bool enable)
+static void r600_emit_depth_ranges(struct r600_common_context *rctx)
 {
-	if (rctx->scissor_enabled != enable) {
-		rctx->scissor_enabled = enable;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct pipe_viewport_state *states = rctx->viewports.states;
+	unsigned mask = rctx->viewports.depth_range_dirty_mask;
+	float zmin, zmax;
+
+	/* The simple case: Only 1 viewport is active. */
+	if (!rctx->vs_writes_viewport_index) {
+		if (!(mask & 1))
+			return;
+
+		util_viewport_zmin_zmax(&states[0], rctx->clip_halfz, &zmin, &zmax);
+
+		radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
+		radeon_emit(cs, fui(zmin));
+		radeon_emit(cs, fui(zmax));
+		rctx->viewports.depth_range_dirty_mask &= ~1; /* clear one bit */
+		return;
+	}
+
+	while (mask) {
+		int start, count, i;
+
+		u_bit_scan_consecutive_range(&mask, &start, &count);
+
+		radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 +
+					   start * 4 * 2, count * 2);
+		for (i = start; i < start+count; i++) {
+			util_viewport_zmin_zmax(&states[i], rctx->clip_halfz, &zmin, &zmax);
+			radeon_emit(cs, fui(zmin));
+			radeon_emit(cs, fui(zmax));
+		}
+	}
+	rctx->viewports.depth_range_dirty_mask = 0;
+}
+
+static void r600_emit_viewport_states(struct r600_common_context *rctx,
+				      struct r600_atom *atom)
+{
+	r600_emit_viewports(rctx);
+	r600_emit_depth_ranges(rctx);
+}
+
+/* Set viewport dependencies on pipe_rasterizer_state. */
+void r600_viewport_set_rast_deps(struct r600_common_context *rctx,
+				 bool scissor_enable, bool clip_halfz)
+{
+	if (rctx->scissor_enabled != scissor_enable) {
+		rctx->scissor_enabled = scissor_enable;
 		rctx->scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 		rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true);
 	}
+	if (rctx->clip_halfz != clip_halfz) {
+		rctx->clip_halfz = clip_halfz;
+		rctx->viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+		rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true);
+	}
 }

 /**
@@ -357,14 +414,16 @@ void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx,

 	if (rctx->scissors.dirty_mask)
 	    rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true);
-	if (rctx->viewports.dirty_mask)
+
+	if (rctx->viewports.dirty_mask ||
+	    rctx->viewports.depth_range_dirty_mask)
 	    rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true);
 }

 void r600_init_viewport_functions(struct r600_common_context *rctx)
 {
 	rctx->scissors.atom.emit = r600_emit_scissors;
-	rctx->viewports.atom.emit = r600_emit_viewports;
+	rctx->viewports.atom.emit = r600_emit_viewport_states;

 	rctx->scissors.atom.num_dw = (2 + 16 * 2) + 6;
 	rctx->viewports.atom.num_dw = 2 + 16 * 6;
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -241,5 +241,7 @@
 #define   S_028254_BR_Y(x)                                            (((unsigned)(x) & 0x7FFF) << 16)
 #define   G_028254_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
 #define   C_028254_BR_Y                                               0x8000FFFF
+#define R_0282D0_PA_SC_VPORT_ZMIN_0                                     0x0282D0
+#define R_0282D4_PA_SC_VPORT_ZMAX_0                                     0x0282D4

 #endif
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -1303,23 +1303,32 @@ static void emit_lsb(const struct lp_build_tgsi_action * action,
 		     struct lp_build_emit_data * emit_data)
 {
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
 	LLVMValueRef args[2] = {
 		emit_data->args[0],

 		/* The value of 1 means that ffs(x=0) = undef, so LLVM won't
 		 * add special code to check for x=0. The reason is that
 		 * the LLVM behavior for x=0 is different from what we
-		 * need here.
-		 *
-		 * The hardware already implements the correct behavior.
+		 * need here. However, LLVM also assumes that ffs(x) is
+		 * in [0, 31], but GLSL expects that ffs(0) = -1, so
+		 * a conditional assignment to handle 0 is still required.
 		 */
-		lp_build_const_int32(gallivm, 1)
+		LLVMConstInt(LLVMInt1TypeInContext(gallivm->context), 1, 0)
 	};

-	emit_data->output[emit_data->chan] =
+	LLVMValueRef lsb =
 		lp_build_intrinsic(gallivm->builder, "llvm.cttz.i32",
 				emit_data->dst_type, args, ARRAY_SIZE(args),
 				LLVMReadNoneAttribute);
+
+	/* TODO: We need an intrinsic to skip this conditional. */
+	/* Check for zero: */
+	emit_data->output[emit_data->chan] =
+		LLVMBuildSelect(builder,
+				LLVMBuildICmp(builder, LLVMIntEQ, args[0],
+					      bld_base->uint_bld.zero, ""),
+				lp_build_const_int32(gallivm, -1), lsb, "");
 }

 /* Find the last bit set. */
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -376,7 +376,9 @@ si_decompress_sampler_color_textures(struct si_context *sctx,
 		assert(view);

 		tex = (struct r600_texture *)view->texture;
-		assert(tex->cmask.size || tex->fmask.size || tex->dcc_offset);
+		/* CMASK or DCC can be discarded and we can still end up here. */
+		if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
+			continue;

 		si_blit_decompress_color(&sctx->b.b, tex,
 					 view->u.tex.first_level, view->u.tex.last_level,
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -202,7 +202,12 @@ static void si_initialize_compute(struct si_context *sctx)
 		radeon_emit(cs, bc_va >> 8);  /* R_030E00_TA_CS_BC_BASE_ADDR */
 		radeon_emit(cs, bc_va >> 40); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
 	} else {
-		radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+		if (sctx->screen->b.info.drm_major == 3 ||
+		    (sctx->screen->b.info.drm_major == 2 &&
+		     sctx->screen->b.info.drm_minor >= 48)) {
+			radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR,
+					      bc_va >> 8);
+		}
 	}

 	sctx->cs_shader_state.emitted_program = NULL;
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -311,6 +311,7 @@ static void si_set_sampler_view(struct si_context *sctx,
 				unsigned slot, struct pipe_sampler_view *view)
 {
 	struct si_sampler_view *rview = (struct si_sampler_view*)view;
+	uint32_t *desc = views->desc.list + slot * 16;

 	if (view && view->texture && view->texture->target != PIPE_BUFFER &&
 	    G_008F28_COMPRESSION_EN(rview->state[6]) &&
@@ -346,9 +347,14 @@ static void si_set_sampler_view(struct si_context *sctx,
 		views->desc.enabled_mask |= 1u << slot;
 	} else {
 		pipe_sampler_view_reference(&views->views[slot], NULL);
-		memcpy(views->desc.list + slot*16, null_texture_descriptor, 8*4);
+		memcpy(desc, null_texture_descriptor, 8*4);
 		/* Only clear the lower dwords of FMASK. */
-		memcpy(views->desc.list + slot*16 + 8, null_texture_descriptor, 4*4);
+		memcpy(desc + 8, null_texture_descriptor, 4*4);
+		/* Re-set the sampler state if we are transitioning from FMASK. */
+		if (views->sampler_states[slot])
+			memcpy(desc + 12,
+			       views->sampler_states[slot], 4*4);
+
 		views->desc.enabled_mask &= ~(1u << slot);
 	}

@@ -631,10 +637,10 @@ static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
 		/* If FMASK is bound, don't overwrite it.
 		 * The sampler state will be set after FMASK is unbound.
 		 */
-		if (samplers->views.views[i] &&
-		    samplers->views.views[i]->texture &&
-		    samplers->views.views[i]->texture->target != PIPE_BUFFER &&
-		    ((struct r600_texture*)samplers->views.views[i]->texture)->fmask.size)
+		if (samplers->views.views[slot] &&
+		    samplers->views.views[slot]->texture &&
+		    samplers->views.views[slot]->texture->target != PIPE_BUFFER &&
+		    ((struct r600_texture*)samplers->views.views[slot]->texture)->fmask.size)
 			continue;

 		memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -231,6 +231,7 @@ void si_begin_new_cs(struct si_context *ctx)

 	ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 	ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+	ctx->b.viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 	si_mark_atom_dirty(ctx, &ctx->b.scissors.atom);
 	si_mark_atom_dirty(ctx, &ctx->b.viewports.atom);

--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -147,7 +147,12 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX,
 				       si_context_gfx_flush, sctx);

-	if (!(sscreen->b.debug_flags & DBG_NO_CE) && ws->cs_add_const_ib) {
+	/* SI + AMDGPU + CE = GPU hang */
+	if (!(sscreen->b.debug_flags & DBG_NO_CE) && ws->cs_add_const_ib &&
+	    sscreen->b.chip_class != SI &&
+	    /* These can't use CE due to a power gating bug in the kernel. */
+	    sscreen->b.family != CHIP_CARRIZO &&
+	    sscreen->b.family != CHIP_STONEY) {
 		sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs);
 		if (!sctx->ce_ib)
 			goto fail;
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1667,7 +1667,12 @@ static void declare_system_value(
 	}

 	case TGSI_SEMANTIC_VERTICESIN:
-		value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
+		if (ctx->type == PIPE_SHADER_TESS_CTRL)
+			value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
+		else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+			value = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 7);
+		else
+			assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
 		break;

 	case TGSI_SEMANTIC_TESSINNER:
@@ -4028,7 +4033,7 @@ static void resq_fetch_args(
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	const struct tgsi_full_src_register *reg = &inst->Src[0];

-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+	emit_data->dst_type = ctx->v4i32;

 	if (reg->Register.File == TGSI_FILE_BUFFER) {
 		emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
@@ -4079,9 +4084,7 @@ static void resq_emit(
 			LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);

 			LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
-			z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
 			z = LLVMBuildSDiv(builder, z, imm6, "");
-			z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
 			out = LLVMBuildInsertElement(builder, out, z, imm2, "");
 		}
 	}
@@ -5862,6 +5865,9 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 			conf->scratch_bytes_per_wave =
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
+		case 0x4:
+		case 0x8:
+			break; /* just spilling stats, not important */
 		default:
 			{
 				static bool printed;
@@ -7232,6 +7238,12 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen,
 						   linear_sample[i], base + 10 + i, "");
 	}

+	/* Tell LLVM to insert WQM instruction sequence when needed. */
+	if (key->ps_prolog.wqm) {
+		LLVMAddTargetDependentFunctionAttr(func,
+						   "amdgpu-ps-wqm-outputs", "");
+	}
+
 	/* Compile. */
 	LLVMBuildRet(gallivm->builder, ret);
 	radeon_llvm_finalize_module(&ctx.radeon_bld);
@@ -7382,6 +7394,9 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
 	prolog_key.ps_prolog.colors_read = info->colors_read;
 	prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
 	prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
+	prolog_key.ps_prolog.wqm = info->uses_derivatives &&
+		(prolog_key.ps_prolog.colors_read ||
+		 prolog_key.ps_prolog.states.force_persample_interp);

 	if (info->colors_read) {
 		unsigned *color = shader->selector->color_attr_index;
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -355,6 +355,7 @@ union si_shader_part_key {
 		unsigned	colors_read:8; /* color input components read */
 		unsigned	num_interp_inputs:5; /* BCOLOR is at this location */
 		unsigned	face_vgpr_index:5;
+		unsigned	wqm:1;
 		char		color_attr_index[2];
 		char		color_interp_vgpr_index[2]; /* -1 == constant */
 	} ps_prolog;
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -461,16 +461,19 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 			S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);

 		/* Only set dual source blending for MRT0 to avoid a hang. */
-		if (i >= 1 && blend->dual_src_blend)
-			continue;
+		if (i >= 1 && blend->dual_src_blend) {
+			/* Vulkan does this for dual source blending. */
+			if (i == 1)
+				blend_cntl |= S_028780_ENABLE(1);

-		if (!state->rt[j].colormask)
+			si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
 			continue;
+		}

 		/* cb_render_state will disable unused ones */
 		blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);

-		if (!state->rt[j].blend_enable) {
+		if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
 			si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
 			continue;
 		}
@@ -551,6 +554,17 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 	}

 	if (sctx->b.family == CHIP_STONEY) {
+		/* Disable RB+ blend optimizations for dual source blending.
+		 * Vulkan does this.
+		 */
+		if (blend->dual_src_blend) {
+			for (int i = 0; i < 8; i++) {
+				sx_mrt_blend_opt[i] =
+					S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
+					S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
+			}
+		}
+
 		for (int i = 0; i < 8; i++)
 			si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4,
 				       sx_mrt_blend_opt[i]);
@@ -728,6 +742,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 	}

 	rs->scissor_enable = state->scissor;
+	rs->clip_halfz = state->clip_halfz;
 	rs->two_side = state->light_twoside;
 	rs->multisample_enable = state->multisample;
 	rs->force_persample_interp = state->force_persample_interp;
@@ -857,7 +872,7 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 			si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
 	}

-	r600_set_scissor_enable(&sctx->b, rs->scissor_enable);
+	r600_viewport_set_rast_deps(&sctx->b, rs->scissor_enable, rs->clip_halfz);

 	si_pm4_bind_state(sctx, rasterizer, rs);
 	si_update_poly_offset_state(sctx);
@@ -3427,6 +3442,11 @@ void si_init_state_functions(struct si_context *sctx)
 	si_init_config(sctx);
 }

+static uint32_t si_get_bo_metadata_word1(struct r600_common_screen *rscreen)
+{
+	return (ATI_VENDOR_ID << 16) | rscreen->info.pci_id;
+}
+
 static void si_query_opaque_metadata(struct r600_common_screen *rscreen,
 				     struct r600_texture *rtex,
 			             struct radeon_bo_metadata *md)
@@ -3461,7 +3481,7 @@ static void si_query_opaque_metadata(struct r600_common_screen *rscreen,
 	md->metadata[0] = 1; /* metadata image format version 1 */

 	/* TILE_MODE_INDEX is ambiguous without a PCI ID. */
-	md->metadata[1] = (ATI_VENDOR_ID << 16) | rscreen->info.pci_id;
+	md->metadata[1] = si_get_bo_metadata_word1(rscreen);

 	si_make_texture_descriptor(sscreen, rtex, true,
 				   res->target, res->format,
@@ -3485,9 +3505,37 @@ static void si_query_opaque_metadata(struct r600_common_screen *rscreen,
 	md->size_metadata = (11 + res->last_level) * 4;
 }

+static void si_apply_opaque_metadata(struct r600_common_screen *rscreen,
+				     struct r600_texture *rtex,
+			             struct radeon_bo_metadata *md)
+{
+	uint32_t *desc = &md->metadata[2];
+
+	if (rscreen->chip_class < VI)
+		return;
+
+	/* Return if DCC is enabled. The texture should be set up with it
+	 * already.
+	 */
+	if (md->size_metadata >= 11 * 4 &&
+	    md->metadata[0] != 0 &&
+	    md->metadata[1] == si_get_bo_metadata_word1(rscreen) &&
+	    G_008F28_COMPRESSION_EN(desc[6])) {
+		assert(rtex->dcc_offset == ((uint64_t)desc[7] << 8));
+		return;
+	}
+
+	/* Disable DCC. These are always set by texture_from_handle and must
+	 * be cleared here.
+	 */
+	rtex->dcc_offset = 0;
+	rtex->cb_color_info &= ~VI_S_028C70_DCC_ENABLE(1);
+}
+
 void si_init_screen_state_functions(struct si_screen *sscreen)
 {
 	sscreen->b.query_opaque_metadata = si_query_opaque_metadata;
+	sscreen->b.apply_opaque_metadata = si_apply_opaque_metadata;
 }

 static void
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -78,6 +78,7 @@ struct si_state_rasterizer {
 	bool			clamp_fragment_color;
 	bool			rasterizer_discard;
 	bool			scissor_enable;
+	bool			clip_halfz;
 };

 struct si_dsa_stencil_ref_part {
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -35,11 +35,13 @@
 #include "JitManager.h"
 #include "fetch_jit.h"

+#pragma push_macro("DEBUG")
+#undef DEBUG
+
 #if defined(_WIN32)
 #include "llvm/ADT/Triple.h"
 #endif
 #include "llvm/IR/Function.h"
-#include "llvm/Support/DynamicLibrary.h"

 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
@@ -53,6 +55,8 @@
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #endif

+#pragma pop_macro("DEBUG")
+
 #include "core/state.h"

 #include "state_llvm.h"
@@ -237,6 +241,13 @@ bool JitManager::SetupModuleFromIR(const uint8_t *pIR)
        return false;
    }

+#if HAVE_LLVM == 0x307
+    // llvm-3.7 has mismatched setDataLyout/getDataLayout APIs
+    newModule->setDataLayout(*mpExec->getDataLayout());
+#else
+    newModule->setDataLayout(mpExec->getDataLayout());
+#endif
+
    mpCurrentModule = newModule.get();
 #if defined(_WIN32)
    // Needed for MCJIT on windows
@@ -251,7 +262,6 @@ bool JitManager::SetupModuleFromIR(const uint8_t *pIR)
    return true;
 }

-
 //////////////////////////////////////////////////////////////////////////
 /// @brief Dump function x86 assembly to file.
 /// @note This should only be called after the module has been jitted to x86 and the
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -54,7 +54,7 @@
 #endif

 #ifndef HAVE_LLVM
-#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR
+#define HAVE_LLVM ((LLVM_VERSION_MAJOR << 8) | LLVM_VERSION_MINOR)
 #endif

 #include "llvm/IR/Verifier.h"
@@ -66,8 +66,12 @@

 #if HAVE_LLVM == 0x306
 #include "llvm/PassManager.h"
+using FunctionPassManager = llvm::FunctionPassManager;
+using PassManager = llvm::PassManager;
 #else
 #include "llvm/IR/LegacyPassManager.h"
+using FunctionPassManager = llvm::legacy::FunctionPassManager;
+using PassManager = llvm::legacy::PassManager;
 #endif

 #include "llvm/CodeGen/Passes.h"
@@ -77,6 +81,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/DynamicLibrary.h"


 #pragma pop_macro("DEBUG")
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -31,7 +31,6 @@
 #include "blend_jit.h"
 #include "builder.h"
 #include "state_llvm.h"
-#include "llvm/IR/DataLayout.h"

 #include <sstream>

@@ -725,12 +724,7 @@ struct BlendJit : public Builder

        JitManager::DumpToFile(blendFunc, "");

-#if HAVE_LLVM == 0x306
-        FunctionPassManager
-#else
-        llvm::legacy::FunctionPassManager
-#endif
-            passes(JM()->mpCurrentModule);
+        ::FunctionPassManager passes(JM()->mpCurrentModule);

        passes.add(createBreakCriticalEdgesPass());
        passes.add(createCFGSimplificationPass());
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -30,8 +30,6 @@
 #include "builder.h"
 #include "common/rdtsc_buckets.h"

-#include "llvm/Support/DynamicLibrary.h"
-
 void __cdecl CallPrint(const char* fmt, ...);

 //////////////////////////////////////////////////////////////////////////
@@ -322,6 +320,32 @@ CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &args
    return CALLA(Callee, args);
 }

+#if HAVE_LLVM > 0x306
+CallInst *Builder::CALL(Value *Callee, Value* arg)
+{
+    std::vector<Value*> args;
+    args.push_back(arg);
+    return CALLA(Callee, args);
+}
+
+CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
+{
+    std::vector<Value*> args;
+    args.push_back(arg1);
+    args.push_back(arg2);
+    return CALLA(Callee, args);
+}
+
+CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
+{
+    std::vector<Value*> args;
+    args.push_back(arg1);
+    args.push_back(arg2);
+    args.push_back(arg3);
+    return CALLA(Callee, args);
+}
+#endif
+
 Value *Builder::VRCP(Value *va)
 {
    return FDIV(VIMMED1(1.0f), va);  // 1 / a
@@ -676,20 +700,22 @@ Value *Builder::PSHUFB(Value* a, Value* b)
 /// lower 8 values are used.
 Value *Builder::PMOVSXBD(Value* a)
 {
-    Value* res;
+    // llvm-3.9 removed the pmovsxbd intrinsic
+#if HAVE_LLVM < 0x309
    // use avx2 byte sign extend instruction if available
    if(JM()->mArch.AVX2())
    {
-        res = VPMOVSXBD(a);
+        Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
+        return CALL(pmovsxbd, std::initializer_list<Value*>{a});
    }
    else
+#endif
    {
        // VPMOVSXBD output type
        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
        // Extract 8 values from 128bit lane and sign extend
-        res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
    }
-    return res;
 }

 //////////////////////////////////////////////////////////////////////////
@@ -698,20 +724,22 @@ Value *Builder::PMOVSXBD(Value* a)
 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 Value *Builder::PMOVSXWD(Value* a)
 {
-    Value* res;
+    // llvm-3.9 removed the pmovsxwd intrinsic
+#if HAVE_LLVM < 0x309
    // use avx2 word sign extend if available
    if(JM()->mArch.AVX2())
    {
-        res = VPMOVSXWD(a);
+        Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
+        return CALL(pmovsxwd, std::initializer_list<Value*>{a});
    }
    else
+#endif
    {
        // VPMOVSXWD output type
        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
        // Extract 8 values from 128bit lane and sign extend
-        res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
    }
-    return res;
 }

 //////////////////////////////////////////////////////////////////////////
@@ -726,8 +754,7 @@ Value *Builder::PERMD(Value* a, Value* idx)
    // use avx2 permute instruction if available
    if(JM()->mArch.AVX2())
    {
-        // llvm 3.6.0 swapped the order of the args to vpermd
-        res = VPERMD(idx, a);
+        res = VPERMD(a, idx);
    }
    else
    {
@@ -852,9 +879,15 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding)

 Value *Builder::PMAXSD(Value* a, Value* b)
 {
+    // llvm-3.9 removed the pmax intrinsics
+#if HAVE_LLVM >= 0x309
+    Value* cmp = ICMP_SGT(a, b);
+    return SELECT(cmp, a, b);
+#else
    if (JM()->mArch.AVX2())
    {
-        return VPMAXSD(a, b);
+        Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
+        return CALL(pmaxsd, {a, b});
    }
    else
    {
@@ -877,13 +910,20 @@ Value *Builder::PMAXSD(Value* a, Value* b)

        return result;
    }
+#endif
 }

 Value *Builder::PMINSD(Value* a, Value* b)
 {
+    // llvm-3.9 removed the pmin intrinsics
+#if HAVE_LLVM >= 0x309
+    Value* cmp = ICMP_SLT(a, b);
+    return SELECT(cmp, a, b);
+#else
    if (JM()->mArch.AVX2())
    {
-        return VPMINSD(a, b);
+        Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
+        return CALL(pminsd, {a, b});
    }
    else
    {
@@ -906,6 +946,7 @@ Value *Builder::PMINSD(Value* a, Value* b)

        return result;
    }
+#endif
 }

 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -72,6 +72,12 @@ int32_t S_IMMED(Value* i);
 Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
 Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
 CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args);
+#if HAVE_LLVM > 0x306
+CallInst *CALL(Value *Callee) { return CALLA(Callee); }
+CallInst *CALL(Value *Callee, Value* arg);
+CallInst *CALL2(Value *Callee, Value* arg1, Value* arg2);
+CallInst *CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3);
+#endif

 LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
 LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -31,7 +31,6 @@
 #include "fetch_jit.h"
 #include "builder.h"
 #include "state_llvm.h"
-#include "llvm/IR/DataLayout.h"
 #include <sstream>
 #include <tuple>

@@ -181,12 +180,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)

    verifyFunction(*fetch);

-#if HAVE_LLVM == 0x306
-        FunctionPassManager
-#else
-        llvm::legacy::FunctionPassManager
-#endif
-            setupPasses(JM()->mpCurrentModule);
+    ::FunctionPassManager setupPasses(JM()->mpCurrentModule);

    ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
    setupPasses.add(createBreakCriticalEdgesPass());
@@ -198,12 +192,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)

    JitManager::DumpToFile(fetch, "se");

-#if HAVE_LLVM == 0x306
-        FunctionPassManager
-#else
-        llvm::legacy::FunctionPassManager
-#endif
-            optPasses(JM()->mpCurrentModule);
+    ::FunctionPassManager optPasses(JM()->mpCurrentModule);

    ///@todo Haven't touched these either. Need to remove some of these and add others.
    optPasses.add(createCFGSimplificationPass());
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -91,8 +91,6 @@ intrinsics = [
        ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
        ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
        ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
-        ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
-        ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
        ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
        ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
        ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
@@ -100,9 +98,7 @@ intrinsics = [
        ["VMASKLOADD", "x86_avx2_maskload_d_256", ["src", "mask"]],
        ["VMASKMOVPS", "x86_avx_maskload_ps_256", ["src", "mask"]],
        ["VPSHUFB", "x86_avx2_pshuf_b", ["a", "b"]],
-        ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]],  # sign extend packed 8bit components
-        ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]],  # sign extend packed 16bit components
-        ["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+        ["VPERMD", "x86_avx2_permd", ["a", "idx"]],
        ["VPERMPS", "x86_avx2_permps", ["idx", "a"]],
        ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
        ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
@@ -110,7 +106,6 @@ intrinsics = [
        ["VPTESTC", "x86_avx_ptestc_256", ["a", "b"]],
        ["VPTESTZ", "x86_avx_ptestz_256", ["a", "b"]],
        ["VFMADDPS", "x86_fma_vfmadd_ps_256", ["a", "b", "c"]],
-        ["VCVTTPS2DQ", "x86_avx_cvtt_ps2dq_256", ["a"]],
        ["VMOVMSKPS", "x86_avx_movmsk_ps_256", ["a"]],
        ["INTERRUPT", "x86_int", ["a"]],
    ]
@@ -352,7 +347,29 @@ def generate_x86_cpp(output_file):
            'Value *Builder::%s(%s)' % (inst[0], args),
            '{',
            '    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1],
+        ]
+        if inst[0] == "VPERMD":
+            rev_args = ''
+            first = True
+            for arg in reversed(inst[2]):
+                if not first:
+                    rev_args += ', '
+                rev_args += arg
+                first = False
+
+            output_lines += [
+                '#if (HAVE_LLVM == 0x306) && (LLVM_VERSION_PATCH == 0)',
+                '    return CALL(func, std::initializer_list<Value*>{%s});' % rev_args,
+                '#else',
+            ]
+        output_lines += [
            '    return CALL(func, std::initializer_list<Value*>{%s});' % pass_args,
+        ]
+        if inst[0] == "VPERMD":
+            output_lines += [
+                '#endif',
+            ]
+        output_lines += [
            '}',
            '',
        ]
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -292,12 +292,7 @@ struct StreamOutJit : public Builder

        JitManager::DumpToFile(soFunc, "SoFunc");

-#if HAVE_LLVM == 0x306
-        FunctionPassManager
-#else
-        llvm::legacy::FunctionPassManager
-#endif
-            passes(JM()->mpCurrentModule);
+        ::FunctionPassManager passes(JM()->mpCurrentModule);

        passes.add(createBreakCriticalEdgesPass());
        passes.add(createCFGSimplificationPass());
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -209,18 +209,24 @@ dri3_bind_context(struct glx_context *context, struct glx_context *old,
   struct dri3_context *pcp = (struct dri3_context *) context;
   struct dri3_screen *psc = (struct dri3_screen *) pcp->base.psc;
   struct dri3_drawable *pdraw, *pread;
+   __DRIdrawable *dri_draw = NULL, *dri_read = NULL;

   pdraw = (struct dri3_drawable *) driFetchDrawable(context, draw);
   pread = (struct dri3_drawable *) driFetchDrawable(context, read);

   driReleaseDrawables(&pcp->base);

-   if (pdraw == NULL || pread == NULL)
+   if (pdraw)
+      dri_draw = pdraw->loader_drawable.dri_drawable;
+   else if (draw != None)
      return GLXBadDrawable;

-   if (!(*psc->core->bindContext) (pcp->driContext,
-                                   pdraw->loader_drawable.dri_drawable,
-                                   pread->loader_drawable.dri_drawable))
+   if (pread)
+      dri_read = pread->loader_drawable.dri_drawable;
+   else if (read != None)
+      return GLXBadDrawable;
+
+   if (!(*psc->core->bindContext) (pcp->driContext, dri_draw, dri_read))
      return GLXBadContext;

   return Success;
--- a/src/glx/g_glxglvnddispatchfuncs.c
+++ b/src/glx/g_glxglvnddispatchfuncs.c
@@ -17,16 +17,19 @@ const char * const __glXDispatchTableStrings[DI_LAST_INDEX] = {
 #define __ATTRIB(field) \
    [DI_##field] = "glX"#field

+    __ATTRIB(BindSwapBarrierSGIX),
    __ATTRIB(BindTexImageEXT),
    // glXChooseFBConfig implemented by libglvnd
    __ATTRIB(ChooseFBConfigSGIX),
    // glXChooseVisual implemented by libglvnd
    // glXCopyContext implemented by libglvnd
+    __ATTRIB(CopySubBufferMESA),
    // glXCreateContext implemented by libglvnd
    __ATTRIB(CreateContextAttribsARB),
    __ATTRIB(CreateContextWithConfigSGIX),
    __ATTRIB(CreateGLXPbufferSGIX),
    // glXCreateGLXPixmap implemented by libglvnd
+    __ATTRIB(CreateGLXPixmapMESA),
    __ATTRIB(CreateGLXPixmapWithConfigSGIX),
    // glXCreateNewContext implemented by libglvnd
    // glXCreatePbuffer implemented by libglvnd
@@ -51,54 +54,50 @@ const char * const __glXDispatchTableStrings[DI_LAST_INDEX] = {
    __ATTRIB(GetFBConfigAttribSGIX),
    __ATTRIB(GetFBConfigFromVisualSGIX),
    // glXGetFBConfigs implemented by libglvnd
+    __ATTRIB(GetMscRateOML),
    // glXGetProcAddress implemented by libglvnd
    // glXGetProcAddressARB implemented by libglvnd
+    __ATTRIB(GetScreenDriver),
    // glXGetSelectedEvent implemented by libglvnd
    __ATTRIB(GetSelectedEventSGIX),
+    __ATTRIB(GetSwapIntervalMESA),
+    __ATTRIB(GetSyncValuesOML),
    __ATTRIB(GetVideoSyncSGI),
    // glXGetVisualFromFBConfig implemented by libglvnd
    __ATTRIB(GetVisualFromFBConfigSGIX),
    // glXImportContextEXT implemented by libglvnd
    // glXIsDirect implemented by libglvnd
+    __ATTRIB(JoinSwapGroupSGIX),
    // glXMakeContextCurrent implemented by libglvnd
    // glXMakeCurrent implemented by libglvnd
    // glXQueryContext implemented by libglvnd
    __ATTRIB(QueryContextInfoEXT),
+    __ATTRIB(QueryCurrentRendererIntegerMESA),
+    __ATTRIB(QueryCurrentRendererStringMESA),
    // glXQueryDrawable implemented by libglvnd
    // glXQueryExtension implemented by libglvnd
    // glXQueryExtensionsString implemented by libglvnd
    __ATTRIB(QueryGLXPbufferSGIX),
+    __ATTRIB(QueryMaxSwapBarriersSGIX),
+    __ATTRIB(QueryRendererIntegerMESA),
+    __ATTRIB(QueryRendererStringMESA),
    // glXQueryServerString implemented by libglvnd
    // glXQueryVersion implemented by libglvnd
+    __ATTRIB(ReleaseBuffersMESA),
    __ATTRIB(ReleaseTexImageEXT),
    // glXSelectEvent implemented by libglvnd
    __ATTRIB(SelectEventSGIX),
    // glXSwapBuffers implemented by libglvnd
+    __ATTRIB(SwapBuffersMscOML),
+    __ATTRIB(SwapIntervalMESA),
    __ATTRIB(SwapIntervalSGI),
    // glXUseXFont implemented by libglvnd
+    __ATTRIB(WaitForMscOML),
+    __ATTRIB(WaitForSbcOML),
    // glXWaitGL implemented by libglvnd
    __ATTRIB(WaitVideoSyncSGI),
    // glXWaitX implemented by libglvnd

-    __ATTRIB(glXBindSwapBarrierSGIX),
-    __ATTRIB(glXCopySubBufferMESA),
-    __ATTRIB(glXCreateGLXPixmapMESA),
-    __ATTRIB(glXGetMscRateOML),
-    __ATTRIB(glXGetScreenDriver),
-    __ATTRIB(glXGetSwapIntervalMESA),
-    __ATTRIB(glXGetSyncValuesOML),
-    __ATTRIB(glXJoinSwapGroupSGIX),
-    __ATTRIB(glXQueryCurrentRendererIntegerMESA),
-    __ATTRIB(glXQueryCurrentRendererStringMESA),
-    __ATTRIB(glXQueryMaxSwapBarriersSGIX),
-    __ATTRIB(glXQueryRendererIntegerMESA),
-    __ATTRIB(glXQueryRendererStringMESA),
-    __ATTRIB(glXReleaseBuffersMESA),
-    __ATTRIB(glXSwapBuffersMscOML),
-    __ATTRIB(glXSwapIntervalMESA),
-    __ATTRIB(glXWaitForMscOML),
-    __ATTRIB(glXWaitForSbcOML),
-
 #undef __ATTRIB
 };

@@ -557,49 +556,49 @@ static int dispatch_WaitVideoSyncSGI(int divisor, int remainder,



-static void dispatch_glXBindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable,
+static void dispatch_BindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable,
                                            int barrier)
 {
-    PFNGLXBINDSWAPBARRIERSGIXPROC pglXBindSwapBarrierSGIX;
+    PFNGLXBINDSWAPBARRIERSGIXPROC pBindSwapBarrierSGIX;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return;

-    __FETCH_FUNCTION_PTR(glXBindSwapBarrierSGIX);
-    if (pglXBindSwapBarrierSGIX == NULL)
+    __FETCH_FUNCTION_PTR(BindSwapBarrierSGIX);
+    if (pBindSwapBarrierSGIX == NULL)
        return;

-    (*pglXBindSwapBarrierSGIX)(dpy, drawable, barrier);
+    (*pBindSwapBarrierSGIX)(dpy, drawable, barrier);
 }



-static void dispatch_glXCopySubBufferMESA(Display *dpy, GLXDrawable drawable,
+static void dispatch_CopySubBufferMESA(Display *dpy, GLXDrawable drawable,
                                          int x, int y, int width, int height)
 {
-    PFNGLXCOPYSUBBUFFERMESAPROC pglXCopySubBufferMESA;
+    PFNGLXCOPYSUBBUFFERMESAPROC pCopySubBufferMESA;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return;

-    __FETCH_FUNCTION_PTR(glXCopySubBufferMESA);
-    if (pglXCopySubBufferMESA == NULL)
+    __FETCH_FUNCTION_PTR(CopySubBufferMESA);
+    if (pCopySubBufferMESA == NULL)
        return;

-    (*pglXCopySubBufferMESA)(dpy, drawable, x, y, width, height);
+    (*pCopySubBufferMESA)(dpy, drawable, x, y, width, height);
 }



-static GLXPixmap dispatch_glXCreateGLXPixmapMESA(Display *dpy,
+static GLXPixmap dispatch_CreateGLXPixmapMESA(Display *dpy,
                                                 XVisualInfo *visinfo,
                                                 Pixmap pixmap, Colormap cmap)
 {
-    PFNGLXCREATEGLXPIXMAPMESAPROC pglXCreateGLXPixmapMESA;
+    PFNGLXCREATEGLXPIXMAPMESAPROC pCreateGLXPixmapMESA;
    __GLXvendorInfo *dd;
    GLXPixmap ret;

@@ -607,11 +606,11 @@ static GLXPixmap dispatch_glXCreateGLXPixmapMESA(Display *dpy,
    if (dd == NULL)
        return None;

-    __FETCH_FUNCTION_PTR(glXCreateGLXPixmapMESA);
-    if (pglXCreateGLXPixmapMESA == NULL)
+    __FETCH_FUNCTION_PTR(CreateGLXPixmapMESA);
+    if (pCreateGLXPixmapMESA == NULL)
        return None;

-    ret = (*pglXCreateGLXPixmapMESA)(dpy, visinfo, pixmap, cmap);
+    ret = (*pCreateGLXPixmapMESA)(dpy, visinfo, pixmap, cmap);
    if (AddDrawableMapping(dpy, ret, dd)) {
        /* XXX: Call glXDestroyGLXPixmap which lives in libglvnd. If we're not
         * allowed to call it from here, should we extend __glXDispatchTableIndices ?
@@ -624,47 +623,47 @@ static GLXPixmap dispatch_glXCreateGLXPixmapMESA(Display *dpy,



-static GLboolean dispatch_glXGetMscRateOML(Display *dpy, GLXDrawable drawable,
+static GLboolean dispatch_GetMscRateOML(Display *dpy, GLXDrawable drawable,
                                           int32_t *numerator, int32_t *denominator)
 {
-    PFNGLXGETMSCRATEOMLPROC pglXGetMscRateOML;
+    PFNGLXGETMSCRATEOMLPROC pGetMscRateOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return GL_FALSE;

-    __FETCH_FUNCTION_PTR(glXGetMscRateOML);
-    if (pglXGetMscRateOML == NULL)
+    __FETCH_FUNCTION_PTR(GetMscRateOML);
+    if (pGetMscRateOML == NULL)
        return GL_FALSE;

-    return (*pglXGetMscRateOML)(dpy, drawable, numerator, denominator);
+    return (*pGetMscRateOML)(dpy, drawable, numerator, denominator);
 }



-static const char *dispatch_glXGetScreenDriver(Display *dpy, int scrNum)
+static const char *dispatch_GetScreenDriver(Display *dpy, int scrNum)
 {
    typedef const char *(*fn_glXGetScreenDriver_ptr)(Display *dpy, int scrNum);
-    fn_glXGetScreenDriver_ptr pglXGetScreenDriver;
+    fn_glXGetScreenDriver_ptr pGetScreenDriver;
    __GLXvendorInfo *dd;

    dd = __VND->getDynDispatch(dpy, scrNum);
    if (dd == NULL)
        return NULL;

-    __FETCH_FUNCTION_PTR(glXGetScreenDriver);
-    if (pglXGetScreenDriver == NULL)
+    __FETCH_FUNCTION_PTR(GetScreenDriver);
+    if (pGetScreenDriver == NULL)
        return NULL;

-    return (*pglXGetScreenDriver)(dpy, scrNum);
+    return (*pGetScreenDriver)(dpy, scrNum);
 }



-static int dispatch_glXGetSwapIntervalMESA(void)
+static int dispatch_GetSwapIntervalMESA(void)
 {
-    PFNGLXGETSWAPINTERVALMESAPROC pglXGetSwapIntervalMESA;
+    PFNGLXGETSWAPINTERVALMESAPROC pGetSwapIntervalMESA;
    __GLXvendorInfo *dd;

    if (!__VND->getCurrentContext())
@@ -674,57 +673,57 @@ static int dispatch_glXGetSwapIntervalMESA(void)
    if (dd == NULL)
        return 0;

-    __FETCH_FUNCTION_PTR(glXGetSwapIntervalMESA);
-    if (pglXGetSwapIntervalMESA == NULL)
+    __FETCH_FUNCTION_PTR(GetSwapIntervalMESA);
+    if (pGetSwapIntervalMESA == NULL)
        return 0;

-    return (*pglXGetSwapIntervalMESA)();
+    return (*pGetSwapIntervalMESA)();
 }



-static Bool dispatch_glXGetSyncValuesOML(Display *dpy, GLXDrawable drawable,
+static Bool dispatch_GetSyncValuesOML(Display *dpy, GLXDrawable drawable,
                                         int64_t *ust, int64_t *msc, int64_t *sbc)
 {
-    PFNGLXGETSYNCVALUESOMLPROC pglXGetSyncValuesOML;
+    PFNGLXGETSYNCVALUESOMLPROC pGetSyncValuesOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXGetSyncValuesOML);
-    if (pglXGetSyncValuesOML == NULL)
+    __FETCH_FUNCTION_PTR(GetSyncValuesOML);
+    if (pGetSyncValuesOML == NULL)
        return False;

-    return (*pglXGetSyncValuesOML)(dpy, drawable, ust, msc, sbc);
+    return (*pGetSyncValuesOML)(dpy, drawable, ust, msc, sbc);
 }



-static void dispatch_glXJoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable,
+static void dispatch_JoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable,
                                          GLXDrawable member)
 {
-    PFNGLXJOINSWAPGROUPSGIXPROC pglXJoinSwapGroupSGIX;
+    PFNGLXJOINSWAPGROUPSGIXPROC pJoinSwapGroupSGIX;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return;

-    __FETCH_FUNCTION_PTR(glXJoinSwapGroupSGIX);
-    if (pglXJoinSwapGroupSGIX == NULL)
+    __FETCH_FUNCTION_PTR(JoinSwapGroupSGIX);
+    if (pJoinSwapGroupSGIX == NULL)
        return;

-    (*pglXJoinSwapGroupSGIX)(dpy, drawable, member);
+    (*pJoinSwapGroupSGIX)(dpy, drawable, member);
 }



-static Bool dispatch_glXQueryCurrentRendererIntegerMESA(int attribute,
+static Bool dispatch_QueryCurrentRendererIntegerMESA(int attribute,
                                                        unsigned int *value)
 {
-    PFNGLXQUERYCURRENTRENDERERINTEGERMESAPROC pglXQueryCurrentRendererIntegerMESA;
+    PFNGLXQUERYCURRENTRENDERERINTEGERMESAPROC pQueryCurrentRendererIntegerMESA;
    __GLXvendorInfo *dd;

    if (!__VND->getCurrentContext())
@@ -734,18 +733,18 @@ static Bool dispatch_glXQueryCurrentRendererIntegerMESA(int attribute,
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXQueryCurrentRendererIntegerMESA);
-    if (pglXQueryCurrentRendererIntegerMESA == NULL)
+    __FETCH_FUNCTION_PTR(QueryCurrentRendererIntegerMESA);
+    if (pQueryCurrentRendererIntegerMESA == NULL)
        return False;

-    return (*pglXQueryCurrentRendererIntegerMESA)(attribute, value);
+    return (*pQueryCurrentRendererIntegerMESA)(attribute, value);
 }



-static const char *dispatch_glXQueryCurrentRendererStringMESA(int attribute)
+static const char *dispatch_QueryCurrentRendererStringMESA(int attribute)
 {
-    PFNGLXQUERYCURRENTRENDERERSTRINGMESAPROC pglXQueryCurrentRendererStringMESA;
+    PFNGLXQUERYCURRENTRENDERERSTRINGMESAPROC pQueryCurrentRendererStringMESA;
    __GLXvendorInfo *dd;

    if (!__VND->getCurrentContext())
@@ -755,114 +754,114 @@ static const char *dispatch_glXQueryCurrentRendererStringMESA(int attribute)
    if (dd == NULL)
        return NULL;

-    __FETCH_FUNCTION_PTR(glXQueryCurrentRendererStringMESA);
-    if (pglXQueryCurrentRendererStringMESA == NULL)
+    __FETCH_FUNCTION_PTR(QueryCurrentRendererStringMESA);
+    if (pQueryCurrentRendererStringMESA == NULL)
        return NULL;

-    return (*pglXQueryCurrentRendererStringMESA)(attribute);
+    return (*pQueryCurrentRendererStringMESA)(attribute);
 }



-static Bool dispatch_glXQueryMaxSwapBarriersSGIX(Display *dpy, int screen,
+static Bool dispatch_QueryMaxSwapBarriersSGIX(Display *dpy, int screen,
                                                 int *max)
 {
-    PFNGLXQUERYMAXSWAPBARRIERSSGIXPROC pglXQueryMaxSwapBarriersSGIX;
+    PFNGLXQUERYMAXSWAPBARRIERSSGIXPROC pQueryMaxSwapBarriersSGIX;
    __GLXvendorInfo *dd;

    dd = __VND->getDynDispatch(dpy, screen);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXQueryMaxSwapBarriersSGIX);
-    if (pglXQueryMaxSwapBarriersSGIX == NULL)
+    __FETCH_FUNCTION_PTR(QueryMaxSwapBarriersSGIX);
+    if (pQueryMaxSwapBarriersSGIX == NULL)
        return False;

-    return (*pglXQueryMaxSwapBarriersSGIX)(dpy, screen, max);
+    return (*pQueryMaxSwapBarriersSGIX)(dpy, screen, max);
 }



-static Bool dispatch_glXQueryRendererIntegerMESA(Display *dpy, int screen,
+static Bool dispatch_QueryRendererIntegerMESA(Display *dpy, int screen,
                                                 int renderer, int attribute,
                                                 unsigned int *value)
 {
-    PFNGLXQUERYRENDERERINTEGERMESAPROC pglXQueryRendererIntegerMESA;
+    PFNGLXQUERYRENDERERINTEGERMESAPROC pQueryRendererIntegerMESA;
    __GLXvendorInfo *dd;

    dd = __VND->getDynDispatch(dpy, screen);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXQueryRendererIntegerMESA);
-    if (pglXQueryRendererIntegerMESA == NULL)
+    __FETCH_FUNCTION_PTR(QueryRendererIntegerMESA);
+    if (pQueryRendererIntegerMESA == NULL)
        return False;

-    return (*pglXQueryRendererIntegerMESA)(dpy, screen, renderer, attribute, value);
+    return (*pQueryRendererIntegerMESA)(dpy, screen, renderer, attribute, value);
 }



-static const char *dispatch_glXQueryRendererStringMESA(Display *dpy, int screen,
+static const char *dispatch_QueryRendererStringMESA(Display *dpy, int screen,
                                                       int renderer, int attribute)
 {
-    PFNGLXQUERYRENDERERSTRINGMESAPROC pglXQueryRendererStringMESA;
+    PFNGLXQUERYRENDERERSTRINGMESAPROC pQueryRendererStringMESA;
    __GLXvendorInfo *dd = NULL;

    dd = __VND->getDynDispatch(dpy, screen);
    if (dd == NULL)
        return NULL;

-    __FETCH_FUNCTION_PTR(glXQueryRendererStringMESA);
-    if (pglXQueryRendererStringMESA == NULL)
+    __FETCH_FUNCTION_PTR(QueryRendererStringMESA);
+    if (pQueryRendererStringMESA == NULL)
        return NULL;

-    return (*pglXQueryRendererStringMESA)(dpy, screen, renderer, attribute);
+    return (*pQueryRendererStringMESA)(dpy, screen, renderer, attribute);
 }



-static Bool dispatch_glXReleaseBuffersMESA(Display *dpy, GLXDrawable d)
+static Bool dispatch_ReleaseBuffersMESA(Display *dpy, GLXDrawable d)
 {
-    PFNGLXRELEASEBUFFERSMESAPROC pglXReleaseBuffersMESA;
+    PFNGLXRELEASEBUFFERSMESAPROC pReleaseBuffersMESA;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, d);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXReleaseBuffersMESA);
-    if (pglXReleaseBuffersMESA == NULL)
+    __FETCH_FUNCTION_PTR(ReleaseBuffersMESA);
+    if (pReleaseBuffersMESA == NULL)
        return False;

-    return (*pglXReleaseBuffersMESA)(dpy, d);
+    return (*pReleaseBuffersMESA)(dpy, d);
 }



-static int64_t dispatch_glXSwapBuffersMscOML(Display *dpy, GLXDrawable drawable,
+static int64_t dispatch_SwapBuffersMscOML(Display *dpy, GLXDrawable drawable,
                                             int64_t target_msc, int64_t divisor,
                                             int64_t remainder)
 {
-    PFNGLXSWAPBUFFERSMSCOMLPROC pglXSwapBuffersMscOML;
+    PFNGLXSWAPBUFFERSMSCOMLPROC pSwapBuffersMscOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return 0;

-    __FETCH_FUNCTION_PTR(glXSwapBuffersMscOML);
-    if (pglXSwapBuffersMscOML == NULL)
+    __FETCH_FUNCTION_PTR(SwapBuffersMscOML);
+    if (pSwapBuffersMscOML == NULL)
        return 0;

-    return (*pglXSwapBuffersMscOML)(dpy, drawable, target_msc, divisor, remainder);
+    return (*pSwapBuffersMscOML)(dpy, drawable, target_msc, divisor, remainder);
 }



-static int dispatch_glXSwapIntervalMESA(unsigned int interval)
+static int dispatch_SwapIntervalMESA(unsigned int interval)
 {
-    PFNGLXSWAPINTERVALMESAPROC pglXSwapIntervalMESA;
+    PFNGLXSWAPINTERVALMESAPROC pSwapIntervalMESA;
    __GLXvendorInfo *dd;

    if (!__VND->getCurrentContext())
@@ -872,52 +871,52 @@ static int dispatch_glXSwapIntervalMESA(unsigned int interval)
    if (dd == NULL)
        return 0;

-    __FETCH_FUNCTION_PTR(glXSwapIntervalMESA);
-    if (pglXSwapIntervalMESA == NULL)
+    __FETCH_FUNCTION_PTR(SwapIntervalMESA);
+    if (pSwapIntervalMESA == NULL)
        return 0;

-    return (*pglXSwapIntervalMESA)(interval);
+    return (*pSwapIntervalMESA)(interval);
 }



-static Bool dispatch_glXWaitForMscOML(Display *dpy, GLXDrawable drawable,
+static Bool dispatch_WaitForMscOML(Display *dpy, GLXDrawable drawable,
                                      int64_t target_msc, int64_t divisor,
                                      int64_t remainder, int64_t *ust,
                                      int64_t *msc, int64_t *sbc)
 {
-    PFNGLXWAITFORMSCOMLPROC pglXWaitForMscOML;
+    PFNGLXWAITFORMSCOMLPROC pWaitForMscOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXWaitForMscOML);
-    if (pglXWaitForMscOML == NULL)
+    __FETCH_FUNCTION_PTR(WaitForMscOML);
+    if (pWaitForMscOML == NULL)
        return False;

-    return (*pglXWaitForMscOML)(dpy, drawable, target_msc, divisor, remainder, ust, msc, sbc);
+    return (*pWaitForMscOML)(dpy, drawable, target_msc, divisor, remainder, ust, msc, sbc);
 }



-static Bool dispatch_glXWaitForSbcOML(Display *dpy, GLXDrawable drawable,
+static Bool dispatch_WaitForSbcOML(Display *dpy, GLXDrawable drawable,
                                      int64_t target_sbc, int64_t *ust,
                                      int64_t *msc, int64_t *sbc)
 {
-    PFNGLXWAITFORSBCOMLPROC pglXWaitForSbcOML;
+    PFNGLXWAITFORSBCOMLPROC pWaitForSbcOML;
    __GLXvendorInfo *dd;

    dd = GetDispatchFromDrawable(dpy, drawable);
    if (dd == NULL)
        return False;

-    __FETCH_FUNCTION_PTR(glXWaitForSbcOML);
-    if (pglXWaitForSbcOML == NULL)
+    __FETCH_FUNCTION_PTR(WaitForSbcOML);
+    if (pWaitForSbcOML == NULL)
        return False;

-    return (*pglXWaitForSbcOML)(dpy, drawable, target_sbc, ust, msc, sbc);
+    return (*pWaitForSbcOML)(dpy, drawable, target_sbc, ust, msc, sbc);
 }

 #undef __FETCH_FUNCTION_PTR
@@ -928,45 +927,44 @@ const void * const __glXDispatchFunctions[DI_LAST_INDEX + 1] = {
 #define __ATTRIB(field) \
    [DI_##field] = (void *)dispatch_##field

-    __ATTRIB(BindTexImageEXT),
+    __ATTRIB(BindSwapBarrierSGIX),
    __ATTRIB(BindTexImageEXT),
    __ATTRIB(ChooseFBConfigSGIX),
+    __ATTRIB(CopySubBufferMESA),
    __ATTRIB(CreateContextAttribsARB),
    __ATTRIB(CreateContextWithConfigSGIX),
    __ATTRIB(CreateGLXPbufferSGIX),
+    __ATTRIB(CreateGLXPixmapMESA),
    __ATTRIB(CreateGLXPixmapWithConfigSGIX),
    __ATTRIB(DestroyGLXPbufferSGIX),
    __ATTRIB(GetContextIDEXT),
    __ATTRIB(GetCurrentDisplayEXT),
    __ATTRIB(GetFBConfigAttribSGIX),
    __ATTRIB(GetFBConfigFromVisualSGIX),
+    __ATTRIB(GetMscRateOML),
+    __ATTRIB(GetScreenDriver),
    __ATTRIB(GetSelectedEventSGIX),
+    __ATTRIB(GetSwapIntervalMESA),
+    __ATTRIB(GetSyncValuesOML),
    __ATTRIB(GetVideoSyncSGI),
    __ATTRIB(GetVisualFromFBConfigSGIX),
+    __ATTRIB(JoinSwapGroupSGIX),
    __ATTRIB(QueryContextInfoEXT),
+    __ATTRIB(QueryCurrentRendererIntegerMESA),
+    __ATTRIB(QueryCurrentRendererStringMESA),
    __ATTRIB(QueryGLXPbufferSGIX),
+    __ATTRIB(QueryMaxSwapBarriersSGIX),
+    __ATTRIB(QueryRendererIntegerMESA),
+    __ATTRIB(QueryRendererStringMESA),
+    __ATTRIB(ReleaseBuffersMESA),
    __ATTRIB(ReleaseTexImageEXT),
    __ATTRIB(SelectEventSGIX),
+    __ATTRIB(SwapBuffersMscOML),
+    __ATTRIB(SwapIntervalMESA),
    __ATTRIB(SwapIntervalSGI),
+    __ATTRIB(WaitForMscOML),
+    __ATTRIB(WaitForSbcOML),
    __ATTRIB(WaitVideoSyncSGI),
-    __ATTRIB(glXBindSwapBarrierSGIX),
-    __ATTRIB(glXCopySubBufferMESA),
-    __ATTRIB(glXCreateGLXPixmapMESA),
-    __ATTRIB(glXGetMscRateOML),
-    __ATTRIB(glXGetScreenDriver),
-    __ATTRIB(glXGetSwapIntervalMESA),
-    __ATTRIB(glXGetSyncValuesOML),
-    __ATTRIB(glXJoinSwapGroupSGIX),
-    __ATTRIB(glXQueryCurrentRendererIntegerMESA),
-    __ATTRIB(glXQueryCurrentRendererStringMESA),
-    __ATTRIB(glXQueryMaxSwapBarriersSGIX),
-    __ATTRIB(glXQueryRendererIntegerMESA),
-    __ATTRIB(glXQueryRendererStringMESA),
-    __ATTRIB(glXReleaseBuffersMESA),
-    __ATTRIB(glXSwapBuffersMscOML),
-    __ATTRIB(glXSwapIntervalMESA),
-    __ATTRIB(glXWaitForMscOML),
-    __ATTRIB(glXWaitForSbcOML),

    [DI_LAST_INDEX] = NULL,
 #undef __ATTRIB
--- a/src/glx/g_glxglvnddispatchindices.h
+++ b/src/glx/g_glxglvnddispatchindices.h
@@ -6,16 +6,19 @@
 #define __glxlibglvnd_dispatchindex_h__

 typedef enum __GLXdispatchIndex {
+    DI_BindSwapBarrierSGIX,
    DI_BindTexImageEXT,
    // ChooseFBConfig implemented by libglvnd
    DI_ChooseFBConfigSGIX,
    // ChooseVisual implemented by libglvnd
    // CopyContext implemented by libglvnd
+    DI_CopySubBufferMESA,
    // CreateContext implemented by libglvnd
    DI_CreateContextAttribsARB,
    DI_CreateContextWithConfigSGIX,
    DI_CreateGLXPbufferSGIX,
    // CreateGLXPixmap implemented by libglvnd
+    DI_CreateGLXPixmapMESA,
    DI_CreateGLXPixmapWithConfigSGIX,
    // CreateNewContext implemented by libglvnd
    // CreatePbuffer implemented by libglvnd
@@ -40,6 +43,7 @@ typedef enum __GLXdispatchIndex {
    DI_GetFBConfigAttribSGIX,
    DI_GetFBConfigFromVisualSGIX,
    // GetFBConfigs implemented by libglvnd
+    DI_GetMscRateOML,
    // GetProcAddress implemented by libglvnd
    // GetProcAddressARB implemented by libglvnd
    // GetSelectedEvent implemented by libglvnd
@@ -47,45 +51,41 @@ typedef enum __GLXdispatchIndex {
    DI_GetVideoSyncSGI,
    // GetVisualFromFBConfig implemented by libglvnd
    DI_GetVisualFromFBConfigSGIX,
+    DI_GetScreenDriver,
+    DI_GetSwapIntervalMESA,
+    DI_GetSyncValuesOML,
    // ImportContextEXT implemented by libglvnd
    // IsDirect implemented by libglvnd
+    DI_JoinSwapGroupSGIX,
    // MakeContextCurrent implemented by libglvnd
    // MakeCurrent implemented by libglvnd
    // QueryContext implemented by libglvnd
    DI_QueryContextInfoEXT,
+    DI_QueryCurrentRendererIntegerMESA,
+    DI_QueryCurrentRendererStringMESA,
    // QueryDrawable implemented by libglvnd
    // QueryExtension implemented by libglvnd
    // QueryExtensionsString implemented by libglvnd
    DI_QueryGLXPbufferSGIX,
+    DI_QueryMaxSwapBarriersSGIX,
+    DI_QueryRendererIntegerMESA,
+    DI_QueryRendererStringMESA,
    // QueryServerString implemented by libglvnd
    // QueryVersion implemented by libglvnd
+    DI_ReleaseBuffersMESA,
    DI_ReleaseTexImageEXT,
    // SelectEvent implemented by libglvnd
    DI_SelectEventSGIX,
    // SwapBuffers implemented by libglvnd
+    DI_SwapBuffersMscOML,
+    DI_SwapIntervalMESA,
    DI_SwapIntervalSGI,
    // UseXFont implemented by libglvnd
    // WaitGL implemented by libglvnd
+    DI_WaitForMscOML,
+    DI_WaitForSbcOML,
    DI_WaitVideoSyncSGI,
    // WaitX implemented by libglvnd
-    DI_glXBindSwapBarrierSGIX,
-    DI_glXCopySubBufferMESA,
-    DI_glXCreateGLXPixmapMESA,
-    DI_glXGetMscRateOML,
-    DI_glXGetScreenDriver,
-    DI_glXGetSwapIntervalMESA,
-    DI_glXGetSyncValuesOML,
-    DI_glXJoinSwapGroupSGIX,
-    DI_glXQueryCurrentRendererIntegerMESA,
-    DI_glXQueryCurrentRendererStringMESA,
-    DI_glXQueryMaxSwapBarriersSGIX,
-    DI_glXQueryRendererIntegerMESA,
-    DI_glXQueryRendererStringMESA,
-    DI_glXReleaseBuffersMESA,
-    DI_glXSwapBuffersMscOML,
-    DI_glXSwapIntervalMESA,
-    DI_glXWaitForMscOML,
-    DI_glXWaitForSbcOML,
    DI_LAST_INDEX
 } __GLXdispatchIndex;

--- a/src/glx/glxglvnd.c
+++ b/src/glx/glxglvnd.c
@@ -50,6 +50,9 @@ static void __glXGLVNDSetDispatchIndex(const GLubyte *procName, int index)
 {
    unsigned internalIndex = FindGLXFunction(procName);

+    if (internalIndex == DI_FUNCTION_COUNT)
+        return; /* unknown or static dispatch */
+
    __glXDispatchTableIndices[internalIndex] = index;
 }

--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -489,6 +489,7 @@ anv_descriptor_set_destroy(struct anv_device *device,
      struct surface_state_free_list_entry *entry =
         set->buffer_views[b].surface_state.map;
      entry->next = pool->surface_state_free_list;
+      entry->offset = set->buffer_views[b].surface_state.offset;
      pool->surface_state_free_list = entry;
   }

--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -24,6 +24,7 @@
 #include <assert.h>
 #include <stdbool.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <unistd.h>
 #include <fcntl.h>

@@ -582,7 +583,14 @@ void anv_GetPhysicalDeviceQueueFamilyProperties(
      return;
   }

-   assert(*pCount >= 1);
+   /* The spec implicitly allows the incoming count to be 0. From the Vulkan
+    * 1.0.38 spec, Section 4.1 Physical Devices:
+    *
+    *     If the value referenced by pQueueFamilyPropertyCount is not 0 [then
+    *     do stuff].
+    */
+   if (*pCount == 0)
+      return;

   *pQueueFamilyProperties = (VkQueueFamilyProperties) {
      .queueFlags = VK_QUEUE_GRAPHICS_BIT |
@@ -1160,6 +1168,9 @@ VkResult anv_AllocateMemory(

   mem->type_index = pAllocateInfo->memoryTypeIndex;

+   mem->map = NULL;
+   mem->map_size = 0;
+
   *pMem = anv_device_memory_to_handle(mem);

   return VK_SUCCESS;
@@ -1181,6 +1192,9 @@ void anv_FreeMemory(
   if (mem == NULL)
      return;

+   if (mem->map)
+      anv_UnmapMemory(_device, _mem);
+
   if (mem->bo.map)
      anv_gem_munmap(mem->bo.map, mem->bo.size);

@@ -1227,8 +1241,12 @@ VkResult anv_MapMemory(
   /* Let's map whole pages */
   map_size = align_u64(map_size, 4096);

-   mem->map = anv_gem_mmap(device, mem->bo.gem_handle,
-                           map_offset, map_size, gem_flags);
+   void *map = anv_gem_mmap(device, mem->bo.gem_handle,
+                            map_offset, map_size, gem_flags);
+   if (map == MAP_FAILED)
+      return vk_error(VK_ERROR_MEMORY_MAP_FAILED);
+
+   mem->map = map;
   mem->map_size = map_size;

   *ppData = mem->map + (offset - map_offset);
@@ -1246,6 +1264,9 @@ void anv_UnmapMemory(
      return;

   anv_gem_munmap(mem->map, mem->map_size);
+
+   mem->map = NULL;
+   mem->map_size = 0;
 }

 static void
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -88,10 +88,8 @@ anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
   };

   int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap);
-   if (ret != 0) {
-      /* FIXME: Is NULL the right error return? Cf MAP_INVALID */
-      return NULL;
-   }
+   if (ret != 0)
+      return MAP_FAILED;

   VG(VALGRIND_MALLOCLIKE_BLOCK(gem_mmap.addr_ptr, gem_mmap.size, 0, 1));
   return (void *)(uintptr_t) gem_mmap.addr_ptr;
--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -129,10 +129,13 @@ make_surface(const struct anv_device *dev,
   image->extent = anv_sanitize_image_extent(vk_info->imageType,
                                             vk_info->extent);

+   enum isl_format format = anv_get_isl_format(&dev->info, vk_info->format,
+                                               aspect, vk_info->tiling);
+   assert(format != ISL_FORMAT_UNSUPPORTED);
+
   ok = isl_surf_init(&dev->isl_dev, &anv_surf->isl,
      .dim = vk_to_isl_surf_dim[vk_info->imageType],
-      .format = anv_get_isl_format(&dev->info, vk_info->format,
-                                   aspect, vk_info->tiling),
+      .format = format,
      .width = image->extent.width,
      .height = image->extent.height,
      .depth = image->extent.depth,
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -392,6 +392,7 @@ anv_fill_binding_table(struct brw_stage_prog_data *prog_data, unsigned bias)
 {
   prog_data->binding_table.size_bytes = 0;
   prog_data->binding_table.texture_start = bias;
+   prog_data->binding_table.gather_texture_start = bias;
   prog_data->binding_table.ubo_start = bias;
   prog_data->binding_table.ssbo_start = bias;
   prog_data->binding_table.image_start = bias;
--- a/src/intel/vulkan/dev_icd.json.in
+++ b/src/intel/vulkan/dev_icd.json.in
@@ -2,6 +2,6 @@
    "file_format_version": "1.0.0",
    "ICD": {
        "library_path": "@build_libdir@/libvulkan_intel.so",
-        "abi_versions": "1.0.3"
+        "api_version": "1.0.3"
    }
 }
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1194,22 +1194,25 @@ void genX(CmdEndRenderPass)(
 }

 static void
-emit_ps_depth_count(struct anv_batch *batch,
+emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
                    struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
      pc.DestinationAddressType  = DAT_PPGTT;
      pc.PostSyncOperation       = WritePSDepthCount;
      pc.DepthStallEnable        = true;
      pc.Address                 = (struct anv_address) { bo, offset };
+
+      if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
+         pc.CommandStreamerStallEnable = true;
   }
 }

 static void
-emit_query_availability(struct anv_batch *batch,
+emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
                        struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
      pc.DestinationAddressType  = DAT_PPGTT;
      pc.PostSyncOperation       = WriteImmediateData;
      pc.Address                 = (struct anv_address) { bo, offset };
@@ -1242,7 +1245,7 @@ void genX(CmdBeginQuery)(

   switch (pool->type) {
   case VK_QUERY_TYPE_OCCLUSION:
-      emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
+      emit_ps_depth_count(cmd_buffer, &pool->bo,
                          query * sizeof(struct anv_query_pool_slot));
      break;

@@ -1262,10 +1265,10 @@ void genX(CmdEndQuery)(

   switch (pool->type) {
   case VK_QUERY_TYPE_OCCLUSION:
-      emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
+      emit_ps_depth_count(cmd_buffer, &pool->bo,
                          query * sizeof(struct anv_query_pool_slot) + 8);

-      emit_query_availability(&cmd_buffer->batch, &pool->bo,
+      emit_query_availability(cmd_buffer, &pool->bo,
                              query * sizeof(struct anv_query_pool_slot) + 16);
      break;

@@ -1307,11 +1310,14 @@ void genX(CmdWriteTimestamp)(
         pc.DestinationAddressType  = DAT_PPGTT,
         pc.PostSyncOperation       = WriteTimestamp,
         pc.Address = (struct anv_address) { &pool->bo, offset };
+
+         if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
+            pc.CommandStreamerStallEnable = true;
      }
      break;
   }

-   emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
+   emit_query_availability(cmd_buffer, &pool->bo, query + 16);
 }

 #if GEN_GEN > 7 || GEN_IS_HASWELL
--- a/src/intel/vulkan/intel_icd.json
+++ b/src/intel/vulkan/intel_icd.json
@@ -2,6 +2,6 @@
    "file_format_version": "1.0.0",
    "ICD": {
        "library_path": "libvulkan_intel.so",
-        "abi_versions": "1.0.3"
+        "api_version": "1.0.3"
    }
 }
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -74,6 +74,7 @@ TEST_LIBS = \

 TESTS = \
 	test_fs_cmod_propagation \
+	test_fs_copy_propagation \
 	test_fs_saturate_propagation \
        test_eu_compact \
 	test_vf_float_conversions \
@@ -89,6 +90,12 @@ test_fs_cmod_propagation_LDADD = \
 	$(top_builddir)/src/gtest/libgtest.la \
 	$(TEST_LIBS)

+test_fs_copy_propagation_SOURCES = \
+	test_fs_copy_propagation.cpp
+test_fs_copy_propagation_LDADD = \
+	$(top_builddir)/src/gtest/libgtest.la \
+	$(TEST_LIBS)
+
 test_fs_saturate_propagation_SOURCES = \
 	test_fs_saturate_propagation.cpp
 test_fs_saturate_propagation_LDADD = \
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -134,7 +134,6 @@ i965_FILES = \
 	brw_gs_surface_state.c \
 	brw_link.cpp \
 	brw_lower_texture_gradients.cpp \
-	brw_lower_unnormalized_offset.cpp \
 	brw_meta_util.c \
 	brw_meta_util.h \
 	brw_misc_state.c \
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1824,7 +1824,6 @@ brw_program_reloc(struct brw_context *brw, uint32_t state_offset,
 bool brw_do_cubemap_normalize(struct exec_list *instructions);
 bool brw_lower_texture_gradients(struct brw_context *brw,
                                 struct exec_list *instructions);
-bool brw_do_lower_unnormalized_offset(struct exec_list *instructions);

 extern const char * const conditional_modifier[16];
 extern const char *const pred_ctrl_align16[16];
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -336,7 +336,7 @@ static const struct brw_device_info brw_device_info_chv = {
   .max_gs_threads = 336,                           \
   .max_hs_threads = 336,                           \
   .max_ds_threads = 336,                           \
-   .max_wm_threads = 64 * 9,                        \
+   .max_wm_threads = 64 * 12,                       \
   .max_cs_threads = 56,                            \
   .urb = {                                         \
      .size = 384,                                  \
@@ -389,7 +389,7 @@ static const struct brw_device_info brw_device_info_bxt = {
   .max_hs_threads = 112,
   .max_ds_threads = 112,
   .max_gs_threads = 112,
-   .max_wm_threads = 64 * 3,
+   .max_wm_threads = 64 * 4,
   .max_cs_threads = 6 * 6,
   .urb = {
      .size = 192,
@@ -412,7 +412,7 @@ static const struct brw_device_info brw_device_info_bxt_2x6 = {
   .max_hs_threads = 56, /* XXX: guess */
   .max_ds_threads = 56,
   .max_gs_threads = 56,
-   .max_wm_threads = 64 * 2,
+   .max_wm_threads = 64 * 4,
   .max_cs_threads = 6 * 6,
   .urb = {
      .size = 128,
@@ -439,7 +439,7 @@ static const struct brw_device_info brw_device_info_kbl_gt1 = {
   .gt = 1,

   .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 4,
   .urb.size = 192,
   .num_slices = 1,
 };
@@ -449,7 +449,7 @@ static const struct brw_device_info brw_device_info_kbl_gt1_5 = {
   .gt = 1,

   .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 4,
   .num_slices = 1,
 };

@@ -457,7 +457,7 @@ static const struct brw_device_info brw_device_info_kbl_gt2 = {
   GEN9_FEATURES,
   .gt = 2,

-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 4,
   .num_slices = 1,
 };

@@ -465,7 +465,7 @@ static const struct brw_device_info brw_device_info_kbl_gt3 = {
   GEN9_FEATURES,
   .gt = 3,

-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 8,
   .num_slices = 2,
 };

@@ -473,7 +473,7 @@ static const struct brw_device_info brw_device_info_kbl_gt4 = {
   GEN9_FEATURES,
   .gt = 4,

-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9,
+   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 12,
   /*
    * From the "L3 Allocation and Programming" documentation:
    *
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3885,6 +3885,12 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
       */
      setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
      length++;
+   } else if (key->replicate_alpha && inst->target != 0) {
+      /* Handle the case when fragment shader doesn't write to draw buffer
+       * zero. No need to call setup_color_payload() for src0_alpha because
+       * alpha value will be undefined.
+       */
+      length++;
   }

   setup_color_payload(bld, key, &sources[length], color0, components);
@@ -5823,7 +5829,7 @@ fs_visitor::optimize()

      OPT(opt_algebraic);
      OPT(opt_cse);
-      OPT(opt_copy_propagate);
+      OPT(opt_copy_propagation);
      OPT(opt_predicated_break, this);
      OPT(opt_cmod_propagation);
      OPT(dead_code_eliminate);
@@ -5849,12 +5855,12 @@ fs_visitor::optimize()
   OPT(lower_logical_sends);

   if (progress) {
-      OPT(opt_copy_propagate);
+      OPT(opt_copy_propagation);
      /* Only run after logical send lowering because it's easier to implement
       * in terms of physical sends.
       */
      if (OPT(opt_zero_samples))
-         OPT(opt_copy_propagate);
+         OPT(opt_copy_propagation);
      /* Run after logical send lowering to give it a chance to CSE the
       * LOAD_PAYLOAD instructions created to construct the payloads of
       * e.g. texturing messages in cases where it wasn't possible to CSE the
@@ -5883,7 +5889,7 @@ fs_visitor::optimize()
   }

   if (OPT(lower_d2x)) {
-      OPT(opt_copy_propagate);
+      OPT(opt_copy_propagation);
      OPT(dead_code_eliminate);
   }

@@ -5893,7 +5899,7 @@ fs_visitor::optimize()
   if (devinfo->gen <= 5 && OPT(lower_minmax)) {
      OPT(opt_cmod_propagation);
      OPT(opt_cse);
-      OPT(opt_copy_propagate);
+      OPT(opt_copy_propagation);
      OPT(dead_code_eliminate);
   }

--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -133,11 +133,11 @@ public:
   bool opt_redundant_discard_jumps();
   bool opt_cse();
   bool opt_cse_local(bblock_t *block);
-   bool opt_copy_propagate();
+   bool opt_copy_propagation();
   bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry);
   bool try_constant_propagate(fs_inst *inst, acp_entry *entry);
-   bool opt_copy_propagate_local(void *mem_ctx, bblock_t *block,
-                                 exec_list *acp);
+   bool opt_copy_propagation_local(void *mem_ctx, bblock_t *block,
+                                   exec_list *acp);
   bool opt_drop_redundant_mov_to_flags();
   bool opt_register_renaming();
   bool register_coalesce();
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -129,7 +129,7 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
         foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
            acp[next_acp] = entry;

-            /* opt_copy_propagate_local populates out_acp with copies created
+            /* opt_copy_propagation_local populates out_acp with copies created
             * in a block which are still live at the end of the block.  This
             * is exactly what we want in the COPY set.
             */
@@ -445,7 +445,9 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
   if (entry->saturate) {
      switch(inst->opcode) {
      case BRW_OPCODE_SEL:
-         if (inst->src[1].file != IMM ||
+         if ((inst->conditional_mod != BRW_CONDITIONAL_GE &&
+              inst->conditional_mod != BRW_CONDITIONAL_L) ||
+             inst->src[1].file != IMM ||
             inst->src[1].f < 0.0 ||
             inst->src[1].f > 1.0) {
            return false;
@@ -759,8 +761,8 @@ can_propagate_from(fs_inst *inst)
 * list.
 */
 bool
-fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
-                                     exec_list *acp)
+fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
+                                       exec_list *acp)
 {
   bool progress = false;

@@ -844,7 +846,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
 }

 bool
-fs_visitor::opt_copy_propagate()
+fs_visitor::opt_copy_propagation()
 {
   bool progress = false;
   void *copy_prop_ctx = ralloc_context(NULL);
@@ -857,8 +859,8 @@ fs_visitor::opt_copy_propagate()
    * the set of copies available at the end of the block.
    */
   foreach_block (block, cfg) {
-      progress = opt_copy_propagate_local(copy_prop_ctx, block,
-                                          out_acp[block->num]) || progress;
+      progress = opt_copy_propagation_local(copy_prop_ctx, block,
+                                            out_acp[block->num]) || progress;
   }

   /* Do dataflow analysis for those available copies. */
@@ -877,7 +879,8 @@ fs_visitor::opt_copy_propagate()
         }
      }

-      progress = opt_copy_propagate_local(copy_prop_ctx, block, in_acp) || progress;
+      progress = opt_copy_propagation_local(copy_prop_ctx, block, in_acp) ||
+                 progress;
   }

   for (int i = 0; i < cfg->num_blocks; i++)
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -385,34 +385,33 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
      indirect_byte_offset =
         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);

-      struct brw_reg ind_src;
-      if (devinfo->gen < 8) {
-         /* From the Haswell PRM section "Register Region Restrictions":
-          *
-          *    "The lower bits of the AddressImmediate must not overflow to
-          *    change the register address.  The lower 5 bits of Address
-          *    Immediate when added to lower 5 bits of address register gives
-          *    the sub-register offset. The upper bits of Address Immediate
-          *    when added to upper bits of address register gives the register
-          *    address. Any overflow from sub-register offset is dropped."
-          *
-          * This restriction is only listed in the Haswell PRM but emperical
-          * testing indicates that it applies on all older generations and is
-          * lifted on Broadwell.
-          *
-          * Since the indirect may cause us to cross a register boundary, this
-          * makes the base offset almost useless.  We could try and do
-          * something clever where we use a actual base offset if
-          * base_offset % 32 == 0 but that would mean we were generating
-          * different code depending on the base offset.  Instead, for the
-          * sake of consistency, we'll just do the add ourselves.
-          */
-         brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
-         ind_src = brw_VxH_indirect(0, 0);
-      } else {
-         brw_MOV(p, addr, indirect_byte_offset);
-         ind_src = brw_VxH_indirect(0, imm_byte_offset);
-      }
+      /* There are a number of reasons why we don't use the base offset here.
+       * One reason is that the field is only 9 bits which means we can only
+       * use it to access the first 16 GRFs.  Also, from the Haswell PRM
+       * section "Register Region Restrictions":
+       *
+       *    "The lower bits of the AddressImmediate must not overflow to
+       *    change the register address.  The lower 5 bits of Address
+       *    Immediate when added to lower 5 bits of address register gives
+       *    the sub-register offset. The upper bits of Address Immediate
+       *    when added to upper bits of address register gives the register
+       *    address. Any overflow from sub-register offset is dropped."
+       *
+       * Since the indirect may cause us to cross a register boundary, this
+       * makes the base offset almost useless.  We could try and do something
+       * clever where we use a actual base offset if base_offset % 32 == 0 but
+       * that would mean we were generating different code depending on the
+       * base offset.  Instead, for the sake of consistency, we'll just do the
+       * add ourselves.  This restriction is only listed in the Haswell PRM
+       * but empirical testing indicates that it applies on all older
+       * generations and is lifted on Broadwell.
+       *
+       * In the end, while base_offset is nice to look at in the generated
+       * code, using it saves us 0 instructions and would require quite a bit
+       * of case-by-case work.  It's just not worth it.
+       */
+      brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+      struct brw_reg ind_src = brw_VxH_indirect(0, 0);

      brw_inst *mov = brw_MOV(p, dst, retype(ind_src, dst.type));

--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -126,7 +126,6 @@ process_glsl_ir(gl_shader_stage stage,
   do_vec_index_to_cond_assign(shader->ir);
   lower_vector_insert(shader->ir, true);
   lower_offset_arrays(shader->ir);
-   brw_do_lower_unnormalized_offset(shader->ir);
   lower_noise(shader->ir);
   lower_quadop_vector(shader->ir, false);

--- a/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp
+++ b/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp
@@ -1,106 +0,0 @@
-/*
- * Copyright © 2013 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file brw_lower_unnormalized_offset.cpp
- *
- * IR lower pass to convert a texture offset into an adjusted coordinate,
- * for use with unnormalized coordinates. At least the gather4* messages
- * on Ivybridge and Haswell make a mess with nonzero offsets.
- *
- * \author Chris Forbes <chrisf@ijw.co.nz>
- */
-
-#include "compiler/glsl_types.h"
-#include "compiler/glsl/ir.h"
-#include "compiler/glsl/ir_builder.h"
-
-using namespace ir_builder;
-
-class brw_lower_unnormalized_offset_visitor : public ir_hierarchical_visitor {
-public:
-   brw_lower_unnormalized_offset_visitor()
-   {
-      progress = false;
-   }
-
-   ir_visitor_status visit_leave(ir_texture *ir);
-
-   bool progress;
-};
-
-ir_visitor_status
-brw_lower_unnormalized_offset_visitor::visit_leave(ir_texture *ir)
-{
-   if (!ir->offset)
-      return visit_continue;
-
-   if (ir->op == ir_tg4 || ir->op == ir_tex) {
-      if (ir->sampler->type->sampler_dimensionality != GLSL_SAMPLER_DIM_RECT)
-         return visit_continue;
-   }
-   else if (ir->op != ir_txf) {
-      return visit_continue;
-   }
-
-   void *mem_ctx = ralloc_parent(ir);
-
-   if (ir->op == ir_txf) {
-      /* It appears that the ld instruction used for txf does its
-       * address bounds check before adding in the offset.  To work
-       * around this, just add the integer offset to the integer texel
-       * coordinate, and don't put the offset in the header.
-       */
-      ir_variable *var = new(mem_ctx) ir_variable(ir->coordinate->type,
-                                                  "coordinate",
-                                                  ir_var_temporary);
-      base_ir->insert_before(var);
-      base_ir->insert_before(assign(var, ir->coordinate));
-      base_ir->insert_before(assign(var,
-               add(swizzle_for_size(var, ir->offset->type->vector_elements), ir->offset),
-               (1 << ir->offset->type->vector_elements) - 1));
-
-      ir->coordinate = new(mem_ctx) ir_dereference_variable(var);
-   } else {
-      ir->coordinate = add(ir->coordinate, i2f(ir->offset));
-   }
-
-   ir->offset = NULL;
-
-   progress = true;
-   return visit_continue;
-}
-
-extern "C" {
-
-bool
-brw_do_lower_unnormalized_offset(exec_list *instructions)
-{
-   brw_lower_unnormalized_offset_visitor v;
-
-   visit_list_elements(&v, instructions);
-
-   return v.progress;
-}
-
-}
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -419,6 +419,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)

   static const nir_lower_tex_options tex_options = {
      .lower_txp = ~0,
+      .lower_txf_offset = true,
+      .lower_rect_offset = true,
   };

   OPT(nir_lower_tex, &tex_options);
--- a/src/mesa/drivers/dri/i965/hsw_sol.c
+++ b/src/mesa/drivers/dri/i965/hsw_sol.c
@@ -201,6 +201,9 @@ hsw_pause_transform_feedback(struct gl_context *ctx,
      (struct brw_transform_feedback_object *) obj;

   if (brw->is_haswell) {
+      /* Flush any drawing so that the counters have the right values. */
+      brw_emit_mi_flush(brw);
+
      /* Save the SOL buffer offset register values. */
      for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
         BEGIN_BATCH(3);
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -984,6 +984,19 @@ intel_miptree_reference(struct intel_mipmap_tree **dst,
   *dst = src;
 }

+static void
+intel_miptree_hiz_buffer_free(struct intel_miptree_aux_buffer *hiz_buf)
+{
+   if (hiz_buf == NULL)
+      return;
+
+   if (hiz_buf->mt)
+      intel_miptree_release(&hiz_buf->mt);
+   else
+      drm_intel_bo_unreference(hiz_buf->bo);
+
+   free(hiz_buf);
+}

 void
 intel_miptree_release(struct intel_mipmap_tree **mt)
@@ -999,13 +1012,7 @@ intel_miptree_release(struct intel_mipmap_tree **mt)

      drm_intel_bo_unreference((*mt)->bo);
      intel_miptree_release(&(*mt)->stencil_mt);
-      if ((*mt)->hiz_buf) {
-         if ((*mt)->hiz_buf->mt)
-            intel_miptree_release(&(*mt)->hiz_buf->mt);
-         else
-            drm_intel_bo_unreference((*mt)->hiz_buf->bo);
-         free((*mt)->hiz_buf);
-      }
+      intel_miptree_hiz_buffer_free((*mt)->hiz_buf);
      intel_miptree_release(&(*mt)->mcs_mt);
      intel_resolve_map_clear(&(*mt)->hiz_map);

@@ -2184,6 +2191,8 @@ intel_miptree_resolve_color(struct brw_context *brw,
 * then discard the MCS buffer, if present.  We also set the fast_clear_state
 * to INTEL_FAST_CLEAR_STATE_NO_MCS to ensure that no MCS buffer gets
 * allocated in the future.
+ *
+ * HiZ is similarly unsafe with shared buffers.
 */
 void
 intel_miptree_make_shareable(struct brw_context *brw,
@@ -2201,6 +2210,14 @@ intel_miptree_make_shareable(struct brw_context *brw,
      intel_miptree_release(&mt->mcs_mt);
      mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_NO_MCS;
   }
+
+   if (mt->hiz_buf) {
+      intel_miptree_all_slices_resolve_depth(brw, mt);
+      intel_miptree_hiz_buffer_free(mt->hiz_buf);
+      mt->hiz_buf = NULL;
+   }
+
+   mt->disable_aux_buffers = true;
 }


--- a/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class copy_propagation_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct brw_device_info *devinfo;
+   struct gl_context *ctx;
+   struct brw_wm_prog_data *prog_data;
+   struct gl_shader_program *shader_prog;
+   fs_visitor *v;
+};
+
+class copy_propagation_fs_visitor : public fs_visitor
+{
+public:
+   copy_propagation_fs_visitor(struct brw_compiler *compiler,
+                               struct brw_wm_prog_data *prog_data,
+                               nir_shader *shader)
+      : fs_visitor(compiler, NULL, NULL, NULL,
+                   &prog_data->base, (struct gl_program *) NULL,
+                   shader, 8, -1) {}
+};
+
+
+void copy_propagation_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
+
+   prog_data = ralloc(NULL, struct brw_wm_prog_data);
+   nir_shader *shader =
+      nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL);
+
+   v = new copy_propagation_fs_visitor(compiler, prog_data, shader);
+
+   devinfo->gen = 4;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+   fs_inst *inst = (fs_inst *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (fs_inst *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+copy_propagation(fs_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->cfg->dump(v);
+   }
+
+   bool ret = v->opt_copy_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->cfg->dump(v);
+   }
+
+   return ret;
+}
+
+TEST_F(copy_propagation_test, basic)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg vgrf0 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf1 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf2 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf3 = v->vgrf(glsl_type::float_type);
+   bld.MOV(vgrf0, vgrf2);
+   bld.ADD(vgrf1, vgrf0, vgrf3);
+
+   /* = Before =
+    *
+    * 0: mov(8)        vgrf0  vgrf2
+    * 1: add(8)        vgrf1  vgrf0  vgrf3
+    *
+    * = After =
+    * 0: mov(8)        vgrf0  vgrf2
+    * 1: add(8)        vgrf1  vgrf2  vgrf3
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(copy_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   fs_inst *mov = instruction(block0, 0);
+   EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode);
+   EXPECT_TRUE(mov->dst.equals(vgrf0));
+   EXPECT_TRUE(mov->src[0].equals(vgrf2));
+
+   fs_inst *add = instruction(block0, 1);
+   EXPECT_EQ(BRW_OPCODE_ADD, add->opcode);
+   EXPECT_TRUE(add->dst.equals(vgrf1));
+   EXPECT_TRUE(add->src[0].equals(vgrf2));
+   EXPECT_TRUE(add->src[1].equals(vgrf3));
+}
+
+TEST_F(copy_propagation_test, maxmax_sat_imm)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg vgrf0 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf1 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf2 = v->vgrf(glsl_type::float_type);
+
+   static const struct {
+      enum brw_conditional_mod conditional_mod;
+      float immediate;
+      bool expected_result;
+   } test[] = {
+      /*   conditional mod,     imm, expected_result */
+      { BRW_CONDITIONAL_GE  ,  0.1f, true },
+      { BRW_CONDITIONAL_L   ,  0.1f, true },
+      { BRW_CONDITIONAL_GE  ,  0.5f, true },
+      { BRW_CONDITIONAL_L   ,  0.5f, true },
+      { BRW_CONDITIONAL_GE  ,  0.9f, true },
+      { BRW_CONDITIONAL_L   ,  0.9f, true },
+      { BRW_CONDITIONAL_GE  , -1.5f, false },
+      { BRW_CONDITIONAL_L   , -1.5f, false },
+      { BRW_CONDITIONAL_GE  ,  1.5f, false },
+      { BRW_CONDITIONAL_L   ,  1.5f, false },
+
+      { BRW_CONDITIONAL_NONE, 0.5f, false },
+      { BRW_CONDITIONAL_Z   , 0.5f, false },
+      { BRW_CONDITIONAL_NZ  , 0.5f, false },
+      { BRW_CONDITIONAL_G   , 0.5f, false },
+      { BRW_CONDITIONAL_LE  , 0.5f, false },
+      { BRW_CONDITIONAL_R   , 0.5f, false },
+      { BRW_CONDITIONAL_O   , 0.5f, false },
+      { BRW_CONDITIONAL_U   , 0.5f, false },
+   };
+
+   for (unsigned i = 0; i < sizeof(test) / sizeof(test[0]); i++) {
+      fs_inst *mov = set_saturate(true, bld.MOV(vgrf0, vgrf1));
+      fs_inst *sel = set_condmod(test[i].conditional_mod,
+                                 bld.SEL(vgrf2, vgrf0,
+                                         brw_imm_f(test[i].immediate)));
+
+      v->calculate_cfg();
+
+      bblock_t *block0 = v->cfg->blocks[0];
+
+      EXPECT_EQ(0, block0->start_ip);
+      EXPECT_EQ(1, block0->end_ip);
+
+      EXPECT_EQ(test[i].expected_result, copy_propagation(v));
+      EXPECT_EQ(0, block0->start_ip);
+      EXPECT_EQ(1, block0->end_ip);
+
+      EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode);
+      EXPECT_TRUE(mov->saturate);
+      EXPECT_TRUE(mov->dst.equals(vgrf0));
+      EXPECT_TRUE(mov->src[0].equals(vgrf1));
+
+      EXPECT_EQ(BRW_OPCODE_SEL, sel->opcode);
+      EXPECT_EQ(test[i].conditional_mod, sel->conditional_mod);
+      EXPECT_EQ(test[i].expected_result, sel->saturate);
+      EXPECT_TRUE(sel->dst.equals(vgrf2));
+      if (test[i].expected_result) {
+         EXPECT_TRUE(sel->src[0].equals(vgrf1));
+      } else {
+         EXPECT_TRUE(sel->src[0].equals(vgrf0));
+      }
+      EXPECT_TRUE(sel->src[1].equals(brw_imm_f(test[i].immediate)));
+
+      delete v->cfg;
+      v->cfg = NULL;
+   }
+}
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -2848,6 +2848,7 @@ reuse_framebuffer_texture_attachment(struct gl_framebuffer *fb,
   dst_att->Type = src_att->Type;
   dst_att->Complete = src_att->Complete;
   dst_att->TextureLevel = src_att->TextureLevel;
+   dst_att->CubeMapFace = src_att->CubeMapFace;
   dst_att->Zoffset = src_att->Zoffset;
   dst_att->Layered = src_att->Layered;
 }
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -857,7 +857,7 @@ _mesa_get_color_read_format(struct gl_context *ctx)
      if (format == MESA_FORMAT_B8G8R8A8_UNORM)
         return GL_BGRA;
      else if (format == MESA_FORMAT_B5G6R5_UNORM)
-         return GL_BGR;
+         return GL_RGB;
      else if (format == MESA_FORMAT_R_UNORM8)
         return GL_RED;

@@ -892,7 +892,7 @@ _mesa_get_color_read_type(struct gl_context *ctx)
      const GLenum data_type = _mesa_get_format_datatype(format);

      if (format == MESA_FORMAT_B5G6R5_UNORM)
-         return GL_UNSIGNED_SHORT_5_6_5_REV;
+         return GL_UNSIGNED_SHORT_5_6_5;

      switch (data_type) {
      case GL_SIGNED_NORMALIZED:
--- a/src/mesa/main/hash.c
+++ b/src/mesa/main/hash.c
@@ -59,7 +59,6 @@ struct _mesa_HashTable {
   struct hash_table *ht;
   GLuint MaxKey;                        /**< highest key inserted so far */
   mtx_t Mutex;                /**< mutual exclusion lock */
-   mtx_t WalkMutex;            /**< for _mesa_HashWalk() */
   GLboolean InDeleteAll;                /**< Debug check */
   /** Value that would be in the table for DELETED_KEY_VALUE. */
   void *deleted_key_data;
@@ -129,8 +128,11 @@ _mesa_NewHashTable(void)
      }

      _mesa_hash_table_set_deleted_key(table->ht, uint_key(DELETED_KEY_VALUE));
-      mtx_init(&table->Mutex, mtx_plain);
-      mtx_init(&table->WalkMutex, mtx_plain);
+      /*
+       * Needs to be recursive, since the callback in _mesa_HashWalk()
+       * is allowed to call _mesa_HashRemove().
+       */
+      mtx_init(&table->Mutex, mtx_recursive);
   }
   else {
      _mesa_error_no_memory(__func__);
@@ -161,7 +163,6 @@ _mesa_DeleteHashTable(struct _mesa_HashTable *table)
   _mesa_hash_table_destroy(table->ht, NULL);

   mtx_destroy(&table->Mutex);
-   mtx_destroy(&table->WalkMutex);
   free(table);
 }

@@ -401,11 +402,6 @@ _mesa_HashDeleteAll(struct _mesa_HashTable *table,

 /**
 * Walk over all entries in a hash table, calling callback function for each.
- * Note: we use a separate mutex in this function to avoid a recursive
- * locking deadlock (in case the callback calls _mesa_HashRemove()) and to
- * prevent multiple threads/contexts from getting tangled up.
- * A lock-less version of this function could be used when the table will
- * not be modified.
 * \param table  the hash table to walk
 * \param callback  the callback function
 * \param userData  arbitrary pointer to pass along to the callback
@@ -422,13 +418,13 @@ _mesa_HashWalk(const struct _mesa_HashTable *table,

   assert(table);
   assert(callback);
-   mtx_lock(&table2->WalkMutex);
+   mtx_lock(&table2->Mutex);
   hash_table_foreach(table->ht, entry) {
      callback((uintptr_t)entry->key, entry->data, userData);
   }
   if (table->deleted_key_data)
      callback(DELETED_KEY_VALUE, table->deleted_key_data, userData);
-   mtx_unlock(&table2->WalkMutex);
+   mtx_unlock(&table2->Mutex);
 }

 static void
@@ -1 +1 @@
 .0.4
 .0.6