docs: add release notes for 18.3.4

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
Update version to 18.3.4
2019-02-18 18:23:55 +00:00 · 2019-02-18 18:19:54 +00:00 · 2019-02-16 13:46:37 +00:00 · 2019-02-15 11:40:11 +00:00 · 2019-02-15 11:40:11 +00:00 · 2019-02-15 11:40:11 +00:00
77 changed files with 1286 additions and 400 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-18.3.2
+18.3.4
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -2,3 +2,37 @@
 c02390f8fcd367c7350db568feabb2f062efca14 egl/wayland: rather obvious build fix
 # fixes: The commit addresses b4476138d5ad3f8d30c14ee61f2f375edfdbab2a
 ff6f1dd0d3c6b4c15ca51b478b2884d14f6a1e06 meson: libfreedreno depends upon libdrm (for fence support)
+
+# fixes: This commit requires commits aeaf8dbd097 and 7484bc894b9 which did not
+#        land in branch.
+f67dea5e19ef14187be0e8d0f61b1f764c7ccb4f radv: Fix multiview depth clears
+
+# stable The commits aren't suitable in their present form.
+bfe31c5e461a1330d6f606bf5310685eff1198dd nir/builder: Add nir_i2i and nir_u2u helpers which take a bit size
+abfe674c54bee6f8fdcae411b07db89c10b9d530 spirv: Handle arbitrary bit sizes for deref array indices
+
+# warn   The commits refer stale sha, yet don't fix anything in particular.
+98984b7cdd79c15cc7331c791f8be61e873b8bbd Revert "mapi/new: sort by slot number"
+9f86f1da7c68b5b900cd6f60925610ff1225a72d egl: add glvnd entrypoints for EGL_MESA_query_driver
+
+# stable Explicit 19.0 only nomination.
+38f542783faa360020b77fdd76b97f207a9e0068 v50,nvc0: add explicit settings for recent caps
+
+# stable Explicit 19.0 only nominations.
+399215eb7a0517463e5757c598d6cff6ae2301d0 nvc0: add support for handling indirect draws with attrib conversion
+4443b6ddf2e08d06f3d0457cf20a2e04244cde37 nvc0/ir: always use CG mode for loads from atomic-only buffers
+5de5beedf21306b01730085f8e03d8f424729016 nvc0/ir: fix second tex argument after levelZero optimization
+162352e6711b3ceab114686f7a3248074339e7f7 nvc0: fix 3d images on kepler
+e00799d3dc0595dc3998dbf199ceec8b1eece966 nv50,nvc0: use condition for occlusion queries when already complete
+6adb9b38bfb1f6ee4c94596bf0744225aa8e967a nvc0: stick zero values for the compute invocation counts
+04593d9a73ea257a36cc3b9fb5cd41427beaaea5 gk110/ir: Add rcp f64 implementation
+7937408052a1896f0b08b0110bb8a1790eeee351 gk110/ir: Add rsq f64 implementation
+656ad060518d067a3b311db8c2de2a396fb41898 gk110/ir: Use the new rcp/rsq in library
+12669d29705a26478aa691cb454149628be65f17 gk104/ir: Use the new rcp/rsq in library
+815a8e59c6d462a7008653ea9e3010d40b6ba589 gm107/ir: add fp64 rcp
+cce495572136a606dd2a35e79f45080c3796e2cc gm107/ir: add fp64 rsq
+6010d7b8e8bee1bcea2b329cf6d3b44c5fc3ca66 gallium: add PIPE_CAP_MAX_VARYINGS
+cbd1ad6165f0aea7fb7c6fd1b36ad5317dd65cb7 st/mesa: require RGBA2, RGB4, and RGBA4 to be renderable
+
+# stable The commit addresses functionality not present in branch
+1b8983c25be19073c02fe9630e949be55f8280fa radv: fix using LOAD_CONTEXT_REG with old GFX ME firmwares on GFX8
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -13,12 +13,12 @@

 is_stable_nomination()
 {
-	git show --summary "$1" | grep -q -i -o "CC:.*mesa-stable"
+	git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-stable"
 }

 is_typod_nomination()
 {
-	git show --summary "$1" | grep -q -i -o "CC:.*mesa-dev"
+	git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-dev"
 }

 fixes=
@@ -44,7 +44,7 @@ is_sha_nomination()
 		# Treat only the current line
 		id=`echo "$fixes" | tail -n $fixes_count | head -n 1 | cut -d : -f 2`
 		fixes_count=$(($fixes_count-1))
-		if ! git show $id &>/dev/null; then
+		if ! git show $id >/dev/null 2>&1; then
 			echo WARNING: Commit $1 lists invalid sha $id
 		fi
 	done
@@ -143,7 +143,7 @@ do
 	esac

 	printf "[ %8s ] " "$tag"
-	git --no-pager show --summary --oneline $sha
+	git --no-pager show --no-patch --oneline $sha
 done

 rm -f already_picked
--- a/configure.ac
+++ b/configure.ac
@@ -1864,6 +1864,7 @@ for plat in $platforms; do
        ;;

    drm)
+        test "x$enable_egl" = "xyes" &&
        test "x$enable_gbm" = "xno" &&
                AC_MSG_ERROR([EGL platform drm needs gbm])
        DEFINES="$DEFINES -DHAVE_DRM_PLATFORM"
--- a/docs/relnotes/18.3.2.html
+++ b/docs/relnotes/18.3.2.html
@@ -31,7 +31,8 @@ Compatibility contexts may report a lower version depending on each driver.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+1cde4fafd40cd1ad4ee3a13b364b7a0175a08b7afdd127fb46f918c1e1dfd4b0  mesa-18.3.2.tar.gz
+f7ce7181c07b6d8e0132da879af1729523a6c8aa87f79a9d59dfd064024cfb35  mesa-18.3.2.tar.xz
 </pre>


--- a/docs/relnotes/18.3.3.html
+++ b/docs/relnotes/18.3.3.html
@@ -0,0 +1,208 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.3.3 Release Notes / January 31, 2019</h1>
+
+<p>
+Mesa 18.3.3 is a bug fix release which fixes bugs found since the 18.3.2 release.
+</p>
+<p>
+Mesa 18.3.3 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+6b9893942fe8011c7736d51448deb6ef80ece2257e0fac27b02e997a6605d5e4  mesa-18.3.3.tar.gz
+2ab6886a6966c532ccbcc3b240925e681464b658244f0cbed752615af3936299  mesa-18.3.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108877">Bug 108877</a> - OpenGL CTS gl43 test cases were interrupted due to segment fault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109023">Bug 109023</a> - error: inlining failed in call to always_inline ‘__m512 _mm512_and_ps(__m512, __m512)’: target specific option mismatch</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109129">Bug 109129</a> - format_types.h:1220: undefined reference to `_mm256_cvtps_ph'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109229">Bug 109229</a> - glLinkProgram locks up for ~30 seconds</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109242">Bug 109242</a> - [RADV] The Witcher 3 system freeze</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109488">Bug 109488</a> - Mesa 18.3.2 crash on a specific fragment shader (assert triggered) / already fixed on the master branch.</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Andres Gomez (2):</p>
+<ul>
+  <li>bin/get-pick-list.sh: fix the oneline printing</li>
+  <li>bin/get-pick-list.sh: fix redirection in sh</li>
+</ul>
+
+<p>Axel Davy (1):</p>
+<ul>
+  <li>st/nine: Immediately upload user provided textures</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (3):</p>
+<ul>
+  <li>radv: Only use 32 KiB per threadgroup on Stoney.</li>
+  <li>radv: Set partial_vs_wave for pipelines with just GS, not tess.</li>
+  <li>nir: Account for atomics in copy propagation.</li>
+</ul>
+
+<p>Bruce Cherniak (1):</p>
+<ul>
+  <li>gallium/swr: Fix multi-context sync fence deadlock.</li>
+</ul>
+
+<p>Carsten Haitzler (Rasterman) (2):</p>
+<ul>
+  <li>vc4: Use named parameters for the NEON inline asm.</li>
+  <li>vc4: Declare the cpu pointers as being modified in NEON asm.</li>
+</ul>
+
+<p>Danylo Piliaiev (1):</p>
+<ul>
+  <li>glsl: Fix copying function's out to temp if dereferenced by array</li>
+</ul>
+
+<p>Dave Airlie (3):</p>
+<ul>
+  <li>dri_interface: add put shm image2 (v2)</li>
+  <li>glx: add support for putimageshm2 path (v2)</li>
+  <li>gallium: use put image shm2 path (v2)</li>
+</ul>
+
+<p>Dylan Baker (4):</p>
+<ul>
+  <li>meson: allow building dri driver without window system if osmesa is classic</li>
+  <li>meson: fix swr KNL build</li>
+  <li>meson: Fix compiler checks for SWR with ICC</li>
+  <li>meson: Add warnings and errors when using ICC</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.3.2</li>
+  <li>cherry-ignore: radv: Fix multiview depth clears</li>
+  <li>cherry-ignore: spirv: Handle arbitrary bit sizes for deref array indices</li>
+  <li>cherry-ignore: WARNING: Commit XXX lists invalid sha</li>
+</ul>
+
+<p>Eric Anholt (2):</p>
+<ul>
+  <li>vc4: Don't leak the GPU fd for renderonly usage.</li>
+  <li>vc4: Enable NEON asm on meson cross-builds.</li>
+</ul>
+
+<p>Eric Engestrom (2):</p>
+<ul>
+  <li>configure: EGL requirements only apply if EGL is built</li>
+  <li>meson/vdpau: add missing soversion</li>
+</ul>
+
+<p>Iago Toral Quiroga (1):</p>
+<ul>
+  <li>anv/device: fix maximum number of images supported</li>
+</ul>
+
+<p>Jason Ekstrand (3):</p>
+<ul>
+  <li>anv/nir: Rework arguments to apply_pipeline_layout</li>
+  <li>anv: Only parse pImmutableSamplers if the descriptor has samplers</li>
+  <li>nir/xfb: Fix offset accounting for dvec3/4</li>
+</ul>
+
+<p>Karol Herbst (2):</p>
+<ul>
+  <li>nv50/ir: disable tryCollapseChainedMULs in ConstantFolding for precise instructions</li>
+  <li>glsl/lower_output_reads: set invariant and precise flags on temporaries</li>
+</ul>
+
+<p>Lionel Landwerlin (1):</p>
+<ul>
+  <li>anv: fix invalid binding table index computation</li>
+</ul>
+
+<p>Marek Olšák (4):</p>
+<ul>
+  <li>radeonsi: also apply the GS hang workaround to draws without tessellation</li>
+  <li>radeonsi: fix a u_blitter crash after a shader with FBFETCH</li>
+  <li>radeonsi: fix rendering to tiny viewports where the viewport center is &gt; 8K</li>
+  <li>st/mesa: purge framebuffers when unbinding a context</li>
+</ul>
+
+<p>Niklas Haas (1):</p>
+<ul>
+  <li>radv: correctly use vulkan 1.0 by default</li>
+</ul>
+
+<p>Pierre Moreau (1):</p>
+<ul>
+  <li>meson: Fix with_gallium_icd to with_opencl_icd</li>
+</ul>
+
+<p>Rob Clark (1):</p>
+<ul>
+  <li>loader: fix the no-modifiers case</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radv: clean up setting partial_es_wave for distributed tess on VI</li>
+</ul>
+
+<p>Timothy Arceri (5):</p>
+<ul>
+  <li>ac/nir_to_llvm: fix interpolateAt* for arrays</li>
+  <li>ac/nir_to_llvm: fix clamp shadow reference for more hardware</li>
+  <li>radv/ac: fix some fp16 handling</li>
+  <li>glsl: use remap location when serialising uniform program resource data</li>
+  <li>glsl: Copy function out to temp if we don't directly ref a variable</li>
+</ul>
+
+<p>Tomeu Vizoso (1):</p>
+<ul>
+  <li>etnaviv: Consolidate buffer references from framebuffers</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>meson: Fix typo.</li>
+</ul>
+
+
+
+</div>
+</body>
+</html>
+
--- a/docs/relnotes/18.3.4.html
+++ b/docs/relnotes/18.3.4.html
@@ -0,0 +1,179 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.3.4 Release Notes / February 18, 2019</h1>
+
+<p>
+Mesa 18.3.4 is a bug fix release which fixes bugs found since the 18.3.3 release.
+</p>
+<p>
+Mesa 18.3.4 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109107">Bug 109107</a> - gallium/st/va: change va max_profiles when using Radeon VCN Hardware</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109401">Bug 109401</a> - [DXVK] Project Cars rendering problems</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109543">Bug 109543</a> - After upgrade mesa to 19.0.0~rc1 all vulkan based application stop working [&quot;vulkan-cube&quot; received SIGSEGV in radv_pipeline_init_blend_state at ../src/amd/vulkan/radv_pipeline.c:699]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109603">Bug 109603</a> - nir_instr_as_deref: Assertion `parent &amp;&amp; parent-&gt;type == nir_instr_type_deref' failed.</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Bart Oldeman (1):</p>
+<ul>
+  <li>gallium-xlib: query MIT-SHM before using it.</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (2):</p>
+<ul>
+  <li>radv: Only look at pImmutableSamples if the descriptor has a sampler.</li>
+  <li>amd/common: Use correct writemask for shared memory stores.</li>
+</ul>
+
+<p>Dylan Baker (2):</p>
+<ul>
+  <li>get-pick-list: Add --pretty=medium to the arguments for Cc patches</li>
+  <li>meson: Add dependency on genxml to anvil</li>
+</ul>
+
+<p>Emil Velikov (5):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.3.3</li>
+  <li>cherry-ignore: nv50,nvc0: add explicit settings for recent caps</li>
+  <li>cherry-ignore: add more 19.0 only nominations from Ilia</li>
+  <li>cherry-ignore: radv: fix using LOAD_CONTEXT_REG with old GFX ME firmwares on GFX8</li>
+  <li>Update version to 18.3.4</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>vc4: Fix copy-and-paste fail in backport of NEON asm fixes.</li>
+</ul>
+
+<p>Eric Engestrom (2):</p>
+<ul>
+  <li>xvmc: fix string comparison</li>
+  <li>xvmc: fix string comparison</li>
+</ul>
+
+<p>Ernestas Kulik (2):</p>
+<ul>
+  <li>vc4: Fix leak in HW queries error path</li>
+  <li>v3d: Fix leak in resource setup error path</li>
+</ul>
+
+<p>Iago Toral Quiroga (1):</p>
+<ul>
+  <li>intel/compiler: do not copy-propagate strided regions to ddx/ddy arguments</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nvc0: we have 16k-sized framebuffers, fix default scissors</li>
+</ul>
+
+<p>Jason Ekstrand (3):</p>
+<ul>
+  <li>intel/fs: Handle IMAGE_SIZE in size_read() and is_send_from_grf()</li>
+  <li>intel/fs: Do the grf127 hack on SIMD8 instructions in SIMD16 mode</li>
+  <li>nir/deref: Rematerialize parents in rematerialize_derefs_in_use_blocks</li>
+</ul>
+
+<p>Juan A. Suarez Romero (1):</p>
+<ul>
+  <li>anv/cmd_buffer: check for NULL framebuffer</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>st/mesa: Limit GL_MAX_[NATIVE_]PROGRAM_PARAMETERS_ARB to 2048</li>
+</ul>
+
+<p>Kristian H. Kristensen (1):</p>
+<ul>
+  <li>freedreno/a6xx: Emit blitter dst with OUT_RELOCW</li>
+</ul>
+
+<p>Leo Liu (2):</p>
+<ul>
+  <li>st/va: fix the incorrect max profiles report</li>
+  <li>st/va/vp9: set max reference as default of VP9 reference number</li>
+</ul>
+
+<p>Marek Olšák (4):</p>
+<ul>
+  <li>meson: drop the xcb-xrandr version requirement</li>
+  <li>gallium/u_threaded: fix EXPLICIT_FLUSH for flush offsets &gt; 0</li>
+  <li>radeonsi: fix EXPLICIT_FLUSH for flush offsets &gt; 0</li>
+  <li>winsys/amdgpu: don't drop manually added fence dependencies</li>
+</ul>
+
+<p>Mario Kleiner (2):</p>
+<ul>
+  <li>egl/wayland: Allow client-&gt;server format conversion for PRIME offload. (v2)</li>
+  <li>egl/wayland-drm: Only announce formats via wl_drm which the driver supports.</li>
+</ul>
+
+<p>Oscar Blumberg (1):</p>
+<ul>
+  <li>radeonsi: Fix guardband computation for large render targets</li>
+</ul>
+
+<p>Rob Clark (1):</p>
+<ul>
+  <li>freedreno: stop frob'ing pipe_resource::nr_samples</li>
+</ul>
+
+<p>Rodrigo Vivi (1):</p>
+<ul>
+  <li>intel: Add more PCI Device IDs for Coffee Lake and Ice Lake.</li>
+</ul>
+
+<p>Samuel Pitoiset (2):</p>
+<ul>
+  <li>radv: fix compiler issues with GCC 9</li>
+  <li>radv: always export gl_SampleMask when the fragment shader uses it</li>
+</ul>
+
+
+
+</div>
+</body>
+</html>
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -589,7 +589,7 @@ struct __DRIdamageExtensionRec {
 * SWRast Loader extension.
 */
 #define __DRI_SWRAST_LOADER "DRI_SWRastLoader"
-#define __DRI_SWRAST_LOADER_VERSION 4
+#define __DRI_SWRAST_LOADER_VERSION 5
 struct __DRIswrastLoaderExtensionRec {
    __DRIextension base;

@@ -649,6 +649,23 @@ struct __DRIswrastLoaderExtensionRec {
    void (*getImageShm)(__DRIdrawable *readable,
                        int x, int y, int width, int height,
                        int shmid, void *loaderPrivate);
+
+   /**
+     * Put shm image to drawable (v2)
+     *
+     * The original version fixes srcx/y to 0, and expected
+     * the offset to be adjusted. This version allows src x,y
+     * to not be included in the offset. This is needed to
+     * avoid certain overflow checks in the X server, that
+     * result in lost rendering.
+     *
+     * \since 5
+     */
+    void (*putImageShm2)(__DRIdrawable *drawable, int op,
+                         int x, int y,
+                         int width, int height, int stride,
+                         int shmid, char *shmaddr, unsigned offset,
+                         void *loaderPrivate);
 };

 /**
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -171,6 +171,7 @@ CHIPSET(0x3185, glk_2x6, "Intel(R) UHD Graphics 600 (Geminilake 2x6)")
 CHIPSET(0x3E90, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E93, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E99, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
+CHIPSET(0x3E9C, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E91, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E92, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E96, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
@@ -203,6 +204,10 @@ CHIPSET(0x5A54, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)")
 CHIPSET(0x8A50, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
 CHIPSET(0x8A51, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
 CHIPSET(0x8A52, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
+CHIPSET(0x8A56, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
+CHIPSET(0x8A57, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
+CHIPSET(0x8A58, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
+CHIPSET(0x8A59, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
 CHIPSET(0x8A5A, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
 CHIPSET(0x8A5B, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
 CHIPSET(0x8A5C, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
--- a/meson.build
+++ b/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2017-2018 Intel Corporation
+# Copyright © 2017-2019 Intel Corporation

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -165,6 +165,14 @@ with_gallium_svga = _drivers.contains('svga')
 with_gallium_virgl = _drivers.contains('virgl')
 with_gallium_swr = _drivers.contains('swr')

+if cc.get_id() == 'intel'
+  if meson.version().version_compare('< 0.49.0')
+    error('Meson does not have sufficient support of ICC before 0.49.0 to compile mesa')
+  elif with_gallium_swr and meson.version().version_compare('== 0.49.0')
+    warning('Meson as of 0.49.0 is sufficient for compiling mesa with ICC, but there are some caveats with SWR. 0.49.1 should resolve all of these')
+  endif
+endif
+
 with_gallium = _drivers.length() != 0 and _drivers != ['']

 if with_gallium and system_has_kms_drm
@@ -385,8 +393,8 @@ if with_any_vk and (with_platform_x11 and not with_dri3)
  error('Vulkan drivers require dri3 for X11 support')
 endif
 if with_dri
-  if with_glx == 'disabled' and not with_egl and not with_gbm
-    error('building dri drivers require at least one windowing system')
+  if with_glx == 'disabled' and not with_egl and not with_gbm and with_osmesa != 'classic'
+    error('building dri drivers require at least one windowing system or classic osmesa')
  endif
 endif

@@ -671,7 +679,7 @@ if _opencl != 'disabled'
 else
  dep_clc = null_dep
  with_gallium_opencl = false
-  with_gallium_icd = false
+  with_opencl_icd = false
 endif

 gl_pkgconfig_c_flags = []
@@ -1399,7 +1407,7 @@ if with_platform_x11
    dep_xcb_xfixes = dependency('xcb-xfixes')
  endif
  if with_xlib_lease
-    dep_xcb_xrandr = dependency('xcb-randr', version : '>= 1.12')
+    dep_xcb_xrandr = dependency('xcb-randr')
    dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3')
  endif
 endif
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2072,7 +2072,7 @@ visit_store_var(struct ac_nir_context *ctx,
 		int writemask = instr->const_index[0];
 		LLVMValueRef address = get_src(ctx, instr->src[0]);
 		LLVMValueRef val = get_src(ctx, instr->src[1]);
-		if (util_is_power_of_two_nonzero(writemask)) {
+		if (writemask == (1u << ac_get_llvm_num_components(val)) - 1) {
 			val = LLVMBuildBitCast(
 			   ctx->ac.builder, val,
 			   LLVMGetElementType(LLVMTypeOf(address)), "");
@@ -2802,15 +2802,16 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx,
 				 const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef result[4];
-	LLVMValueRef interp_param, attr_number;
+	LLVMValueRef interp_param;
 	unsigned location;
 	unsigned chan;
 	LLVMValueRef src_c0 = NULL;
 	LLVMValueRef src_c1 = NULL;
 	LLVMValueRef src0 = NULL;

-	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
-	int input_index = ctx->abi->fs_input_attr_indices[var->data.location - VARYING_SLOT_VAR0];
+	nir_deref_instr *deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+	nir_variable *var = nir_deref_instr_get_variable(deref_instr);
+	int input_base = ctx->abi->fs_input_attr_indices[var->data.location - VARYING_SLOT_VAR0];
 	switch (instr->intrinsic) {
 	case nir_intrinsic_interp_deref_at_centroid:
 		location = INTERP_CENTROID;
@@ -2840,7 +2841,6 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx,
 		src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, "");
 	}
 	interp_param = ctx->abi->lookup_interp_param(ctx->abi, var->data.interpolation, location);
-	attr_number = LLVMConstInt(ctx->ac.i32, input_index, false);

 	if (location == INTERP_CENTER) {
 		LLVMValueRef ij_out[2];
@@ -2878,26 +2878,65 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx,

 	}

+	LLVMValueRef array_idx = ctx->ac.i32_0;
+	while(deref_instr->deref_type != nir_deref_type_var) {
+		if (deref_instr->deref_type == nir_deref_type_array) {
+			unsigned array_size = glsl_get_aoa_size(deref_instr->type);
+			if (!array_size)
+				array_size = 1;
+
+			LLVMValueRef offset;
+			nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
+			if (const_value) {
+				offset = LLVMConstInt(ctx->ac.i32, array_size * const_value->u32[0], false);
+			} else {
+				LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index);
+
+				offset = LLVMBuildMul(ctx->ac.builder, indirect,
+						      LLVMConstInt(ctx->ac.i32, array_size, false), "");
+			}
+
+			array_idx = LLVMBuildAdd(ctx->ac.builder, array_idx, offset, "");
+			deref_instr = nir_src_as_deref(deref_instr->parent);
+		} else {
+			unreachable("Unsupported deref type");
+		}
+
+	}
+
+	unsigned input_array_size = glsl_get_aoa_size(var->type);
+	if (!input_array_size)
+		input_array_size = 1;
+
 	for (chan = 0; chan < 4; chan++) {
+		LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->ac.f32, input_array_size));
 		LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);

-		if (interp_param) {
-			interp_param = LLVMBuildBitCast(ctx->ac.builder,
-							interp_param, ctx->ac.v2f32, "");
-			LLVMValueRef i = LLVMBuildExtractElement(
-				ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
-			LLVMValueRef j = LLVMBuildExtractElement(
-				ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+		for (unsigned idx = 0; idx < input_array_size; ++idx) {
+			LLVMValueRef v, attr_number;

-			result[chan] = ac_build_fs_interp(&ctx->ac,
-							  llvm_chan, attr_number,
-							  ctx->abi->prim_mask, i, j);
-		} else {
-			result[chan] = ac_build_fs_interp_mov(&ctx->ac,
-							      LLVMConstInt(ctx->ac.i32, 2, false),
-							      llvm_chan, attr_number,
-							      ctx->abi->prim_mask);
+			attr_number = LLVMConstInt(ctx->ac.i32, input_base + idx, false);
+			if (interp_param) {
+				interp_param = LLVMBuildBitCast(ctx->ac.builder,
+							interp_param, ctx->ac.v2f32, "");
+				LLVMValueRef i = LLVMBuildExtractElement(
+					ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+				LLVMValueRef j = LLVMBuildExtractElement(
+					ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+
+				v = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number,
+						       ctx->abi->prim_mask, i, j);
+			} else {
+				v = ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, false),
+							   llvm_chan, attr_number, ctx->abi->prim_mask);
+			}
+
+			gather = LLVMBuildInsertElement(ctx->ac.builder, gather, v,
+							LLVMConstInt(ctx->ac.i32, idx, false), "");
 		}
+
+		result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, array_idx, "");
+
 	}
 	return ac_build_varying_gather_values(&ctx->ac, result, instr->num_components,
 					      var->data.location_frac);
@@ -3460,7 +3499,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 	 * It's unnecessary if the original texture format was
 	 * Z32_FLOAT, but we don't know that here.
 	 */
-	if (args.compare && ctx->ac.chip_class == VI && ctx->abi->clamp_shadow_reference)
+	if (args.compare && ctx->ac.chip_class >= VI && ctx->abi->clamp_shadow_reference)
 		args.compare = ac_build_clamp(&ctx->ac, ac_to_float(&ctx->ac, args.compare));

 	/* pack derivatives */
@@ -3851,7 +3890,7 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
 		}
 	}

-	bool is_16bit = glsl_type_is_16bit(variable->type);
+	bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
 	LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32;
 	for (unsigned i = 0; i < attrib_count; ++i) {
 		for (unsigned chan = 0; chan < 4; chan++) {
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -84,7 +84,9 @@ VkResult radv_CreateDescriptorSetLayout(
 	uint32_t immutable_sampler_count = 0;
 	for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
 		max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding);
-		if (pCreateInfo->pBindings[j].pImmutableSamplers)
+		if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+		     pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
+		     pCreateInfo->pBindings[j].pImmutableSamplers)
 			immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
 	}

@@ -182,7 +184,9 @@ VkResult radv_CreateDescriptorSetLayout(
 			set_layout->has_variable_descriptors = true;
 		}

-		if (binding->pImmutableSamplers) {
+		if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+		     binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
+		    binding->pImmutableSamplers) {
 			set_layout->binding[b].immutable_samplers_offset = samplers_offset;
 			set_layout->binding[b].immutable_samplers_equal =
 				has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount);
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -525,7 +525,7 @@ VkResult radv_CreateInstance(
 	    pCreateInfo->pApplicationInfo->apiVersion != 0) {
 		client_version = pCreateInfo->pApplicationInfo->apiVersion;
 	} else {
-		radv_EnumerateInstanceVersion(&client_version);
+		client_version = VK_API_VERSION_1_0;
 	}

 	instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
--- a/src/amd/vulkan/radv_meta_blit.c
+++ b/src/amd/vulkan/radv_meta_blit.c
@@ -849,54 +849,60 @@ build_pipeline(struct radv_device *device,
 		.subpass = 0,
 	};

-	switch(aspect) {
-	case VK_IMAGE_ASPECT_COLOR_BIT:
-		vk_pipeline_info.pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
-			.attachmentCount = 1,
-			.pAttachments = (VkPipelineColorBlendAttachmentState []) {
-				{ .colorWriteMask =
-				VK_COLOR_COMPONENT_A_BIT |
-				VK_COLOR_COMPONENT_R_BIT |
-				VK_COLOR_COMPONENT_G_BIT |
-				VK_COLOR_COMPONENT_B_BIT },
+	VkPipelineColorBlendStateCreateInfo color_blend_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+		.attachmentCount = 1,
+		.pAttachments = (VkPipelineColorBlendAttachmentState []) {
+			{
+				.colorWriteMask = VK_COLOR_COMPONENT_A_BIT |
+						  VK_COLOR_COMPONENT_R_BIT |
+						  VK_COLOR_COMPONENT_G_BIT |
+						  VK_COLOR_COMPONENT_B_BIT },
 			}
 		};
+
+	VkPipelineDepthStencilStateCreateInfo depth_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+		.depthTestEnable = true,
+		.depthWriteEnable = true,
+		.depthCompareOp = VK_COMPARE_OP_ALWAYS,
+	};
+
+	VkPipelineDepthStencilStateCreateInfo stencil_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+		.depthTestEnable = false,
+		.depthWriteEnable = false,
+		.stencilTestEnable = true,
+		.front = {
+			.failOp = VK_STENCIL_OP_REPLACE,
+			.passOp = VK_STENCIL_OP_REPLACE,
+			.depthFailOp = VK_STENCIL_OP_REPLACE,
+			.compareOp = VK_COMPARE_OP_ALWAYS,
+			.compareMask = 0xff,
+			.writeMask = 0xff,
+			.reference = 0
+		},
+		.back = {
+			.failOp = VK_STENCIL_OP_REPLACE,
+			.passOp = VK_STENCIL_OP_REPLACE,
+			.depthFailOp = VK_STENCIL_OP_REPLACE,
+			.compareOp = VK_COMPARE_OP_ALWAYS,
+			.compareMask = 0xff,
+			.writeMask = 0xff,
+			.reference = 0
+		},
+		.depthCompareOp = VK_COMPARE_OP_ALWAYS,
+	};
+
+	switch(aspect) {
+	case VK_IMAGE_ASPECT_COLOR_BIT:
+		vk_pipeline_info.pColorBlendState = &color_blend_info;
 		break;
 	case VK_IMAGE_ASPECT_DEPTH_BIT:
-		vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
-			.depthTestEnable = true,
-			.depthWriteEnable = true,
-			.depthCompareOp = VK_COMPARE_OP_ALWAYS,
-		};
+		vk_pipeline_info.pDepthStencilState = &depth_info;
 		break;
 	case VK_IMAGE_ASPECT_STENCIL_BIT:
-		vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
-			.depthTestEnable = false,
-			.depthWriteEnable = false,
-			.stencilTestEnable = true,
-			.front = {
-				.failOp = VK_STENCIL_OP_REPLACE,
-				.passOp = VK_STENCIL_OP_REPLACE,
-				.depthFailOp = VK_STENCIL_OP_REPLACE,
-				.compareOp = VK_COMPARE_OP_ALWAYS,
-				.compareMask = 0xff,
-				.writeMask = 0xff,
-				.reference = 0
-			},
-			.back = {
-				.failOp = VK_STENCIL_OP_REPLACE,
-				.passOp = VK_STENCIL_OP_REPLACE,
-				.depthFailOp = VK_STENCIL_OP_REPLACE,
-				.compareOp = VK_COMPARE_OP_ALWAYS,
-				.compareMask = 0xff,
-				.writeMask = 0xff,
-				.reference = 0
-			},
-			.depthCompareOp = VK_COMPARE_OP_ALWAYS,
-		};
+		vk_pipeline_info.pDepthStencilState = &stencil_info;
 		break;
 	default:
 		unreachable("Unhandled aspect");
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -256,7 +256,16 @@ get_tcs_num_patches(struct radv_shader_context *ctx)
 	/* Make sure that the data fits in LDS. This assumes the shaders only
 	 * use LDS for the inputs and outputs.
 	 */
-	hardware_lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768;
+	hardware_lds_size = 32768;
+
+	/* Looks like STONEY hangs if we use more than 32 KiB LDS in a single
+	 * threadgroup, even though there is more than 32 KiB LDS.
+	 *
+	 * Test: dEQP-VK.tessellation.shader_input_output.barrier
+	 */
+	if (ctx->options->chip_class >= CIK && ctx->options->family != CHIP_STONEY)
+		hardware_lds_size = 65536;
+
 	num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
 	/* Make sure the output data fits in the offchip buffer */
 	num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size);
@@ -2160,7 +2169,7 @@ handle_fs_input_decl(struct radv_shader_context *ctx,

 		interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type);
 	}
-	bool is_16bit = glsl_type_is_16bit(variable->type);
+	bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
 	LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32;
 	if (interp == NULL)
 		interp = LLVMGetUndef(type);
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3179,11 +3179,11 @@ radv_compute_db_shader_control(const struct radv_device *device,
 	bool disable_rbplus = device->physical_device->has_rbplus &&
 	                      !device->physical_device->rbplus_allowed;

-	/* Do not enable the gl_SampleMask fragment shader output if MSAA is
-	 * disabled.
+	/* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
+	 * but this appears to break Project Cars (DXVK). See
+	 * https://bugs.freedesktop.org/show_bug.cgi?id=109401
 	 */
-	bool mask_export_enable = ms->num_samples > 1 &&
-				  ps->info.info.ps.writes_sample_mask;
+	bool mask_export_enable = ps->info.info.ps.writes_sample_mask;

 	return  S_02880C_Z_EXPORT_ENABLE(ps->info.info.ps.writes_z) |
 		S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.info.ps.writes_stencil) |
@@ -3371,14 +3371,8 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline,
 	else
 		ia_multi_vgt_param.primgroup_size = 128; /* recommended without a GS */

-	ia_multi_vgt_param.partial_es_wave = false;
-	if (pipeline->device->has_distributed_tess) {
-		if (radv_pipeline_has_gs(pipeline)) {
-			if (device->physical_device->rad_info.chip_class <= VI)
-				ia_multi_vgt_param.partial_es_wave = true;
-		}
-	}
 	/* GS requirement. */
+	ia_multi_vgt_param.partial_es_wave = false;
 	if (radv_pipeline_has_gs(pipeline) && device->physical_device->rad_info.chip_class <= VI)
 		if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
 			ia_multi_vgt_param.partial_es_wave = true;
@@ -3424,13 +3418,8 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline,
 		/* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
 		if (device->has_distributed_tess) {
 			if (radv_pipeline_has_gs(pipeline)) {
-				if (device->physical_device->rad_info.family == CHIP_TONGA ||
-				    device->physical_device->rad_info.family == CHIP_FIJI ||
-				    device->physical_device->rad_info.family == CHIP_POLARIS10 ||
-				    device->physical_device->rad_info.family == CHIP_POLARIS11 ||
-				    device->physical_device->rad_info.family == CHIP_POLARIS12 ||
-				    device->physical_device->rad_info.family == CHIP_VEGAM)
-					ia_multi_vgt_param.partial_vs_wave = true;
+				if (device->physical_device->rad_info.chip_class <= VI)
+					ia_multi_vgt_param.partial_es_wave = true;
 			} else {
 				ia_multi_vgt_param.partial_vs_wave = true;
 			}
@@ -3448,6 +3437,26 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline,
 		ia_multi_vgt_param.partial_vs_wave = true;
 	}

+	if (radv_pipeline_has_gs(pipeline)) {
+		/* On these chips there is the possibility of a hang if the
+		 * pipeline uses a GS and partial_vs_wave is not set.
+		 *
+		 * This mostly does not hit 4-SE chips, as those typically set
+		 * ia_switch_on_eoi and then partial_vs_wave is set for pipelines
+		 * with GS due to another workaround.
+		 *
+		 * Reproducer: https://bugs.freedesktop.org/show_bug.cgi?id=109242
+		 */
+		if (device->physical_device->rad_info.family == CHIP_TONGA ||
+		    device->physical_device->rad_info.family == CHIP_FIJI ||
+		    device->physical_device->rad_info.family == CHIP_POLARIS10 ||
+		    device->physical_device->rad_info.family == CHIP_POLARIS11 ||
+		    device->physical_device->rad_info.family == CHIP_POLARIS12 ||
+	            device->physical_device->rad_info.family == CHIP_VEGAM) {
+			ia_multi_vgt_param.partial_vs_wave = true;
+		}
+	}
+
 	ia_multi_vgt_param.base =
 		S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) |
 		/* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
--- a/src/compiler/glsl/ast_function.cpp
+++ b/src/compiler/glsl/ast_function.cpp
@@ -363,31 +363,29 @@ copy_index_derefs_to_temps(ir_instruction *ir, void *data)
      ir = a->array->as_dereference();

      ir_rvalue *idx = a->array_index;
-      if (idx->as_dereference_variable()) {
-         ir_variable *var = idx->variable_referenced();
+      ir_variable *var = idx->variable_referenced();

-         /* If the index is read only it cannot change so there is no need
-          * to copy it.
-          */
-         if (var->data.read_only || var->data.memory_read_only)
-            return;
+      /* If the index is read only it cannot change so there is no need
+       * to copy it.
+       */
+      if (!var || var->data.read_only || var->data.memory_read_only)
+         return;

-         ir_variable *tmp = new(d->mem_ctx) ir_variable(idx->type, "idx_tmp",
-                                                        ir_var_temporary);
-         d->before_instructions->push_tail(tmp);
+      ir_variable *tmp = new(d->mem_ctx) ir_variable(idx->type, "idx_tmp",
+                                                      ir_var_temporary);
+      d->before_instructions->push_tail(tmp);

-         ir_dereference_variable *const deref_tmp_1 =
-            new(d->mem_ctx) ir_dereference_variable(tmp);
-         ir_assignment *const assignment =
-            new(d->mem_ctx) ir_assignment(deref_tmp_1,
-                                          idx->clone(d->mem_ctx, NULL));
-         d->before_instructions->push_tail(assignment);
+      ir_dereference_variable *const deref_tmp_1 =
+         new(d->mem_ctx) ir_dereference_variable(tmp);
+      ir_assignment *const assignment =
+         new(d->mem_ctx) ir_assignment(deref_tmp_1,
+                                       idx->clone(d->mem_ctx, NULL));
+      d->before_instructions->push_tail(assignment);

-         /* Replace the array index with a dereference of the new temporary */
-         ir_dereference_variable *const deref_tmp_2 =
-            new(d->mem_ctx) ir_dereference_variable(tmp);
-         a->array_index = deref_tmp_2;
-      }
+      /* Replace the array index with a dereference of the new temporary */
+      ir_dereference_variable *const deref_tmp_2 =
+         new(d->mem_ctx) ir_dereference_variable(tmp);
+      a->array_index = deref_tmp_2;
   }
 }

@@ -402,7 +400,8 @@ fix_parameter(void *mem_ctx, ir_rvalue *actual, const glsl_type *formal_type,
    * nothing needs to be done to fix the parameter.
    */
   if (formal_type == actual->type
-       && (expr == NULL || expr->operation != ir_binop_vector_extract))
+       && (expr == NULL || expr->operation != ir_binop_vector_extract)
+       && actual->as_dereference_variable())
      return;

   /* An array index could also be an out variable so we need to make a copy
@@ -456,7 +455,7 @@ fix_parameter(void *mem_ctx, ir_rvalue *actual, const glsl_type *formal_type,
      ir_dereference_variable *const deref_tmp_1 =
         new(mem_ctx) ir_dereference_variable(tmp);
      ir_assignment *const assignment =
-         new(mem_ctx) ir_assignment(deref_tmp_1, actual);
+         new(mem_ctx) ir_assignment(deref_tmp_1, actual->clone(mem_ctx, NULL));
      before_instructions->push_tail(assignment);
   }

--- a/src/compiler/glsl/lower_output_reads.cpp
+++ b/src/compiler/glsl/lower_output_reads.cpp
@@ -101,6 +101,10 @@ output_read_remover::visit(ir_dereference_variable *ir)
      void *var_ctx = ralloc_parent(ir->var);
      temp = new(var_ctx) ir_variable(ir->var->type, ir->var->name,
                                      ir_var_temporary);
+      /* copy flags which affect arithematical precision */
+      temp->data.invariant = ir->var->data.invariant;
+      temp->data.precise = ir->var->data.precise;
+      temp->data.precision = ir->var->data.precision;
      _mesa_hash_table_insert(replacements, ir->var, temp);
      ir->var->insert_after(temp);
   }
--- a/src/compiler/glsl/serialize.cpp
+++ b/src/compiler/glsl/serialize.cpp
@@ -764,6 +764,12 @@ get_shader_var_and_pointer_sizes(size_t *s_var_size, size_t *s_var_ptrs,
      sizeof(var->name);
 }

+enum uniform_type
+{
+   uniform_remapped,
+   uniform_not_remapped
+};
+
 static void
 write_program_resource_data(struct blob *metadata,
                            struct gl_shader_program *prog,
@@ -816,12 +822,19 @@ write_program_resource_data(struct blob *metadata,
   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
   case GL_UNIFORM:
-      for (unsigned i = 0; i < prog->data->NumUniformStorage; i++) {
-         if (strcmp(((gl_uniform_storage *)res->Data)->name,
-                    prog->data->UniformStorage[i].name) == 0) {
-            blob_write_uint32(metadata, i);
-            break;
+      if (((gl_uniform_storage *)res->Data)->builtin ||
+          res->Type != GL_UNIFORM) {
+         blob_write_uint32(metadata, uniform_not_remapped);
+         for (unsigned i = 0; i < prog->data->NumUniformStorage; i++) {
+            if (strcmp(((gl_uniform_storage *)res->Data)->name,
+                       prog->data->UniformStorage[i].name) == 0) {
+               blob_write_uint32(metadata, i);
+               break;
+            }
         }
+      } else {
+         blob_write_uint32(metadata, uniform_remapped);
+         blob_write_uint32(metadata, ((gl_uniform_storage *)res->Data)->remap_location);
      }
      break;
   case GL_ATOMIC_COUNTER_BUFFER:
@@ -906,9 +919,15 @@ read_program_resource_data(struct blob_reader *metadata,
   case GL_COMPUTE_SUBROUTINE_UNIFORM:
   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-   case GL_UNIFORM:
-      res->Data = &prog->data->UniformStorage[blob_read_uint32(metadata)];
+   case GL_UNIFORM: {
+      enum uniform_type type = (enum uniform_type) blob_read_uint32(metadata);
+      if (type == uniform_not_remapped) {
+         res->Data = &prog->data->UniformStorage[blob_read_uint32(metadata)];
+      } else {
+         res->Data = prog->UniformRemapTable[blob_read_uint32(metadata)];
+      }
      break;
+   }
   case GL_ATOMIC_COUNTER_BUFFER:
      res->Data = &prog->data->AtomicBuffers[blob_read_uint32(metadata)];
      break;
--- a/src/compiler/nir/nir_deref.c
+++ b/src/compiler/nir/nir_deref.c
@@ -490,10 +490,9 @@ nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl)
         _mesa_hash_table_clear(state.cache, NULL);

      nir_foreach_instr_safe(instr, block) {
-         if (instr->type == nir_instr_type_deref) {
-            nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr));
+         if (instr->type == nir_instr_type_deref &&
+             nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr)))
            continue;
-         }

         state.builder.cursor = nir_before_instr(instr);
         nir_foreach_src(instr, rematerialize_deref_src, &state);
--- a/src/compiler/nir/nir_gather_xfb_info.c
+++ b/src/compiler/nir/nir_gather_xfb_info.c
@@ -76,13 +76,13 @@ add_var_xfb_outputs(nir_xfb_info *xfb,
         nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];

         output->buffer = var->data.xfb_buffer;
-         output->offset = *offset;
+         output->offset = *offset + s * 16;
         output->location = *location;
         output->component_mask = (comp_mask >> (s * 4)) & 0xf;

         (*location)++;
-         *offset += comp_slots * 4;
      }
+      *offset += comp_slots * 4;
   }
 }

--- a/src/compiler/nir/nir_opt_copy_prop_vars.c
+++ b/src/compiler/nir/nir_opt_copy_prop_vars.c
@@ -143,9 +143,19 @@ gather_vars_written(struct copy_prop_var_state *state,
            written->modes = nir_var_shader_out;
            break;

+         case nir_intrinsic_deref_atomic_add:
+         case nir_intrinsic_deref_atomic_imin:
+         case nir_intrinsic_deref_atomic_umin:
+         case nir_intrinsic_deref_atomic_imax:
+         case nir_intrinsic_deref_atomic_umax:
+         case nir_intrinsic_deref_atomic_and:
+         case nir_intrinsic_deref_atomic_or:
+         case nir_intrinsic_deref_atomic_xor:
+         case nir_intrinsic_deref_atomic_exchange:
+         case nir_intrinsic_deref_atomic_comp_swap:
         case nir_intrinsic_store_deref:
         case nir_intrinsic_copy_deref: {
-            /* Destination in _both_ store_deref and copy_deref is src[0]. */
+            /* Destination in all of store_deref, copy_deref and the atomics is src[0]. */
            nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);

            uintptr_t mask = intrin->intrinsic == nir_intrinsic_store_deref ?
@@ -750,6 +760,19 @@ copy_prop_vars_block(struct copy_prop_var_state *state,
         break;
      }

+      case nir_intrinsic_deref_atomic_add:
+      case nir_intrinsic_deref_atomic_imin:
+      case nir_intrinsic_deref_atomic_umin:
+      case nir_intrinsic_deref_atomic_imax:
+      case nir_intrinsic_deref_atomic_umax:
+      case nir_intrinsic_deref_atomic_and:
+      case nir_intrinsic_deref_atomic_or:
+      case nir_intrinsic_deref_atomic_xor:
+      case nir_intrinsic_deref_atomic_exchange:
+      case nir_intrinsic_deref_atomic_comp_swap:
+         kill_aliases(copies, nir_src_as_deref(intrin->src[0]), 0xf);
+         break;
+
      default:
         break;
      }
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -2819,7 +2819,8 @@ dri2_bind_wayland_display_wl(_EGLDriver *drv, _EGLDisplay *disp,
   const struct wayland_drm_callbacks wl_drm_callbacks = {
      .authenticate = (int(*)(void *, uint32_t)) dri2_dpy->vtbl->authenticate,
      .reference_buffer = dri2_wl_reference_buffer,
-      .release_buffer = dri2_wl_release_buffer
+      .release_buffer = dri2_wl_release_buffer,
+      .is_format_supported = dri2_wl_is_format_supported
   };
   int flags = 0;
   uint64_t cap;
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -457,6 +457,8 @@ EGLBoolean
 dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp);
 void
 dri2_teardown_wayland(struct dri2_egl_display *dri2_dpy);
+bool
+dri2_wl_is_format_supported(void* user_data, uint32_t format);
 #else
 static inline EGLBoolean
 dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -59,49 +59,57 @@ static const struct dri2_wl_visual {
   uint32_t wl_drm_format;
   uint32_t wl_shm_format;
   int dri_image_format;
+   /* alt_dri_image_format is a substitute wl_buffer format to use for a
+    * wl-server unsupported dri_image_format, ie. some other dri_image_format in
+    * the table, of the same precision but with different channel ordering, or
+    * __DRI_IMAGE_FORMAT_NONE if an alternate format is not needed or supported.
+    * The code checks if alt_dri_image_format can be used as a fallback for a
+    * dri_image_format for a given wl-server implementation.
+    */
+   int alt_dri_image_format;
   int bpp;
   unsigned int rgba_masks[4];
 } dri2_wl_visuals[] = {
   {
     "XRGB2101010",
     WL_DRM_FORMAT_XRGB2101010, WL_SHM_FORMAT_XRGB2101010,
-     __DRI_IMAGE_FORMAT_XRGB2101010, 32,
+     __DRI_IMAGE_FORMAT_XRGB2101010, __DRI_IMAGE_FORMAT_XBGR2101010, 32,
     { 0x3ff00000, 0x000ffc00, 0x000003ff, 0x00000000 }
   },
   {
     "ARGB2101010",
     WL_DRM_FORMAT_ARGB2101010, WL_SHM_FORMAT_ARGB2101010,
-     __DRI_IMAGE_FORMAT_ARGB2101010, 32,
+     __DRI_IMAGE_FORMAT_ARGB2101010, __DRI_IMAGE_FORMAT_ABGR2101010, 32,
     { 0x3ff00000, 0x000ffc00, 0x000003ff, 0xc0000000 }
   },
   {
     "XBGR2101010",
     WL_DRM_FORMAT_XBGR2101010, WL_SHM_FORMAT_XBGR2101010,
-     __DRI_IMAGE_FORMAT_XBGR2101010, 32,
+     __DRI_IMAGE_FORMAT_XBGR2101010, __DRI_IMAGE_FORMAT_XRGB2101010, 32,
     { 0x000003ff, 0x000ffc00, 0x3ff00000, 0x00000000 }
   },
   {
     "ABGR2101010",
     WL_DRM_FORMAT_ABGR2101010, WL_SHM_FORMAT_ABGR2101010,
-     __DRI_IMAGE_FORMAT_ABGR2101010, 32,
+     __DRI_IMAGE_FORMAT_ABGR2101010, __DRI_IMAGE_FORMAT_ARGB2101010, 32,
     { 0x000003ff, 0x000ffc00, 0x3ff00000, 0xc0000000 }
   },
   {
     "XRGB8888",
     WL_DRM_FORMAT_XRGB8888, WL_SHM_FORMAT_XRGB8888,
-     __DRI_IMAGE_FORMAT_XRGB8888, 32,
+     __DRI_IMAGE_FORMAT_XRGB8888, __DRI_IMAGE_FORMAT_NONE, 32,
     { 0x00ff0000, 0x0000ff00, 0x000000ff, 0x00000000 }
   },
   {
     "ARGB8888",
     WL_DRM_FORMAT_ARGB8888, WL_SHM_FORMAT_ARGB8888,
-     __DRI_IMAGE_FORMAT_ARGB8888, 32,
+     __DRI_IMAGE_FORMAT_ARGB8888, __DRI_IMAGE_FORMAT_NONE, 32,
     { 0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000 }
   },
   {
     "RGB565",
     WL_DRM_FORMAT_RGB565, WL_SHM_FORMAT_RGB565,
-     __DRI_IMAGE_FORMAT_RGB565, 16,
+     __DRI_IMAGE_FORMAT_RGB565, __DRI_IMAGE_FORMAT_NONE, 16,
     { 0xf800, 0x07e0, 0x001f, 0x0000 }
   },
 };
@@ -166,6 +174,24 @@ dri2_wl_visual_idx_from_shm_format(uint32_t shm_format)
   return -1;
 }

+bool
+dri2_wl_is_format_supported(void* user_data, uint32_t format)
+{
+   _EGLDisplay *disp = (_EGLDisplay *) user_data;
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   int j = dri2_wl_visual_idx_from_fourcc(format);
+
+   if (j == -1)
+      return false;
+
+   for (int i = 0; dri2_dpy->driver_configs[i]; i++)
+      if (j == dri2_wl_visual_idx_from_config(dri2_dpy,
+                                              dri2_dpy->driver_configs[i]))
+         return true;
+
+   return false;
+}
+
 static int
 roundtrip(struct dri2_egl_display *dri2_dpy)
 {
@@ -461,15 +487,29 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
   int use_flags;
   int visual_idx;
   unsigned int dri_image_format;
+   unsigned int linear_dri_image_format;
   uint64_t *modifiers;
   int num_modifiers;

   visual_idx = dri2_wl_visual_idx_from_fourcc(dri2_surf->format);
   assert(visual_idx != -1);
   dri_image_format = dri2_wl_visuals[visual_idx].dri_image_format;
+   linear_dri_image_format = dri_image_format;
   modifiers = u_vector_tail(&dri2_dpy->wl_modifiers[visual_idx]);
   num_modifiers = u_vector_length(&dri2_dpy->wl_modifiers[visual_idx]);

+   /* Substitute dri image format if server does not support original format */
+   if (!(dri2_dpy->formats & (1 << visual_idx)))
+      linear_dri_image_format = dri2_wl_visuals[visual_idx].alt_dri_image_format;
+
+   /* These asserts hold, as long as dri2_wl_visuals[] is self-consistent and
+    * the PRIME substitution logic in dri2_wl_add_configs_for_visuals() is free
+    * of bugs.
+    */
+   assert(linear_dri_image_format != __DRI_IMAGE_FORMAT_NONE);
+   assert(dri2_dpy->formats &
+          (1 << dri2_wl_visual_idx_from_dri_image_format(linear_dri_image_format)));
+
   /* There might be a buffer release already queued that wasn't processed */
   wl_display_dispatch_queue_pending(dri2_dpy->wl_dpy, dri2_surf->wl_queue);

@@ -516,7 +556,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
            dri2_dpy->image->createImageWithModifiers(dri2_dpy->dri_screen,
                                                      dri2_surf->base.Width,
                                                      dri2_surf->base.Height,
-                                                      dri_image_format,
+                                                      linear_dri_image_format,
                                                      &linear_mod,
                                                      1,
                                                      NULL);
@@ -525,7 +565,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
            dri2_dpy->image->createImage(dri2_dpy->dri_screen,
                                         dri2_surf->base.Width,
                                         dri2_surf->base.Height,
-                                         dri_image_format,
+                                         linear_dri_image_format,
                                         use_flags |
                                         __DRI_IMAGE_USE_LINEAR,
                                         NULL);
@@ -1298,8 +1338,11 @@ dri2_wl_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *disp)
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   unsigned int format_count[ARRAY_SIZE(dri2_wl_visuals)] = { 0 };
   unsigned int count = 0;
+   bool assigned;

   for (unsigned i = 0; dri2_dpy->driver_configs[i]; i++) {
+      assigned = false;
+
      for (unsigned j = 0; j < ARRAY_SIZE(dri2_wl_visuals); j++) {
         struct dri2_egl_config *dri2_conf;

@@ -1312,6 +1355,43 @@ dri2_wl_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *disp)
            if (dri2_conf->base.ConfigID == count + 1)
               count++;
            format_count[j]++;
+            assigned = true;
+         }
+      }
+
+      if (!assigned && dri2_dpy->is_different_gpu) {
+         struct dri2_egl_config *dri2_conf;
+         int alt_dri_image_format, c, s;
+
+         /* No match for config. Try if we can blitImage convert to a visual */
+         c = dri2_wl_visual_idx_from_config(dri2_dpy,
+                                            dri2_dpy->driver_configs[i]);
+
+         if (c == -1)
+            continue;
+
+         /* Find optimal target visual for blitImage conversion, if any. */
+         alt_dri_image_format = dri2_wl_visuals[c].alt_dri_image_format;
+         s = dri2_wl_visual_idx_from_dri_image_format(alt_dri_image_format);
+
+         if (s == -1 || !(dri2_dpy->formats & (1 << s)))
+            continue;
+
+         /* Visual s works for the Wayland server, and c can be converted into s
+          * by our client gpu during PRIME blitImage conversion to a linear
+          * wl_buffer, so add visual c as supported by the client renderer.
+          */
+         dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i],
+                                     count + 1, EGL_WINDOW_BIT, NULL,
+                                     dri2_wl_visuals[c].rgba_masks);
+         if (dri2_conf) {
+            if (dri2_conf->base.ConfigID == count + 1)
+               count++;
+            format_count[c]++;
+            if (format_count[c] == 1)
+               _eglLog(_EGL_DEBUG, "Client format %s to server format %s via "
+                       "PRIME blitImage.", dri2_wl_visuals[c].format_name,
+                       dri2_wl_visuals[s].format_name);
         }
      }
   }
--- a/src/egl/wayland/wayland-drm/wayland-drm.c
+++ b/src/egl/wayland/wayland-drm/wayland-drm.c
@@ -111,6 +111,8 @@ drm_create_buffer(struct wl_client *client, struct wl_resource *resource,
 		  uint32_t stride, uint32_t format)
 {
        switch (format) {
+        case WL_DRM_FORMAT_ABGR2101010:
+        case WL_DRM_FORMAT_XBGR2101010:
        case WL_DRM_FORMAT_ARGB2101010:
        case WL_DRM_FORMAT_XRGB2101010:
        case WL_DRM_FORMAT_ARGB8888:
@@ -210,10 +212,31 @@ bind_drm(struct wl_client *client, void *data, uint32_t version, uint32_t id)
 	wl_resource_set_implementation(resource, &drm_interface, data, NULL);

 	wl_resource_post_event(resource, WL_DRM_DEVICE, drm->device_name);
-	wl_resource_post_event(resource, WL_DRM_FORMAT,
-			       WL_DRM_FORMAT_ARGB2101010);
-	wl_resource_post_event(resource, WL_DRM_FORMAT,
-			       WL_DRM_FORMAT_XRGB2101010);
+
+	if (drm->callbacks.is_format_supported(drm->user_data,
+					       WL_DRM_FORMAT_ARGB2101010)) {
+		wl_resource_post_event(resource, WL_DRM_FORMAT,
+				       WL_DRM_FORMAT_ARGB2101010);
+	}
+
+	if (drm->callbacks.is_format_supported(drm->user_data,
+					       WL_DRM_FORMAT_XRGB2101010)) {
+		wl_resource_post_event(resource, WL_DRM_FORMAT,
+				       WL_DRM_FORMAT_XRGB2101010);
+	}
+
+	if (drm->callbacks.is_format_supported(drm->user_data,
+					       WL_DRM_FORMAT_ABGR2101010)) {
+		wl_resource_post_event(resource, WL_DRM_FORMAT,
+				       WL_DRM_FORMAT_ABGR2101010);
+	}
+
+	if (drm->callbacks.is_format_supported(drm->user_data,
+					       WL_DRM_FORMAT_XBGR2101010)) {
+		wl_resource_post_event(resource, WL_DRM_FORMAT,
+				       WL_DRM_FORMAT_XBGR2101010);
+	}
+
 	wl_resource_post_event(resource, WL_DRM_FORMAT,
 			       WL_DRM_FORMAT_ARGB8888);
 	wl_resource_post_event(resource, WL_DRM_FORMAT,
--- a/src/egl/wayland/wayland-drm/wayland-drm.h
+++ b/src/egl/wayland/wayland-drm/wayland-drm.h
@@ -14,6 +14,8 @@ struct wayland_drm_callbacks {
                                 struct wl_drm_buffer *buffer);

 	void (*release_buffer)(void *user_data, struct wl_drm_buffer *buffer);
+
+	bool (*is_format_supported)(void *user_data, uint32_t format);
 };


--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -1524,7 +1524,8 @@ tc_buffer_do_flush_region(struct threaded_context *tc,
   if (ttrans->staging) {
      struct pipe_box src_box;

-      u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
+      u_box_1d(ttrans->offset + ttrans->b.box.x % tc->map_buffer_alignment +
+               (box->x - ttrans->b.box.x),
               box->width, &src_box);

      /* Copy the staging buffer into the original one. */
--- a/src/gallium/drivers/etnaviv/etnaviv_context.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_context.c
@@ -60,6 +60,8 @@ etna_context_destroy(struct pipe_context *pctx)
 {
   struct etna_context *ctx = etna_context(pctx);

+   util_copy_framebuffer_state(&ctx->framebuffer_s, NULL);
+
   if (ctx->primconvert)
      util_primconvert_destroy(ctx->primconvert);

@@ -296,10 +298,10 @@ etna_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
   if (DBG_ENABLED(ETNA_DBG_FLUSH_ALL))
      pctx->flush(pctx, NULL, 0);

-   if (ctx->framebuffer.cbuf)
-      etna_resource(ctx->framebuffer.cbuf->texture)->seqno++;
-   if (ctx->framebuffer.zsbuf)
-      etna_resource(ctx->framebuffer.zsbuf->texture)->seqno++;
+   if (ctx->framebuffer_s.cbufs[0])
+      etna_resource(ctx->framebuffer_s.cbufs[0]->texture)->seqno++;
+   if (ctx->framebuffer_s.zsbuf)
+      etna_resource(ctx->framebuffer_s.zsbuf->texture)->seqno++;
   if (info->index_size && indexbuf != info->index.resource)
      pipe_resource_reference(&indexbuf, NULL);
 }
--- a/src/gallium/drivers/etnaviv/etnaviv_internal.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_internal.h
@@ -182,7 +182,6 @@ struct compiled_viewport_state {

 /* Compiled pipe_framebuffer_state */
 struct compiled_framebuffer_state {
-   struct pipe_surface *cbuf, *zsbuf; /* keep reference to surfaces */
   uint32_t GL_MULTI_SAMPLE_CONFIG;
   uint32_t PE_COLOR_FORMAT;
   uint32_t PE_DEPTH_CONFIG;
--- a/src/gallium/drivers/etnaviv/etnaviv_state.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_state.c
@@ -37,6 +37,7 @@
 #include "etnaviv_surface.h"
 #include "etnaviv_translate.h"
 #include "etnaviv_util.h"
+#include "util/u_framebuffer.h"
 #include "util/u_helpers.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
@@ -130,7 +131,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
      assert(res->layout & ETNA_LAYOUT_BIT_TILE); /* Cannot render to linear surfaces */
      etna_update_render_resource(pctx, cbuf->base.texture);

-      pipe_surface_reference(&cs->cbuf, &cbuf->base);
      cs->PE_COLOR_FORMAT =
         VIVS_PE_COLOR_FORMAT_FORMAT(translate_rs_format(cbuf->base.format)) |
         VIVS_PE_COLOR_FORMAT_COMPONENTS__MASK |
@@ -182,7 +182,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx,

      nr_samples_color = cbuf->base.texture->nr_samples;
   } else {
-      pipe_surface_reference(&cs->cbuf, NULL);
      /* Clearing VIVS_PE_COLOR_FORMAT_COMPONENTS__MASK and
       * VIVS_PE_COLOR_FORMAT_OVERWRITE prevents us from overwriting the
       * color target */
@@ -201,7 +200,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx,

      etna_update_render_resource(pctx, zsbuf->base.texture);

-      pipe_surface_reference(&cs->zsbuf, &zsbuf->base);
      assert(res->layout &ETNA_LAYOUT_BIT_TILE); /* Cannot render to linear surfaces */

      uint32_t depth_format = translate_depth_format(zsbuf->base.format);
@@ -252,7 +250,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx,

      nr_samples_depth = zsbuf->base.texture->nr_samples;
   } else {
-      pipe_surface_reference(&cs->zsbuf, NULL);
      cs->PE_DEPTH_CONFIG = VIVS_PE_DEPTH_CONFIG_DEPTH_MODE_NONE;
      cs->PE_DEPTH_ADDR.bo = NULL;
      cs->PE_DEPTH_STRIDE = 0;
@@ -325,7 +322,8 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
    */
   cs->PE_LOGIC_OP = VIVS_PE_LOGIC_OP_SINGLE_BUFFER(ctx->specs.single_buffer ? 3 : 0);

-   ctx->framebuffer_s = *sv; /* keep copy of original structure */
+   /* keep copy of original structure */
+   util_copy_framebuffer_state(&ctx->framebuffer_s, sv);
   ctx->dirty |= ETNA_DIRTY_FRAMEBUFFER | ETNA_DIRTY_DERIVE_TS;
 }

--- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
@@ -430,7 +430,7 @@ emit_blit_texture(struct fd_ringbuffer *ring, const struct pipe_blit_info *info)
 		OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) |
 				 A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) |
 				 A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap));
-		OUT_RELOC(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
+		OUT_RELOCW(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
 		OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch));
 		OUT_RING(ring, 0x00000000);
 		OUT_RING(ring, 0x00000000);
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -839,8 +839,7 @@ fd_resource_create(struct pipe_screen *pscreen,

 	rsc->internal_format = format;
 	rsc->cpp = util_format_get_blocksize(format);
-	prsc->nr_samples = MAX2(1, prsc->nr_samples);
-	rsc->cpp *= prsc->nr_samples;
+	rsc->cpp *= fd_resource_nr_samples(prsc);

 	assert(rsc->cpp);

@@ -924,9 +923,9 @@ fd_resource_from_handle(struct pipe_screen *pscreen,
 	if (!rsc->bo)
 		goto fail;

-	prsc->nr_samples = MAX2(1, prsc->nr_samples);
 	rsc->internal_format = tmpl->format;
-	rsc->cpp = prsc->nr_samples * util_format_get_blocksize(tmpl->format);
+	rsc->cpp = util_format_get_blocksize(tmpl->format);
+	rsc->cpp *= fd_resource_nr_samples(prsc);
 	slice->pitch = handle->stride / rsc->cpp;
 	slice->offset = handle->offset;
 	slice->size0 = handle->stride * prsc->height0;
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -178,6 +178,15 @@ fd_resource_level_linear(struct pipe_resource *prsc, int level)
 	return false;
 }

+/* access # of samples, with 0 normalized to 1 (which is what we care about
+ * most of the time)
+ */
+static inline unsigned
+fd_resource_nr_samples(struct pipe_resource *prsc)
+{
+	return MAX2(1, prsc->nr_samples);
+}
+
 void fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond, bool discard,
 		enum fd_render_stage stage);
 void fd_blitter_pipe_end(struct fd_context *ctx);
--- a/src/gallium/drivers/freedreno/freedreno_texture.c
+++ b/src/gallium/drivers/freedreno/freedreno_texture.c
@@ -31,6 +31,7 @@

 #include "freedreno_texture.h"
 #include "freedreno_context.h"
+#include "freedreno_resource.h"
 #include "freedreno_util.h"

 static void
@@ -83,7 +84,7 @@ static void set_sampler_views(struct fd_texture_stateobj *tex,
 	tex->num_textures = util_last_bit(tex->valid_textures);

 	for (i = 0; i < tex->num_textures; i++) {
-		uint nr_samples = tex->textures[i]->texture->nr_samples;
+		uint nr_samples = fd_resource_nr_samples(tex->textures[i]->texture);
 		samplers |= (nr_samples >> 1) << (i * 2);
 	}

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1044,7 +1044,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
      break;
   }
   case OP_MUL:
-      if (i->dType == TYPE_F32)
+      if (i->dType == TYPE_F32 && !i->precise)
         tryCollapseChainedMULs(i, s, imm0);

      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -1279,8 +1279,8 @@ nvc0_screen_create(struct nouveau_device *dev)
   for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) {
      BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3);
      PUSH_DATA (push, 1);
-      PUSH_DATA (push, 8192 << 16);
-      PUSH_DATA (push, 8192 << 16);
+      PUSH_DATA (push, 16384 << 16);
+      PUSH_DATA (push, 16384 << 16);
   }

 #define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n);
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -521,10 +521,13 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
 	struct r600_resource *rbuffer = r600_resource(transfer->resource);

 	if (stransfer->staging) {
+		unsigned src_offset = stransfer->offset +
+				      transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
+				      (box->x - transfer->box.x);
+
 		/* Copy the staging buffer into the original one. */
 		si_copy_buffer((struct si_context*)ctx, transfer->resource,
-			       &stransfer->staging->b.b, box->x,
-			       stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT,
+			       &stransfer->staging->b.b, box->x, src_offset,
 			       box->width);
 	}

--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -348,20 +348,11 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
 		    key->u.uses_gs)
 			partial_vs_wave = true;

-		/* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
+		/* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= VI) */
 		if (sscreen->has_distributed_tess) {
 			if (key->u.uses_gs) {
-				if (sscreen->info.chip_class <= VI)
+				if (sscreen->info.chip_class == VI)
 					partial_es_wave = true;
-
-				/* GPU hang workaround. */
-				if (sscreen->info.family == CHIP_TONGA ||
-				    sscreen->info.family == CHIP_FIJI ||
-				    sscreen->info.family == CHIP_POLARIS10 ||
-				    sscreen->info.family == CHIP_POLARIS11 ||
-				    sscreen->info.family == CHIP_POLARIS12 ||
-				    sscreen->info.family == CHIP_VEGAM)
-					partial_vs_wave = true;
 			} else {
 				partial_vs_wave = true;
 			}
@@ -417,6 +408,18 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
 		if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
 			ia_switch_on_eoi = true;

+		/* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
+		 * to work around a GS hang.
+		 */
+		if (key->u.uses_gs &&
+		    (sscreen->info.family == CHIP_TONGA ||
+		     sscreen->info.family == CHIP_FIJI ||
+		     sscreen->info.family == CHIP_POLARIS10 ||
+		     sscreen->info.family == CHIP_POLARIS11 ||
+		     sscreen->info.family == CHIP_POLARIS12 ||
+		     sscreen->info.family == CHIP_VEGAM))
+			partial_vs_wave = true;
+
 		/* Required by Hawaii and, for some special cases, by VI. */
 		if (ia_switch_on_eoi &&
 		    (sscreen->info.family == CHIP_HAWAII ||
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1662,7 +1662,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 		key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);

 		/* ps_uses_fbfetch is true only if the color buffer is bound. */
-		if (sctx->ps_uses_fbfetch) {
+		if (sctx->ps_uses_fbfetch && !sctx->blitter->running) {
 			struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
 			struct pipe_resource *tex = cb0->texture;

--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -146,6 +146,8 @@ static void si_emit_one_scissor(struct si_context *ctx,
 			S_028254_BR_Y(final.maxy));
 }

+#define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176
+
 static void si_emit_guardband(struct si_context *ctx)
 {
 	const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
@@ -179,13 +181,22 @@ static void si_emit_guardband(struct si_context *ctx)
 	int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
 	int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;

-	const unsigned hw_screen_offset_max = 8176;
 	/* SI-CI need to align the offset to an ubertile consisting of all SEs. */
 	const unsigned hw_screen_offset_alignment =
 		ctx->chip_class >= VI ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);

-	hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, hw_screen_offset_max);
-	hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, hw_screen_offset_max);
+	/* Indexed by quantization modes */
+	static unsigned max_viewport_size[] = {65535, 16383, 4095};
+
+	/* Ensure that the whole viewport stays representable in
+	 * absolute coordinates.
+	 * See comment in si_set_viewport_states.
+	 */
+	assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
+	       vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
+
+	hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+	hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);

 	/* Align the screen offset by dropping the low bits. */
 	hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
@@ -218,7 +229,6 @@ static void si_emit_guardband(struct si_context *ctx)
 	 *
 	 * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
 	 */
-	static unsigned max_viewport_size[] = {65535, 16383, 4095};
 	assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
 	max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
 	left   = (-max_range - vp.translate[0]) / vp.scale[0];
@@ -332,6 +342,22 @@ static void si_set_viewport_states(struct pipe_context *pctx,
 		unsigned h = scissor->maxy - scissor->miny;
 		unsigned max_extent = MAX2(w, h);

+		int max_corner = MAX2(scissor->maxx, scissor->maxy);
+
+		unsigned center_x = (scissor->maxx + scissor->minx) / 2;
+		unsigned center_y = (scissor->maxy + scissor->miny) / 2;
+		unsigned max_center = MAX2(center_x, center_y);
+
+		/* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
+		 * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
+		 * (for example, a 1x1 viewport in the lower right corner of
+		 * 16Kx16K) Such viewports need a greater guardband, so they
+		 * have to use a worse quantization mode.
+		 */
+		unsigned distance_off_center =
+			MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+		max_extent += distance_off_center;
+
 		/* Determine the best quantization mode (subpixel precision),
 		 * but also leave enough space for the guardband.
 		 *
@@ -343,7 +369,22 @@ static void si_set_viewport_states(struct pipe_context *pctx,
 		if (ctx->family == CHIP_RAVEN)
 			max_extent = 16384; /* Use QUANT_MODE == 16_8. */

-		if (max_extent <= 1024) /* 4K scanline area for guardband */
+		/* Another constraint is that all coordinates in the viewport
+		 * are representable in fixed point with respect to the
+		 * surface origin.
+		 *
+		 * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
+		 * an offset that would make the upper corner of the viewport
+		 * greater than the maximum representable number post
+		 * quantization, ie 2^quant_bits.
+		 *
+		 * This does not matter for 14.10 and 16.8 formats since the
+		 * offset is already limited at 8k, but it means we can't use
+		 * 12.12 if we are drawing to some pixels outside the lower
+		 * 4k x 4k of the render target.
+		 */
+
+		if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
 			scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
 		else if (max_extent <= 4096) /* 16K scanline area for guardband */
 			scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -190,11 +190,7 @@ swr_arch_libs = []
 swr_arch_defines = []

 swr_avx_args = cpp.first_supported_argument(
-  '-target-cpu=sandybridge', '-mavx', '-march=core-avx', '-tp=sandybridge',
-  prefix : '''
-    #if !defined(__AVX__)
-    # error
-    #endif ''',
+  '-mavx', '-target-cpu=sandybridge', '-march=core-avx', '-tp=sandybridge',
 )
 if swr_avx_args == []
  error('Cannot find AVX support for swr. (these are required for SWR an all architectures.)')
@@ -215,18 +211,10 @@ endif

 if with_swr_arches.contains('avx2')
  swr_avx2_args = cpp.first_supported_argument(
-    '-target-cpu=haswell', '-march=core-avx2', '-tp=haswell',
-    prefix : '''
-      #if !defined(__AVX2__)
-      # error
-      #endif ''',
+    '-march=core-avx2', '-target-cpu=haswell', '-tp=haswell',
  )
  if swr_avx2_args == []
-    if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c'],
-                        prefix : '''
-                          #if !defined(__AVX2__)
-                          # error
-                          #endif ''')
+    if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c'])
      swr_avx2_args = ['-mavx2', '-mfma', '-mbmi2', '-mf16c']
    else
      error('Cannot find AVX2 support for swr.')
@@ -248,11 +236,7 @@ endif

 if with_swr_arches.contains('knl')
  swr_knl_args = cpp.first_supported_argument(
-    '-target-cpu=mic-knl', '-march=knl', '-xMIC-AVX512',
-    prefix : '''
-      #if !defined(__AVX512F__) || !defined(__AVX512ER__)
-      # error
-      #endif ''',
+    '-march=knl', '-target-cpu=mic-knl', '-xMIC-AVX512',
  )
  if swr_knl_args == []
    error('Cannot find KNL support for swr.')
@@ -264,7 +248,7 @@ if with_swr_arches.contains('knl')
    [files_swr_common, files_swr_arch],
    cpp_args : [
      swr_cpp_args, swr_knl_args, '-DKNOB_ARCH=KNOB_ARCH_AVX512',
-      '-DKNOB_ARCH_KNIGHTS',
+      '-DSIMD_ARCH_KNIGHTS',
    ],
    link_args : [ld_args_gc_sections],
    include_directories : [swr_incs],
@@ -276,11 +260,7 @@ endif

 if with_swr_arches.contains('skx')
  swr_skx_args = cpp.first_supported_argument(
-    '-target-cpu=x86-skylake', '-march=skylake-avx512', '-xCORE-AVX512',
-    prefix : '''
-      #if !defined(__AVX512F__) || !defined(__AVX512BW__)
-      # error
-      #endif ''',
+    '-march=skylake-avx512', '-target-cpu=x86-skylake', '-xCORE-AVX512',
  )
  if swr_skx_args == []
    error('Cannot find SKX support for swr.')
--- a/src/gallium/drivers/swr/swr_fence.cpp
+++ b/src/gallium/drivers/swr/swr_fence.cpp
@@ -50,7 +50,9 @@ swr_fence_cb(uint64_t userData, uint64_t userData2, uint64_t userData3)
   swr_fence_do_work(fence);

   /* Correct value is in SwrSync data, and not the fence write field. */
-   fence->read = userData2;
+   /* Contexts may not finish in order, but fence value always increases */
+   if (fence->read < userData2)
+      fence->read = userData2;
 }

 /*
--- a/src/gallium/drivers/v3d/v3d_resource.c
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -669,7 +669,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
                rsc->tiled = false;
        } else {
                fprintf(stderr, "Unsupported modifier requested\n");
-                return NULL;
+                goto fail;
        }

        rsc->internal_format = prsc->format;
--- a/src/gallium/drivers/vc4/meson.build
+++ b/src/gallium/drivers/vc4/meson.build
@@ -81,8 +81,10 @@ files_libvc4 = files(
  'vc4_uniforms.c',
 )

+vc4_c_args = []
+
 libvc4_neon = []
-if with_asm_arch == 'arm'
+if host_machine.cpu_family() == 'arm'
  libvc4_neon = static_library(
    'vc4_neon',
    'vc4_tiling_lt_neon.c',
@@ -91,12 +93,12 @@ if with_asm_arch == 'arm'
    ],
    c_args : '-mfpu=neon',
  )
+  vc4_c_args += '-DUSE_ARM_ASM'
 endif

-simpenrose_c_args = []
 dep_simpenrose = dependency('simpenrose', required : false)
 if dep_simpenrose.found()
-  simpenrose_c_args = '-DUSE_VC4_SIMULATOR'
+  vc4_c_args += '-DUSE_VC4_SIMULATOR'
 endif

 libvc4 = static_library(
@@ -107,7 +109,7 @@ libvc4 = static_library(
    inc_gallium_drivers, inc_drm_uapi,
  ],
  link_with: libvc4_neon,
-  c_args : [c_vis_args, simpenrose_c_args],
+  c_args : [c_vis_args, vc4_c_args],
  cpp_args : [cpp_vis_args],
  dependencies : [dep_simpenrose, dep_libdrm, dep_valgrind, idep_nir_headers],
  build_by_default : false,
--- a/src/gallium/drivers/vc4/vc4_query.c
+++ b/src/gallium/drivers/vc4/vc4_query.c
@@ -132,7 +132,7 @@ vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries,

        /* We can't mix HW and non-HW queries. */
        if (nhwqueries && nhwqueries != num_queries)
-                return NULL;
+                goto err_free_query;

        if (!nhwqueries)
                return (struct pipe_query *)query;
--- a/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -73,42 +73,46 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                        /* Load from the GPU in one shot, no interleave, to
                         * d0-d7.
                         */
-                        "vldm %0, {q0, q1, q2, q3}\n"
+                        "vldm %[gpu], {q0, q1, q2, q3}\n"
                        /* Store each 8-byte line to cpu-side destination,
                         * incrementing it by the stride each time.
                         */
-                        "vst1.8 d0, [%1], %2\n"
-                        "vst1.8 d1, [%1], %2\n"
-                        "vst1.8 d2, [%1], %2\n"
-                        "vst1.8 d3, [%1], %2\n"
-                        "vst1.8 d4, [%1], %2\n"
-                        "vst1.8 d5, [%1], %2\n"
-                        "vst1.8 d6, [%1], %2\n"
-                        "vst1.8 d7, [%1]\n"
-                        :
-                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d7, [%[cpu]]\n"
+                        : [cpu]         "+r"(cpu)
+                        : [gpu]         "r"(gpu),
+                          [cpu_stride]  "r"(cpu_stride)
                        : "q0", "q1", "q2", "q3");
        } else {
                assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                __asm__ volatile (
                        /* Load from the GPU in one shot, no interleave, to
                         * d0-d7.
                         */
-                        "vldm %0, {q0, q1, q2, q3};\n"
+                        "vldm %[gpu], {q0, q1, q2, q3};\n"
                        /* Store each 16-byte line in 2 parts to the cpu-side
                         * destination.  (vld1 can only store one d-register
                         * at a time).
                         */
-                        "vst1.8 d0, [%1], %3\n"
-                        "vst1.8 d1, [%2], %3\n"
-                        "vst1.8 d2, [%1], %3\n"
-                        "vst1.8 d3, [%2], %3\n"
-                        "vst1.8 d4, [%1], %3\n"
-                        "vst1.8 d5, [%2], %3\n"
-                        "vst1.8 d6, [%1]\n"
-                        "vst1.8 d7, [%2]\n"
-                        :
-                        : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
+                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
+                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
+                        "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
+                        "vst1.8 d6, [%[cpu]]\n"
+                        "vst1.8 d7, [%[cpu2]]\n"
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
+                        : [gpu]         "r"(gpu),
+                          [cpu_stride]  "r"(cpu_stride)
                        : "q0", "q1", "q2", "q3");
        }
 #elif defined (PIPE_ARCH_AARCH64)
@@ -117,42 +121,46 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                        /* Load from the GPU in one shot, no interleave, to
                         * d0-d7.
                         */
-                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
+                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
                        /* Store each 8-byte line to cpu-side destination,
                         * incrementing it by the stride each time.
                         */
-                        "st1 {v0.D}[0], [%1], %2\n"
-                        "st1 {v0.D}[1], [%1], %2\n"
-                        "st1 {v1.D}[0], [%1], %2\n"
-                        "st1 {v1.D}[1], [%1], %2\n"
-                        "st1 {v2.D}[0], [%1], %2\n"
-                        "st1 {v2.D}[1], [%1], %2\n"
-                        "st1 {v3.D}[0], [%1], %2\n"
-                        "st1 {v3.D}[1], [%1]\n"
-			:
-                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v3.D}[1], [%[cpu]]\n"
+                        : [cpu]         "+r"(cpu)
+                        : [gpu]         "r"(gpu),
+                          [cpu_stride]  "r"(cpu_stride)
                        : "v0", "v1", "v2", "v3");
        } else {
                assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                __asm__ volatile (
                        /* Load from the GPU in one shot, no interleave, to
                         * d0-d7.
                         */
-                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
+                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
                        /* Store each 16-byte line in 2 parts to the cpu-side
                         * destination.  (vld1 can only store one d-register
                         * at a time).
                         */
-                        "st1 {v0.D}[0], [%1], %3\n"
-                        "st1 {v0.D}[1], [%2], %3\n"
-                        "st1 {v1.D}[0], [%1], %3\n"
-                        "st1 {v1.D}[1], [%2], %3\n"
-                        "st1 {v2.D}[0], [%1], %3\n"
-                        "st1 {v2.D}[1], [%2], %3\n"
-                        "st1 {v3.D}[0], [%1]\n"
-                        "st1 {v3.D}[1], [%2]\n"
-                        :
-                        : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
+                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
+                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
+                        "st1 {v3.D}[0], [%[cpu]]\n"
+                        "st1 {v3.D}[1], [%[cpu2]]\n"
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
+                        : [gpu]         "r"(gpu),
+                          [cpu_stride]  "r"(cpu_stride)
                        : "v0", "v1", "v2", "v3");
        }
 #else
@@ -174,40 +182,44 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                        /* Load each 8-byte line from cpu-side source,
                         * incrementing it by the stride each time.
                         */
-                        "vld1.8 d0, [%1], %2\n"
-                        "vld1.8 d1, [%1], %2\n"
-                        "vld1.8 d2, [%1], %2\n"
-                        "vld1.8 d3, [%1], %2\n"
-                        "vld1.8 d4, [%1], %2\n"
-                        "vld1.8 d5, [%1], %2\n"
-                        "vld1.8 d6, [%1], %2\n"
-                        "vld1.8 d7, [%1]\n"
+                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d7, [%[cpu]]\n"
                        /* Load from the GPU in one shot, no interleave, to
                         * d0-d7.
                         */
-                        "vstm %0, {q0, q1, q2, q3}\n"
-                        :
-                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        "vstm %[gpu], {q0, q1, q2, q3}\n"
+                        : [cpu]         "+r"(cpu)
+                        : [gpu]         "r"(gpu),
+                          [cpu_stride]  "r"(cpu_stride)
                        : "q0", "q1", "q2", "q3");
        } else {
                assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                __asm__ volatile (
                        /* Load each 16-byte line in 2 parts from the cpu-side
                         * destination.  (vld1 can only store one d-register
                         * at a time).
                         */
-                        "vld1.8 d0, [%1], %3\n"
-                        "vld1.8 d1, [%2], %3\n"
-                        "vld1.8 d2, [%1], %3\n"
-                        "vld1.8 d3, [%2], %3\n"
-                        "vld1.8 d4, [%1], %3\n"
-                        "vld1.8 d5, [%2], %3\n"
-                        "vld1.8 d6, [%1]\n"
-                        "vld1.8 d7, [%2]\n"
+                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
+                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
+                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
+                        "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
+                        "vld1.8 d6, [%[cpu]]\n"
+                        "vld1.8 d7, [%[cpu2]]\n"
                        /* Store to the GPU in one shot, no interleave. */
-                        "vstm %0, {q0, q1, q2, q3}\n"
-                        :
-                        : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+                        "vstm %[gpu], {q0, q1, q2, q3}\n"
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
+                        : [gpu]         "r"(gpu),
+                          [cpu_stride]  "r"(cpu_stride)
                        : "q0", "q1", "q2", "q3");
        }
 #elif defined (PIPE_ARCH_AARCH64)
@@ -216,38 +228,42 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                        /* Load each 8-byte line from cpu-side source,
                         * incrementing it by the stride each time.
                         */
-                        "ld1 {v0.D}[0], [%1], %2\n"
-                        "ld1 {v0.D}[1], [%1], %2\n"
-                        "ld1 {v1.D}[0], [%1], %2\n"
-                        "ld1 {v1.D}[1], [%1], %2\n"
-                        "ld1 {v2.D}[0], [%1], %2\n"
-                        "ld1 {v2.D}[1], [%1], %2\n"
-                        "ld1 {v3.D}[0], [%1], %2\n"
-                        "ld1 {v3.D}[1], [%1]\n"
+                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v3.D}[1], [%[cpu]]\n"
                        /* Store to the GPU in one shot, no interleave. */
-                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
-                        :
-                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
+                        : [cpu]         "+r"(cpu)
+                        : [gpu]         "r"(gpu),
+                          [cpu_stride]  "r"(cpu_stride)
                        : "v0", "v1", "v2", "v3");
        } else {
                assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                __asm__ volatile (
                        /* Load each 16-byte line in 2 parts from the cpu-side
                         * destination.  (vld1 can only store one d-register
                         * at a time).
                         */
-                        "ld1 {v0.D}[0], [%1], %3\n"
-                        "ld1 {v0.D}[1], [%2], %3\n"
-                        "ld1 {v1.D}[0], [%1], %3\n"
-                        "ld1 {v1.D}[1], [%2], %3\n"
-                        "ld1 {v2.D}[0], [%1], %3\n"
-                        "ld1 {v2.D}[1], [%2], %3\n"
-                        "ld1 {v3.D}[0], [%1]\n"
-                        "ld1 {v3.D}[1], [%2]\n"
+                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
+                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
+                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+                        "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
+                        "ld1 {v3.D}[0], [%[cpu]]\n"
+                        "ld1 {v3.D}[1], [%[cpu2]]\n"
                        /* Store to the GPU in one shot, no interleave. */
-                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
-                        :
-                        : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
+                        : [gpu]         "r"(gpu),
+                          [cpu_stride]  "r"(cpu_stride)
                        : "v0", "v1", "v2", "v3");
        }
 #else
--- a/src/gallium/include/pipe/p_video_enums.h
+++ b/src/gallium/include/pipe/p_video_enums.h
@@ -70,7 +70,8 @@ enum pipe_video_profile
   PIPE_VIDEO_PROFILE_HEVC_MAIN_444,
   PIPE_VIDEO_PROFILE_JPEG_BASELINE,
   PIPE_VIDEO_PROFILE_VP9_PROFILE0,
-   PIPE_VIDEO_PROFILE_VP9_PROFILE2
+   PIPE_VIDEO_PROFILE_VP9_PROFILE2,
+   PIPE_VIDEO_PROFILE_MAX
 };

 /* Video caps, can be different for each codec/profile */
--- a/src/gallium/include/state_tracker/drisw_api.h
+++ b/src/gallium/include/state_tracker/drisw_api.h
@@ -20,7 +20,7 @@ struct drisw_loader_funcs
   void (*put_image2) (struct dri_drawable *dri_drawable,
                       void *data, int x, int y, unsigned width, unsigned height, unsigned stride);
   void (*put_image_shm) (struct dri_drawable *dri_drawable,
-                          int shmid, char *shmaddr, unsigned offset,
+                          int shmid, char *shmaddr, unsigned offset, unsigned offset_x,
                          int x, int y, unsigned width, unsigned height, unsigned stride);
 };

--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -79,15 +79,21 @@ put_image2(__DRIdrawable *dPriv, void *data, int x, int y,

 static inline void
 put_image_shm(__DRIdrawable *dPriv, int shmid, char *shmaddr,
-              unsigned offset, int x, int y,
+              unsigned offset, unsigned offset_x, int x, int y,
              unsigned width, unsigned height, unsigned stride)
 {
   __DRIscreen *sPriv = dPriv->driScreenPriv;
   const __DRIswrastLoaderExtension *loader = sPriv->swrast_loader;

-   loader->putImageShm(dPriv, __DRI_SWRAST_IMAGE_OP_SWAP,
-                       x, y, width, height, stride,
-                       shmid, shmaddr, offset, dPriv->loaderPrivate);
+   /* if we have the newer interface, don't have to add the offset_x here. */
+   if (loader->base.version > 4 && loader->putImageShm2)
+     loader->putImageShm2(dPriv, __DRI_SWRAST_IMAGE_OP_SWAP,
+                          x, y, width, height, stride,
+                          shmid, shmaddr, offset, dPriv->loaderPrivate);
+   else
+     loader->putImageShm(dPriv, __DRI_SWRAST_IMAGE_OP_SWAP,
+                         x, y, width, height, stride,
+                         shmid, shmaddr, offset + offset_x, dPriv->loaderPrivate);
 }

 static inline void
@@ -179,12 +185,13 @@ drisw_put_image2(struct dri_drawable *drawable,
 static inline void
 drisw_put_image_shm(struct dri_drawable *drawable,
                    int shmid, char *shmaddr, unsigned offset,
+                    unsigned offset_x,
                    int x, int y, unsigned width, unsigned height,
                    unsigned stride)
 {
   __DRIdrawable *dPriv = drawable->dPriv;

-   put_image_shm(dPriv, shmid, shmaddr, offset, x, y, width, height, stride);
+   put_image_shm(dPriv, shmid, shmaddr, offset, offset_x, x, y, width, height, stride);
 }

 static inline void
--- a/src/gallium/state_trackers/nine/surface9.c
+++ b/src/gallium/state_trackers/nine/surface9.c
@@ -668,6 +668,19 @@ NineSurface9_CopyMemToDefault( struct NineSurface9 *This,
                            From->data, From->stride,
                            0, /* depth = 1 */
                            &src_box);
+    if (From->texture == D3DRTYPE_TEXTURE) {
+        struct NineTexture9 *tex =
+            NineTexture9(From->base.base.container);
+        /* D3DPOOL_SYSTEMMEM with buffer content passed
+         * from the user: execute the upload right now.
+         * It is possible it is enough to delay upload
+         * until the surface refcount is 0, but the
+         * bind refcount may not be 0, and thus the dtor
+         * is not executed (and doesn't trigger the
+         * pending_uploads_counter check). */
+        if (!tex->managed_buffer)
+            nine_csmt_process(This->base.base.device);
+    }

    if (This->data_conversion)
        (void) util_format_translate(This->format_conversion,
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -175,7 +175,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
   ctx->version_minor = 1;
   *ctx->vtable = vtable;
   *ctx->vtable_vpp = vtable_vpp;
-   ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN;
+   ctx->max_profiles = PIPE_VIDEO_PROFILE_MAX - PIPE_VIDEO_PROFILE_UNKNOWN - 1;
   ctx->max_entrypoints = 2;
   ctx->max_attributes = 1;
   ctx->max_image_formats = VL_VA_MAX_IMAGE_FORMATS;
--- a/src/gallium/state_trackers/va/picture_vp9.c
+++ b/src/gallium/state_trackers/va/picture_vp9.c
@@ -28,6 +28,8 @@
 #include "vl/vl_vlc.h"
 #include "va_private.h"

+#define NUM_VP9_REFS 8
+
 void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
   VADecPictureParameterBufferVP9 *vp9 = buf->data;
@@ -79,8 +81,11 @@ void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context,

   context->desc.vp9.picture_parameter.bit_depth = vp9->bit_depth;

-   for (i = 0 ; i < 8 ; i++)
+   for (i = 0 ; i < NUM_VP9_REFS ; i++)
      vlVaGetReferenceFrame(drv, vp9->reference_frames[i], &context->desc.vp9.ref[i]);
+
+   if (!context->decoder && !context->templat.max_references)
+      context->templat.max_references = NUM_VP9_REFS;
 }

 void vlVaHandleSliceParameterBufferVP9(vlVaContext *context, vlVaBuffer *buf)
--- a/src/gallium/state_trackers/vdpau/meson.build
+++ b/src/gallium/state_trackers/vdpau/meson.build
@@ -18,13 +18,20 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

+VDPAU_MAJOR = 1
+VDPAU_MINOR = 0
+
 libvdpau_st = static_library(
  'vdpau_st',
  files(
    'bitmap.c', 'decode.c', 'device.c', 'ftab.c', 'htab.c', 'mixer.c',
    'output.c', 'preemption.c', 'presentation.c', 'query.c', 'surface.c',
  ),
-  c_args : [c_vis_args, '-DVER_MAJOR=1', '-DVER_MINOR=0'],
+  c_args : [
+    c_vis_args,
+    '-DVER_MAJOR=@0@'.format(VDPAU_MAJOR),
+    '-DVER_MINOR=@0@'.format(VDPAU_MINOR),
+  ],
  include_directories : [
    inc_include, inc_src, inc_util, inc_gallium, inc_gallium_aux,
  ],
--- a/src/gallium/state_trackers/xvmc/attributes.c
+++ b/src/gallium/state_trackers/xvmc/attributes.c
@@ -90,15 +90,15 @@ Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
   if (!attr)
      return XvMCBadContext;

-   if (strcmp(attr, XV_BRIGHTNESS))
+   if (strcmp(attr, XV_BRIGHTNESS) == 0)
      context_priv->procamp.brightness = value / 1000.0f;
-   else if (strcmp(attr, XV_CONTRAST))
+   else if (strcmp(attr, XV_CONTRAST) == 0)
      context_priv->procamp.contrast = value / 1000.0f + 1.0f;
-   else if (strcmp(attr, XV_SATURATION))
+   else if (strcmp(attr, XV_SATURATION) == 0)
      context_priv->procamp.saturation = value / 1000.0f + 1.0f;
-   else if (strcmp(attr, XV_HUE))
+   else if (strcmp(attr, XV_HUE) == 0)
      context_priv->procamp.hue = value / 1000.0f;
-   else if (strcmp(attr, XV_COLORSPACE))
+   else if (strcmp(attr, XV_COLORSPACE) == 0)
      context_priv->color_standard = value ?
         VL_CSC_COLOR_STANDARD_BT_601 :
         VL_CSC_COLOR_STANDARD_BT_709;
@@ -134,15 +134,15 @@ Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
   if (!attr)
      return XvMCBadContext;

-   if (strcmp(attr, XV_BRIGHTNESS))
+   if (strcmp(attr, XV_BRIGHTNESS) == 0)
      *value = context_priv->procamp.brightness * 1000;
-   else if (strcmp(attr, XV_CONTRAST))
+   else if (strcmp(attr, XV_CONTRAST) == 0)
      *value = context_priv->procamp.contrast * 1000 - 1000;
-   else if (strcmp(attr, XV_SATURATION))
+   else if (strcmp(attr, XV_SATURATION) == 0)
      *value = context_priv->procamp.saturation * 1000 + 1000;
-   else if (strcmp(attr, XV_HUE))
+   else if (strcmp(attr, XV_HUE) == 0)
      *value = context_priv->procamp.hue * 1000;
-   else if (strcmp(attr, XV_COLORSPACE))
+   else if (strcmp(attr, XV_COLORSPACE) == 0)
      *value = context_priv->color_standard == VL_CSC_COLOR_STANDARD_BT_709;
   else
      return BadName;
--- a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
+++ b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
@@ -123,11 +123,11 @@ void ParseArgs(int argc, char **argv, struct Config *config)

 			while (token && !fail)
 			{
-				if (strcmp(token, "i"))
+				if (strcmp(token, "i") == 0)
 					config->mb_types |= MB_TYPE_I;
-				else if (strcmp(token, "p"))
+				else if (strcmp(token, "p") == 0)
 					config->mb_types |= MB_TYPE_P;
-				else if (strcmp(token, "b"))
+				else if (strcmp(token, "b") == 0)
 					config->mb_types |= MB_TYPE_B;
 				else
 					fail = 1;
--- a/src/gallium/targets/vdpau/meson.build
+++ b/src/gallium/targets/vdpau/meson.build
@@ -54,13 +54,14 @@ libvdpau_gallium = shared_library(
    dep_thread, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
  ],
  link_depends : vdpau_link_depends,
+  soversion : '@0@.@1@.0'.format(VDPAU_MAJOR, VDPAU_MINOR),
 )
 foreach d : [[with_gallium_r300, 'r300'],
             [with_gallium_r600, 'r600'],
             [with_gallium_radeonsi, 'radeonsi'],
             [with_gallium_nouveau, 'nouveau']]
  if d[0]
-    vdpau_drivers += 'libvdpau_@0@.so.1.0.0'.format(d[1])
+    vdpau_drivers += 'libvdpau_@0@.so.@1@.@2@.0'.format(d[1], VDPAU_MAJOR, VDPAU_MINOR)
  endif
 endforeach

--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1217,8 +1217,6 @@ static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs)
 {
   struct amdgpu_cs_context *cs = acs->csc;

-   cs->num_fence_dependencies = 0;
-
   amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
   amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
   amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -244,15 +244,20 @@ dri_sw_displaytarget_display(struct sw_winsys *ws,
   unsigned width, height, x = 0, y = 0;
   unsigned blsize = util_format_get_blocksize(dri_sw_dt->format);
   unsigned offset = 0;
+   unsigned offset_x = 0;
   char *data = dri_sw_dt->data;
-
+   bool is_shm = dri_sw_dt->shmid != -1;
   /* Set the width to 'stride / cpp'.
    *
    * PutImage correctly clips to the width of the dst drawable.
    */
   if (box) {
-      offset = (dri_sw_dt->stride * box->y) + box->x * blsize;
+      offset = dri_sw_dt->stride * box->y;
+      offset_x = box->x * blsize;
      data += offset;
+      /* don't add x offset for shm, the put_image_shm will deal with it */
+      if (!is_shm)
+         data += offset_x;
      x = box->x;
      y = box->y;
      width = box->width;
@@ -262,8 +267,8 @@ dri_sw_displaytarget_display(struct sw_winsys *ws,
      height = dri_sw_dt->height;
   }

-   if (dri_sw_dt->shmid != -1) {
-      dri_sw_ws->lf->put_image_shm(dri_drawable, dri_sw_dt->shmid, dri_sw_dt->data, offset,
+   if (is_shm) {
+      dri_sw_ws->lf->put_image_shm(dri_drawable, dri_sw_dt->shmid, dri_sw_dt->data, offset, offset_x,
                                   x, y, width, height, dri_sw_dt->stride);
      return;
   }
--- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
+++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
@@ -396,6 +396,7 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
 {
   struct xlib_displaytarget *xlib_dt;
   unsigned nblocksy, size;
+   int ignore;

   xlib_dt = CALLOC_STRUCT(xlib_displaytarget);
   if (!xlib_dt)
@@ -410,7 +411,8 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
   xlib_dt->stride = align(util_format_get_stride(format, width), alignment);
   size = xlib_dt->stride * nblocksy;

-   if (!debug_get_option_xlib_no_shm()) {
+   if (!debug_get_option_xlib_no_shm() &&
+       XQueryExtension(xlib_dt->display, "MIT-SHM", &ignore, &ignore, &ignore)) {
      xlib_dt->data = alloc_shm(xlib_dt, size);
      if (xlib_dt->data) {
         xlib_dt->shm = True;
--- a/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c
+++ b/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c
@@ -37,5 +37,5 @@ vc4_drm_screen_create(int fd)
 struct pipe_screen *
 vc4_drm_screen_create_renderonly(struct renderonly *ro)
 {
-   return vc4_screen_create(fcntl(ro->gpu_fd, F_DUPFD_CLOEXEC, 3), ro);
+   return vc4_screen_create(ro->gpu_fd, ro);
 }
--- a/src/glx/drisw_glx.c
+++ b/src/glx/drisw_glx.c
@@ -201,7 +201,8 @@ bytes_per_line(unsigned pitch_bits, unsigned mul)

 static void
 swrastXPutImage(__DRIdrawable * draw, int op,
-                int x, int y, int w, int h, int stride,
+                int srcx, int srcy, int x, int y,
+                int w, int h, int stride,
                int shmid, char *data, void *loaderPrivate)
 {
   struct drisw_drawable *pdp = loaderPrivate;
@@ -235,12 +236,12 @@ swrastXPutImage(__DRIdrawable * draw, int op,
   if (pdp->shminfo.shmid >= 0) {
      ximage->width = ximage->bytes_per_line / ((ximage->bits_per_pixel + 7)/ 8);
      ximage->height = h;
-      XShmPutImage(dpy, drawable, gc, ximage, 0, 0, x, y, w, h, False);
+      XShmPutImage(dpy, drawable, gc, ximage, srcx, srcy, x, y, w, h, False);
      XSync(dpy, False);
   } else {
      ximage->width = w;
      ximage->height = h;
-      XPutImage(dpy, drawable, gc, ximage, 0, 0, x, y, w, h);
+      XPutImage(dpy, drawable, gc, ximage, srcx, srcy, x, y, w, h);
   }
   ximage->data = NULL;
 }
@@ -254,7 +255,21 @@ swrastPutImageShm(__DRIdrawable * draw, int op,
   struct drisw_drawable *pdp = loaderPrivate;

   pdp->shminfo.shmaddr = shmaddr;
-   swrastXPutImage(draw, op, x, y, w, h, stride, shmid,
+   swrastXPutImage(draw, op, 0, 0, x, y, w, h, stride, shmid,
+                   shmaddr + offset, loaderPrivate);
+}
+
+static void
+swrastPutImageShm2(__DRIdrawable * draw, int op,
+                   int x, int y,
+                   int w, int h, int stride,
+		   int shmid, char *shmaddr, unsigned offset,
+		   void *loaderPrivate)
+{
+   struct drisw_drawable *pdp = loaderPrivate;
+
+   pdp->shminfo.shmaddr = shmaddr;
+   swrastXPutImage(draw, op, x, 0, x, y, w, h, stride, shmid,
                   shmaddr + offset, loaderPrivate);
 }

@@ -263,7 +278,7 @@ swrastPutImage2(__DRIdrawable * draw, int op,
                int x, int y, int w, int h, int stride,
                char *data, void *loaderPrivate)
 {
-   swrastXPutImage(draw, op, x, y, w, h, stride, -1,
+   swrastXPutImage(draw, op, 0, 0, x, y, w, h, stride, -1,
                   data, loaderPrivate);
 }

@@ -272,7 +287,7 @@ swrastPutImage(__DRIdrawable * draw, int op,
               int x, int y, int w, int h,
               char *data, void *loaderPrivate)
 {
-   swrastXPutImage(draw, op, x, y, w, h, 0, -1,
+   swrastXPutImage(draw, op, 0, 0, x, y, w, h, 0, -1,
                   data, loaderPrivate);
 }

@@ -340,7 +355,7 @@ swrastGetImageShm(__DRIdrawable * read,
 }

 static const __DRIswrastLoaderExtension swrastLoaderExtension_shm = {
-   .base = {__DRI_SWRAST_LOADER, 4 },
+   .base = {__DRI_SWRAST_LOADER, 5 },

   .getDrawableInfo     = swrastGetDrawableInfo,
   .putImage            = swrastPutImage,
@@ -349,6 +364,7 @@ static const __DRIswrastLoaderExtension swrastLoaderExtension_shm = {
   .getImage2           = swrastGetImage2,
   .putImageShm         = swrastPutImageShm,
   .getImageShm         = swrastGetImageShm,
+   .putImageShm2        = swrastPutImageShm2,
 };

 static const __DRIextension *loader_extensions_shm[] = {
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -251,6 +251,7 @@ fs_inst::is_send_from_grf() const
   case SHADER_OPCODE_TYPED_ATOMIC:
   case SHADER_OPCODE_TYPED_SURFACE_READ:
   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_IMAGE_SIZE:
   case SHADER_OPCODE_URB_WRITE_SIMD8:
   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
@@ -892,6 +893,7 @@ fs_inst::size_read(int arg) const
   case SHADER_OPCODE_TYPED_ATOMIC:
   case SHADER_OPCODE_TYPED_SURFACE_READ:
   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_IMAGE_SIZE:
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
   case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -371,6 +371,20 @@ can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
   return true;
 }

+static bool
+instruction_requires_packed_data(fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case FS_OPCODE_DDX_FINE:
+   case FS_OPCODE_DDX_COARSE:
+   case FS_OPCODE_DDY_FINE:
+   case FS_OPCODE_DDY_COARSE:
+      return true;
+   default:
+      return false;
+   }
+}
+
 bool
 fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 {
@@ -417,6 +431,13 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
       inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
      return false;

+   /* Some instructions implemented in the generator backend, such as
+    * derivatives, assume that their operands are packed so we can't
+    * generally propagate strided regions to them.
+    */
+   if (instruction_requires_packed_data(inst) && entry->src.stride > 1)
+      return false;
+
   /* Bail if the result of composing both strides would exceed the
    * hardware limit.
    */
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -667,15 +667,14 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
       * messages adding a node interference to the grf127_send_hack_node.
       * This node has a fixed asignment to grf127.
       *
-       * We don't apply it to SIMD16 because previous code avoids any register
-       * overlap between sources and destination.
+       * We don't apply it to SIMD16 instructions because previous code avoids
+       * any register overlap between sources and destination.
       */
      ra_set_node_reg(g, grf127_send_hack_node, 127);
-      if (dispatch_width == 8) {
-         foreach_block_and_inst(block, fs_inst, inst, cfg) {
-            if (inst->is_send_from_grf() && inst->dst.file == VGRF)
-               ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
-         }
+      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         if (inst->exec_size < 16 && inst->is_send_from_grf() &&
+             inst->dst.file == VGRF)
+            ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
      }

      if (spilled_any_registers) {
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -94,7 +94,22 @@ VkResult anv_CreateDescriptorSetLayout(
   uint32_t immutable_sampler_count = 0;
   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
      max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding);
-      if (pCreateInfo->pBindings[j].pImmutableSamplers)
+
+      /* From the Vulkan 1.1.97 spec for VkDescriptorSetLayoutBinding:
+       *
+       *    "If descriptorType specifies a VK_DESCRIPTOR_TYPE_SAMPLER or
+       *    VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER type descriptor, then
+       *    pImmutableSamplers can be used to initialize a set of immutable
+       *    samplers. [...]  If descriptorType is not one of these descriptor
+       *    types, then pImmutableSamplers is ignored.
+       *
+       * We need to be careful here and only parse pImmutableSamplers if we
+       * have one of the right descriptor types.
+       */
+      VkDescriptorType desc_type = pCreateInfo->pBindings[j].descriptorType;
+      if ((desc_type == VK_DESCRIPTOR_TYPE_SAMPLER ||
+           desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) &&
+          pCreateInfo->pBindings[j].pImmutableSamplers)
         immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
   }

@@ -153,6 +168,12 @@ VkResult anv_CreateDescriptorSetLayout(
      if (binding == NULL)
         continue;

+      /* We temporarily stashed the pointer to the binding in the
+       * immutable_samplers pointer.  Now that we've pulled it back out
+       * again, we reset immutable_samplers to NULL.
+       */
+      set_layout->binding[b].immutable_samplers = NULL;
+
      if (binding->descriptorCount == 0)
         continue;

@@ -170,6 +191,15 @@ VkResult anv_CreateDescriptorSetLayout(
            set_layout->binding[b].stage[s].sampler_index = sampler_count[s];
            sampler_count[s] += binding->descriptorCount;
         }
+
+         if (binding->pImmutableSamplers) {
+            set_layout->binding[b].immutable_samplers = samplers;
+            samplers += binding->descriptorCount;
+
+            for (uint32_t i = 0; i < binding->descriptorCount; i++)
+               set_layout->binding[b].immutable_samplers[i] =
+                  anv_sampler_from_handle(binding->pImmutableSamplers[i]);
+         }
         break;
      default:
         break;
@@ -221,17 +251,6 @@ VkResult anv_CreateDescriptorSetLayout(
         break;
      }

-      if (binding->pImmutableSamplers) {
-         set_layout->binding[b].immutable_samplers = samplers;
-         samplers += binding->descriptorCount;
-
-         for (uint32_t i = 0; i < binding->descriptorCount; i++)
-            set_layout->binding[b].immutable_samplers[i] =
-               anv_sampler_from_handle(binding->pImmutableSamplers[i]);
-      } else {
-         set_layout->binding[b].immutable_samplers = NULL;
-      }
-
      set_layout->shader_stages |= binding->stageFlags;
   }

--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -980,9 +980,12 @@ void anv_GetPhysicalDeviceProperties(
   const uint32_t max_samplers = (devinfo->gen >= 8 || devinfo->is_haswell) ?
                                 128 : 16;

+   const uint32_t max_images = devinfo->gen < 9 ? MAX_GEN8_IMAGES : MAX_IMAGES;
+
   VkSampleCountFlags sample_counts =
      isl_device_get_sample_counts(&pdevice->isl_dev);

+
   VkPhysicalDeviceLimits limits = {
      .maxImageDimension1D                      = (1 << 14),
      .maxImageDimension2D                      = (1 << 14),
@@ -1002,7 +1005,7 @@ void anv_GetPhysicalDeviceProperties(
      .maxPerStageDescriptorUniformBuffers      = 64,
      .maxPerStageDescriptorStorageBuffers      = 64,
      .maxPerStageDescriptorSampledImages       = max_samplers,
-      .maxPerStageDescriptorStorageImages       = 64,
+      .maxPerStageDescriptorStorageImages       = max_images,
      .maxPerStageDescriptorInputAttachments    = 64,
      .maxPerStageResources                     = 250,
      .maxDescriptorSetSamplers                 = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
@@ -1011,7 +1014,7 @@ void anv_GetPhysicalDeviceProperties(
      .maxDescriptorSetStorageBuffers           = 6 * 64,           /* number of stages * maxPerStageDescriptorStorageBuffers */
      .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
      .maxDescriptorSetSampledImages            = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSampledImages */
-      .maxDescriptorSetStorageImages            = 6 * 64,           /* number of stages * maxPerStageDescriptorStorageImages */
+      .maxDescriptorSetStorageImages            = 6 * max_images,   /* number of stages * maxPerStageDescriptorStorageImages */
      .maxDescriptorSetInputAttachments         = 256,
      .maxVertexInputAttributes                 = MAX_VBS,
      .maxVertexInputBindings                   = MAX_VBS,
--- a/src/intel/vulkan/anv_nir.h
+++ b/src/intel/vulkan/anv_nir.h
@@ -40,7 +40,8 @@ bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask);
 bool anv_nir_lower_ycbcr_textures(nir_shader *shader,
                                  struct anv_pipeline_layout *layout);

-void anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline,
+void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
+                                   bool robust_buffer_access,
                                   struct anv_pipeline_layout *layout,
                                   nir_shader *shader,
                                   struct brw_stage_prog_data *prog_data,
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -428,7 +428,8 @@ setup_vec4_uniform_value(uint32_t *params, uint32_t offset, unsigned n)
 }

 void
-anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline,
+anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
+                              bool robust_buffer_access,
                              struct anv_pipeline_layout *layout,
                              nir_shader *shader,
                              struct brw_stage_prog_data *prog_data,
@@ -439,7 +440,7 @@ anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline,
   struct apply_pipeline_layout_state state = {
      .shader = shader,
      .layout = layout,
-      .add_bounds_checks = pipeline->device->robust_buffer_access,
+      .add_bounds_checks = robust_buffer_access,
   };

   void *mem_ctx = ralloc_context(NULL);
@@ -518,8 +519,8 @@ anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline,
      }
   }

-   if (map->image_count > 0) {
-      assert(map->image_count <= MAX_IMAGES);
+   if (map->image_count > 0 && pdevice->compiler->devinfo->gen < 9) {
+      assert(map->image_count <= MAX_GEN8_IMAGES);
      assert(shader->num_uniforms == prog_data->nr_params * 4);
      state.first_image_uniform = shader->num_uniforms;
      uint32_t *param = brw_stage_prog_data_add_params(prog_data,
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -532,7 +532,9 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,

   /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
   if (layout) {
-      anv_nir_apply_pipeline_layout(pipeline, layout, nir, prog_data,
+      anv_nir_apply_pipeline_layout(&pipeline->device->instance->physicalDevice,
+                                    pipeline->device->robust_buffer_access,
+                                    layout, nir, prog_data,
                                    &stage->bind_map);
   }

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -157,7 +157,8 @@ struct gen_l3_config;
 #define MAX_SCISSORS    16
 #define MAX_PUSH_CONSTANTS_SIZE 128
 #define MAX_DYNAMIC_BUFFERS 16
-#define MAX_IMAGES 8
+#define MAX_IMAGES 64
+#define MAX_GEN8_IMAGES 8
 #define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */

 /* The kernel relocation API has a limitation of a 32-bit delta value
@@ -1874,7 +1875,7 @@ struct anv_push_constants {
   uint32_t base_work_group_id[3];

   /* Image data for image_load_store on pre-SKL */
-   struct brw_image_param images[MAX_IMAGES];
+   struct brw_image_param images[MAX_GEN8_IMAGES];
 };

 struct anv_dynamic_state {
--- a/src/intel/vulkan/gen7_cmd_buffer.c
+++ b/src/intel/vulkan/gen7_cmd_buffer.c
@@ -70,12 +70,36 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
      };

      const int max = 0xffff;
+
+      uint32_t y_min = s->offset.y;
+      uint32_t x_min = s->offset.x;
+      uint32_t y_max = s->offset.y + s->extent.height - 1;
+      uint32_t x_max = s->offset.x + s->extent.width - 1;
+
+      /* Do this math using int64_t so overflow gets clamped correctly. */
+      if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+         y_min = clamp_int64((uint64_t) y_min,
+                             cmd_buffer->state.render_area.offset.y, max);
+         x_min = clamp_int64((uint64_t) x_min,
+                             cmd_buffer->state.render_area.offset.x, max);
+         y_max = clamp_int64((uint64_t) y_max, 0,
+                             cmd_buffer->state.render_area.offset.y +
+                             cmd_buffer->state.render_area.extent.height - 1);
+         x_max = clamp_int64((uint64_t) x_max, 0,
+                             cmd_buffer->state.render_area.offset.x +
+                             cmd_buffer->state.render_area.extent.width - 1);
+      } else if (fb) {
+         y_min = clamp_int64((uint64_t) y_min, 0, max);
+         x_min = clamp_int64((uint64_t) x_min, 0, max);
+         y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1);
+         x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1);
+      }
+
      struct GEN7_SCISSOR_RECT scissor = {
-         /* Do this math using int64_t so overflow gets clamped correctly. */
-         .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
-         .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
-         .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + s->extent.height - 1, 0, fb->height - 1),
-         .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + s->extent.width - 1, 0, fb->width - 1)
+         .ScissorRectangleYMin = y_min,
+         .ScissorRectangleXMin = x_min,
+         .ScissorRectangleYMax = y_max,
+         .ScissorRectangleXMax = x_max
      };

      if (s->extent.width <= 0 || s->extent.height <= 0) {
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1998,6 +1998,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
                   gl_shader_stage stage,
                   struct anv_state *bt_state)
 {
+   const struct gen_device_info *devinfo = &cmd_buffer->device->info;
   struct anv_subpass *subpass = cmd_buffer->state.subpass;
   struct anv_cmd_pipeline_state *pipe_state;
   struct anv_pipeline *pipeline;
@@ -2055,7 +2056,8 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
   if (map->surface_count == 0)
      goto out;

-   if (map->image_count > 0) {
+   /* We only use push constant space for images before gen9 */
+   if (map->image_count > 0 && devinfo->gen < 9) {
      VkResult result =
         anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images);
      if (result != VK_SUCCESS)
@@ -2168,11 +2170,15 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
         surface_state = sstate.state;
         assert(surface_state.alloc_size);
         add_surface_state_relocs(cmd_buffer, sstate);
+         if (devinfo->gen < 9) {
+            assert(image < MAX_GEN8_IMAGES);
+            struct brw_image_param *image_param =
+               &cmd_buffer->state.push_constants[stage]->images[image];

-         struct brw_image_param *image_param =
-            &cmd_buffer->state.push_constants[stage]->images[image++];
-
-         *image_param = desc->image_view->planes[binding->plane].storage_image_param;
+            *image_param =
+               desc->image_view->planes[binding->plane].storage_image_param;
+         }
+         image++;
         break;
      }

@@ -2217,11 +2223,14 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
         assert(surface_state.alloc_size);
         add_surface_reloc(cmd_buffer, surface_state,
                           desc->buffer_view->address);
+         if (devinfo->gen < 9) {
+            assert(image < MAX_GEN8_IMAGES);
+            struct brw_image_param *image_param =
+               &cmd_buffer->state.push_constants[stage]->images[image];

-         struct brw_image_param *image_param =
-            &cmd_buffer->state.push_constants[stage]->images[image++];
-
-         *image_param = desc->buffer_view->storage_image_param;
+            *image_param = desc->buffer_view->storage_image_param;
+         }
+         image++;
         break;

      default:
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2017-2018 Intel Corporation
+# Copyright © 2017-2019 Intel Corporation

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -176,7 +176,10 @@ endif

 libanv_common = static_library(
  'anv_common',
-  [libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h],
+  [
+    libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h,
+    gen_xml_pack,
+  ],
  include_directories : [
    inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util,
    inc_vulkan_wsi,
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -1273,12 +1273,20 @@ dri3_alloc_render_buffer(struct loader_dri3_drawable *draw, unsigned int format,

         free(mod_reply);

-         buffer->image = draw->ext->image->createImageWithModifiers(draw->dri_screen,
-                                                                    width, height,
-                                                                    format,
-                                                                    modifiers,
-                                                                    count,
-                                                                    buffer);
+         /* don't use createImageWithModifiers() if we have no
+          * modifiers, other things depend on the use flags when
+          * there are no modifiers to know that a buffer can be
+          * shared.
+          */
+         if (modifiers) {
+            buffer->image = draw->ext->image->createImageWithModifiers(draw->dri_screen,
+                                                                       width, height,
+                                                                       format,
+                                                                       modifiers,
+                                                                       count,
+                                                                       buffer);
+         }
+
         free(modifiers);
      }
 #endif
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -222,8 +222,13 @@ void st_init_limits(struct pipe_screen *screen,
      pc->MaxUniformComponents = MIN2(pc->MaxUniformComponents,
                                      MAX_UNIFORMS * 4);

+      /* For ARB programs, prog_src_register::Index is a signed 13-bit number.
+       * This gives us a limit of 4096 values - but we may need to generate
+       * internal values in addition to what the source program uses.  So, we
+       * drop the limit one step lower, to 2048, to be safe.
+       */
      pc->MaxParameters =
-      pc->MaxNativeParameters = pc->MaxUniformComponents / 4;
+      pc->MaxNativeParameters = MIN2(pc->MaxUniformComponents / 4, 2048);
      pc->MaxInputComponents =
         screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS) * 4;
      pc->MaxOutputComponents =
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -1071,7 +1071,12 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi,
      st_framebuffers_purge(st);
   }
   else {
+      GET_CURRENT_CONTEXT(ctx);
+
      ret = _mesa_make_current(NULL, NULL, NULL);
+
+      if (ctx)
+         st_framebuffers_purge(ctx->st);
   }

   return ret;
@@ -1 +1 @@
 .3.2
 .3.4