docs: Update 11.1.0 release notes

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
Update version to 11.1.0(final)
2015-12-15 14:49:25 +00:00 · 2015-12-14 12:20:18 +00:00 · 2015-12-12 19:39:03 +00:00 · 2015-12-12 19:39:03 +00:00 · 2015-12-12 19:39:03 +00:00 · 2015-12-12 19:39:03 +00:00
148 changed files with 4032 additions and 1027 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-11.1.0-devel
+11.1.0
--- a/configure.ac
+++ b/configure.ac
@@ -767,6 +767,11 @@ linux*)
    dri3_default=no
    ;;
 esac
+
+if test "x$enable_dri" = xno; then
+    dri3_default=no
+fi
+
 AC_ARG_ENABLE([dri3],
    [AS_HELP_STRING([--enable-dri3],
        [enable DRI3 @<:@default=auto@:>@])],
@@ -2173,7 +2178,9 @@ if test -n "$with_gallium_drivers"; then
            gallium_require_drm_loader

            PKG_CHECK_MODULES([SIMPENROSE], [simpenrose],
-                              [USE_VC4_SIMULATOR=yes], [USE_VC4_SIMULATOR=no])
+                              [USE_VC4_SIMULATOR=yes;
+                               DEFINES="$DEFINES -DUSE_VC4_SIMULATOR"],
+                              [USE_VC4_SIMULATOR=no])
            ;;
        xvirgl)
            HAVE_GALLIUM_VIRGL=yes
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -238,6 +238,12 @@ for details.
 </ul>


+<h3>VA-API state tracker environment variables</h3>
+<ul>
+<li>VAAPI_MPEG4_ENABLED - enable MPEG4 for VA-API, disabled by default.
+</ul>
+
+
 <p>
 Other Gallium drivers have their own environment variables.  These may change
 frequently so the source code should be consulted for details.
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -14,7 +14,7 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">

-<h1>Mesa 11.1.0 Release Notes / TBD</h1>
+<h1>Mesa 11.1.0 Release Notes / 15 December 2015</h1>

 <p>
 Mesa 11.1.0 is a new development release.
@@ -51,14 +51,20 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_arrays_of_arrays on i965</li>
 <li>GL_ARB_blend_func_extended on freedreno (a3xx)</li>
 <li>GL_ARB_clear_texture on nv50, nvc0</li>
+<li>GL_ARB_clip_control on freedreno/a4xx</li>
 <li>GL_ARB_copy_image on nv50, nvc0, radeonsi</li>
+<li>GL_ARB_depth_clamp on freedreno/a4xx</li>
+<li>GL_ARB_fragment_layer_viewport on i965 (gen6+)</li>
 <li>GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips</li>
 <li>GL_ARB_gpu_shader5 on r600 for Evergreen and later chips</li>
+<li>GL_ARB_seamless_cubemap_per_texture on freedreno/a4xx</li>
 <li>GL_ARB_shader_clock on i965 (gen7+)</li>
 <li>GL_ARB_shader_stencil_export on i965 (gen9+)</li>
 <li>GL_ARB_shader_storage_buffer_object on i965</li>
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
+<li>GL_ARB_texture_buffer_range on freedreno/a3xx</li>
+<li>GL_ARB_texture_compression_bptc on freedreno/a4xx</li>
 <li>GL_ARB_texture_query_lod on softpipe</li>
 <li>GL_ARB_texture_view on radeonsi and r600 (for evergeen and newer)</li>
 <li>GL_ARB_vertex_type_2_10_10_10_rev on freedreno (a3xx, a4xx)</li>
@@ -78,11 +84,196 @@ Note: some of the new features are only available with certain drivers.

 <h2>Bug fixes</h2>

-TBD.
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=28130">Bug 28130</a> - vbo: premature flushing breaks GL_LINE_LOOP</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=38109">Bug 38109</a> - i915 driver crashes if too few vertices are submitted (Mesa 7.10.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=49779">Bug 49779</a> - Extra line segments in GL_LINE_LOOP</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=55552">Bug 55552</a> - Compile errors with --enable-mangling</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=71789">Bug 71789</a> - [r300g] Visuals not found in (default) depth = 24</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=79783">Bug 79783</a> - Distorted output in obs-studio where other vendors &quot;work&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=80821">Bug 80821</a> - When LIBGL_ALWAYS_SOFTWARE is set, KHR_create_context is not supported</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=81174">Bug 81174</a> - Gallium: GL_LINE_LOOP broken with more than 512 points</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=83508">Bug 83508</a> - [UBO] Assertion for array of blocks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84677">Bug 84677</a> - Triangle disappears with glPolygonMode GL_LINE</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86281">Bug 86281</a> - brw_meta_fast_clear (brw=brw&#64;entry=0x7fffd4097a08, fb=fb&#64;entry=0x7fffd40fa900, buffers=buffers&#64;entry=2, partial_clear=partial_clear&#64;entry=false)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86469">Bug 86469</a> - Unreal Engine demo doesn't run</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86720">Bug 86720</a> - [radeon] Europa Universalis 4 freezing during game start (10.3.3+, still broken on 11.0.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89014">Bug 89014</a> - PIPE_QUERY_GPU_FINISHED is not acting as expected on SI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90175">Bug 90175</a> - [hsw bisected][PATCH] atomic counters doesn't work for a binding point different to zero</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90348">Bug 90348</a> - Spilling failure of b96 merged value</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90631">Bug 90631</a> - Compilation failure for fragment shader with many branches on Sandy Bridge</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90734">Bug 90734</a> - glBufferSubData is corrupting data when buffer is &gt; 32k</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90887">Bug 90887</a> - PhiMovesPass in register allocator broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91044">Bug 91044</a> - piglit spec/egl_khr_create_context/valid debug flag gles* fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91114">Bug 91114</a> - ES3-CTS.gtf.GL3Tests.shadow.shadow_execution_vert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91254">Bug 91254</a> - (regresion) video using VA-API on Intel slow and freeze system with mesa 10.6 or 10.6.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91292">Bug 91292</a> - [BDW+] glVertexAttribDivisor not working in combination with glPolygonMode</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91342">Bug 91342</a> - Very dark textures on some objects in indoors environments in Postal 2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91526">Bug 91526</a> - World of Warcraft (on Wine) has UI corruption with nouveau</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91551">Bug 91551</a> - DXTn compressed normal maps produce severe artifacts on all NV5x and NVDx chipsets</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91596">Bug 91596</a> - EGL_KHR_gl_colorspace (v2) causes problem with Android-x86 GUI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91716">Bug 91716</a> - [bisected] piglit.shaders.glsl-vs-int-attrib regresses on 32 bit BYT, HSW, IVB, SNB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91718">Bug 91718</a> - piglit.spec.arb_shader_image_load_store.invalid causes intermittent GPU HANG</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91719">Bug 91719</a> - [SNB,HSW,BYT] dEQP regressions associated with using NIR for vertex shaders</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91726">Bug 91726</a> - R600 asserts in tgsi_cmp/make_src_for_op3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91780">Bug 91780</a> - Rendering issues with geometry shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91785">Bug 91785</a> - make check DispatchSanity_test.GLES31 regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91788">Bug 91788</a> - [HSW Regression] Synmark2_v6 Multithread performance case FPS reduced by 36%</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91847">Bug 91847</a> - glGenerateTextureMipmap not working (no errors) unless glActiveTexture(GL_TEXTURE1) is called before</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91857">Bug 91857</a> - Mesa 10.6.3 linker is slow</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91881">Bug 91881</a> - regression: GPU lockups since mesa-11.0.0_rc1 on RV620 (r600) driver</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91890">Bug 91890</a> - [nve7] witcher2: blurry image &amp; DATA_ERRORs (class 0xa097 mthd 0x2380/0x238c)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91898">Bug 91898</a> - src/util/mesa-sha1.c:250:25: fatal error: openssl/sha.h: No such file or directory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91927">Bug 91927</a> - [SKL] [regression] piglit compressed textures tests fail  with kernel upgrade</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91930">Bug 91930</a> - Program with GtkGLArea widget does not redraw</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91970">Bug 91970</a> - [BSW regression] dEQP-GLES3.functional.shaders.precision.int.highp_mul_vertex</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91985">Bug 91985</a> - [regression, bisected] FTBFS with commit f9caabe8f1: R600_UCP_CONST_BUFFER is undefined</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91993">Bug 91993</a> - Graphical glitch in Astromenace (open-source game).</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92009">Bug 92009</a> - ES3-CTS.gtf.GL3Tests.packed_pixels.packed_pixels fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92033">Bug 92033</a> - [SNB,regression,dEQP,bisected] functional.shaders.random tests regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92052">Bug 92052</a> - nir/nir_builder.h:79: error: expected primary-expression before ‘.’ token</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92054">Bug 92054</a> - make check gbm-symbols-check regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92066">Bug 92066</a> - [ILK,G45,regression] New assertion on BRW_MAX_MRF breaks ilk and g45</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92072">Bug 92072</a> - Wine breakage since d082c5324 (st/mesa: don't call st_validate_state in BlitFramebuffer)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92095">Bug 92095</a> - [Regression, bisected] arb_shader_atomic_counters.compiler.builtins.frag</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92122">Bug 92122</a> - [bisected, cts] Regression with Assault Android Cactus</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92124">Bug 92124</a> - shader_query.cpp:841:34: error: ‘strndup’ was not declared in this scope</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92183">Bug 92183</a> - linker.cpp:3187:46: error: ‘strtok_r’ was not declared in this scope</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92193">Bug 92193</a> - [SKL] ES2-CTS.gtf.GL2ExtensionTests.compressed_astc_texture.compressed_astc_texture fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92214">Bug 92214</a> - Flightgear crashes during splashboot with R600 driver, LLVM 3.7.0 and mesa 11.0.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92221">Bug 92221</a> - Unintended code changes in _mesa_base_tex_format commit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92265">Bug 92265</a> - Black windows in weston after update mesa to 11.0.2-1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92304">Bug 92304</a> - [cts] cts.shaders.negative conformance tests fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92363">Bug 92363</a> - [BSW/BDW] ogles1conform Gets test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92437">Bug 92437</a> - osmesa: Expose GL entry points for Windows build, via .def file</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92438">Bug 92438</a> - Segfault in pushbuf_kref when running the android emulator (qemu) on nv50</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92476">Bug 92476</a> - [cts] ES2-CTS.gtf.GL2ExtensionTests.egl_image.egl_image fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92588">Bug 92588</a> - [HSW,BDW,BSW,SKL-Y][GLES 3.1 CTS] ES31-CTS.arrays_of_arrays.InteractionFunctionCalls2 - assert</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92621">Bug 92621</a> - [G965 ILK G45] Regression: 24 piglit regressions in glsl-1.10</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92623">Bug 92623</a> - Differences in prog_data ignored when caching fragment programs (causes hangs)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92634">Bug 92634</a> - gallium's vl_mpeg12_decoder does not work with st/va</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92639">Bug 92639</a> - [Regression bisected] Ogles1conform mustpass.c fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92641">Bug 92641</a> - [SKL BSW] [Regression] Ogles1conform userclip.c fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92645">Bug 92645</a> - kodi vdpau interop fails since  mesa,meta: move gl_texture_object::TargetIndex initializations</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92705">Bug 92705</a> - [clover] fail to build with llvm-svn/clang-svn 3.8</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92709">Bug 92709</a> - &quot;LLVM triggered Diagnostic Handler: unsupported call to function ldexpf in main&quot; when starting race in stuntrally</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92738">Bug 92738</a> - Randon R7 240 doesn't work on 16KiB page size platform</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92744">Bug 92744</a> - [g965 Regression bisected] Performance regression and piglit assertions due to liveness analysis</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92770">Bug 92770</a> - [SNB, regression, dEQP] deqp-gles3.functional.shaders.discard.dynamic_loop_texture</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92824">Bug 92824</a> - [regression, bisected] `make check` dispatch-sanity broken by GL_EXT_buffer_storage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92849">Bug 92849</a> - [IVB HSW BDW] piglit image load/store load-from-cleared-image.shader_test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92859">Bug 92859</a> - [regression, bisected] validate_intrinsic_instr: Assertion triggered</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92860">Bug 92860</a> - [radeonsi][bisected] st/mesa: implement ARB_copy_image - Corruption in ARK Survival Evolved</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92900">Bug 92900</a> - [regression bisected] About 700 piglit regressions is what could go wrong</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92909">Bug 92909</a> - Offset/alignment issue with layout std140 and vec3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92985">Bug 92985</a> - Mac OS X build error &quot;ar: no archive members specified&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93015">Bug 93015</a> - Tonga Elemental segfault + VM faults since  radeon: implement r600_query_hw_get_result via function pointers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93048">Bug 93048</a> - [CTS regression] mesa af2723 breaks GL Conformance for debug extension</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93063">Bug 93063</a> - drm_helper.h:227:1: error: static declaration of ‘pipe_virgl_create_screen’ follows non-static declaration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93091">Bug 93091</a> - [opencl] segfault when running any opencl programs (like clinfo)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93126">Bug 93126</a> - wrongly claim supporting GL_EXT_texture_rg</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93180">Bug 93180</a> - [regression] arb_separate_shader_objects.active sampler conflict fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93235">Bug 93235</a> - [regression] dispatch sanity broken by GetPointerv</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93266">Bug 93266</a> - gl_arb_shading_language_420pack does not allow binding of image variables</li>
+
+</ul>
+

 <h2>Changes</h2>

-TBD.
+<li>MPEG4 decoding has been disabled by default in the VAAPI driver</li>

 </div>
 </body>
--- a/include/GLES2/gl2ext.h
+++ b/include/GLES2/gl2ext.h
--- a/src/gallium/auxiliary/pipe-loader/Makefile.am
+++ b/src/gallium/auxiliary/pipe-loader/Makefile.am
@@ -37,12 +37,12 @@ libpipe_loader_static_la_SOURCES += \
 libpipe_loader_dynamic_la_SOURCES += \
 	$(DRM_SOURCES)

+endif
+
 libpipe_loader_static_la_LIBADD = \
 	$(top_builddir)/src/loader/libloader.la

 libpipe_loader_dynamic_la_LIBADD = \
 	$(top_builddir)/src/loader/libloader.la

-endif
-
 EXTRA_DIST = SConscript
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -94,6 +94,18 @@ static const struct drm_driver_descriptor driver_descriptors[] = {
        .create_screen = pipe_i915_create_screen,
        .configuration = configuration_query,
    },
+#ifdef USE_VC4_SIMULATOR
+    /* VC4 simulator and ILO (i965) are mutually exclusive (error at
+     * configure). As the latter is unconditionally added, keep this one above
+     * it.
+     */
+    {
+        .name = "i965",
+        .driver_name = "vc4",
+        .create_screen = pipe_vc4_create_screen,
+        .configuration = configuration_query,
+    },
+#endif
    {
        .name = "i965",
        .driver_name = "i915",
@@ -154,14 +166,6 @@ static const struct drm_driver_descriptor driver_descriptors[] = {
        .create_screen = pipe_vc4_create_screen,
        .configuration = configuration_query,
    },
-#ifdef USE_VC4_SIMULATOR
-    {
-        .name = "i965",
-        .driver_name = "vc4",
-        .create_screen = pipe_vc4_create_screen,
-        .configuration = configuration_query,
-    },
-#endif
 };
 #endif

--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -33,9 +33,10 @@
 #include "sw/kms-dri/kms_dri_sw_winsys.h"
 #include "sw/null/null_sw_winsys.h"
 #include "sw/wrapper/wrapper_sw_winsys.h"
-#include "target-helpers/inline_sw_helper.h"
+#include "target-helpers/sw_helper_public.h"
 #include "state_tracker/drisw_api.h"
 #include "state_tracker/sw_driver.h"
+#include "state_tracker/sw_winsys.h"

 struct pipe_loader_sw_device {
   struct pipe_loader_device base;
@@ -136,7 +137,7 @@ pipe_loader_sw_probe_dri(struct pipe_loader_device **devs, struct drisw_loader_f
   if (!pipe_loader_sw_probe_init_common(sdev))
      goto fail;

-   for (i = 0; sdev->dd->winsys; i++) {
+   for (i = 0; sdev->dd->winsys[i].name; i++) {
      if (strcmp(sdev->dd->winsys[i].name, "dri") == 0) {
         sdev->ws = sdev->dd->winsys[i].create_winsys(drisw_lf);
         break;
@@ -168,7 +169,7 @@ pipe_loader_sw_probe_kms(struct pipe_loader_device **devs, int fd)
   if (!pipe_loader_sw_probe_init_common(sdev))
      goto fail;

-   for (i = 0; sdev->dd->winsys; i++) {
+   for (i = 0; sdev->dd->winsys[i].name; i++) {
      if (strcmp(sdev->dd->winsys[i].name, "kms_dri") == 0) {
         sdev->ws = sdev->dd->winsys[i].create_winsys(fd);
         break;
@@ -199,7 +200,7 @@ pipe_loader_sw_probe_null(struct pipe_loader_device **devs)
   if (!pipe_loader_sw_probe_init_common(sdev))
      goto fail;

-   for (i = 0; sdev->dd->winsys; i++) {
+   for (i = 0; sdev->dd->winsys[i].name; i++) {
      if (strcmp(sdev->dd->winsys[i].name, "null") == 0) {
         sdev->ws = sdev->dd->winsys[i].create_winsys();
         break;
@@ -222,7 +223,7 @@ pipe_loader_sw_probe(struct pipe_loader_device **devs, int ndev)
 {
   int i = 1;

-   if (i < ndev) {
+   if (i <= ndev) {
      if (!pipe_loader_sw_probe_null(devs)) {
         i--;
      }
@@ -244,7 +245,7 @@ pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev,
   if (!pipe_loader_sw_probe_init_common(sdev))
      goto fail;

-   for (i = 0; sdev->dd->winsys; i++) {
+   for (i = 0; sdev->dd->winsys[i].name; i++) {
      if (strcmp(sdev->dd->winsys[i].name, "wrapped") == 0) {
         sdev->ws = sdev->dd->winsys[i].create_winsys(screen);
         break;
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -223,7 +223,7 @@ pipe_freedreno_create_screen(int fd)
 #include "virgl/drm/virgl_drm_public.h"
 #include "virgl/virgl_public.h"

-static struct pipe_screen *
+struct pipe_screen *
 pipe_virgl_create_screen(int fd)
 {
   struct virgl_winsys *vws;
--- a/src/gallium/auxiliary/target-helpers/sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/sw_helper.h
@@ -0,0 +1,73 @@
+
+#ifndef SW_HELPER_H
+#define SW_HELPER_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "target-helpers/sw_helper_public.h"
+#include "state_tracker/sw_winsys.h"
+
+
+/* Helper function to choose and instantiate one of the software rasterizers:
+ * llvmpipe, softpipe.
+ */
+
+#ifdef GALLIUM_SOFTPIPE
+#include "softpipe/sp_public.h"
+#endif
+
+#ifdef GALLIUM_LLVMPIPE
+#include "llvmpipe/lp_public.h"
+#endif
+
+#ifdef GALLIUM_VIRGL
+#include "virgl/virgl_public.h"
+#include "virgl/vtest/virgl_vtest_public.h"
+#endif
+
+static inline struct pipe_screen *
+sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
+{
+   struct pipe_screen *screen = NULL;
+
+#if defined(GALLIUM_LLVMPIPE)
+   if (screen == NULL && strcmp(driver, "llvmpipe") == 0)
+      screen = llvmpipe_create_screen(winsys);
+#endif
+
+#if defined(GALLIUM_VIRGL)
+   if (screen == NULL && strcmp(driver, "virpipe") == 0) {
+      struct virgl_winsys *vws;
+      vws = virgl_vtest_winsys_wrap(winsys);
+      screen = virgl_create_screen(vws);
+   }
+#endif
+
+#if defined(GALLIUM_SOFTPIPE)
+   if (screen == NULL)
+      screen = softpipe_create_screen(winsys);
+#endif
+
+   return screen;
+}
+
+
+struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   const char *default_driver;
+   const char *driver;
+
+#if defined(GALLIUM_LLVMPIPE)
+   default_driver = "llvmpipe";
+#elif defined(GALLIUM_SOFTPIPE)
+   default_driver = "softpipe";
+#else
+   default_driver = "";
+#endif
+
+   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
+   return sw_screen_create_named(winsys, driver);
+}
+
+#endif
--- a/src/gallium/auxiliary/target-helpers/sw_helper_public.h
+++ b/src/gallium/auxiliary/target-helpers/sw_helper_public.h
@@ -0,0 +1,10 @@
+#ifndef _SW_HELPER_PUBLIC_H
+#define _SW_HELPER_PUBLIC_H
+
+struct pipe_screen;
+struct sw_winsys;
+
+struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys);
+
+#endif /* _SW_HELPER_PUBLIC_H */
--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@@ -115,7 +115,7 @@ vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format)
      return const_resource_formats_VUYA;

   case PIPE_FORMAT_R8G8B8X8_UNORM:
-      return const_resource_formats_VUYX;
+      return const_resource_formats_YUVX;

   case PIPE_FORMAT_B8G8R8X8_UNORM:
      return const_resource_formats_VUYX;
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -392,7 +392,7 @@ vl_dri2_screen_create(Display *display, int screen)
      goto free_connect;

   if (drmGetMagic(fd, &magic))
-      goto free_connect;
+      goto close_fd;

   authenticate_cookie = xcb_dri2_authenticate_unchecked(scrn->conn,
                                                         get_xcb_screen(s, screen)->root,
@@ -402,7 +402,7 @@ vl_dri2_screen_create(Display *display, int screen)
   if (authenticate == NULL || !authenticate->authenticated)
      goto free_authenticate;

-   if (pipe_loader_drm_probe_fd(&scrn->base.dev, dup(fd)))
+   if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd))
      scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev);

   if (!scrn->base.pscreen)
@@ -428,8 +428,11 @@ vl_dri2_screen_create(Display *display, int screen)
 release_pipe:
   if (scrn->base.dev)
      pipe_loader_release(&scrn->base.dev, 1);
+   fd = -1;
 free_authenticate:
   free(authenticate);
+close_fd:
+   close(fd);
 free_connect:
   free(connect);
 free_query:
--- a/src/gallium/auxiliary/vl/vl_winsys_drm.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@@ -41,12 +41,16 @@ struct vl_screen *
 vl_drm_screen_create(int fd)
 {
   struct vl_screen *vscreen;
+   int new_fd = -1;

   vscreen = CALLOC_STRUCT(vl_screen);
   if (!vscreen)
      return NULL;

-   if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd)))
+   if (fd < 0 || (new_fd = dup(fd)) < 0)
+      goto error;
+
+   if (pipe_loader_drm_probe_fd(&vscreen->dev, new_fd))
      vscreen->pscreen = pipe_loader_create_screen(vscreen->dev);

   if (!vscreen->pscreen)
@@ -63,6 +67,8 @@ vl_drm_screen_create(int fd)
 error:
   if (vscreen->dev)
      pipe_loader_release(&vscreen->dev, 1);
+   else
+      close(new_fd);

   FREE(vscreen);
   return NULL;
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -627,7 +627,7 @@ static inline uint32_t A4XX_RB_FS_OUTPUT_ENABLE_BLEND(uint32_t val)
 {
 	return ((val) << A4XX_RB_FS_OUTPUT_ENABLE_BLEND__SHIFT) & A4XX_RB_FS_OUTPUT_ENABLE_BLEND__MASK;
 }
-#define A4XX_RB_FS_OUTPUT_FAST_CLEAR				0x00000100
+#define A4XX_RB_FS_OUTPUT_INDEPENDENT_BLEND			0x00000100
 #define A4XX_RB_FS_OUTPUT_SAMPLE_MASK__MASK			0xffff0000
 #define A4XX_RB_FS_OUTPUT_SAMPLE_MASK__SHIFT			16
 static inline uint32_t A4XX_RB_FS_OUTPUT_SAMPLE_MASK(uint32_t val)
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
@@ -137,7 +137,8 @@ fd4_blend_state_create(struct pipe_context *pctx,
 			so->rb_mrt[i].buf_info |= A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS);
 	}

-	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend);
+	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend) |
+		COND(cso->independent_blend_enable, A4XX_RB_FS_OUTPUT_INDEPENDENT_BLEND);

 	return so;
 }
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -194,7 +194,7 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			if (view->base.texture) {
 				struct fd_resource *rsc = fd_resource(view->base.texture);
 				uint32_t offset = fd_resource_offset(rsc, start, 0);
-				OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+				OUT_RELOC(ring, rsc->bo, offset, view->texconst4, 0);
 			} else {
 				OUT_RING(ring, 0x00000000);
 			}
@@ -497,11 +497,16 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RINGP(ring, val, &fd4_context(ctx)->rbrc_patches);
 	}

-	if (dirty & FD_DIRTY_ZSA) {
+	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) {
 		struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa);
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		uint32_t rb_alpha_control = zsa->rb_alpha_control;
+
+		if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])))
+			rb_alpha_control &= ~A4XX_RB_ALPHA_CONTROL_ALPHA_TEST;

 		OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1);
-		OUT_RING(ring, zsa->rb_alpha_control);
+		OUT_RING(ring, rb_alpha_control);

 		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2);
 		OUT_RING(ring, zsa->rb_stencil_control);
@@ -628,10 +633,16 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 			enum pipe_format format = pipe_surface_format(
 					ctx->framebuffer.cbufs[i]);
+			bool is_int = util_format_is_pure_integer(format);
 			bool has_alpha = util_format_has_alpha(format);
 			uint32_t control = blend->rb_mrt[i].control;
 			uint32_t blend_control = blend->rb_mrt[i].blend_control_alpha;

+			if (is_int) {
+				control &= A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK;
+				control |= A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY);
+			}
+
 			if (has_alpha) {
 				blend_control |= blend->rb_mrt[i].blend_control_rgb;
 			} else {
@@ -651,19 +662,48 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff));
 	}

-	if (dirty & FD_DIRTY_BLEND_COLOR) {
+	if (dirty & (FD_DIRTY_BLEND_COLOR | FD_DIRTY_FRAMEBUFFER)) {
 		struct pipe_blend_color *bcolor = &ctx->blend_color;
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		float factor = 65535.0;
+		int i;
+
+		for (i = 0; i < pfb->nr_cbufs; i++) {
+			enum pipe_format format = pipe_surface_format(pfb->cbufs[i]);
+			const struct util_format_description *desc =
+				util_format_description(format);
+			int j;
+
+			if (desc->is_mixed)
+				continue;
+
+			j = util_format_get_first_non_void_channel(format);
+			if (j == -1)
+				continue;
+
+			if (desc->channel[j].size > 8 || !desc->channel[j].normalized ||
+				desc->channel[j].pure_integer)
+				continue;
+
+			/* Just use the first unorm8/snorm8 render buffer. Can't keep
+			 * everyone happy.
+			 */
+			if (desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED)
+				factor = 32767.0;
+			break;
+		}
+
 		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8);
-		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * factor) |
 				A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]));
 		OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0]));
-		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * factor) |
 				A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]));
 		OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1]));
-		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * factor) |
 				A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]));
 		OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2]));
-		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * factor) |
 				A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
 		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
 	}
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -214,6 +214,7 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	struct fd_resource *rsc = fd_resource(prsc);
 	unsigned lvl = fd_sampler_first_level(cso);
 	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
+	uint32_t sz2 = 0;

 	if (!so)
 		return NULL;
@@ -259,7 +260,10 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	case PIPE_TEXTURE_3D:
 		so->texconst3 =
 			A4XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) |
-			A4XX_TEX_CONST_3_LAYERSZ(rsc->slices[0].size0);
+			A4XX_TEX_CONST_3_LAYERSZ(rsc->slices[lvl].size0);
+		while (lvl < cso->u.tex.last_level && sz2 != rsc->slices[lvl+1].size0)
+			sz2 = rsc->slices[++lvl].size0;
+		so->texconst4 = A4XX_TEX_CONST_4_LAYERSZ(sz2);
 		break;
 	default:
 		so->texconst3 = 0x00000000;
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
@@ -51,7 +51,7 @@ fd4_sampler_stateobj(struct pipe_sampler_state *samp)

 struct fd4_pipe_sampler_view {
 	struct pipe_sampler_view base;
-	uint32_t texconst0, texconst1, texconst2, texconst3, textconst4;
+	uint32_t texconst0, texconst1, texconst2, texconst3, texconst4;
 };

 static inline struct fd4_pipe_sampler_view *
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -551,7 +551,7 @@ fd_resource_create(struct pipe_screen *pscreen,
 	struct fd_resource *rsc = CALLOC_STRUCT(fd_resource);
 	struct pipe_resource *prsc = &rsc->base.b;
 	enum pipe_format format = tmpl->format;
-	uint32_t size;
+	uint32_t size, alignment;

 	DBG("target=%d, format=%s, %ux%ux%u, array_size=%u, last_level=%u, "
 			"nr_samples=%u, usage=%u, bind=%x, flags=%x",
@@ -583,6 +583,7 @@ fd_resource_create(struct pipe_screen *pscreen,

 	assert(rsc->cpp);

+	alignment = slice_alignment(pscreen, tmpl);
 	if (is_a4xx(fd_screen(pscreen))) {
 		switch (tmpl->target) {
 		case PIPE_TEXTURE_3D:
@@ -590,11 +591,12 @@ fd_resource_create(struct pipe_screen *pscreen,
 			break;
 		default:
 			rsc->layer_first = true;
+			alignment = 1;
 			break;
 		}
 	}

-	size = setup_slices(rsc, slice_alignment(pscreen, tmpl), format);
+	size = setup_slices(rsc, alignment, format);

 	if (rsc->layer_first) {
 		rsc->layer_size = align(size, 4096);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)

   if (b->prev)
      b->prev->next = b;
-   if (a->prev)
+   if (a->next)
      a->next->prev = a;
 }

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -575,8 +575,8 @@ CodeEmitterGK110::emitIMUL(const Instruction *i)
   if (isLIMM(i->src(1), TYPE_S32)) {
      emitForm_L(i, 0x280, 2, Modifier(0));

-      assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH);
-
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 24;
      if (i->sType == TYPE_S32)
         code[1] |= 3 << 25;
   } else {
@@ -695,14 +695,9 @@ CodeEmitterGK110::emitIMAD(const Instruction *i)
   if (i->sType == TYPE_S32)
      code[1] |= (1 << 19) | (1 << 24);

-   if (code[0] & 0x1) {
-      assert(!i->subOp);
-      SAT_(39);
-   } else {
-      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
-         code[1] |= 1 << 25;
-      SAT_(35);
-   }
+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+      code[1] |= 1 << 25;
+   SAT_(35);
 }

 void
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2893,6 +2893,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
      }
      setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true);
+
+      // If the loop never breaks (e.g. only has RET's inside), then there
+      // will be no way to get to the break bb. However BGNLOOP will have
+      // already made a PREBREAK to it, so it must be in the CFG.
+      if (getBB()->cfg.incidentCount() == 0)
+         loopBB->cfg.attach(&getBB()->cfg, Graph::Edge::TREE);
   }
      break;
   case TGSI_OPCODE_BRK:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -202,7 +202,8 @@ NV50LegalizePostRA::visit(Function *fn)
   Program *prog = fn->getProgram();

   r63 = new_LValue(fn, FILE_GPR);
-   if (prog->maxGPR < 63)
+   // GPR units on nv50 are in half-regs
+   if (prog->maxGPR < 126)
      r63->reg.data.id = 63;
   else
      r63->reg.data.id = 127;
@@ -832,7 +833,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i)
   }
   Value *flags = bld.getScratch(1, FILE_FLAGS);
   bld.setPosition(cond, true);
-   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;

   Instruction *tex[4];
   for (l = 0; l < 4; ++l) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -686,7 +686,7 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
         i->tex.s = 0x1f;
         i->setIndirectR(hnd);
         i->setIndirectS(NULL);
-      } else if (i->tex.r == i->tex.s) {
+      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
         i->tex.r += prog->driver->io.texBindBase / 4;
         i->tex.s  = 0; // only a single cX[] value possible here
      } else {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -858,6 +858,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
         i->src(0).mod = i->src(t).mod;
         i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
         i->src(1).mod = 0;
+      } else
+      if (i->postFactor && i->sType == TYPE_F32) {
+         /* Can't emit a postfactor with an immediate, have to fold it in */
+         i->setSrc(s, new_ImmediateValue(
+                      prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
+         i->postFactor = 0;
      }
      break;
   case OP_MAD:
@@ -2654,7 +2660,7 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb)
            break;

         def = i->getSrc(1)->getInsn();
-         if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
+         if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
            vtmp = i->getSrc(1);
            i->setSrc(1, def->getSrc(0));

@@ -2956,6 +2962,16 @@ DeadCodeElim::visit(BasicBlock *bb)
   return true;
 }

+// Each load can go into up to 4 destinations, any of which might potentially
+// be dead (i.e. a hole). These can always be split into 2 loads, independent
+// of where the holes are. We find the first contiguous region, put it into
+// the first load, and then put the second contiguous region into the second
+// load. There can be at most 2 contiguous regions.
+//
+// Note that there are some restrictions, for example it's not possible to do
+// a 64-bit load that's not 64-bit aligned, so such a load has to be split
+// up. Also hardware doesn't support 96-bit loads, so those also have to be
+// split into a 64-bit and 32-bit load.
 void
 DeadCodeElim::checkSplitLoad(Instruction *ld1)
 {
@@ -2976,6 +2992,8 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
   addr1 = ld1->getSrc(0)->reg.data.offset;
   n1 = n2 = 0;
   size1 = size2 = 0;
+
+   // Compute address/width for first load
   for (d = 0; ld1->defExists(d); ++d) {
      if (mask & (1 << d)) {
         if (size1 && (addr1 & 0x7))
@@ -2989,16 +3007,34 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
         break;
      }
   }
+
+   // Scale back the size of the first load until it can be loaded. This
+   // typically happens for TYPE_B96 loads.
+   while (n1 &&
+          !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
+                                                typeOfSize(size1))) {
+      size1 -= def1[--n1]->reg.size;
+      d--;
+   }
+
+   // Compute address/width for second load
   for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
      if (mask & (1 << d)) {
+         assert(!size2 || !(addr2 & 0x7));
         def2[n2] = ld1->getDef(d);
         size2 += def2[n2++]->reg.size;
-      } else {
+      } else if (!n2) {
         assert(!n2);
         addr2 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
      }
   }

+   // Make sure that we've processed all the values
+   for (; ld1->defExists(d); ++d)
+      assert(!(mask & (1 << d)));
+
   updateLdStOffset(ld1, addr1, func);
   ld1->setType(typeOfSize(size1));
   for (d = 0; d < 4; ++d)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1573,10 +1573,28 @@ SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)

   Instruction *st;
   if (slot->reg.file == FILE_MEMORY_LOCAL) {
-      st = new_Instruction(func, OP_STORE, ty);
-      st->setSrc(0, slot);
-      st->setSrc(1, lval);
      lval->noSpill = 1;
+      if (ty != TYPE_B96) {
+         st = new_Instruction(func, OP_STORE, ty);
+         st->setSrc(0, slot);
+         st->setSrc(1, lval);
+      } else {
+         st = new_Instruction(func, OP_SPLIT, ty);
+         st->setSrc(0, lval);
+         for (int d = 0; d < lval->reg.size / 4; ++d)
+            st->setDef(d, new_LValue(func, FILE_GPR));
+
+         for (int d = lval->reg.size / 4 - 1; d >= 0; --d) {
+            Value *tmp = cloneShallow(func, slot);
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32);
+            s->setSrc(0, tmp);
+            s->setSrc(1, st->getDef(d));
+            defi->bb->insertAfter(defi, s);
+         }
+      }
   } else {
      st = new_Instruction(func, OP_CVT, ty);
      st->setDef(0, slot);
@@ -1596,7 +1614,27 @@ SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
   Instruction *ld;
   if (slot->reg.file == FILE_MEMORY_LOCAL) {
      lval->noSpill = 1;
-      ld = new_Instruction(func, OP_LOAD, ty);
+      if (ty != TYPE_B96) {
+         ld = new_Instruction(func, OP_LOAD, ty);
+      } else {
+         ld = new_Instruction(func, OP_MERGE, ty);
+         for (int d = 0; d < lval->reg.size / 4; ++d) {
+            Value *tmp = cloneShallow(func, slot);
+            LValue *val;
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32);
+            l->setDef(0, (val = new_LValue(func, FILE_GPR)));
+            l->setSrc(0, tmp);
+            usei->bb->insertBefore(usei, l);
+            ld->setSrc(d, val);
+            val->noSpill = 1;
+         }
+         ld->setDef(0, lval);
+         usei->bb->insertBefore(usei, ld);
+         return lval;
+      }
   } else {
      ld = new_Instruction(func, OP_CVT, ty);
   }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -454,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
         return false;
      }
   }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
      return false;
   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -439,7 +439,7 @@ TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
         return false;
      }
   }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
      return false;
   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -657,8 +657,8 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
   if (buffer->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
                             PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
      buffer->domain = NOUVEAU_BO_GART;
-   } else if (buffer->base.bind &
-              (screen->vidmem_bindings & screen->sysmem_bindings)) {
+   } else if (buffer->base.bind == 0 || (buffer->base.bind &
+              (screen->vidmem_bindings & screen->sysmem_bindings))) {
      switch (buffer->base.usage) {
      case PIPE_USAGE_DEFAULT:
      case PIPE_USAGE_IMMUTABLE:
@@ -685,6 +685,10 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
      if (buffer->base.bind & screen->sysmem_bindings)
         buffer->domain = NOUVEAU_BO_GART;
   }
+   /* There can be very special situations where we want non-gpu-mapped
+    * buffers, but never through this interface.
+    */
+   assert(buffer->domain);
   ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);

   if (ret == false)
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -168,9 +168,10 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
                                 int ref)
 {
   struct nv50_context *nv50 = nv50_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
   unsigned s, i;

-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
      assert(nv50->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
      for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
         if (nv50->framebuffer.cbufs[i] &&
@@ -182,7 +183,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
         }
      }
   }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
      if (nv50->framebuffer.zsbuf &&
          nv50->framebuffer.zsbuf->texture == res) {
         nv50->dirty |= NV50_NEW_FRAMEBUFFER;
@@ -192,11 +193,11 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
      }
   }

-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_SAMPLER_VIEW)) {

      assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
      for (i = 0; i < nv50->num_vtxbufs; ++i) {
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -180,9 +180,10 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
                                 int ref)
 {
   struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
   unsigned s, i;

-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
      for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
         if (nvc0->framebuffer.cbufs[i] &&
             nvc0->framebuffer.cbufs[i]->texture == res) {
@@ -193,7 +194,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
         }
      }
   }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
      if (nvc0->framebuffer.zsbuf &&
          nvc0->framebuffer.zsbuf->texture == res) {
         nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -203,12 +204,12 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
      }
   }

-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_COMMAND_ARGS_BUFFER |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_COMMAND_ARGS_BUFFER |
+               PIPE_BIND_SAMPLER_VIEW)) {
      for (i = 0; i < nvc0->num_vtxbufs; ++i) {
         if (nvc0->vtxbuf[i].buffer == res) {
            nvc0->dirty |= NVC0_NEW_ARRAYS;
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -59,7 +59,7 @@

 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS	16
-#define R600_MAX_DRAW_CS_DWORDS		47
+#define R600_MAX_DRAW_CS_DWORDS		52
 #define R600_TRACE_CS_DWORDS		7

 #define R600_MAX_USER_CONST_BUFFERS 13
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -598,6 +598,106 @@ static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back
 	return 0;
 }

+/* execute a single slot ALU calculation */
+static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
+		for (i = 0; i < 4; i++) {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = op;
+			alu.src[0].sel = src0_sel;
+			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[0].value = src0_chan_val;
+			else
+				alu.src[0].chan = src0_chan_val;
+			alu.src[1].sel = src1_sel;
+			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[1].value = src1_chan_val;
+			else
+				alu.src[1].chan = src1_chan_val;
+			alu.dst.sel = dst_sel;
+			alu.dst.chan = i;
+			alu.dst.write = i == dst_chan;
+			alu.last = (i == 3);
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+		return 0;
+	}
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+/* execute a single slot ALU calculation */
+static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val,
+			  int src2_sel, unsigned src2_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r;
+
+	/* validate this for other ops */
+	assert(op == ALU_OP3_MULADD_UINT24);
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.src[2].sel = src2_sel;
+	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[2].value = src2_chan_val;
+	else
+		alu.src[2].chan = src2_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.is_op3 = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
+{
+	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
+}
+
 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
 {
 	int i;
@@ -1129,6 +1229,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	unsigned vtx_id = src->Dimension.Index;
 	int offset_reg = vtx_id / 3;
 	int offset_chan = vtx_id % 3;
+	int t2 = 0;

 	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
 	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
@@ -1136,13 +1237,24 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	if (offset_reg == 0 && offset_chan == 2)
 		offset_chan = 3;

+	if (src->Dimension.Indirect || src->Register.Indirect)
+		t2 = r600_get_temp(ctx);
+
 	if (src->Dimension.Indirect) {
 		int treg[3];
-		int t2;
 		struct r600_bytecode_alu alu;
 		int r, i;
-
-		/* you have got to be shitting me -
+		unsigned addr_reg;
+		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
+		if (src->DimIndirect.Index > 0) {
+			r = single_alu_op2(ctx, ALU_OP1_MOV,
+					   ctx->bc->ar_reg, 0,
+					   addr_reg, 0,
+					   0, 0);
+			if (r)
+				return r;
+		}
+		/*
 		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
 		   at least this is what fglrx seems to do. */
 		for (i = 0; i < 3; i++) {
@@ -1150,7 +1262,6 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		}
 		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);

-		t2 = r600_get_temp(ctx);
 		for (i = 0; i < 3; i++) {
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ALU_OP1_MOV;
@@ -1175,8 +1286,33 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		if (r)
 			return r;
 		offset_reg = t2;
+		offset_chan = 0;
 	}

+	if (src->Register.Indirect) {
+		int addr_reg;
+		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
+
+		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
+
+		/* pull the value from index_reg */
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   t2, 1,
+				   addr_reg, 0,
+				   V_SQ_ALU_SRC_LITERAL, first);
+		if (r)
+			return r;
+		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+				   t2, 0,
+				   t2, 1,
+				   V_SQ_ALU_SRC_LITERAL, 4,
+				   offset_reg, offset_chan);
+		if (r)
+			return r;
+		offset_reg = t2;
+		offset_chan = 0;
+		index = src->Register.Index - first;
+	}

 	memset(&vtx, 0, sizeof(vtx));
 	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
@@ -1222,6 +1358,7 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)

 			fetch_gs_input(ctx, src, treg);
 			ctx->src[i].sel = treg;
+			ctx->src[i].rel = 0;
 		}
 	}
 	return 0;
@@ -1498,7 +1635,7 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 		*last_exp_pos = NULL, *last_exp_param = NULL;
 	int i, j, next_clip_pos = 61, next_param = 0;
 	int ring;
-
+	bool only_ring_0 = true;
 	cshader = calloc(1, sizeof(struct r600_pipe_shader));
 	if (!cshader)
 		return 0;
@@ -1570,6 +1707,8 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 		for (i = 0; i < so->num_outputs; i++) {
 			if (so->output[i].stream == ring) {
 				enabled = true;
+				if (ring > 0)
+					only_ring_0 = false;
 				break;
 			}
 		}
@@ -1604,7 +1743,7 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 		cf_jump = ctx.bc->cf_last;

 		if (enabled)
-			emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
+			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
 		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
 	}

@@ -2206,6 +2345,11 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 			struct r600_bytecode_alu alu;
 			int r;
+
+			/* GS thread with no output workaround - emit a cut at start of GS */
+			if (ctx.bc->chip_class == R600)
+				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
+
 			for (j = 0; j < 4; j++) {
 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 				alu.op = ALU_OP1_MOV;
@@ -7180,7 +7324,7 @@ static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
 	struct r600_bytecode_alu alu;
 	int r;
 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
-	unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg;
+	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);

 	assert(inst->Dst[0].Register.Index < 3);
 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2213,10 +2213,11 @@ void r600_init_atom_start_cs(struct r600_context *rctx)
 		num_temp_gprs = 4;
 		num_gs_gprs = 0;
 		num_es_gprs = 0;
-		num_ps_threads = 136;
-		num_vs_threads = 48;
-		num_gs_threads = 4;
-		num_es_threads = 4;
+		/* use limits 40 VS and at least 16 ES/GS */
+		num_ps_threads = 120;
+		num_vs_threads = 40;
+		num_gs_threads = 16;
+		num_es_threads = 16;
 		num_ps_stack_entries = 40;
 		num_vs_stack_entries = 40;
 		num_gs_stack_entries = 32;
@@ -2675,6 +2676,9 @@ void r600_update_vs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 		S_02881C_USE_VTX_VIEWPORT_INDX(rshader->vs_out_viewport);
 }

+#define RV610_GSVS_ALIGN 32
+#define R600_GSVS_ALIGN 16
+
 void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *shader)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
@@ -2684,6 +2688,23 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 	unsigned gsvs_itemsize =
 			(cp_shader->ring_item_sizes[0] * shader->selector->gs_max_out_vertices) >> 2;

+	/* some r600s needs gsvs itemsize aligned to cacheline size
+	   this was fixed in rs780 and above. */
+	switch (rctx->b.family) {
+	case CHIP_RV610:
+		gsvs_itemsize = align(gsvs_itemsize, RV610_GSVS_ALIGN);
+		break;
+	case CHIP_R600:
+	case CHIP_RV630:
+	case CHIP_RV670:
+	case CHIP_RV620:
+	case CHIP_RV635:
+		gsvs_itemsize = align(gsvs_itemsize, R600_GSVS_ALIGN);
+		break;
+	default:
+		break;
+	}
+
 	r600_init_command_buffer(cb, 64);

 	/* VGT_GS_MODE is written by r600_emit_shader_stages */
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1770,6 +1770,24 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 					(info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0);
 	}

+	/* SMX returns CONTEXT_DONE too early workaround */
+	if (rctx->b.family == CHIP_R600 ||
+	    rctx->b.family == CHIP_RV610 ||
+	    rctx->b.family == CHIP_RV630 ||
+	    rctx->b.family == CHIP_RV635) {
+		/* if we have gs shader or streamout
+		   we need to do a wait idle after every draw */
+		if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) {
+			radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1));
+		}
+	}
+
+	/* ES ring rolling over at EOP - workaround */
+	if (rctx->b.chip_class == R600) {
+		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT);
+	}
+
 	if (rctx->screen->b.trace_bo) {
 		r600_trace_emit(rctx);
 	}
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -130,6 +130,7 @@
 #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
 #define EVENT_TYPE_FLUSH_AND_INV_DB_META       0x2c /* supported on r700+ */
 #define EVENT_TYPE_VGT_FLUSH                   0x24
+#define EVENT_TYPE_SQ_NON_EVENT                0x26
 #define EVENT_TYPE_FLUSH_AND_INV_CB_META	46 /* supported on r700+ */
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -136,8 +136,12 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
 	/* suspend queries */
-	if (!LIST_IS_EMPTY(&ctx->active_nontimer_queries))
+	if (ctx->num_cs_dw_nontimer_queries_suspend) {
+		/* Since non-timer queries are suspended during blits,
+		 * we have to guard against double-suspends. */
 		r600_suspend_nontimer_queries(ctx);
+		ctx->nontimer_queries_suspended_by_flush = true;
+	}
 	if (!LIST_IS_EMPTY(&ctx->active_timer_queries))
 		r600_suspend_timer_queries(ctx);

@@ -158,8 +162,10 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 	/* resume queries */
 	if (!LIST_IS_EMPTY(&ctx->active_timer_queries))
 		r600_resume_timer_queries(ctx);
-	if (!LIST_IS_EMPTY(&ctx->active_nontimer_queries))
+	if (ctx->nontimer_queries_suspended_by_flush) {
+		ctx->nontimer_queries_suspended_by_flush = false;
 		r600_resume_nontimer_queries(ctx);
+	}
 }

 static void r600_flush_from_st(struct pipe_context *ctx,
@@ -233,8 +239,8 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->family = rscreen->family;
 	rctx->chip_class = rscreen->chip_class;

-	if (rscreen->family == CHIP_HAWAII)
-		rctx->max_db = 16;
+	if (rscreen->chip_class >= CIK)
+		rctx->max_db = MAX2(8, rscreen->info.r600_num_backends);
 	else if (rscreen->chip_class >= EVERGREEN)
 		rctx->max_db = 8;
 	else
@@ -550,10 +556,11 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_TONGA: return "tonga";
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
-	case CHIP_FIJI: return "fiji";
 #if HAVE_LLVM <= 0x0307
+	case CHIP_FIJI: return "tonga";
 	case CHIP_STONEY: return "carrizo";
 #else
+	case CHIP_FIJI: return "fiji";
 	case CHIP_STONEY: return "stoney";
 #endif
 	default: return "";
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -392,6 +392,7 @@ struct r600_common_context {
 	struct list_head		active_nontimer_queries;
 	struct list_head		active_timer_queries;
 	unsigned			num_cs_dw_nontimer_queries_suspend;
+	bool				nontimer_queries_suspended_by_flush;
 	unsigned			num_cs_dw_timer_queries_suspend;
 	/* Additional hardware info. */
 	unsigned			backend_mask;
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -489,6 +489,10 @@ static void vi_texture_alloc_dcc_separate(struct r600_common_screen *rscreen,
 	if (rscreen->debug_flags & DBG_NO_DCC)
 		return;

+	/* TODO: DCC is broken on Stoney */
+	if (rscreen->family == CHIP_STONEY)
+		return;
+
 	rtex->dcc_buffer = (struct r600_resource *)
 		r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
 				   PIPE_USAGE_DEFAULT, rtex->surface.dcc_size, rtex->surface.dcc_alignment);
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -1539,7 +1539,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
 	bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.exp2.f32";
+	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp.";
 	bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32";
 	bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem;
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -958,6 +958,8 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	dec->msg->body.decode.db_pitch = dec->base.width;

 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
+	if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY)
+		dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2;

 	switch (u_reduce_video_profile(picture->profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -394,7 +394,10 @@ struct ruvd_msg {
 			uint32_t	dt_chroma_top_offset;
 			uint32_t	dt_chroma_bottom_offset;
 			uint32_t	dt_surf_tile_config;
-			uint32_t	dt_reserved[3];
+			uint32_t	dt_uv_surf_tile_config;
+			// re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney
+			uint32_t	dt_wa_chroma_top_offset;
+			uint32_t	dt_wa_chroma_bottom_offset;

 			uint32_t	reserved[16];

--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -389,6 +389,11 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	struct radeon_surf *tmp_surf;
 	unsigned cpb_size;

+	if (rscreen->info.family == CHIP_STONEY) {
+		RVID_ERR("Stoney VCE is not supported!\n");
+		return NULL;
+	}
+
 	if (!rscreen->info.vce_fw_version) {
 		RVID_ERR("Kernel doesn't supports VCE!\n");
 		return NULL;
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -34,11 +34,6 @@

 #define MAX_GLOBAL_BUFFERS 20

-/* XXX: Even though we don't pass the scratch buffer via user sgprs any more
- * LLVM still expects that we specify 4 USER_SGPRS so it can remain compatible
- * with older mesa. */
-#define NUM_USER_SGPRS 4
-
 struct si_compute {
 	struct si_context *ctx;

@@ -238,7 +233,6 @@ static void si_launch_grid(
 	uint64_t kernel_args_va;
 	uint64_t scratch_buffer_va = 0;
 	uint64_t shader_va;
-	unsigned arg_user_sgpr_count = NUM_USER_SGPRS;
 	unsigned i;
 	struct si_shader *shader = &program->shader;
 	unsigned lds_blocks;
@@ -366,20 +360,7 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);

-	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1,
-		/* We always use at least 3 VGPRS, these come from
-		 * TIDIG_COMP_CNT.
-		 * XXX: The compiler should account for this.
-		 */
-		S_00B848_VGPRS((MAX2(3, shader->num_vgprs) - 1) / 4)
-		/* We always use at least 4 + arg_user_sgpr_count.  The 4 extra
-		 * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN
-		 * XXX: The compiler should account for this.
-		 */
-		|  S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count,
-		                        shader->num_sgprs)) - 1) / 8)
-		|  S_00B028_FLOAT_MODE(shader->float_mode))
-		;
+	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);

 	lds_blocks = shader->lds_size;
 	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
@@ -395,17 +376,10 @@ static void si_launch_grid(

 	assert(lds_blocks <= 0xFF);

-	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2,
-		S_00B84C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)
-		| S_00B84C_USER_SGPR(arg_user_sgpr_count)
-		| S_00B84C_TGID_X_EN(1)
-		| S_00B84C_TGID_Y_EN(1)
-		| S_00B84C_TGID_Z_EN(1)
-		| S_00B84C_TG_SIZE_EN(1)
-		| S_00B84C_TIDIG_COMP_CNT(2)
-		| S_00B84C_LDS_SIZE(lds_blocks)
-		| S_00B84C_EXCP_EN(0))
-		;
+	shader->rsrc2 &= C_00B84C_LDS_SIZE;
+	shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+
+	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
 	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);

 	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -632,7 +632,7 @@ void si_check_vm_faults(struct si_context *sctx)
 	/* Use conservative timeout 800ms, after which we won't wait any
 	 * longer and assume the GPU is hung.
 	 */
-	screen->fence_finish(screen, sctx->last_gfx_fence, 800*1000*1000);
+	sctx->b.ws->fence_wait(sctx->b.ws, sctx->last_gfx_fence, 800*1000*1000);

 	if (!si_vm_fault_occured(sctx, &addr))
 		return;
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -594,6 +594,14 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 			    lp_build_const_int32(gallivm, swizzle));

 	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	if (type == TGSI_TYPE_DOUBLE) {
+		LLVMValueRef value2;
+		dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+				       lp_build_const_int32(gallivm, swizzle + 1));
+		value2 = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+		return radeon_llvm_emit_fetch_double(bld_base, value, value2);
+	}
+
 	return LLVMBuildBitCast(gallivm->builder, value,
 				tgsi2llvmtype(bld_base, type), "");
 }
@@ -733,6 +741,7 @@ static LLVMValueRef fetch_input_gs(
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
 	unsigned param;
+	LLVMValueRef value;

 	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
 		return get_primitive_id(bld_base, swizzle);
@@ -774,11 +783,22 @@ static LLVMValueRef fetch_input_gs(
 	args[7] = uint->zero; /* SLC */
 	args[8] = uint->zero; /* TFE */

+	value = lp_build_intrinsic(gallivm->builder,
+				   "llvm.SI.buffer.load.dword.i32.i32",
+				   i32, args, 9,
+				   LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+	if (type == TGSI_TYPE_DOUBLE) {
+		LLVMValueRef value2;
+		args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
+		value2 = lp_build_intrinsic(gallivm->builder,
+					    "llvm.SI.buffer.load.dword.i32.i32",
+					    i32, args, 9,
+					    LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+		return radeon_llvm_emit_fetch_double(bld_base,
+						     value, value2);
+	}
 	return LLVMBuildBitCast(gallivm->builder,
-				lp_build_intrinsic(gallivm->builder,
-						"llvm.SI.buffer.load.dword.i32.i32",
-						i32, args, 9,
-						LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
+				value,
 				tgsi2llvmtype(bld_base, type), "");
 }

@@ -3745,12 +3765,14 @@ void si_shader_binary_read_config(const struct si_screen *sscreen,
 			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
 			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 			shader->float_mode =  G_00B028_FLOAT_MODE(value);
+			shader->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
 			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
 			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
+			shader->rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			shader->spi_ps_input_ena = value;
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -290,8 +290,8 @@ struct si_shader {
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */

-	unsigned		ls_rsrc1;
-	unsigned		ls_rsrc2;
+	unsigned		rsrc1;
+	unsigned		rsrc2;
 };

 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;

 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->ls_rsrc2;
+	ls_rsrc2 = ls->current->rsrc2;

 	if (sctx->b.chip_class >= CIK) {
 		assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
 		radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
 	radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->ls_rsrc1);
+	radeon_emit(cs, ls->current->rsrc1);
 	radeon_emit(cs, ls_rsrc2);

 	/* Compute userdata SGPRs. */
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -121,11 +121,11 @@ static void si_shader_ls(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);

-	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+	shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
 			   S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
-	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+	shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
 			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
 }

--- a/src/gallium/drivers/vc4/Automake.inc
+++ b/src/gallium/drivers/vc4/Automake.inc
@@ -6,8 +6,4 @@ TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/winsys/vc4/drm/libvc4drm.la \
 	$(top_builddir)/src/gallium/drivers/vc4/libvc4.la

-if USE_VC4_SIMULATOR
-TARGET_CPPFLAGS += -DUSE_VC4_SIMULATOR
-endif
-
 endif
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -23,7 +23,6 @@ include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc

 if USE_VC4_SIMULATOR
-SIM_CFLAGS = -DUSE_VC4_SIMULATOR=1
 SIM_LDFLAGS = -lsimpenrose
 endif

--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -21,6 +21,7 @@ C_SOURCES := \
 	vc4_job.c \
 	vc4_nir_lower_blend.c \
 	vc4_nir_lower_io.c \
+	vc4_nir_lower_txf_ms.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
 	vc4_opt_copy_propagation.c \
--- a/src/gallium/drivers/vc4/kernel/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -121,6 +121,11 @@ enum vc4_packet {
 #define VC4_PACKET_TILE_COORDINATES_SIZE				3
 #define VC4_PACKET_GEM_HANDLES_SIZE					9

+/* Number of multisamples supported. */
+#define VC4_MAX_SAMPLES							4
+/* Size of a full resolution color or Z tile buffer load/store. */
+#define VC4_TILE_BUFFER_SIZE			(64 * 64 * 4)
+
 #define VC4_MASK(high, low) (((1 << ((high) - (low) + 1)) - 1) << (low))
 /* Using the GNU statement expression extension */
 #define VC4_SET_FIELD(value, field)                                       \
@@ -151,6 +156,16 @@ enum vc4_packet {
 #define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              (1 << 1)
 #define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           (1 << 0)

+/** @{
+ *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     (1 << 3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       (1 << 2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              (1 << 1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           (1 << 0)
+
 /** @{
 *
 * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -36,9 +36,11 @@

 struct vc4_rcl_setup {
 	struct drm_gem_cma_object *color_read;
-	struct drm_gem_cma_object *color_ms_write;
+	struct drm_gem_cma_object *color_write;
 	struct drm_gem_cma_object *zs_read;
 	struct drm_gem_cma_object *zs_write;
+	struct drm_gem_cma_object *msaa_color_write;
+	struct drm_gem_cma_object *msaa_zs_write;

 	struct drm_gem_cma_object *rcl;
 	u32 next_offset;
@@ -62,7 +64,6 @@ static inline void rcl_u32(struct vc4_rcl_setup *setup, u32 val)
 	setup->next_offset += 4;
 }

-
 /*
 * Emits a no-op STORE_TILE_BUFFER_GENERAL.
 *
@@ -81,6 +82,22 @@ static void vc4_store_before_load(struct vc4_rcl_setup *setup)
 	rcl_u32(setup, 0); /* no address, since we're in None mode */
 }

+/*
+ * Calculates the physical address of the start of a tile in a RCL surface.
+ *
+ * Unlike the other load/store packets,
+ * VC4_PACKET_LOAD/STORE_FULL_RES_TILE_BUFFER don't look at the tile
+ * coordinates packet, and instead just store to the address given.
+ */
+static uint32_t vc4_full_res_offset(struct vc4_exec_info *exec,
+				    struct drm_gem_cma_object *bo,
+				    struct drm_vc4_submit_rcl_surface *surf,
+				    uint8_t x, uint8_t y)
+{
+	return bo->paddr + surf->offset + VC4_TILE_BUFFER_SIZE *
+		(DIV_ROUND_UP(exec->args->width, 32) * y + x);
+}
+
 /*
 * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
 *
@@ -108,22 +125,41 @@ static void emit_tile(struct vc4_exec_info *exec,
 	 * may be outstanding at a time.
 	 */
 	if (setup->color_read) {
-		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, args->color_read.bits);
-		rcl_u32(setup,
-			setup->color_read->paddr + args->color_read.offset);
+		if (args->color_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			rcl_u8(setup, VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER);
+			rcl_u32(setup,
+				vc4_full_res_offset(exec, setup->color_read,
+						    &args->color_read, x, y) |
+				VC4_LOADSTORE_FULL_RES_DISABLE_ZS);
+		} else {
+			rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+			rcl_u16(setup, args->color_read.bits);
+			rcl_u32(setup, setup->color_read->paddr +
+				args->color_read.offset);
+		}
 	}

 	if (setup->zs_read) {
-		if (setup->color_read) {
-			/* Exec previous load. */
-			vc4_tile_coordinates(setup, x, y);
-			vc4_store_before_load(setup);
-		}
+		if (args->zs_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			rcl_u8(setup, VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER);
+			rcl_u32(setup,
+				vc4_full_res_offset(exec, setup->zs_read,
+						    &args->zs_read, x, y) |
+				VC4_LOADSTORE_FULL_RES_DISABLE_COLOR);
+		} else {
+			if (setup->color_read) {
+				/* Exec previous load. */
+				vc4_tile_coordinates(setup, x, y);
+				vc4_store_before_load(setup);
+			}

-		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, args->zs_read.bits);
-		rcl_u32(setup, setup->zs_read->paddr + args->zs_read.offset);
+			rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+			rcl_u16(setup, args->zs_read.bits);
+			rcl_u32(setup, setup->zs_read->paddr +
+				args->zs_read.offset);
+		}
 	}

 	/* Clipping depends on tile coordinates having been
@@ -144,20 +180,60 @@ static void emit_tile(struct vc4_exec_info *exec,
 				(y * exec->bin_tiles_x + x) * 32));
 	}

+	if (setup->msaa_color_write) {
+		bool last_tile_write = (!setup->msaa_zs_write &&
+					!setup->zs_write &&
+					!setup->color_write);
+		uint32_t bits = VC4_LOADSTORE_FULL_RES_DISABLE_ZS;
+
+		if (!last_tile_write)
+			bits |= VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL;
+		else if (last)
+			bits |= VC4_LOADSTORE_FULL_RES_EOF;
+		rcl_u8(setup, VC4_PACKET_STORE_FULL_RES_TILE_BUFFER);
+		rcl_u32(setup,
+			vc4_full_res_offset(exec, setup->msaa_color_write,
+					    &args->msaa_color_write, x, y) |
+			bits);
+	}
+
+	if (setup->msaa_zs_write) {
+		bool last_tile_write = (!setup->zs_write &&
+					!setup->color_write);
+		uint32_t bits = VC4_LOADSTORE_FULL_RES_DISABLE_COLOR;
+
+		if (setup->msaa_color_write)
+			vc4_tile_coordinates(setup, x, y);
+		if (!last_tile_write)
+			bits |= VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL;
+		else if (last)
+			bits |= VC4_LOADSTORE_FULL_RES_EOF;
+		rcl_u8(setup, VC4_PACKET_STORE_FULL_RES_TILE_BUFFER);
+		rcl_u32(setup,
+			vc4_full_res_offset(exec, setup->msaa_zs_write,
+					    &args->msaa_zs_write, x, y) |
+			bits);
+	}
+
 	if (setup->zs_write) {
+		bool last_tile_write = !setup->color_write;
+
+		if (setup->msaa_color_write || setup->msaa_zs_write)
+			vc4_tile_coordinates(setup, x, y);
+
 		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
 		rcl_u16(setup, args->zs_write.bits |
-			(setup->color_ms_write ?
-			 VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0));
+			(last_tile_write ?
+			 0 : VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR));
 		rcl_u32(setup,
 			(setup->zs_write->paddr + args->zs_write.offset) |
-			((last && !setup->color_ms_write) ?
+			((last && last_tile_write) ?
 			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
 	}

-	if (setup->color_ms_write) {
-		if (setup->zs_write) {
-			/* Reset after previous store */
+	if (setup->color_write) {
+		if (setup->msaa_color_write || setup->msaa_zs_write ||
+		    setup->zs_write) {
 			vc4_tile_coordinates(setup, x, y);
 		}

@@ -192,14 +268,26 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	}

 	if (setup->color_read) {
-		loop_body_size += (VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE);
+		if (args->color_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			loop_body_size += VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE;
+		} else {
+			loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		}
 	}
 	if (setup->zs_read) {
-		if (setup->color_read) {
-			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
-			loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+		if (args->zs_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			loop_body_size += VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE;
+		} else {
+			if (setup->color_read &&
+			    !(args->color_read.flags &
+			      VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES)) {
+				loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+				loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+			}
+			loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
 		}
-		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
 	}

 	if (has_bin) {
@@ -207,13 +295,23 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 		loop_body_size += VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE;
 	}

+	if (setup->msaa_color_write)
+		loop_body_size += VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE;
+	if (setup->msaa_zs_write)
+		loop_body_size += VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE;
+
 	if (setup->zs_write)
 		loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
-	if (setup->color_ms_write) {
-		if (setup->zs_write)
-			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+	if (setup->color_write)
 		loop_body_size += VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE;
-	}
+
+	/* We need a VC4_PACKET_TILE_COORDINATES in between each store. */
+	loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE *
+		((setup->msaa_color_write != NULL) +
+		 (setup->msaa_zs_write != NULL) +
+		 (setup->color_write != NULL) +
+		 (setup->zs_write != NULL) - 1);
+
 	size += xtiles * ytiles * loop_body_size;

 	setup->rcl = drm_gem_cma_create(dev, size);
@@ -224,13 +322,12 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,

 	rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
 	rcl_u32(setup,
-		(setup->color_ms_write ?
-		 (setup->color_ms_write->paddr +
-		  args->color_ms_write.offset) :
+		(setup->color_write ? (setup->color_write->paddr +
+				       args->color_write.offset) :
 		 0));
 	rcl_u16(setup, args->width);
 	rcl_u16(setup, args->height);
-	rcl_u16(setup, args->color_ms_write.bits);
+	rcl_u16(setup, args->color_write.bits);

 	/* The tile buffer gets cleared when the previous tile is stored.  If
 	 * the clear values changed between frames, then the tile buffer has
@@ -255,6 +352,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 		for (x = min_x_tile; x <= max_x_tile; x++) {
 			bool first = (x == min_x_tile && y == min_y_tile);
 			bool last = (x == max_x_tile && y == max_y_tile);
+
 			emit_tile(exec, setup, x, y, first, last);
 		}
 	}
@@ -266,6 +364,56 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	return 0;
 }

+static int vc4_full_res_bounds_check(struct vc4_exec_info *exec,
+				     struct drm_gem_cma_object *obj,
+				     struct drm_vc4_submit_rcl_surface *surf)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	u32 render_tiles_stride = DIV_ROUND_UP(exec->args->width, 32);
+
+	if (surf->offset > obj->base.size) {
+		DRM_ERROR("surface offset %d > BO size %zd\n",
+			  surf->offset, obj->base.size);
+		return -EINVAL;
+	}
+
+	if ((obj->base.size - surf->offset) / VC4_TILE_BUFFER_SIZE <
+	    render_tiles_stride * args->max_y_tile + args->max_x_tile) {
+		DRM_ERROR("MSAA tile %d, %d out of bounds "
+			  "(bo size %zd, offset %d).\n",
+			  args->max_x_tile, args->max_y_tile,
+			  obj->base.size,
+			  surf->offset);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int vc4_rcl_msaa_surface_setup(struct vc4_exec_info *exec,
+				      struct drm_gem_cma_object **obj,
+				      struct drm_vc4_submit_rcl_surface *surf)
+{
+	if (surf->flags != 0 || surf->bits != 0) {
+		DRM_ERROR("MSAA surface had nonzero flags/bits\n");
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	*obj = vc4_use_bo(exec, surf->hindex);
+	if (!*obj)
+		return -EINVAL;
+
+	if (surf->offset & 0xf) {
+		DRM_ERROR("MSAA write must be 16b aligned.\n");
+		return -EINVAL;
+	}
+
+	return vc4_full_res_bounds_check(exec, *obj, surf);
+}
+
 static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 				 struct drm_gem_cma_object **obj,
 				 struct drm_vc4_submit_rcl_surface *surf)
@@ -277,9 +425,10 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	uint8_t format = VC4_GET_FIELD(surf->bits,
 				       VC4_LOADSTORE_TILE_BUFFER_FORMAT);
 	int cpp;
+	int ret;

-	if (surf->pad != 0) {
-		DRM_ERROR("Padding unset\n");
+	if (surf->flags & ~VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+		DRM_ERROR("Extra flags set\n");
 		return -EINVAL;
 	}

@@ -290,6 +439,25 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	if (!*obj)
 		return -EINVAL;

+	if (surf->flags & VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+		if (surf == &exec->args->zs_write) {
+			DRM_ERROR("general zs write may not be a full-res.\n");
+			return -EINVAL;
+		}
+
+		if (surf->bits != 0) {
+			DRM_ERROR("load/store general bits set with "
+				  "full res load/store.\n");
+			return -EINVAL;
+		}
+
+		ret = vc4_full_res_bounds_check(exec, *obj, surf);
+		if (!ret)
+			return ret;
+
+		return 0;
+	}
+
 	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
 			   VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK |
 			   VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK)) {
@@ -341,9 +509,10 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 }

 static int
-vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
-			 struct drm_gem_cma_object **obj,
-			 struct drm_vc4_submit_rcl_surface *surf)
+vc4_rcl_render_config_surface_setup(struct vc4_exec_info *exec,
+				    struct vc4_rcl_setup *setup,
+				    struct drm_gem_cma_object **obj,
+				    struct drm_vc4_submit_rcl_surface *surf)
 {
 	uint8_t tiling = VC4_GET_FIELD(surf->bits,
 				       VC4_RENDER_CONFIG_MEMORY_FORMAT);
@@ -351,13 +520,15 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
 				       VC4_RENDER_CONFIG_FORMAT);
 	int cpp;

-	if (surf->pad != 0) {
-		DRM_ERROR("Padding unset\n");
+	if (surf->flags != 0) {
+		DRM_ERROR("No flags supported on render config.\n");
 		return -EINVAL;
 	}

 	if (surf->bits & ~(VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK |
-			   VC4_RENDER_CONFIG_FORMAT_MASK)) {
+			   VC4_RENDER_CONFIG_FORMAT_MASK |
+			   VC4_RENDER_CONFIG_MS_MODE_4X |
+			   VC4_RENDER_CONFIG_DECIMATE_MODE_4X)) {
 		DRM_ERROR("Unknown bits in render config: 0x%04x\n",
 			  surf->bits);
 		return -EINVAL;
@@ -414,18 +585,20 @@ int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec)
 	if (has_bin &&
 	    (args->max_x_tile > exec->bin_tiles_x ||
 	     args->max_y_tile > exec->bin_tiles_y)) {
-		DRM_ERROR("Render tiles (%d,%d) outside of bin config (%d,%d)\n",
+		DRM_ERROR("Render tiles (%d,%d) outside of bin config "
+			  "(%d,%d)\n",
 			  args->max_x_tile, args->max_y_tile,
 			  exec->bin_tiles_x, exec->bin_tiles_y);
 		return -EINVAL;
 	}

-	ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
+	ret = vc4_rcl_render_config_surface_setup(exec, &setup,
+						  &setup.color_write,
+						  &args->color_write);
 	if (ret)
 		return ret;

-	ret = vc4_rcl_ms_surface_setup(exec, &setup.color_ms_write,
-				       &args->color_ms_write);
+	ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
 	if (ret)
 		return ret;

@@ -437,10 +610,21 @@ int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec)
 	if (ret)
 		return ret;

+	ret = vc4_rcl_msaa_surface_setup(exec, &setup.msaa_color_write,
+					 &args->msaa_color_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_msaa_surface_setup(exec, &setup.msaa_zs_write,
+					 &args->msaa_zs_write);
+	if (ret)
+		return ret;
+
 	/* We shouldn't even have the job submitted to us if there's no
 	 * surface to write out.
 	 */
-	if (!setup.color_ms_write && !setup.zs_write) {
+	if (!setup.color_write && !setup.zs_write &&
+	    !setup.msaa_color_write && !setup.msaa_zs_write) {
 		DRM_ERROR("RCL requires color or Z/S write\n");
 		return -EINVAL;
 	}
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -47,7 +47,6 @@
 	void *validated,				\
 	void *untrusted

-
 /** Return the width in pixels of a 64-byte microtile. */
 static uint32_t
 utile_width(int cpp)
@@ -191,7 +190,7 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,

 	if (size + offset < size ||
 	    size + offset > fbo->base.size) {
-		DRM_ERROR("Overflow in %dx%d (%dx%d) fbo size (%d + %d > %d)\n",
+		DRM_ERROR("Overflow in %dx%d (%dx%d) fbo size (%d + %d > %zd)\n",
 			  width, height,
 			  aligned_width, aligned_height,
 			  size, offset, fbo->base.size);
@@ -201,7 +200,6 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
 	return true;
 }

-
 static int
 validate_flush(VALIDATE_ARGS)
 {
@@ -270,7 +268,7 @@ validate_indexed_prim_list(VALIDATE_ARGS)

 	if (offset > ib->base.size ||
 	    (ib->base.size - offset) / index_size < length) {
-		DRM_ERROR("IB access overflow (%d + %d*%d > %d)\n",
+		DRM_ERROR("IB access overflow (%d + %d*%d > %zd)\n",
 			  offset, length, index_size, ib->base.size);
 		return -EINVAL;
 	}
@@ -361,9 +359,8 @@ validate_tile_binning_config(VALIDATE_ARGS)
 	}

 	if (flags & (VC4_BIN_CONFIG_DB_NON_MS |
-		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT |
-		     VC4_BIN_CONFIG_MS_MODE_4X)) {
-		DRM_ERROR("unsupported bining config flags 0x%02x\n", flags);
+		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT)) {
+		DRM_ERROR("unsupported binning config flags 0x%02x\n", flags);
 		return -EINVAL;
 	}

@@ -424,8 +421,8 @@ validate_gem_handles(VALIDATE_ARGS)
 	return 0;
 }

-#define VC4_DEFINE_PACKET(packet, name, func) \
-	[packet] = { packet ## _SIZE, name, func }
+#define VC4_DEFINE_PACKET(packet, func) \
+	[packet] = { packet ## _SIZE, #packet, func }

 static const struct cmd_info {
 	uint16_t len;
@@ -433,42 +430,42 @@ static const struct cmd_info {
 	int (*func)(struct vc4_exec_info *exec, void *validated,
 		    void *untrusted);
 } cmd_info[] = {
-	VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", validate_flush),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning),
-	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore),
+	VC4_DEFINE_PACKET(VC4_PACKET_HALT, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_NOP, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, validate_flush),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING,
+			  validate_start_tile_binning),
+	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE,
+			  validate_increment_semaphore),

-	VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, "Indexed Primitive List", validate_indexed_prim_list),
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE,
+			  validate_indexed_prim_list),
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE,
+			  validate_gl_array_primitive),

-	VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, "Vertex Array Primitives", validate_gl_array_primitive),
+	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, NULL),

-	/* This is only used by clipped primitives (packets 48 and 49), which
-	 * we don't support parsing yet.
-	 */
-	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, validate_gl_shader_state),

-	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state),
-	/* We don't support validating NV shader states. */
-
-	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, "point size", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, "line width", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, "RHT X boundary", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, "Depth Offset", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, "Clip Window", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, "Viewport Offset", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, "Clipper XY Scaling", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, NULL),
 	/* Note: The docs say this was also 105, but it was 106 in the
 	 * initial userland code drop.
 	 */
-	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, "Clipper Z Scale and Offset", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, NULL),

-	VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG, "tile binning configuration", validate_tile_binning_config),
+	VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG,
+			  validate_tile_binning_config),

-	VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, "GEM handles", validate_gem_handles),
+	VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, validate_gem_handles),
 };

 int
@@ -500,11 +497,6 @@ vc4_validate_bin_cl(struct drm_device *dev,
 			return -EINVAL;
 		}

-#if 0
-		DRM_INFO("0x%08x: packet %d (%s) size %d processing...\n",
-			 src_offset, cmd, info->name, info->len);
-#endif
-
 		if (src_offset + info->len > len) {
 			DRM_ERROR("0x%08x: packet %d (%s) length 0x%08x "
 				  "exceeds bounds (0x%08x)\n",
@@ -519,8 +511,7 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		if (info->func && info->func(exec,
 					     dst_pkt + 1,
 					     src_pkt + 1)) {
-			DRM_ERROR("0x%08x: packet %d (%s) failed to "
-				  "validate\n",
+			DRM_ERROR("0x%08x: packet %d (%s) failed to validate\n",
 				  src_offset, cmd, info->name);
 			return -EINVAL;
 		}
@@ -588,12 +579,14 @@ reloc_tex(struct vc4_exec_info *exec,

 	if (sample->is_direct) {
 		uint32_t remaining_size = tex->base.size - p0;
+
 		if (p0 > tex->base.size - 4) {
 			DRM_ERROR("UBO offset greater than UBO size\n");
 			goto fail;
 		}
 		if (p1 > remaining_size - 4) {
-			DRM_ERROR("UBO clamp would allow reads outside of UBO\n");
+			DRM_ERROR("UBO clamp would allow reads "
+				  "outside of UBO\n");
 			goto fail;
 		}
 		*validated_p0 = tex->paddr + p0;
@@ -866,7 +859,7 @@ validate_gl_shader_rec(struct drm_device *dev,

 		if (vbo->base.size < offset ||
 		    vbo->base.size - offset < attr_size) {
-			DRM_ERROR("BO offset overflow (%d + %d > %d)\n",
+			DRM_ERROR("BO offset overflow (%d + %d > %zd)\n",
 				  offset, attr_size, vbo->base.size);
 			return -EINVAL;
 		}
@@ -875,7 +868,8 @@ validate_gl_shader_rec(struct drm_device *dev,
 			max_index = ((vbo->base.size - offset - attr_size) /
 				     stride);
 			if (state->max_index > max_index) {
-				DRM_ERROR("primitives use index %d out of supplied %d\n",
+				DRM_ERROR("primitives use index %d out of "
+					  "supplied %d\n",
 					  state->max_index, max_index);
 				return -EINVAL;
 			}
--- a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
@@ -24,24 +24,16 @@
 /**
 * DOC: Shader validator for VC4.
 *
- * The VC4 has no IOMMU between it and system memory.  So, a user with access
- * to execute shaders could escalate privilege by overwriting system memory
- * (using the VPM write address register in the general-purpose DMA mode) or
- * reading system memory it shouldn't (reading it as a texture, or uniform
- * data, or vertex data).
+ * The VC4 has no IOMMU between it and system memory, so a user with
+ * access to execute shaders could escalate privilege by overwriting
+ * system memory (using the VPM write address register in the
+ * general-purpose DMA mode) or reading system memory it shouldn't
+ * (reading it as a texture, or uniform data, or vertex data).
 *
- * This walks over a shader starting from some offset within a BO, ensuring
- * that its accesses are appropriately bounded, and recording how many texture
- * accesses are made and where so that we can do relocations for them in the
+ * This walks over a shader BO, ensuring that its accesses are
+ * appropriately bounded, and recording how many texture accesses are
+ * made and where so that we can do relocations for them in the
 * uniform stream.
- *
- * The kernel API has shaders stored in user-mapped BOs.  The BOs will be
- * forcibly unmapped from the process before validation, and any cache of
- * validated state will be flushed if the mapping is faulted back in.
- *
- * Storing the shaders in BOs means that the validation process will be slow
- * due to uncached reads, but since shaders are long-lived and shader BOs are
- * never actually modified, this shouldn't be a problem.
 */

 #include "vc4_drv.h"
@@ -71,7 +63,6 @@ waddr_to_live_reg_index(uint32_t waddr, bool is_b)
 		else
 			return waddr;
 	} else if (waddr <= QPU_W_ACC3) {
-
 		return 64 + waddr - QPU_W_ACC0;
 	} else {
 		return ~0;
@@ -86,15 +77,14 @@ raddr_add_a_to_live_reg_index(uint64_t inst)
 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);

-	if (add_a == QPU_MUX_A) {
+	if (add_a == QPU_MUX_A)
 		return raddr_a;
-	} else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) {
+	else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
 		return 32 + raddr_b;
-	} else if (add_a <= QPU_MUX_R3) {
+	else if (add_a <= QPU_MUX_R3)
 		return 64 + add_a;
-	} else {
+	else
 		return ~0;
-	}
 }

 static bool
@@ -112,9 +102,9 @@ is_tmu_write(uint32_t waddr)
 }

 static bool
-record_validated_texture_sample(struct vc4_validated_shader_info *validated_shader,
-				struct vc4_shader_validation_state *validation_state,
-				int tmu)
+record_texture_sample(struct vc4_validated_shader_info *validated_shader,
+		      struct vc4_shader_validation_state *validation_state,
+		      int tmu)
 {
 	uint32_t s = validated_shader->num_texture_samples;
 	int i;
@@ -227,8 +217,8 @@ check_tmu_write(uint64_t inst,
 		validated_shader->uniforms_size += 4;

 	if (submit) {
-		if (!record_validated_texture_sample(validated_shader,
-						     validation_state, tmu)) {
+		if (!record_texture_sample(validated_shader,
+					   validation_state, tmu)) {
 			return false;
 		}

@@ -239,10 +229,10 @@ check_tmu_write(uint64_t inst,
 }

 static bool
-check_register_write(uint64_t inst,
-		     struct vc4_validated_shader_info *validated_shader,
-		     struct vc4_shader_validation_state *validation_state,
-		     bool is_mul)
+check_reg_write(uint64_t inst,
+		struct vc4_validated_shader_info *validated_shader,
+		struct vc4_shader_validation_state *validation_state,
+		bool is_mul)
 {
 	uint32_t waddr = (is_mul ?
 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
@@ -298,7 +288,7 @@ check_register_write(uint64_t inst,
 		return true;

 	case QPU_W_TLB_STENCIL_SETUP:
-                return true;
+		return true;
 	}

 	return true;
@@ -361,7 +351,7 @@ track_live_clamps(uint64_t inst,
 		}

 		validation_state->live_max_clamp_regs[lri_add] = true;
-	} if (op_add == QPU_A_MIN) {
+	} else if (op_add == QPU_A_MIN) {
 		/* Track live clamps of a value clamped to a minimum of 0 and
 		 * a maximum of some uniform's offset.
 		 */
@@ -393,8 +383,10 @@ check_instruction_writes(uint64_t inst,
 		return false;
 	}

-	ok = (check_register_write(inst, validated_shader, validation_state, false) &&
-	      check_register_write(inst, validated_shader, validation_state, true));
+	ok = (check_reg_write(inst, validated_shader, validation_state,
+			      false) &&
+	      check_reg_write(inst, validated_shader, validation_state,
+			      true));

 	track_live_clamps(inst, validated_shader, validation_state);

@@ -442,7 +434,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 	shader = shader_obj->vaddr;
 	max_ip = shader_obj->base.size / sizeof(uint64_t);

-	validated_shader = kcalloc(sizeof(*validated_shader), 1, GFP_KERNEL);
+	validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
 	if (!validated_shader)
 		return NULL;

@@ -498,7 +490,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)

 	if (ip == max_ip) {
 		DRM_ERROR("shader failed to terminate before "
-			  "shader BO end at %d\n",
+			  "shader BO end at %zd\n",
 			  shader_obj->base.size);
 		goto fail;
 	}
@@ -514,6 +506,9 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 	return validated_shader;

 fail:
-	kfree(validated_shader);
+	if (validated_shader) {
+		kfree(validated_shader->texture_samples);
+		kfree(validated_shader);
+	}
 	return NULL;
 }
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -41,24 +41,53 @@ vc4_get_blit_surface(struct pipe_context *pctx,
        return pctx->create_surface(pctx, prsc, &tmpl);
 }

+static bool
+is_tile_unaligned(unsigned size, unsigned tile_size)
+{
+        return size & (tile_size - 1);
+}
+
 static bool
 vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
 {
        struct vc4_context *vc4 = vc4_context(pctx);
+        bool old_msaa = vc4->msaa;
+        int old_tile_width = vc4->tile_width;
+        int old_tile_height = vc4->tile_height;
+        bool msaa = (info->src.resource->nr_samples ||
+                     info->dst.resource->nr_samples);
+        int tile_width = msaa ? 32 : 64;
+        int tile_height = msaa ? 32 : 64;

        if (util_format_is_depth_or_stencil(info->dst.resource->format))
                return false;

+        if (info->scissor_enable)
+                return false;
+
        if ((info->mask & PIPE_MASK_RGBA) == 0)
                return false;

-        if (info->dst.box.x != 0 || info->dst.box.y != 0 ||
-            info->src.box.x != 0 || info->src.box.y != 0 ||
+        if (info->dst.box.x != info->src.box.x ||
+            info->dst.box.y != info->src.box.y ||
            info->dst.box.width != info->src.box.width ||
            info->dst.box.height != info->src.box.height) {
                return false;
        }

+        int dst_surface_width = u_minify(info->dst.resource->width0,
+                                         info->dst.level);
+        int dst_surface_height = u_minify(info->dst.resource->height0,
+                                         info->dst.level);
+        if (is_tile_unaligned(info->dst.box.x, tile_width) ||
+            is_tile_unaligned(info->dst.box.y, tile_height) ||
+            (is_tile_unaligned(info->dst.box.width, tile_width) &&
+             info->dst.box.x + info->dst.box.width != dst_surface_width) ||
+            (is_tile_unaligned(info->dst.box.height, tile_height) &&
+             info->dst.box.y + info->dst.box.height != dst_surface_height)) {
+                return false;
+        }
+
        if (info->dst.resource->format != info->src.resource->format)
                return false;

@@ -70,18 +99,32 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
                vc4_get_blit_surface(pctx, info->src.resource, info->src.level);

        pipe_surface_reference(&vc4->color_read, src_surf);
-        pipe_surface_reference(&vc4->color_write, dst_surf);
+        pipe_surface_reference(&vc4->color_write,
+                               dst_surf->texture->nr_samples ? NULL : dst_surf);
+        pipe_surface_reference(&vc4->msaa_color_write,
+                               dst_surf->texture->nr_samples ? dst_surf : NULL);
        pipe_surface_reference(&vc4->zs_read, NULL);
        pipe_surface_reference(&vc4->zs_write, NULL);
-        vc4->draw_min_x = 0;
-        vc4->draw_min_y = 0;
-        vc4->draw_max_x = dst_surf->width;
-        vc4->draw_max_y = dst_surf->height;
+        pipe_surface_reference(&vc4->msaa_zs_write, NULL);
+
+        vc4->draw_min_x = info->dst.box.x;
+        vc4->draw_min_y = info->dst.box.y;
+        vc4->draw_max_x = info->dst.box.x + info->dst.box.width;
+        vc4->draw_max_y = info->dst.box.y + info->dst.box.height;
        vc4->draw_width = dst_surf->width;
        vc4->draw_height = dst_surf->height;
+
+        vc4->tile_width = tile_width;
+        vc4->tile_height = tile_height;
+        vc4->msaa = msaa;
        vc4->needs_flush = true;
+
        vc4_job_submit(vc4);

+        vc4->msaa = old_msaa;
+        vc4->tile_width = old_tile_width;
+        vc4->tile_height = old_tile_height;
+
        pipe_surface_reference(&dst_surf, NULL);
        pipe_surface_reference(&src_surf, NULL);

@@ -131,14 +174,6 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
 {
        struct pipe_blit_info info = *blit_info;

-        if (info.src.resource->nr_samples > 1 &&
-            info.dst.resource->nr_samples <= 1 &&
-            !util_format_is_depth_or_stencil(info.src.resource->format) &&
-            !util_format_is_pure_integer(info.src.resource->format)) {
-                fprintf(stderr, "color resolve unimplemented\n");
-                return;
-        }
-
        if (vc4_tile_blit(pctx, blit_info))
                return;

--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -67,8 +67,16 @@ vc4_flush(struct pipe_context *pctx)
        cl_u8(&bcl, VC4_PACKET_FLUSH);
        cl_end(&vc4->bcl, bcl);

+        vc4->msaa = false;
        if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
-                pipe_surface_reference(&vc4->color_write, cbuf);
+                pipe_surface_reference(&vc4->color_write,
+                                       cbuf->texture->nr_samples ? NULL : cbuf);
+                pipe_surface_reference(&vc4->msaa_color_write,
+                                       cbuf->texture->nr_samples ? cbuf : NULL);
+
+                if (cbuf->texture->nr_samples)
+                        vc4->msaa = true;
+
                if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) {
                        pipe_surface_reference(&vc4->color_read, cbuf);
                } else {
@@ -78,11 +86,21 @@ vc4_flush(struct pipe_context *pctx)
        } else {
                pipe_surface_reference(&vc4->color_write, NULL);
                pipe_surface_reference(&vc4->color_read, NULL);
+                pipe_surface_reference(&vc4->msaa_color_write, NULL);
        }

        if (vc4->framebuffer.zsbuf &&
            (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
-                pipe_surface_reference(&vc4->zs_write, zsbuf);
+                pipe_surface_reference(&vc4->zs_write,
+                                       zsbuf->texture->nr_samples ?
+                                       NULL : zsbuf);
+                pipe_surface_reference(&vc4->msaa_zs_write,
+                                       zsbuf->texture->nr_samples ?
+                                       zsbuf : NULL);
+
+                if (zsbuf->texture->nr_samples)
+                        vc4->msaa = true;
+
                if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
                        pipe_surface_reference(&vc4->zs_read, zsbuf);
                } else {
@@ -91,6 +109,7 @@ vc4_flush(struct pipe_context *pctx)
        } else {
                pipe_surface_reference(&vc4->zs_write, NULL);
                pipe_surface_reference(&vc4->zs_read, NULL);
+                pipe_surface_reference(&vc4->msaa_zs_write, NULL);
        }

        vc4_job_submit(vc4);
@@ -245,6 +264,8 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)

        vc4_debug |= saved_shaderdb_flag;

+        vc4->sample_mask = (1 << VC4_MAX_SAMPLES) - 1;
+
        return &vc4->base;

 fail:
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -206,6 +206,8 @@ struct vc4_context {
        struct pipe_surface *color_write;
        struct pipe_surface *zs_read;
        struct pipe_surface *zs_write;
+        struct pipe_surface *msaa_color_write;
+        struct pipe_surface *msaa_zs_write;
        /** @} */
        /** @{
         * Bounding box of the scissor across all queued drawing.
@@ -224,6 +226,15 @@ struct vc4_context {
        uint32_t draw_width;
        uint32_t draw_height;
        /** @} */
+        /** @{ Tile information, depending on MSAA and float color buffer. */
+        uint32_t draw_tiles_x; /** @< Number of tiles wide for framebuffer. */
+        uint32_t draw_tiles_y; /** @< Number of tiles high for framebuffer. */
+
+        uint32_t tile_width; /** @< Width of a tile. */
+        uint32_t tile_height; /** @< Height of a tile. */
+        /** Whether the current rendering is in a 4X MSAA tile buffer. */
+        bool msaa;
+	/** @} */

        struct util_slab_mempool transfer_pool;
        struct blitter_context *blitter;
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -68,21 +68,17 @@ vc4_start_draw(struct vc4_context *vc4)

        vc4_get_draw_cl_space(vc4);

-        uint32_t width = vc4->framebuffer.width;
-        uint32_t height = vc4->framebuffer.height;
-        uint32_t tilew = align(width, 64) / 64;
-        uint32_t tileh = align(height, 64) / 64;
        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
-
        //   Tile state data is 48 bytes per tile, I think it can be thrown away
        //   as soon as binning is finished.
        cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
        cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */
        cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */
        cl_u32(&bcl, 0); /* tile state addr, filled by kernel */
-        cl_u8(&bcl, tilew);
-        cl_u8(&bcl, tileh);
-        cl_u8(&bcl, 0); /* flags, filled by kernel. */
+        cl_u8(&bcl, vc4->draw_tiles_x);
+        cl_u8(&bcl, vc4->draw_tiles_y);
+        /* Other flags are filled by kernel. */
+        cl_u8(&bcl, vc4->msaa ? VC4_BIN_CONFIG_MS_MODE_4X : 0);

        /* START_TILE_BINNING resets the statechange counters in the hardware,
         * which are what is used when a primitive is binned to a tile to
@@ -102,8 +98,8 @@ vc4_start_draw(struct vc4_context *vc4)

        vc4->needs_flush = true;
        vc4->draw_calls_queued++;
-        vc4->draw_width = width;
-        vc4->draw_height = height;
+        vc4->draw_width = vc4->framebuffer.width;
+        vc4->draw_height = vc4->framebuffer.height;

        cl_end(&vc4->bcl, bcl);
 }
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -44,10 +44,13 @@ struct drm_vc4_submit_rcl_surface {
 	uint32_t hindex; /* Handle index, or ~0 if not present. */
 	uint32_t offset; /* Offset to start of buffer. */
 	/*
-         * Bits for either render config (color_ms_write) or load/store packet.
+         * Bits for either render config (color_write) or load/store packet.
+         * Bits should all be 0 for MSAA load/stores.
 	 */
 	uint16_t bits;
-	uint16_t pad;
+
+#define VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES		(1 << 0)
+	uint16_t flags;
 };

 /**
@@ -126,9 +129,11 @@ struct drm_vc4_submit_cl {
 	uint8_t max_x_tile;
 	uint8_t max_y_tile;
 	struct drm_vc4_submit_rcl_surface color_read;
-	struct drm_vc4_submit_rcl_surface color_ms_write;
+	struct drm_vc4_submit_rcl_surface color_write;
 	struct drm_vc4_submit_rcl_surface zs_read;
 	struct drm_vc4_submit_rcl_surface zs_write;
+	struct drm_vc4_submit_rcl_surface msaa_color_write;
+	struct drm_vc4_submit_rcl_surface msaa_zs_write;
 	uint32_t clear_color[2];
 	uint32_t clear_z;
 	uint8_t clear_s;
--- a/src/gallium/drivers/vc4/vc4_emit.c
+++ b/src/gallium/drivers/vc4/vc4_emit.c
@@ -29,17 +29,35 @@ vc4_emit_state(struct pipe_context *pctx)
        struct vc4_context *vc4 = vc4_context(pctx);

        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
-        if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) {
+        if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT |
+                          VC4_DIRTY_RASTERIZER)) {
                float *vpscale = vc4->viewport.scale;
                float *vptranslate = vc4->viewport.translate;
                float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
                float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
                float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
                float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
-                uint32_t minx = MAX2(vc4->scissor.minx, vp_minx);
-                uint32_t miny = MAX2(vc4->scissor.miny, vp_miny);
-                uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx);
-                uint32_t maxy = MIN2(vc4->scissor.maxy, vp_maxy);
+
+                /* Clip to the scissor if it's enabled, but still clip to the
+                 * drawable regardless since that controls where the binner
+                 * tries to put things.
+                 *
+                 * Additionally, always clip the rendering to the viewport,
+                 * since the hardware does guardband clipping, meaning
+                 * primitives would rasterize outside of the view volume.
+                 */
+                uint32_t minx, miny, maxx, maxy;
+                if (!vc4->rasterizer->base.scissor) {
+                        minx = MAX2(vp_minx, 0);
+                        miny = MAX2(vp_miny, 0);
+                        maxx = MIN2(vp_maxx, vc4->draw_width);
+                        maxy = MIN2(vp_maxy, vc4->draw_height);
+                } else {
+                        minx = MAX2(vp_minx, vc4->scissor.minx);
+                        miny = MAX2(vp_miny, vc4->scissor.miny);
+                        maxx = MIN2(vp_maxx, vc4->scissor.maxx);
+                        maxy = MIN2(vp_maxy, vc4->scissor.maxy);
+                }

                cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW);
                cl_u16(&bcl, minx);
@@ -54,6 +72,20 @@ vc4_emit_state(struct pipe_context *pctx)
        }

        if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) {
+                uint8_t ez_enable_mask_out = ~0;
+
+                /* HW-2905: If the RCL ends up doing a full-res load when
+                 * multisampling, then early Z tracking may end up with values
+                 * from the previous tile due to a HW bug.  Disable it to
+                 * avoid that.
+                 *
+                 * We should be able to skip this when the Z is cleared, but I
+                 * was seeing bad rendering on glxgears -samples 4 even in
+                 * that case.
+                 */
+                if (vc4->msaa)
+                        ez_enable_mask_out &= ~VC4_CONFIG_BITS_EARLY_Z;
+
                cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS);
                cl_u8(&bcl,
                      vc4->rasterizer->config_bits[0] |
@@ -62,8 +94,8 @@ vc4_emit_state(struct pipe_context *pctx)
                      vc4->rasterizer->config_bits[1] |
                      vc4->zsa->config_bits[1]);
                cl_u8(&bcl,
-                      vc4->rasterizer->config_bits[2] |
-                      vc4->zsa->config_bits[2]);
+                      (vc4->rasterizer->config_bits[2] |
+                       vc4->zsa->config_bits[2]) & ez_enable_mask_out);
        }

        if (vc4->dirty & VC4_DIRTY_RASTERIZER) {
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -89,31 +89,37 @@ vc4_submit_setup_rcl_surface(struct vc4_context *vc4,
        submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
        submit_surf->offset = surf->offset;

-        if (is_depth) {
-                submit_surf->bits =
-                        VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS,
-                                      VC4_LOADSTORE_TILE_BUFFER_BUFFER);
+        if (psurf->texture->nr_samples == 0) {
+                if (is_depth) {
+                        submit_surf->bits =
+                                VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS,
+                                              VC4_LOADSTORE_TILE_BUFFER_BUFFER);

+                } else {
+                        submit_surf->bits =
+                                VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_COLOR,
+                                              VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
+                                VC4_SET_FIELD(vc4_rt_format_is_565(psurf->format) ?
+                                              VC4_LOADSTORE_TILE_BUFFER_BGR565 :
+                                              VC4_LOADSTORE_TILE_BUFFER_RGBA8888,
+                                              VC4_LOADSTORE_TILE_BUFFER_FORMAT);
+                }
+                submit_surf->bits |=
+                        VC4_SET_FIELD(surf->tiling,
+                                      VC4_LOADSTORE_TILE_BUFFER_TILING);
        } else {
-                submit_surf->bits =
-                        VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_COLOR,
-                                      VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
-                        VC4_SET_FIELD(vc4_rt_format_is_565(psurf->format) ?
-                                      VC4_LOADSTORE_TILE_BUFFER_BGR565 :
-                                      VC4_LOADSTORE_TILE_BUFFER_RGBA8888,
-                                      VC4_LOADSTORE_TILE_BUFFER_FORMAT);
+                assert(!is_write);
+                submit_surf->flags |= VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES;
        }
-        submit_surf->bits |=
-                VC4_SET_FIELD(surf->tiling, VC4_LOADSTORE_TILE_BUFFER_TILING);

        if (is_write)
                rsc->writes++;
 }

 static void
-vc4_submit_setup_ms_rcl_surface(struct vc4_context *vc4,
-                                struct drm_vc4_submit_rcl_surface *submit_surf,
-                                struct pipe_surface *psurf)
+vc4_submit_setup_rcl_render_config_surface(struct vc4_context *vc4,
+                                           struct drm_vc4_submit_rcl_surface *submit_surf,
+                                           struct pipe_surface *psurf)
 {
        struct vc4_surface *surf = vc4_surface(psurf);

@@ -126,16 +132,38 @@ vc4_submit_setup_ms_rcl_surface(struct vc4_context *vc4,
        submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
        submit_surf->offset = surf->offset;

-        submit_surf->bits =
-                VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ?
-                              VC4_RENDER_CONFIG_FORMAT_BGR565 :
-                              VC4_RENDER_CONFIG_FORMAT_RGBA8888,
-                              VC4_RENDER_CONFIG_FORMAT) |
-                VC4_SET_FIELD(surf->tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT);
+        if (psurf->texture->nr_samples == 0) {
+                submit_surf->bits =
+                        VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ?
+                                      VC4_RENDER_CONFIG_FORMAT_BGR565 :
+                                      VC4_RENDER_CONFIG_FORMAT_RGBA8888,
+                                      VC4_RENDER_CONFIG_FORMAT) |
+                        VC4_SET_FIELD(surf->tiling,
+                                      VC4_RENDER_CONFIG_MEMORY_FORMAT);
+        }

        rsc->writes++;
 }

+static void
+vc4_submit_setup_rcl_msaa_surface(struct vc4_context *vc4,
+                                  struct drm_vc4_submit_rcl_surface *submit_surf,
+                                  struct pipe_surface *psurf)
+{
+        struct vc4_surface *surf = vc4_surface(psurf);
+
+        if (!surf) {
+                submit_surf->hindex = ~0;
+                return;
+        }
+
+        struct vc4_resource *rsc = vc4_resource(psurf->texture);
+        submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
+        submit_surf->offset = surf->offset;
+        submit_surf->bits = 0;
+        rsc->writes++;
+}
+
 /**
 * Submits the job to the kernel and then reinitializes it.
 */
@@ -150,18 +178,35 @@ vc4_job_submit(struct vc4_context *vc4)
        struct drm_vc4_submit_cl submit;
        memset(&submit, 0, sizeof(submit));

-        cl_ensure_space(&vc4->bo_handles, 4 * sizeof(uint32_t));
-        cl_ensure_space(&vc4->bo_pointers, 4 * sizeof(struct vc4_bo *));
+        cl_ensure_space(&vc4->bo_handles, 6 * sizeof(uint32_t));
+        cl_ensure_space(&vc4->bo_pointers, 6 * sizeof(struct vc4_bo *));

        vc4_submit_setup_rcl_surface(vc4, &submit.color_read,
                                     vc4->color_read, false, false);
-        vc4_submit_setup_ms_rcl_surface(vc4, &submit.color_ms_write,
-                                        vc4->color_write);
+        vc4_submit_setup_rcl_render_config_surface(vc4, &submit.color_write,
+                                                   vc4->color_write);
        vc4_submit_setup_rcl_surface(vc4, &submit.zs_read,
                                     vc4->zs_read, true, false);
        vc4_submit_setup_rcl_surface(vc4, &submit.zs_write,
                                     vc4->zs_write, true, true);

+        vc4_submit_setup_rcl_msaa_surface(vc4, &submit.msaa_color_write,
+                                          vc4->msaa_color_write);
+        vc4_submit_setup_rcl_msaa_surface(vc4, &submit.msaa_zs_write,
+                                          vc4->msaa_zs_write);
+
+        if (vc4->msaa) {
+                /* This bit controls how many pixels the general
+                 * (i.e. subsampled) loads/stores are iterating over
+                 * (multisample loads replicate out to the other samples).
+                 */
+                submit.color_write.bits |= VC4_RENDER_CONFIG_MS_MODE_4X;
+                /* Controls whether color_write's
+                 * VC4_PACKET_STORE_MS_TILE_BUFFER does 4x decimation
+                 */
+                submit.color_write.bits |= VC4_RENDER_CONFIG_DECIMATE_MODE_4X;
+        }
+
        submit.bo_handles = (uintptr_t)vc4->bo_handles.base;
        submit.bo_handle_count = cl_offset(&vc4->bo_handles) / 4;
        submit.bin_cl = (uintptr_t)vc4->bcl.base;
@@ -173,10 +218,10 @@ vc4_job_submit(struct vc4_context *vc4)
        submit.uniforms_size = cl_offset(&vc4->uniforms);

        assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
-        submit.min_x_tile = vc4->draw_min_x / 64;
-        submit.min_y_tile = vc4->draw_min_y / 64;
-        submit.max_x_tile = (vc4->draw_max_x - 1) / 64;
-        submit.max_y_tile = (vc4->draw_max_y - 1) / 64;
+        submit.min_x_tile = vc4->draw_min_x / vc4->tile_width;
+        submit.min_y_tile = vc4->draw_min_y / vc4->tile_height;
+        submit.max_x_tile = (vc4->draw_max_x - 1) / vc4->tile_width;
+        submit.max_y_tile = (vc4->draw_max_y - 1) / vc4->tile_height;
        submit.width = vc4->draw_width;
        submit.height = vc4->draw_height;
        if (vc4->cleared) {
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -29,6 +29,10 @@
 * from the tile buffer after having waited for the scoreboard (which is
 * handled by vc4_qpu_emit.c), then do math using your output color and that
 * destination value, and update the output color appropriately.
+ *
+ * Once this pass is done, the color write will either have one component (for
+ * single sample) with packed argb8888, or 4 components with the per-sample
+ * argb8888 result.
 */

 /**
@@ -40,15 +44,23 @@
 #include "glsl/nir/nir_builder.h"
 #include "vc4_context.h"

+static bool
+blend_depends_on_dst_color(struct vc4_compile *c)
+{
+        return (c->fs_key->blend.blend_enable ||
+                c->fs_key->blend.colormask != 0xf ||
+                c->fs_key->logicop_func != PIPE_LOGICOP_COPY);
+}
+
 /** Emits a load of the previous fragment color from the tile buffer. */
 static nir_ssa_def *
-vc4_nir_get_dst_color(nir_builder *b)
+vc4_nir_get_dst_color(nir_builder *b, int sample)
 {
        nir_intrinsic_instr *load =
                nir_intrinsic_instr_create(b->shader,
                                           nir_intrinsic_load_input);
        load->num_components = 1;
-        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT;
+        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT + sample;
        nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
        nir_builder_instr_insert(b, &load->instr);
        return &load->dest.ssa;
@@ -496,23 +508,26 @@ vc4_nir_swizzle_and_pack(struct vc4_compile *c, nir_builder *b,

 }

-static void
-vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
-                          nir_intrinsic_instr *intr)
+static nir_ssa_def *
+vc4_nir_blend_pipeline(struct vc4_compile *c, nir_builder *b, nir_ssa_def *src,
+                       int sample)
 {
        enum pipe_format color_format = c->fs_key->color_format;
        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
        bool srgb = util_format_is_srgb(color_format);

        /* Pull out the float src/dst color components. */
-        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
+        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b, sample);
        nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color);
        nir_ssa_def *src_color[4], *unpacked_dst_color[4];
        for (unsigned i = 0; i < 4; i++) {
-                src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false);
-                unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
+                src_color[i] = nir_channel(b, src, i);
+                unpacked_dst_color[i] = nir_channel(b, dst_vec4, i);
        }

+        if (c->fs_key->sample_alpha_to_one && c->fs_key->msaa)
+                src_color[3] = nir_imm_float(b, 1.0);
+
        vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);

        nir_ssa_def *packed_color;
@@ -560,16 +575,100 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
                        colormask &= ~(0xff << (i * 8));
                }
        }
-        packed_color = nir_ior(b,
-                               nir_iand(b, packed_color,
-                                        nir_imm_int(b, colormask)),
-                               nir_iand(b, packed_dst_color,
-                                        nir_imm_int(b, ~colormask)));

-        /* Turn the old vec4 output into a store of the packed color. */
-        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
-                              nir_src_for_ssa(packed_color));
+        return nir_ior(b,
+                       nir_iand(b, packed_color,
+                                nir_imm_int(b, colormask)),
+                       nir_iand(b, packed_dst_color,
+                                nir_imm_int(b, ~colormask)));
+}
+
+static int
+vc4_nir_next_output_driver_location(nir_shader *s)
+{
+        int maxloc = -1;
+
+        nir_foreach_variable(var, &s->outputs)
+                maxloc = MAX2(maxloc, (int)var->data.driver_location);
+
+        return maxloc + 1;
+}
+
+static void
+vc4_nir_store_sample_mask(struct vc4_compile *c, nir_builder *b,
+                          nir_ssa_def *val)
+{
+        nir_variable *sample_mask = nir_variable_create(c->s, nir_var_shader_out,
+                                                        glsl_uint_type(),
+                                                        "sample_mask");
+        sample_mask->data.driver_location =
+                vc4_nir_next_output_driver_location(c->s);
+        sample_mask->data.location = FRAG_RESULT_SAMPLE_MASK;
+
+        nir_intrinsic_instr *intr =
+                nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output);
        intr->num_components = 1;
+        intr->const_index[0] = sample_mask->data.driver_location;
+
+        intr->src[0] = nir_src_for_ssa(val);
+        nir_builder_instr_insert(b, &intr->instr);
+}
+
+static void
+vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
+{
+        nir_ssa_def *frag_color = intr->src[0].ssa;
+
+        if (c->fs_key->sample_coverage) {
+                nir_intrinsic_instr *load =
+                        nir_intrinsic_instr_create(b->shader,
+                                                   nir_intrinsic_load_sample_mask_in);
+                load->num_components = 1;
+                nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+                nir_builder_instr_insert(b, &load->instr);
+
+                nir_ssa_def *bitmask = &load->dest.ssa;
+
+                vc4_nir_store_sample_mask(c, b, bitmask);
+        } else if (c->fs_key->sample_alpha_to_coverage) {
+                nir_ssa_def *a = nir_channel(b, frag_color, 3);
+
+                /* XXX: We should do a nice dither based on the fragment
+                 * coordinate, instead.
+                 */
+                nir_ssa_def *num_samples = nir_imm_float(b, VC4_MAX_SAMPLES);
+                nir_ssa_def *num_bits = nir_f2i(b, nir_fmul(b, a, num_samples));
+                nir_ssa_def *bitmask = nir_isub(b,
+                                                nir_ishl(b,
+                                                         nir_imm_int(b, 1),
+                                                         num_bits),
+                                                nir_imm_int(b, 1));
+                vc4_nir_store_sample_mask(c, b, bitmask);
+        }
+
+        /* The TLB color read returns each sample in turn, so if our blending
+         * depends on the destination color, we're going to have to run the
+         * blending function separately for each destination sample value, and
+         * then output the per-sample color using TLB_COLOR_MS.
+         */
+        nir_ssa_def *blend_output;
+        if (c->fs_key->msaa && blend_depends_on_dst_color(c)) {
+                c->msaa_per_sample_output = true;
+
+                nir_ssa_def *samples[4];
+                for (int i = 0; i < VC4_MAX_SAMPLES; i++)
+                        samples[i] = vc4_nir_blend_pipeline(c, b, frag_color, i);
+                blend_output = nir_vec4(b,
+                                        samples[0], samples[1],
+                                        samples[2], samples[3]);
+        } else {
+                blend_output = vc4_nir_blend_pipeline(c, b, frag_color, 0);
+        }
+
+        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+                              nir_src_for_ssa(blend_output));
+        intr->num_components = blend_output->num_components;
 }

 static bool
@@ -577,7 +676,7 @@ vc4_nir_lower_blend_block(nir_block *block, void *state)
 {
        struct vc4_compile *c = state;

-        nir_foreach_instr(block, instr) {
+        nir_foreach_instr_safe(block, instr) {
                if (instr->type != nir_instr_type_intrinsic)
                        continue;
                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -84,7 +84,7 @@ vc4_nir_unpack_16u(nir_builder *b, nir_ssa_def *src, unsigned chan)
 static nir_ssa_def *
 vc4_nir_unpack_8f(nir_builder *b, nir_ssa_def *src, unsigned chan)
 {
-        return nir_swizzle(b, nir_unpack_unorm_4x8(b, src), &chan, 1, false);
+        return nir_channel(b, nir_unpack_unorm_4x8(b, src), chan);
 }

 static nir_ssa_def *
@@ -226,7 +226,9 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
 {
        b->cursor = nir_before_instr(&intr->instr);

-        if (intr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
+        if (intr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT &&
+            intr->const_index[0] < (VC4_NIR_TLB_COLOR_READ_INPUT +
+                                    VC4_MAX_SAMPLES)) {
                /* This doesn't need any lowering. */
                return;
        }
@@ -309,7 +311,8 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
        /* Color output is lowered by vc4_nir_lower_blend(). */
        if (c->stage == QSTAGE_FRAG &&
            (output_var->data.location == FRAG_RESULT_COLOR ||
-             output_var->data.location == FRAG_RESULT_DATA0)) {
+             output_var->data.location == FRAG_RESULT_DATA0 ||
+             output_var->data.location == FRAG_RESULT_SAMPLE_MASK)) {
                intr->const_index[0] *= 4;
                return;
        }
@@ -326,9 +329,8 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;

                assert(intr->src[0].is_ssa);
-                intr_comp->src[0] = nir_src_for_ssa(nir_swizzle(b,
-                                                                intr->src[0].ssa,
-                                                                &i, 1, false));
+                intr_comp->src[0] =
+                        nir_src_for_ssa(nir_channel(b, intr->src[0].ssa, i));
                nir_builder_instr_insert(b, &intr_comp->instr);
        }

--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vc4_qir.h"
+#include "kernel/vc4_packet.h"
+#include "tgsi/tgsi_info.h"
+#include "glsl/nir/nir_builder.h"
+
+/** @file vc4_nir_lower_txf_ms.c
+ * Walks the NIR generated by TGSI-to-NIR to lower its nir_texop_txf_ms
+ * coordinates to do the math necessary and use a plain nir_texop_txf instead.
+ *
+ * MSAA textures are laid out as 32x32-aligned blocks of RGBA8888 or Z24S8.
+ * We can't load them through the normal sampler path because of the lack of
+ * linear support in the hardware.  So, we treat MSAA textures as a giant UBO
+ * and do the math in the shader.
+ */
+
+static void
+vc4_nir_lower_txf_ms_instr(struct vc4_compile *c, nir_builder *b,
+                           nir_tex_instr *txf_ms)
+{
+        if (txf_ms->op != nir_texop_txf_ms)
+                return;
+
+        b->cursor = nir_before_instr(&txf_ms->instr);
+
+        nir_tex_instr *txf = nir_tex_instr_create(c->s, 1);
+        txf->op = nir_texop_txf;
+        txf->sampler = txf_ms->sampler;
+        txf->sampler_index = txf_ms->sampler_index;
+        txf->coord_components = txf_ms->coord_components;
+        txf->is_shadow = txf_ms->is_shadow;
+        txf->is_new_style_shadow = txf_ms->is_new_style_shadow;
+
+        nir_ssa_def *coord = NULL, *sample_index = NULL;
+        for (int i = 0; i < txf_ms->num_srcs; i++) {
+                assert(txf_ms->src[i].src.is_ssa);
+
+                switch (txf_ms->src[i].src_type) {
+                case nir_tex_src_coord:
+                        coord = txf_ms->src[i].src.ssa;
+                        break;
+                case nir_tex_src_ms_index:
+                        sample_index = txf_ms->src[i].src.ssa;
+                        break;
+                default:
+                        unreachable("Unknown txf_ms src\n");
+                }
+        }
+        assert(coord);
+        assert(sample_index);
+
+        nir_ssa_def *x = nir_channel(b, coord, 0);
+        nir_ssa_def *y = nir_channel(b, coord, 1);
+
+        uint32_t tile_w = 32;
+        uint32_t tile_h = 32;
+        uint32_t tile_w_shift = 5;
+        uint32_t tile_h_shift = 5;
+        uint32_t tile_size = (tile_h * tile_w *
+                              VC4_MAX_SAMPLES * sizeof(uint32_t));
+        unsigned unit = txf_ms->sampler_index;
+        uint32_t w = align(c->key->tex[unit].msaa_width, tile_w);
+        uint32_t w_tiles = w / tile_w;
+
+        nir_ssa_def *x_tile = nir_ushr(b, x, nir_imm_int(b, tile_w_shift));
+        nir_ssa_def *y_tile = nir_ushr(b, y, nir_imm_int(b, tile_h_shift));
+        nir_ssa_def *tile_addr = nir_iadd(b,
+                                          nir_imul(b, x_tile,
+                                                   nir_imm_int(b, tile_size)),
+                                          nir_imul(b, y_tile,
+                                                   nir_imm_int(b, (w_tiles *
+                                                                   tile_size))));
+        nir_ssa_def *x_subspan = nir_iand(b, x,
+                                          nir_imm_int(b, (tile_w - 1) & ~1));
+        nir_ssa_def *y_subspan = nir_iand(b, y,
+                                          nir_imm_int(b, (tile_h - 1) & ~1));
+        nir_ssa_def *subspan_addr = nir_iadd(b,
+                                             nir_imul(b, x_subspan,
+                                                      nir_imm_int(b, 2 * VC4_MAX_SAMPLES * sizeof(uint32_t))),
+                                             nir_imul(b, y_subspan,
+                                                      nir_imm_int(b,
+                                                                  tile_w *
+                                                                  VC4_MAX_SAMPLES *
+                                                                  sizeof(uint32_t))));
+
+        nir_ssa_def *pixel_addr = nir_ior(b,
+                                          nir_iand(b,
+                                                   nir_ishl(b, x,
+                                                            nir_imm_int(b, 2)),
+                                                   nir_imm_int(b, (1 << 2))),
+                                          nir_iand(b,
+                                                   nir_ishl(b, y,
+                                                            nir_imm_int(b, 3)),
+                                                   nir_imm_int(b, (1 << 3))));
+
+        nir_ssa_def *sample_addr = nir_ishl(b, sample_index, nir_imm_int(b, 4));
+
+        nir_ssa_def *addr = nir_iadd(b,
+                                     nir_ior(b, sample_addr, pixel_addr),
+                                     nir_iadd(b, subspan_addr, tile_addr));
+
+        txf->src[0].src_type = nir_tex_src_coord;
+        txf->src[0].src = nir_src_for_ssa(nir_vec2(b, addr, nir_imm_int(b, 0)));
+        nir_ssa_dest_init(&txf->instr, &txf->dest, 4, NULL);
+        nir_builder_instr_insert(b, &txf->instr);
+        nir_ssa_def_rewrite_uses(&txf_ms->dest.ssa,
+                                 nir_src_for_ssa(&txf->dest.ssa));
+        nir_instr_remove(&txf_ms->instr);
+}
+
+static bool
+vc4_nir_lower_txf_ms_block(nir_block *block, void *arg)
+{
+        struct vc4_compile *c = arg;
+        nir_function_impl *impl =
+                nir_cf_node_get_function(&block->cf_node);
+
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_instr_safe(block, instr) {
+                if (instr->type == nir_instr_type_tex) {
+                        vc4_nir_lower_txf_ms_instr(c, &b,
+                                                   nir_instr_as_tex(instr));
+                }
+        }
+
+        return true;
+}
+
+static bool
+vc4_nir_lower_txf_ms_impl(struct vc4_compile *c, nir_function_impl *impl)
+{
+        nir_foreach_block(impl, vc4_nir_lower_txf_ms_block, c);
+
+        nir_metadata_preserve(impl,
+                              nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+vc4_nir_lower_txf_ms(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl)
+                        vc4_nir_lower_txf_ms_impl(c, overload->impl);
+        }
+}
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -94,7 +94,12 @@ static void
 replace_with_mov(struct vc4_compile *c, struct qinst *inst, struct qreg arg)
 {
        dump_from(c, inst);
-        inst->op = QOP_MOV;
+        if (qir_is_mul(inst))
+                inst->op = QOP_MMOV;
+        else if (qir_is_float_input(inst))
+                inst->op = QOP_FMOV;
+        else
+                inst->op = QOP_MOV;
        inst->src[0] = arg;
        inst->src[1] = c->undef;
        dump_to(c, inst);
@@ -181,6 +186,7 @@ qir_opt_algebraic(struct vc4_compile *c)
                case QOP_SUB:
                        if (is_zero(c, inst->src[1])) {
                                replace_with_mov(c, inst, inst->src[0]);
+                                progress = true;
                        }
                        break;

--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -294,6 +294,76 @@ ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
                                        qir_uniform_ui(c, 24)));
 }

+static struct qreg
+ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
+{
+        struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
+                                                 qir_uniform_ui(c, 8)));
+        return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
+}
+
+/**
+ * Emits a lowered TXF_MS from an MSAA texture.
+ *
+ * The addressing math has been lowered in NIR, and now we just need to read
+ * it like a UBO.
+ */
+static void
+ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
+{
+        uint32_t tile_width = 32;
+        uint32_t tile_height = 32;
+        uint32_t tile_size = (tile_height * tile_width *
+                              VC4_MAX_SAMPLES * sizeof(uint32_t));
+
+        unsigned unit = instr->sampler_index;
+        uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
+        uint32_t w_tiles = w / tile_width;
+        uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
+        uint32_t h_tiles = h / tile_height;
+        uint32_t size = w_tiles * h_tiles * tile_size;
+
+        struct qreg addr;
+        assert(instr->num_srcs == 1);
+        assert(instr->src[0].src_type == nir_tex_src_coord);
+        addr = ntq_get_src(c, instr->src[0].src, 0);
+
+        /* Perform the clamping required by kernel validation. */
+        addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
+        addr = qir_MIN(c, addr,  qir_uniform_ui(c, size - 4));
+
+        qir_TEX_DIRECT(c, addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
+
+        struct qreg tex = qir_TEX_RESULT(c);
+        c->num_texture_samples++;
+
+        struct qreg texture_output[4];
+        enum pipe_format format = c->key->tex[unit].format;
+        if (util_format_is_depth_or_stencil(format)) {
+                struct qreg scaled = ntq_scale_depth_texture(c, tex);
+                for (int i = 0; i < 4; i++)
+                        texture_output[i] = scaled;
+        } else {
+                struct qreg tex_result_unpacked[4];
+                for (int i = 0; i < 4; i++)
+                        tex_result_unpacked[i] = qir_UNPACK_8_F(c, tex, i);
+
+                const uint8_t *format_swiz =
+                        vc4_get_format_swizzle(c->key->tex[unit].format);
+                for (int i = 0; i < 4; i++) {
+                        texture_output[i] =
+                                get_swizzled_channel(c, tex_result_unpacked,
+                                                     format_swiz[i]);
+                }
+        }
+
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
+        for (int i = 0; i < 4; i++) {
+                dest[i] = get_swizzled_channel(c, texture_output,
+                                               c->key->tex[unit].swizzle[i]);
+        }
+}
+
 static void
 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
 {
@@ -301,6 +371,11 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
        bool is_txb = false, is_txl = false, has_proj = false;
        unsigned unit = instr->sampler_index;

+        if (instr->op == nir_texop_txf) {
+                ntq_emit_txf(c, instr);
+                return;
+        }
+
        for (unsigned i = 0; i < instr->num_srcs; i++) {
                switch (instr->src[i].src_type) {
                case nir_tex_src_coord:
@@ -396,11 +471,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)

        struct qreg unpacked[4];
        if (util_format_is_depth_or_stencil(format)) {
-                struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
-                                                         qir_uniform_ui(c, 8)));
-                struct qreg normalized = qir_FMUL(c, depthf,
-                                                  qir_uniform_f(c, 1.0f/0xffffff));
-
+                struct qreg normalized = ntq_scale_depth_texture(c, tex);
                struct qreg depth_output;

                struct qreg one = qir_uniform_f(c, 1.0f);
@@ -1109,6 +1180,10 @@ emit_frag_end(struct vc4_compile *c)
                }
        }

+        if (c->output_sample_mask_index != -1) {
+                qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
+        }
+
        if (c->fs_key->depth_enabled) {
                struct qreg z;
                if (c->output_position_index != -1) {
@@ -1120,7 +1195,12 @@ emit_frag_end(struct vc4_compile *c)
                qir_TLB_Z_WRITE(c, z);
        }

-        qir_TLB_COLOR_WRITE(c, color);
+        if (!c->msaa_per_sample_output) {
+                qir_TLB_COLOR_WRITE(c, color);
+        } else {
+                for (int i = 0; i < VC4_MAX_SAMPLES; i++)
+                        qir_TLB_COLOR_WRITE_MS(c, c->sample_colors[i]);
+        }
 }

 static void
@@ -1171,7 +1251,7 @@ emit_point_size_write(struct vc4_compile *c)
        struct qreg point_size;

        if (c->output_point_size_index != -1)
-                point_size = c->outputs[c->output_point_size_index + 3];
+                point_size = c->outputs[c->output_point_size_index];
        else
                point_size = qir_uniform_f(c, 1.0);

@@ -1359,6 +1439,9 @@ ntq_setup_outputs(struct vc4_compile *c)
                        case FRAG_RESULT_DEPTH:
                                c->output_position_index = loc;
                                break;
+                        case FRAG_RESULT_SAMPLE_MASK:
+                                c->output_sample_mask_index = loc;
+                                break;
                        }
                } else {
                        switch (var->data.location) {
@@ -1462,20 +1545,48 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                                    instr->const_index[0]);
                break;

+        case nir_intrinsic_load_sample_mask_in:
+                *dest = qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0);
+                break;
+
        case nir_intrinsic_load_input:
                assert(instr->num_components == 1);
-                if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
-                        *dest = qir_TLB_COLOR_READ(c);
+                if (instr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT) {
+                        /* Reads of the per-sample color need to be done in
+                         * order.
+                         */
+                        int sample_index = (instr->const_index[0] -
+                                           VC4_NIR_TLB_COLOR_READ_INPUT);
+                        for (int i = 0; i <= sample_index; i++) {
+                                if (c->color_reads[i].file == QFILE_NULL) {
+                                        c->color_reads[i] =
+                                                qir_TLB_COLOR_READ(c);
+                                }
+                        }
+                        *dest = c->color_reads[sample_index];
                } else {
                        *dest = c->inputs[instr->const_index[0]];
                }
                break;

        case nir_intrinsic_store_output:
-                assert(instr->num_components == 1);
-                c->outputs[instr->const_index[0]] =
-                        qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
-                c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1);
+                /* MSAA color outputs are the only case where we have an
+                 * output that's not lowered to being a store of a single 32
+                 * bit value.
+                 */
+                if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {
+                        assert(instr->const_index[0] == c->output_color_index);
+                        for (int i = 0; i < 4; i++) {
+                                c->sample_colors[i] =
+                                        qir_MOV(c, ntq_get_src(c, instr->src[0],
+                                                               i));
+                        }
+                } else {
+                        assert(instr->num_components == 1);
+                        c->outputs[instr->const_index[0]] =
+                                qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
+                        c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1);
+                }
                break;

        case nir_intrinsic_discard:
@@ -1672,6 +1783,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
                nir_lower_clip_vs(c->s, c->key->ucp_enables);

        vc4_nir_lower_io(c);
+        vc4_nir_lower_txf_ms(c);
        nir_lower_idiv(c->s);
        nir_lower_load_const_to_scalar(c->s);

@@ -1907,12 +2019,19 @@ vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
                struct pipe_sampler_state *sampler_state =
                        texstate->samplers[i];

-                if (sampler) {
-                        key->tex[i].format = sampler->format;
-                        key->tex[i].swizzle[0] = sampler->swizzle_r;
-                        key->tex[i].swizzle[1] = sampler->swizzle_g;
-                        key->tex[i].swizzle[2] = sampler->swizzle_b;
-                        key->tex[i].swizzle[3] = sampler->swizzle_a;
+                if (!sampler)
+                        continue;
+
+                key->tex[i].format = sampler->format;
+                key->tex[i].swizzle[0] = sampler->swizzle_r;
+                key->tex[i].swizzle[1] = sampler->swizzle_g;
+                key->tex[i].swizzle[2] = sampler->swizzle_b;
+                key->tex[i].swizzle[3] = sampler->swizzle_a;
+
+                if (sampler->texture->nr_samples) {
+                        key->tex[i].msaa_width = sampler->texture->width0;
+                        key->tex[i].msaa_height = sampler->texture->height0;
+                } else if (sampler){
                        key->tex[i].compare_mode = sampler_state->compare_mode;
                        key->tex[i].compare_func = sampler_state->compare_func;
                        key->tex[i].wrap_s = sampler_state->wrap_s;
@@ -1952,6 +2071,11 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
        } else {
                key->logicop_func = PIPE_LOGICOP_COPY;
        }
+        key->msaa = vc4->rasterizer->base.multisample;
+        key->sample_coverage = (vc4->rasterizer->base.multisample &&
+                                vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
+        key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
+        key->sample_alpha_to_one = vc4->blend->alpha_to_one;
        if (vc4->framebuffer.cbufs[0])
                key->color_format = vc4->framebuffer.cbufs[0]->format;

--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -86,7 +86,9 @@ static const struct qir_op_info qir_op_info[] = {
        [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true },
        [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true },
        [QOP_TLB_COLOR_WRITE] = { "tlb_color", 0, 1, true },
+        [QOP_TLB_COLOR_WRITE_MS] = { "tlb_color_ms", 0, 1, true },
        [QOP_TLB_COLOR_READ] = { "tlb_color_read", 1, 0 },
+        [QOP_MS_MASK] = { "ms_mask", 0, 1, true },
        [QOP_VARY_ADD_C] = { "vary_add_c", 1, 1 },

        [QOP_FRAG_X] = { "frag_x", 1, 0 },
@@ -399,6 +401,7 @@ qir_compile_init(void)
        c->output_position_index = -1;
        c->output_color_index = -1;
        c->output_point_size_index = -1;
+        c->output_sample_mask_index = -1;

        c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
                                            _mesa_key_pointer_equal);
@@ -420,13 +423,19 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
 struct qreg
 qir_follow_movs(struct vc4_compile *c, struct qreg reg)
 {
+        int pack = reg.pack;
+
        while (reg.file == QFILE_TEMP &&
               c->defs[reg.index] &&
-               c->defs[reg.index]->op == QOP_MOV &&
-               !c->defs[reg.index]->dst.pack) {
+               (c->defs[reg.index]->op == QOP_MOV ||
+                c->defs[reg.index]->op == QOP_FMOV ||
+                c->defs[reg.index]->op == QOP_MMOV)&&
+               !c->defs[reg.index]->dst.pack &&
+               !c->defs[reg.index]->src[0].pack) {
                reg = c->defs[reg.index]->src[0];
        }

+        reg.pack = pack;
        return reg;
 }

--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -38,6 +38,7 @@

 #include "vc4_screen.h"
 #include "vc4_qpu_defines.h"
+#include "kernel/vc4_packet.h"
 #include "pipe/p_state.h"

 struct nir_builder;
@@ -121,7 +122,9 @@ enum qop {
        QOP_TLB_STENCIL_SETUP,
        QOP_TLB_Z_WRITE,
        QOP_TLB_COLOR_WRITE,
+        QOP_TLB_COLOR_WRITE_MS,
        QOP_TLB_COLOR_READ,
+        QOP_MS_MASK,
        QOP_VARY_ADD_C,

        QOP_FRAG_X,
@@ -230,6 +233,8 @@ enum quniform_contents {
        /** A reference to a texture config parameter 2 cubemap stride uniform */
        QUNIFORM_TEXTURE_CONFIG_P2,

+        QUNIFORM_TEXTURE_MSAA_ADDR,
+
        QUNIFORM_UBO_ADDR,

        QUNIFORM_TEXRECT_SCALE_X,
@@ -247,6 +252,7 @@ enum quniform_contents {
        QUNIFORM_STENCIL,

        QUNIFORM_ALPHA_REF,
+        QUNIFORM_SAMPLE_MASK,
 };

 struct vc4_varying_slot {
@@ -283,11 +289,18 @@ struct vc4_key {
        struct vc4_uncompiled_shader *shader_state;
        struct {
                enum pipe_format format;
-                unsigned compare_mode:1;
-                unsigned compare_func:3;
-                unsigned wrap_s:3;
-                unsigned wrap_t:3;
                uint8_t swizzle[4];
+                union {
+                        struct {
+                                unsigned compare_mode:1;
+                                unsigned compare_func:3;
+                                unsigned wrap_s:3;
+                                unsigned wrap_t:3;
+                        };
+                        struct {
+                                uint16_t msaa_width, msaa_height;
+                        };
+                };
        } tex[VC4_MAX_TEXTURE_SAMPLERS];
        uint8_t ucp_enables;
 };
@@ -304,6 +317,10 @@ struct vc4_fs_key {
        bool alpha_test;
        bool point_coord_upper_left;
        bool light_twoside;
+        bool msaa;
+        bool sample_coverage;
+        bool sample_alpha_to_coverage;
+        bool sample_alpha_to_one;
        uint8_t alpha_test_func;
        uint8_t logicop_func;
        uint32_t point_sprite_mask;
@@ -348,6 +365,9 @@ struct vc4_compile {
         */
        struct qreg *inputs;
        struct qreg *outputs;
+        bool msaa_per_sample_output;
+        struct qreg color_reads[VC4_MAX_SAMPLES];
+        struct qreg sample_colors[VC4_MAX_SAMPLES];
        uint32_t inputs_array_size;
        uint32_t outputs_array_size;
        uint32_t uniforms_array_size;
@@ -396,6 +416,7 @@ struct vc4_compile {
        uint32_t output_position_index;
        uint32_t output_color_index;
        uint32_t output_point_size_index;
+        uint32_t output_sample_mask_index;

        struct qreg undef;
        enum qstage stage;
@@ -418,6 +439,8 @@ struct vc4_compile {
 */
 #define VC4_NIR_TLB_COLOR_READ_INPUT		2000000000

+#define VC4_NIR_MS_MASK_OUTPUT			2000000000
+
 /* Special offset for nir_load_uniform values to get a QUNIFORM_*
 * state-dependent value.
 */
@@ -476,6 +499,7 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
                                       enum quniform_contents contents);
 nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
                                          nir_ssa_def **srcs, int swiz);
+void vc4_nir_lower_txf_ms(struct vc4_compile *c);
 void qir_lower_uniforms(struct vc4_compile *c);

 void qpu_schedule_instructions(struct vc4_compile *c);
@@ -616,9 +640,11 @@ QIR_ALU0(FRAG_REV_FLAG)
 QIR_ALU0(TEX_RESULT)
 QIR_ALU0(TLB_COLOR_READ)
 QIR_NODST_1(TLB_COLOR_WRITE)
+QIR_NODST_1(TLB_COLOR_WRITE_MS)
 QIR_NODST_1(TLB_Z_WRITE)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)
+QIR_NODST_1(MS_MASK)

 static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -116,6 +116,17 @@ qpu_tlbc()
        return r;
 }

+static inline struct qpu_reg
+qpu_tlbc_ms()
+{
+        struct qpu_reg r = {
+                QPU_MUX_A,
+                QPU_W_TLB_COLOR_MS,
+        };
+
+        return r;
+}
+
 static inline struct qpu_reg qpu_r0(void) { return qpu_rn(0); }
 static inline struct qpu_reg qpu_r1(void) { return qpu_rn(1); }
 static inline struct qpu_reg qpu_r2(void) { return qpu_rn(2); }
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -387,6 +387,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                            qpu_rb(QPU_R_MS_REV_FLAGS)));
                        break;

+                case QOP_MS_MASK:
+                        src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1],
+                                             qinst, &unpack);
+                        queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
+                                           src[0], src[1]) | unpack);
+                        break;
+
                case QOP_FRAG_Z:
                case QOP_FRAG_W:
                        /* QOP_FRAG_Z/W don't emit instructions, just allocate
@@ -430,6 +438,13 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                        }
                        break;

+                case QOP_TLB_COLOR_WRITE_MS:
+                        queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0]));
+                        if (discard) {
+                                set_last_cond_add(c, QPU_COND_ZS);
+                        }
+                        break;
+
                case QOP_VARY_ADD_C:
                        queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
                        break;
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -295,6 +295,10 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
                        add_write_dep(state, &state->last_tlb, n);
                        break;

+                case QPU_W_MS_FLAGS:
+                        add_write_dep(state, &state->last_tlb, n);
+                        break;
+
                case QPU_W_NOP:
                        break;

--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -22,6 +22,7 @@
 * IN THE SOFTWARE.
 */

+#include "util/u_blit.h"
 #include "util/u_memory.h"
 #include "util/u_format.h"
 #include "util/u_inlines.h"
@@ -72,11 +73,18 @@ vc4_resource_transfer_unmap(struct pipe_context *pctx,
 {
        struct vc4_context *vc4 = vc4_context(pctx);
        struct vc4_transfer *trans = vc4_transfer(ptrans);
-        struct pipe_resource *prsc = ptrans->resource;
-        struct vc4_resource *rsc = vc4_resource(prsc);
-        struct vc4_resource_slice *slice = &rsc->slices[ptrans->level];

        if (trans->map) {
+                struct vc4_resource *rsc;
+                struct vc4_resource_slice *slice;
+                if (trans->ss_resource) {
+                        rsc = vc4_resource(trans->ss_resource);
+                        slice = &rsc->slices[0];
+                } else {
+                        rsc = vc4_resource(ptrans->resource);
+                        slice = &rsc->slices[ptrans->level];
+                }
+
                if (ptrans->usage & PIPE_TRANSFER_WRITE) {
                        vc4_store_tiled_image(rsc->bo->map + slice->offset +
                                              ptrans->box.z * rsc->cube_map_stride,
@@ -88,10 +96,52 @@ vc4_resource_transfer_unmap(struct pipe_context *pctx,
                free(trans->map);
        }

+        if (trans->ss_resource && (ptrans->usage & PIPE_TRANSFER_WRITE)) {
+                struct pipe_blit_info blit;
+                memset(&blit, 0, sizeof(blit));
+
+                blit.src.resource = trans->ss_resource;
+                blit.src.format = trans->ss_resource->format;
+                blit.src.box.width = trans->ss_box.width;
+                blit.src.box.height = trans->ss_box.height;
+                blit.src.box.depth = 1;
+
+                blit.dst.resource = ptrans->resource;
+                blit.dst.format = ptrans->resource->format;
+                blit.dst.level = ptrans->level;
+                blit.dst.box = trans->ss_box;
+
+                blit.mask = util_format_get_mask(ptrans->resource->format);
+                blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+                pctx->blit(pctx, &blit);
+                vc4_flush(pctx);
+
+                pipe_resource_reference(&trans->ss_resource, NULL);
+        }
+
        pipe_resource_reference(&ptrans->resource, NULL);
        util_slab_free(&vc4->transfer_pool, ptrans);
 }

+static struct pipe_resource *
+vc4_get_temp_resource(struct pipe_context *pctx,
+                      struct pipe_resource *prsc,
+                      const struct pipe_box *box)
+{
+        struct pipe_resource temp_setup;
+
+        memset(&temp_setup, 0, sizeof(temp_setup));
+        temp_setup.target = prsc->target;
+        temp_setup.format = prsc->format;
+        temp_setup.width0 = box->width;
+        temp_setup.height0 = box->height;
+        temp_setup.depth0 = 1;
+        temp_setup.array_size = 1;
+
+        return pctx->screen->resource_create(pctx->screen, &temp_setup);
+}
+
 static void *
 vc4_resource_transfer_map(struct pipe_context *pctx,
                          struct pipe_resource *prsc,
@@ -101,7 +151,6 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
 {
        struct vc4_context *vc4 = vc4_context(pctx);
        struct vc4_resource *rsc = vc4_resource(prsc);
-        struct vc4_resource_slice *slice = &rsc->slices[level];
        struct vc4_transfer *trans;
        struct pipe_transfer *ptrans;
        enum pipe_format format = prsc->format;
@@ -155,6 +204,50 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
        ptrans->usage = usage;
        ptrans->box = *box;

+        /* If the resource is multisampled, we need to resolve to single
+         * sample.  This seems like it should be handled at a higher layer.
+         */
+        if (prsc->nr_samples) {
+                trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box);
+                if (!trans->ss_resource)
+                        goto fail;
+                assert(!trans->ss_resource->nr_samples);
+
+                /* The ptrans->box gets modified for tile alignment, so save
+                 * the original box for unmap time.
+                 */
+                trans->ss_box = *box;
+
+                if (usage & PIPE_TRANSFER_READ) {
+                        struct pipe_blit_info blit;
+                        memset(&blit, 0, sizeof(blit));
+
+                        blit.src.resource = ptrans->resource;
+                        blit.src.format = ptrans->resource->format;
+                        blit.src.level = ptrans->level;
+                        blit.src.box = trans->ss_box;
+
+                        blit.dst.resource = trans->ss_resource;
+                        blit.dst.format = trans->ss_resource->format;
+                        blit.dst.box.width = trans->ss_box.width;
+                        blit.dst.box.height = trans->ss_box.height;
+                        blit.dst.box.depth = 1;
+
+                        blit.mask = util_format_get_mask(prsc->format);
+                        blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+                        pctx->blit(pctx, &blit);
+                        vc4_flush(pctx);
+                }
+
+                /* The rest of the mapping process should use our temporary. */
+                prsc = trans->ss_resource;
+                rsc = vc4_resource(prsc);
+                ptrans->box.x = 0;
+                ptrans->box.y = 0;
+                ptrans->box.z = 0;
+        }
+
        /* Note that the current kernel implementation is synchronous, so no
         * need to do syncing stuff here yet.
         */
@@ -170,6 +263,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,

        *pptrans = ptrans;

+        struct vc4_resource_slice *slice = &rsc->slices[level];
        if (rsc->tiled) {
                uint32_t utile_w = vc4_utile_width(rsc->cpp);
                uint32_t utile_h = vc4_utile_height(rsc->cpp);
@@ -203,7 +297,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                    ptrans->box.height != orig_height) {
                        vc4_load_tiled_image(trans->map, ptrans->stride,
                                             buf + slice->offset +
-                                             box->z * rsc->cube_map_stride,
+                                             ptrans->box.z * rsc->cube_map_stride,
                                             slice->stride,
                                             slice->tiling, rsc->cpp,
                                             &ptrans->box);
@@ -216,9 +310,9 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                ptrans->layer_stride = ptrans->stride;

                return buf + slice->offset +
-                        box->y / util_format_get_blockheight(format) * ptrans->stride +
-                        box->x / util_format_get_blockwidth(format) * rsc->cpp +
-                        box->z * rsc->cube_map_stride;
+                        ptrans->box.y / util_format_get_blockheight(format) * ptrans->stride +
+                        ptrans->box.x / util_format_get_blockwidth(format) * rsc->cpp +
+                        ptrans->box.z * rsc->cube_map_stride;
        }


@@ -283,7 +377,13 @@ vc4_setup_slices(struct vc4_resource *rsc)

                if (!rsc->tiled) {
                        slice->tiling = VC4_TILING_FORMAT_LINEAR;
-                        level_width = align(level_width, utile_w);
+                        if (prsc->nr_samples) {
+                                /* MSAA (4x) surfaces are stored as raw tile buffer contents. */
+                                level_width = align(level_width, 32);
+                                level_height = align(level_height, 32);
+                        } else {
+                                level_width = align(level_width, utile_w);
+                        }
                } else {
                        if (vc4_size_is_lt(level_width, level_height,
                                           rsc->cpp)) {
@@ -300,7 +400,8 @@ vc4_setup_slices(struct vc4_resource *rsc)
                }

                slice->offset = offset;
-                slice->stride = level_width * rsc->cpp;
+                slice->stride = (level_width * rsc->cpp *
+                                 MAX2(prsc->nr_samples, 1));
                slice->size = level_height * slice->stride;

                offset += slice->size;
@@ -357,7 +458,10 @@ vc4_resource_setup(struct pipe_screen *pscreen,
        prsc->screen = pscreen;

        rsc->base.vtbl = &vc4_resource_vtbl;
-        rsc->cpp = util_format_get_blocksize(tmpl->format);
+        if (prsc->nr_samples == 0)
+                rsc->cpp = util_format_get_blocksize(tmpl->format);
+        else
+                rsc->cpp = sizeof(uint32_t);

        assert(rsc->cpp);

@@ -371,8 +475,12 @@ get_resource_texture_format(struct pipe_resource *prsc)
        uint8_t format = vc4_get_tex_format(prsc->format);

        if (!rsc->tiled) {
-                assert(format == VC4_TEXTURE_TYPE_RGBA8888);
-                return VC4_TEXTURE_TYPE_RGBA32R;
+                if (prsc->nr_samples) {
+                        return ~0;
+                } else {
+                        assert(format == VC4_TEXTURE_TYPE_RGBA8888);
+                        return VC4_TEXTURE_TYPE_RGBA32R;
+                }
        }

        return format;
@@ -389,6 +497,7 @@ vc4_resource_create(struct pipe_screen *pscreen,
         * communicate metadata about tiling currently.
         */
        if (tmpl->target == PIPE_BUFFER ||
+            tmpl->nr_samples ||
            (tmpl->bind & (PIPE_BIND_SCANOUT |
                           PIPE_BIND_LINEAR |
                           PIPE_BIND_SHARED |
@@ -492,13 +601,9 @@ vc4_surface_destroy(struct pipe_context *pctx, struct pipe_surface *psurf)
        FREE(psurf);
 }

-/** Debug routine to dump the contents of an 8888 surface to the console */
-void
-vc4_dump_surface(struct pipe_surface *psurf)
+static void
+vc4_dump_surface_non_msaa(struct pipe_surface *psurf)
 {
-        if (!psurf)
-                return;
-
        struct pipe_resource *prsc = psurf->texture;
        struct vc4_resource *rsc = vc4_resource(prsc);
        uint32_t *map = vc4_bo_map(rsc->bo);
@@ -592,6 +697,147 @@ vc4_dump_surface(struct pipe_surface *psurf)
        }
 }

+static uint32_t
+vc4_surface_msaa_get_sample(struct pipe_surface *psurf,
+                            uint32_t x, uint32_t y, uint32_t sample)
+{
+        struct pipe_resource *prsc = psurf->texture;
+        struct vc4_resource *rsc = vc4_resource(prsc);
+        uint32_t tile_w = 32, tile_h = 32;
+        uint32_t tiles_w = DIV_ROUND_UP(psurf->width, 32);
+
+        uint32_t tile_x = x / tile_w;
+        uint32_t tile_y = y / tile_h;
+        uint32_t *tile = (vc4_bo_map(rsc->bo) +
+                          VC4_TILE_BUFFER_SIZE * (tile_y * tiles_w + tile_x));
+        uint32_t subtile_x = x % tile_w;
+        uint32_t subtile_y = y % tile_h;
+
+        uint32_t quad_samples = VC4_MAX_SAMPLES * 4;
+        uint32_t tile_stride = quad_samples * tile_w / 2;
+
+        return *((uint32_t *)tile +
+                 (subtile_y >> 1) * tile_stride +
+                 (subtile_x >> 1) * quad_samples +
+                 ((subtile_y & 1) << 1) +
+                 (subtile_x & 1) +
+                 sample);
+}
+
+static void
+vc4_dump_surface_msaa_char(struct pipe_surface *psurf,
+                           uint32_t start_x, uint32_t start_y,
+                           uint32_t w, uint32_t h)
+{
+        bool all_same_color = true;
+        uint32_t all_pix = 0;
+
+        for (int y = start_y; y < start_y + h; y++) {
+                for (int x = start_x; x < start_x + w; x++) {
+                        for (int s = 0; s < VC4_MAX_SAMPLES; s++) {
+                                uint32_t pix = vc4_surface_msaa_get_sample(psurf,
+                                                                           x, y,
+                                                                           s);
+                                if (x == start_x && y == start_y)
+                                        all_pix = pix;
+                                else if (all_pix != pix)
+                                        all_same_color = false;
+                        }
+                }
+        }
+        if (all_same_color) {
+                static const struct {
+                        uint32_t val;
+                        const char *c;
+                } named_colors[] = {
+                        { 0xff000000, "█" },
+                        { 0x00000000, "█" },
+                        { 0xffff0000, "r" },
+                        { 0xff00ff00, "g" },
+                        { 0xff0000ff, "b" },
+                        { 0xffffffff, "w" },
+                };
+                int i;
+                for (i = 0; i < ARRAY_SIZE(named_colors); i++) {
+                        if (named_colors[i].val == all_pix) {
+                                fprintf(stderr, "%s",
+                                        named_colors[i].c);
+                                return;
+                        }
+                }
+                fprintf(stderr, "x");
+        } else {
+                fprintf(stderr, ".");
+        }
+}
+
+static void
+vc4_dump_surface_msaa(struct pipe_surface *psurf)
+{
+        uint32_t tile_w = 32, tile_h = 32;
+        uint32_t tiles_w = DIV_ROUND_UP(psurf->width, tile_w);
+        uint32_t tiles_h = DIV_ROUND_UP(psurf->height, tile_h);
+        uint32_t char_w = 140, char_h = 60;
+        uint32_t char_w_per_tile = char_w / tiles_w - 1;
+        uint32_t char_h_per_tile = char_h / tiles_h - 1;
+        uint32_t found_colors[10];
+        uint32_t num_found_colors = 0;
+
+        fprintf(stderr, "Surface: %dx%d (%dx MSAA)\n",
+                psurf->width, psurf->height, psurf->texture->nr_samples);
+
+        for (int x = 0; x < (char_w_per_tile + 1) * tiles_w; x++)
+                fprintf(stderr, "-");
+        fprintf(stderr, "\n");
+
+        for (int ty = 0; ty < psurf->height; ty += tile_h) {
+                for (int y = 0; y < char_h_per_tile; y++) {
+
+                        for (int tx = 0; tx < psurf->width; tx += tile_w) {
+                                for (int x = 0; x < char_w_per_tile; x++) {
+                                        uint32_t bx1 = (x * tile_w /
+                                                        char_w_per_tile);
+                                        uint32_t bx2 = ((x + 1) * tile_w /
+                                                        char_w_per_tile);
+                                        uint32_t by1 = (y * tile_h /
+                                                        char_h_per_tile);
+                                        uint32_t by2 = ((y + 1) * tile_h /
+                                                        char_h_per_tile);
+
+                                        vc4_dump_surface_msaa_char(psurf,
+                                                                   tx + bx1,
+                                                                   ty + by1,
+                                                                   bx2 - bx1,
+                                                                   by2 - by1);
+                                }
+                                fprintf(stderr, "|");
+                        }
+                        fprintf(stderr, "\n");
+                }
+
+                for (int x = 0; x < (char_w_per_tile + 1) * tiles_w; x++)
+                        fprintf(stderr, "-");
+                fprintf(stderr, "\n");
+        }
+
+        for (int i = 0; i < num_found_colors; i++) {
+                fprintf(stderr, "color %d: 0x%08x\n", i, found_colors[i]);
+        }
+}
+
+/** Debug routine to dump the contents of an 8888 surface to the console */
+void
+vc4_dump_surface(struct pipe_surface *psurf)
+{
+        if (!psurf)
+                return;
+
+        if (psurf->texture->nr_samples)
+                vc4_dump_surface_msaa(psurf);
+        else
+                vc4_dump_surface_non_msaa(psurf);
+}
+
 static void
 vc4_flush_resource(struct pipe_context *pctx, struct pipe_resource *resource)
 {
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -32,6 +32,9 @@
 struct vc4_transfer {
        struct pipe_transfer base;
        void *map;
+
+        struct pipe_resource *ss_resource;
+        struct pipe_box ss_box;
 };

 struct vc4_resource_slice {
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -95,6 +95,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
        case PIPE_CAP_BLEND_EQUATION_SEPARATE:
        case PIPE_CAP_TWO_SIDED_STENCIL:
        case PIPE_CAP_USER_INDEX_BUFFERS:
+        case PIPE_CAP_TEXTURE_MULTISAMPLE:
                return 1;

                /* lying for GL 2.0 */
@@ -140,7 +141,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
        case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
        case PIPE_CAP_CONDITIONAL_RENDER:
        case PIPE_CAP_PRIMITIVE_RESTART:
-        case PIPE_CAP_TEXTURE_MULTISAMPLE:
        case PIPE_CAP_TEXTURE_BARRIER:
        case PIPE_CAP_SM3:
        case PIPE_CAP_INDEP_BLEND_ENABLE:
@@ -358,7 +358,6 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
        unsigned retval = 0;

        if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
-            (sample_count > 1) ||
            !util_format_is_supported(format, usage)) {
                return FALSE;
        }
@@ -417,11 +416,13 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
        }

        if ((usage & PIPE_BIND_RENDER_TARGET) &&
+            (sample_count == 0 || sample_count == VC4_MAX_SAMPLES) &&
            vc4_rt_format_supported(format)) {
                retval |= PIPE_BIND_RENDER_TARGET;
        }

        if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
+            (sample_count == 0 || sample_count == VC4_MAX_SAMPLES) &&
            (vc4_tex_format_supported(format))) {
                retval |= PIPE_BIND_SAMPLER_VIEW;
        }
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -65,7 +65,7 @@ struct drm_device {
 };

 struct drm_gem_object {
-        uint32_t size;
+        size_t size;
        struct drm_device *dev;
 };

--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -79,7 +79,7 @@ static void
 vc4_set_sample_mask(struct pipe_context *pctx, unsigned sample_mask)
 {
        struct vc4_context *vc4 = vc4_context(pctx);
-        vc4->sample_mask = (uint16_t)sample_mask;
+        vc4->sample_mask = sample_mask & ((1 << VC4_MAX_SAMPLES) - 1);
        vc4->dirty |= VC4_DIRTY_SAMPLE_MASK;
 }

@@ -121,6 +121,9 @@ vc4_create_rasterizer_state(struct pipe_context *pctx,
                so->offset_factor = float_to_187_half(cso->offset_scale);
        }

+        if (cso->multisample)
+                so->config_bits[0] |= VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_4X;
+
        return so;
 }

@@ -457,6 +460,22 @@ vc4_set_framebuffer_state(struct pipe_context *pctx,
                         rsc->cpp);
        }

+        vc4->msaa = false;
+        if (cso->cbufs[0])
+                vc4->msaa = cso->cbufs[0]->texture->nr_samples != 0;
+        else if (cso->zsbuf)
+                vc4->msaa = cso->zsbuf->texture->nr_samples != 0;
+
+        if (vc4->msaa) {
+                vc4->tile_width = 32;
+                vc4->tile_height = 32;
+        } else {
+                vc4->tile_width = 64;
+                vc4->tile_height = 64;
+        }
+        vc4->draw_tiles_x = DIV_ROUND_UP(cso->width, vc4->tile_width);
+        vc4->draw_tiles_y = DIV_ROUND_UP(cso->height, vc4->tile_height);
+
        vc4->dirty |= VC4_DIRTY_FRAMEBUFFER;
 }

--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -71,6 +71,18 @@ write_texture_p2(struct vc4_context *vc4,
               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
 }

+static void
+write_texture_msaa_addr(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                        struct vc4_texture_stateobj *texstate,
+                        uint32_t unit)
+{
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+
+        cl_aligned_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, 0);
+}
+

 #define SWIZ(x,y,z,w) {          \
        UTIL_FORMAT_SWIZZLE_##x, \
@@ -244,6 +256,11 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                        cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
                        break;

+                case QUNIFORM_TEXTURE_MSAA_ADDR:
+                        write_texture_msaa_addr(vc4, &uniforms,
+                                                texstate, uinfo->data[i]);
+                        break;
+
                case QUNIFORM_TEXTURE_BORDER_COLOR:
                        write_texture_border_color(vc4, &uniforms,
                                                   texstate, uinfo->data[i]);
@@ -303,6 +320,10 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                        cl_aligned_f(&uniforms,
                                     vc4->zsa->base.alpha.ref_value);
                        break;
+
+                case QUNIFORM_SAMPLE_MASK:
+                        cl_aligned_u32(&uniforms, vc4->sample_mask);
+                        break;
                }
 #if 0
                uint32_t written_val = *((uint32_t *)uniforms - 1);
@@ -345,6 +366,7 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
                case QUNIFORM_TEXTURE_CONFIG_P1:
                case QUNIFORM_TEXTURE_CONFIG_P2:
                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                case QUNIFORM_TEXTURE_MSAA_ADDR:
                case QUNIFORM_TEXRECT_SCALE_X:
                case QUNIFORM_TEXRECT_SCALE_Y:
                        dirty |= VC4_DIRTY_TEXSTATE;
@@ -363,6 +385,10 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
                case QUNIFORM_ALPHA_REF:
                        dirty |= VC4_DIRTY_ZSA;
                        break;
+
+                case QUNIFORM_SAMPLE_MASK:
+                        dirty |= VC4_DIRTY_SAMPLE_MASK;
+                        break;
                }
        }

--- a/src/gallium/state_trackers/clover/core/platform.cpp
+++ b/src/gallium/state_trackers/clover/core/platform.cpp
@@ -32,7 +32,8 @@ platform::platform() : adaptor_range(evals(), devs) {

   for (pipe_loader_device *ldev : ldevs) {
      try {
-         devs.push_back(create<device>(*this, ldev));
+         if (ldev)
+            devs.push_back(create<device>(*this, ldev));
      } catch (error &) {
         pipe_loader_release(&ldev, 1);
      }
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -1446,6 +1446,7 @@ dri2_init_screen(__DRIscreen * sPriv)
   struct pipe_screen *pscreen = NULL;
   const struct drm_conf_ret *throttle_ret;
   const struct drm_conf_ret *dmabuf_ret;
+   int fd = -1;

   screen = CALLOC_STRUCT(dri_screen);
   if (!screen)
@@ -1457,7 +1458,10 @@ dri2_init_screen(__DRIscreen * sPriv)

   sPriv->driverPrivate = (void *)screen;

-   if (pipe_loader_drm_probe_fd(&screen->dev, dup(screen->fd)))
+   if (screen->fd < 0 || (fd = dup(screen->fd)) < 0)
+      goto fail;
+
+   if (pipe_loader_drm_probe_fd(&screen->dev, fd))
      pscreen = pipe_loader_create_screen(screen->dev);

   if (!pscreen)
@@ -1502,6 +1506,8 @@ fail:
   dri_destroy_screen_helper(screen);
   if (screen->dev)
      pipe_loader_release(&screen->dev, 1);
+   else
+      close(fd);
   FREE(screen);
   return NULL;
 }
@@ -1519,6 +1525,7 @@ dri_kms_init_screen(__DRIscreen * sPriv)
   struct dri_screen *screen;
   struct pipe_screen *pscreen = NULL;
   uint64_t cap;
+   int fd = -1;

   screen = CALLOC_STRUCT(dri_screen);
   if (!screen)
@@ -1529,7 +1536,10 @@ dri_kms_init_screen(__DRIscreen * sPriv)

   sPriv->driverPrivate = (void *)screen;

-   if (pipe_loader_sw_probe_kms(&screen->dev, dup(screen->fd)))
+   if (screen->fd < 0 || (fd = dup(screen->fd)) < 0)
+      goto fail;
+
+   if (pipe_loader_sw_probe_kms(&screen->dev, fd))
      pscreen = pipe_loader_create_screen(screen->dev);

   if (!pscreen)
@@ -1557,6 +1567,8 @@ fail:
   dri_destroy_screen_helper(screen);
   if (screen->dev)
      pipe_loader_release(&screen->dev, 1);
+   else
+      close(fd);
   FREE(screen);
 #endif // GALLIUM_SOFTPIPE
   return NULL;
--- a/src/gallium/state_trackers/va/config.c
+++ b/src/gallium/state_trackers/va/config.c
@@ -28,10 +28,14 @@

 #include "pipe/p_screen.h"

+#include "util/u_video.h"
+
 #include "vl/vl_winsys.h"

 #include "va_private.h"

+DEBUG_GET_ONCE_BOOL_OPTION(mpeg4, "VAAPI_MPEG4_ENABLED", false)
+
 VAStatus
 vlVaQueryConfigProfiles(VADriverContextP ctx, VAProfile *profile_list, int *num_profiles)
 {
@@ -45,12 +49,16 @@ vlVaQueryConfigProfiles(VADriverContextP ctx, VAProfile *profile_list, int *num_
   *num_profiles = 0;

   pscreen = VL_VA_PSCREEN(ctx);
-   for (p = PIPE_VIDEO_PROFILE_MPEG2_SIMPLE; p <= PIPE_VIDEO_PROFILE_HEVC_MAIN_444; ++p)
+   for (p = PIPE_VIDEO_PROFILE_MPEG2_SIMPLE; p <= PIPE_VIDEO_PROFILE_HEVC_MAIN_444; ++p) {
+      if (u_reduce_video_profile(p) == PIPE_VIDEO_FORMAT_MPEG4 && !debug_get_option_mpeg4())
+         continue;
+
      if (pscreen->get_video_param(pscreen, p, PIPE_VIDEO_ENTRYPOINT_BITSTREAM, PIPE_VIDEO_CAP_SUPPORTED)) {
         vap = PipeToProfile(p);
         if (vap != VAProfileNone)
            profile_list[(*num_profiles)++] = vap;
      }
+   }

   /* Support postprocessing through vl_compositor */
   profile_list[(*num_profiles)++] = VAProfileNone;
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -152,11 +152,15 @@ xa_tracker_create(int drm_fd)
    struct xa_tracker *xa = calloc(1, sizeof(struct xa_tracker));
    enum xa_surface_type stype;
    unsigned int num_formats;
+    int fd = -1;

    if (!xa)
 	return NULL;

-    if (pipe_loader_drm_probe_fd(&xa->dev, dup(drm_fd)))
+    if (drm_fd < 0 || (fd = dup(drm_fd)) < 0)
+	goto out_no_fd;
+
+    if (pipe_loader_drm_probe_fd(&xa->dev, fd))
 	xa->screen = pipe_loader_create_screen(xa->dev);

    if (!xa->screen)
@@ -208,6 +212,9 @@ xa_tracker_create(int drm_fd)
 out_no_screen:
    if (xa->dev)
 	pipe_loader_release(&xa->dev, 1);
+    fd = -1;
+ out_no_fd:
+    close(fd);
    free(xa);
    return NULL;
 }
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -31,6 +31,7 @@
 #include "pipe/p_state.h"

 #include "target-helpers/drm_helper.h"
+#include "target-helpers/sw_helper.h"
 #include "state_tracker/drm_driver.h"

 #include "d3dadapter/d3dadapter9.h"
--- a/src/gallium/targets/dri/target.c
+++ b/src/gallium/targets/dri/target.c
@@ -1,4 +1,5 @@
 #include "target-helpers/drm_helper.h"
+#include "target-helpers/sw_helper.h"

 #include "dri_screen.h"

--- a/src/gallium/targets/omx/target.c
+++ b/src/gallium/targets/omx/target.c
@@ -1 +1,2 @@
 #include "target-helpers/drm_helper.h"
+#include "target-helpers/sw_helper.h"
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -20,7 +20,7 @@ lib@OPENCL_LIBNAME@_la_LIBADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(ELF_LIB) \
-	-ldl \
+	$(DLOPEN_LIBS) \
 	-lclangCodeGen \
 	-lclangFrontendTool \
 	-lclangFrontend \
--- a/src/gallium/targets/va/target.c
+++ b/src/gallium/targets/va/target.c
@@ -1 +1,2 @@
 #include "target-helpers/drm_helper.h"
+#include "target-helpers/sw_helper.h"
--- a/src/gallium/targets/vdpau/target.c
+++ b/src/gallium/targets/vdpau/target.c
@@ -1 +1,2 @@
 #include "target-helpers/drm_helper.h"
+#include "target-helpers/sw_helper.h"
--- a/src/gallium/targets/xa/target.c
+++ b/src/gallium/targets/xa/target.c
@@ -1 +1,2 @@
 #include "target-helpers/drm_helper.h"
+#include "target-helpers/sw_helper.h"
--- a/src/gallium/targets/xvmc/target.c
+++ b/src/gallium/targets/xvmc/target.c
@@ -1 +1,2 @@
 #include "target-helpers/drm_helper.h"
+#include "target-helpers/sw_helper.h"
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -1737,7 +1737,7 @@ ast_function_expression::handle_method(exec_list *instructions,
            result = new(ctx) ir_constant(op->type->array_size());
         }
      } else if (op->type->is_vector()) {
-         if (state->ARB_shading_language_420pack_enable) {
+         if (state->has_420pack()) {
            /* .length() returns int. */
            result = new(ctx) ir_constant((int) op->type->vector_elements);
         } else {
@@ -1746,7 +1746,7 @@ ast_function_expression::handle_method(exec_list *instructions,
            goto fail;
         }
      } else if (op->type->is_matrix()) {
-         if (state->ARB_shading_language_420pack_enable) {
+         if (state->has_420pack()) {
            /* .length() returns int. */
            result = new(ctx) ir_constant((int) op->type->matrix_columns);
         } else {
@@ -2075,7 +2075,7 @@ ast_aggregate_initializer::hir(exec_list *instructions,
   }
   const glsl_type *const constructor_type = this->constructor_type;

-   if (!state->ARB_shading_language_420pack_enable) {
+   if (!state->has_420pack()) {
      _mesa_glsl_error(&loc, state, "C-style initialization requires the "
                       "GL_ARB_shading_language_420pack extension");
      return ir_rvalue::error_value(ctx);
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2649,7 +2649,9 @@ apply_explicit_binding(struct _mesa_glsl_parse_state *state,

         return;
      }
-   } else if (state->is_version(420, 310) && base_type->is_image()) {
+   } else if ((state->is_version(420, 310) ||
+               state->ARB_shading_language_420pack_enable) &&
+              base_type->is_image()) {
      assert(ctx->Const.MaxImageUnits <= MAX_IMAGE_UNITS);
      if (max_index >= ctx->Const.MaxImageUnits) {
         _mesa_glsl_error(loc, state, "Image binding %d exceeds the "
@@ -3736,7 +3738,7 @@ process_initializer(ir_variable *var, ast_declaration *decl,
             * expressions. Const-qualified global variables must still be
             * initialized with constant expressions.
             */
-            if (!state->ARB_shading_language_420pack_enable
+            if (!state->has_420pack()
                || state->current_function == NULL) {
               _mesa_glsl_error(& initializer_loc, state,
                                "initializer of %s variable `%s' must be a "
@@ -5365,7 +5367,7 @@ ast_jump_statement::hir(exec_list *instructions,
         if (state->current_function->return_type != ret_type) {
            YYLTYPE loc = this->get_location();

-            if (state->ARB_shading_language_420pack_enable) {
+            if (state->has_420pack()) {
               if (!apply_implicit_conversion(state->current_function->return_type,
                                              ret, state)) {
                  _mesa_glsl_error(& loc, state,
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -948,7 +948,7 @@ parameter_qualifier:
      if (($1.flags.q.in || $1.flags.q.out) && ($2.flags.q.in || $2.flags.q.out))
         _mesa_glsl_error(&@1, state, "duplicate in/out/inout qualifier");

-      if (!state->has_420pack() && $2.flags.q.constant)
+      if (!state->has_420pack_or_es31() && $2.flags.q.constant)
         _mesa_glsl_error(&@1, state, "in/out/inout must come after const "
                                      "or precise");

@@ -960,7 +960,7 @@ parameter_qualifier:
      if ($2.precision != ast_precision_none)
         _mesa_glsl_error(&@1, state, "duplicate precision qualifier");

-      if (!(state->has_420pack() || state->is_version(420, 310)) &&
+      if (!state->has_420pack_or_es31() &&
          $2.flags.i != 0)
         _mesa_glsl_error(&@1, state, "precision qualifiers must come last");

@@ -1482,7 +1482,7 @@ layout_qualifier_id:
         $$.index = $3;
      }

-      if ((state->has_420pack() ||
+      if ((state->has_420pack_or_es31() ||
           state->has_atomic_counters() ||
           state->has_shader_storage_buffer_objects()) &&
          match_layout_qualifier("binding", $1, state) == 0) {
@@ -1714,7 +1714,7 @@ type_qualifier:
      if ($2.flags.q.invariant)
         _mesa_glsl_error(&@1, state, "duplicate \"invariant\" qualifier");

-      if (!state->has_420pack() && $2.flags.q.precise)
+      if (!state->has_420pack_or_es31() && $2.flags.q.precise)
         _mesa_glsl_error(&@1, state,
                          "\"invariant\" must come after \"precise\"");

@@ -1747,7 +1747,7 @@ type_qualifier:
      if ($2.has_interpolation())
         _mesa_glsl_error(&@1, state, "duplicate interpolation qualifier");

-      if (!state->has_420pack() &&
+      if (!state->has_420pack_or_es31() &&
          ($2.flags.q.precise || $2.flags.q.invariant)) {
         _mesa_glsl_error(&@1, state, "interpolation qualifiers must come "
                          "after \"precise\" or \"invariant\"");
@@ -1767,7 +1767,7 @@ type_qualifier:
       * precise qualifiers since these are useful in ARB_separate_shader_objects.
       * There is no clear spec guidance on this either.
       */
-      if (!state->has_420pack() && $2.has_layout())
+      if (!state->has_420pack_or_es31() && $2.has_layout())
         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");

      $$ = $1;
@@ -1785,7 +1785,7 @@ type_qualifier:
                          "duplicate auxiliary storage qualifier (centroid or sample)");
      }

-      if (!state->has_420pack() &&
+      if (!state->has_420pack_or_es31() &&
          ($2.flags.q.precise || $2.flags.q.invariant ||
           $2.has_interpolation() || $2.has_layout())) {
         _mesa_glsl_error(&@1, state, "auxiliary storage qualifiers must come "
@@ -1803,7 +1803,7 @@ type_qualifier:
      if ($2.has_storage())
         _mesa_glsl_error(&@1, state, "duplicate storage qualifier");

-      if (!state->has_420pack() &&
+      if (!state->has_420pack_or_es31() &&
          ($2.flags.q.precise || $2.flags.q.invariant || $2.has_interpolation() ||
           $2.has_layout() || $2.has_auxiliary_storage())) {
         _mesa_glsl_error(&@1, state, "storage qualifiers must come after "
@@ -1819,7 +1819,7 @@ type_qualifier:
      if ($2.precision != ast_precision_none)
         _mesa_glsl_error(&@1, state, "duplicate precision qualifier");

-      if (!(state->has_420pack() || state->is_version(420, 310)) &&
+      if (!(state->has_420pack_or_es31()) &&
          $2.flags.i != 0)
         _mesa_glsl_error(&@1, state, "precision qualifiers must come last");

@@ -2575,7 +2575,7 @@ interface_block:
   {
      ast_interface_block *block = (ast_interface_block *) $2;

-      if (!state->has_420pack() && block->layout.has_layout() &&
+      if (!state->has_420pack_or_es31() && block->layout.has_layout() &&
          !block->layout.is_default_qualifier) {
         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
         YYERROR;
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -477,7 +477,7 @@ _mesa_glsl_msg(const YYLTYPE *locp, _mesa_glsl_parse_state *state,
   struct gl_context *ctx = state->ctx;

   /* Report the error via GL_ARB_debug_output. */
-   _mesa_shader_debug(ctx, type, &msg_id, msg, strlen(msg));
+   _mesa_shader_debug(ctx, type, &msg_id, msg);

   ralloc_strcat(&state->info_log, "\n");
 }
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -255,6 +255,11 @@ struct _mesa_glsl_parse_state {
      return ARB_shading_language_420pack_enable || is_version(420, 0);
   }

+   bool has_420pack_or_es31() const
+   {
+      return ARB_shading_language_420pack_enable || is_version(420, 310);
+   }
+
   bool has_compute_shader() const
   {
      return ARB_compute_shader_enable || is_version(430, 310);
--- a/src/glsl/hir_field_selection.cpp
+++ b/src/glsl/hir_field_selection.cpp
@@ -57,8 +57,7 @@ _mesa_ast_field_selection_to_hir(const ast_expression *expr,
 			  expr->primary_expression.identifier);
      }
   } else if (op->type->is_vector() ||
-              (state->ARB_shading_language_420pack_enable &&
-               op->type->is_scalar())) {
+              (state->has_420pack() && op->type->is_scalar())) {
      ir_swizzle *swiz = ir_swizzle::create(op,
 					    expr->primary_expression.identifier,
 					    op->type->vector_elements);
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -1669,6 +1669,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
   this->data.pixel_center_integer = false;
   this->data.depth_layout = ir_depth_layout_none;
   this->data.used = false;
+   this->data.always_active_io = false;
   this->data.read_only = false;
   this->data.centroid = false;
   this->data.sample = false;
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -658,6 +658,13 @@ public:
       */
      unsigned assigned:1;

+      /**
+       * When separate shader programs are enabled, only input/outputs between
+       * the stages of a multi-stage separate program can be safely removed
+       * from the shader interface. Other input/outputs must remains active.
+       */
+      unsigned always_active_io:1;
+
      /**
       * Enum indicating how the variable was declared.  See
       * ir_var_declaration_type.
--- a/Show More
+++ b/Show More