docs: add release notes for 11.1.2

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Update version to 11.1.2
2016-02-11 00:03:22 +00:00 · 2016-02-10 23:59:40 +00:00 · 2016-02-04 12:35:13 +00:00 · 2016-02-04 10:06:27 +00:00 · 2016-02-04 10:06:27 +00:00 · 2016-02-04 10:06:27 +00:00
229 changed files with 5458 additions and 1443 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-11.1.0-devel
+11.1.2
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,9 @@
+# As per Marek http://lists.freedesktop.org/archives/mesa-stable/2015-December/003600.html
+37208c4fd7b1ec679d10992b42a2811cab8245a5 Revert "radeonsi: disable DCC on Stoney"
+
+# causes regression in xwayland, kde/plasma, mpv, steam ... fdo#92759
+839793680f99b8387bee9489733d5071c10f3ace i965: Use MESA_FORMAT_B8G8R8X8_SRGB for RGB visuals
+
+# .num_slices isn't available in 11.1
+f2c891353609b48459f27f205407d42823dd7d03 Add missing platform information for KBL
+
--- a/configure.ac
+++ b/configure.ac
@@ -98,7 +98,7 @@ AC_PROG_CXX
 AM_PROG_CC_C_O
 AM_PROG_AS
 AX_CHECK_GNU_MAKE
-AC_CHECK_PROGS([PYTHON2], [python2 python])
+AC_CHECK_PROGS([PYTHON2], [python2.7 python2 python])
 AC_PROG_SED
 AC_PROG_MKDIR_P

@@ -376,10 +376,11 @@ save_CFLAGS="$CFLAGS"
 CFLAGS="$SSE41_CFLAGS $CFLAGS"
 AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
 #include <smmintrin.h>
+int param;
 int main () {
-    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
    c = _mm_max_epu32(a, b);
-    return 0;
+    return _mm_cvtsi128_si32(c);
 }]])], SSE41_SUPPORTED=1)
 CFLAGS="$save_CFLAGS"
 if test "x$SSE41_SUPPORTED" = x1; then
@@ -767,6 +768,11 @@ linux*)
    dri3_default=no
    ;;
 esac
+
+if test "x$enable_dri" = xno; then
+    dri3_default=no
+fi
+
 AC_ARG_ENABLE([dri3],
    [AS_HELP_STRING([--enable-dri3],
        [enable DRI3 @<:@default=auto@:>@])],
@@ -1700,7 +1706,15 @@ AC_ARG_WITH([clang-libdir],
   [CLANG_LIBDIR=''])

 PKG_CHECK_EXISTS([libclc], [have_libclc=yes], [have_libclc=no])
-AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;ELF_LIB=-lelf])
+PKG_CHECK_MODULES([LIBELF], [libelf], [have_libelf=yes], [have_libelf=no])
+
+if test "x$have_libelf" = xno; then
+   LIBELF_LIBS=''
+   LIBELF_CFLAGS=''
+   AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;LIBELF_LIBS=-lelf], [have_libelf=no])
+   AC_SUBST([LIBELF_LIBS])
+   AC_SUBST([LIBELF_CFLAGS])
+fi

 if test "x$enable_opencl" = xyes; then
    if test -z "$with_gallium_drivers"; then
@@ -2173,7 +2187,9 @@ if test -n "$with_gallium_drivers"; then
            gallium_require_drm_loader

            PKG_CHECK_MODULES([SIMPENROSE], [simpenrose],
-                              [USE_VC4_SIMULATOR=yes], [USE_VC4_SIMULATOR=no])
+                              [USE_VC4_SIMULATOR=yes;
+                               DEFINES="$DEFINES -DUSE_VC4_SIMULATOR"],
+                              [USE_VC4_SIMULATOR=no])
            ;;
        xvirgl)
            HAVE_GALLIUM_VIRGL=yes
@@ -2285,8 +2301,6 @@ if test "x$USE_VC4_SIMULATOR" = xyes -a "x$HAVE_GALLIUM_ILO" = xyes; then
    AC_MSG_ERROR([VC4 simulator on x86 replaces i965 driver build, so ilo must be disabled.])
 fi

-AC_SUBST([ELF_LIB])
-
 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_X11_DRIVER, test "x$enable_xlib_glx" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -238,6 +238,12 @@ for details.
 </ul>


+<h3>VA-API state tracker environment variables</h3>
+<ul>
+<li>VAAPI_MPEG4_ENABLED - enable MPEG4 for VA-API, disabled by default.
+</ul>
+
+
 <p>
 Other Gallium drivers have their own environment variables.  These may change
 frequently so the source code should be consulted for details.
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -14,7 +14,7 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">

-<h1>Mesa 11.1.0 Release Notes / TBD</h1>
+<h1>Mesa 11.1.0 Release Notes / 15 December 2015</h1>

 <p>
 Mesa 11.1.0 is a new development release.
@@ -33,7 +33,8 @@ because compatibility contexts are not supported.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD.
+e3bc44be4df5e4dc728dfda7b55b1aaeadfce36eca6a367b76cc07598070cb2d  mesa-11.1.0.tar.gz
+9befe03b04223eb1ede177fa8cac001e2850292c8c12a3ec9929106afad9cf1f  mesa-11.1.0.tar.xz
 </pre>


@@ -51,14 +52,20 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_arrays_of_arrays on i965</li>
 <li>GL_ARB_blend_func_extended on freedreno (a3xx)</li>
 <li>GL_ARB_clear_texture on nv50, nvc0</li>
+<li>GL_ARB_clip_control on freedreno/a4xx</li>
 <li>GL_ARB_copy_image on nv50, nvc0, radeonsi</li>
+<li>GL_ARB_depth_clamp on freedreno/a4xx</li>
+<li>GL_ARB_fragment_layer_viewport on i965 (gen6+)</li>
 <li>GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips</li>
 <li>GL_ARB_gpu_shader5 on r600 for Evergreen and later chips</li>
+<li>GL_ARB_seamless_cubemap_per_texture on freedreno/a4xx</li>
 <li>GL_ARB_shader_clock on i965 (gen7+)</li>
 <li>GL_ARB_shader_stencil_export on i965 (gen9+)</li>
 <li>GL_ARB_shader_storage_buffer_object on i965</li>
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
+<li>GL_ARB_texture_buffer_range on freedreno/a3xx</li>
+<li>GL_ARB_texture_compression_bptc on freedreno/a4xx</li>
 <li>GL_ARB_texture_query_lod on softpipe</li>
 <li>GL_ARB_texture_view on radeonsi and r600 (for evergeen and newer)</li>
 <li>GL_ARB_vertex_type_2_10_10_10_rev on freedreno (a3xx, a4xx)</li>
@@ -78,11 +85,196 @@ Note: some of the new features are only available with certain drivers.

 <h2>Bug fixes</h2>

-TBD.
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=28130">Bug 28130</a> - vbo: premature flushing breaks GL_LINE_LOOP</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=38109">Bug 38109</a> - i915 driver crashes if too few vertices are submitted (Mesa 7.10.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=49779">Bug 49779</a> - Extra line segments in GL_LINE_LOOP</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=55552">Bug 55552</a> - Compile errors with --enable-mangling</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=71789">Bug 71789</a> - [r300g] Visuals not found in (default) depth = 24</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=79783">Bug 79783</a> - Distorted output in obs-studio where other vendors &quot;work&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=80821">Bug 80821</a> - When LIBGL_ALWAYS_SOFTWARE is set, KHR_create_context is not supported</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=81174">Bug 81174</a> - Gallium: GL_LINE_LOOP broken with more than 512 points</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=83508">Bug 83508</a> - [UBO] Assertion for array of blocks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84677">Bug 84677</a> - Triangle disappears with glPolygonMode GL_LINE</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86281">Bug 86281</a> - brw_meta_fast_clear (brw=brw&#64;entry=0x7fffd4097a08, fb=fb&#64;entry=0x7fffd40fa900, buffers=buffers&#64;entry=2, partial_clear=partial_clear&#64;entry=false)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86469">Bug 86469</a> - Unreal Engine demo doesn't run</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86720">Bug 86720</a> - [radeon] Europa Universalis 4 freezing during game start (10.3.3+, still broken on 11.0.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89014">Bug 89014</a> - PIPE_QUERY_GPU_FINISHED is not acting as expected on SI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90175">Bug 90175</a> - [hsw bisected][PATCH] atomic counters doesn't work for a binding point different to zero</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90348">Bug 90348</a> - Spilling failure of b96 merged value</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90631">Bug 90631</a> - Compilation failure for fragment shader with many branches on Sandy Bridge</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90734">Bug 90734</a> - glBufferSubData is corrupting data when buffer is &gt; 32k</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90887">Bug 90887</a> - PhiMovesPass in register allocator broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91044">Bug 91044</a> - piglit spec/egl_khr_create_context/valid debug flag gles* fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91114">Bug 91114</a> - ES3-CTS.gtf.GL3Tests.shadow.shadow_execution_vert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91254">Bug 91254</a> - (regresion) video using VA-API on Intel slow and freeze system with mesa 10.6 or 10.6.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91292">Bug 91292</a> - [BDW+] glVertexAttribDivisor not working in combination with glPolygonMode</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91342">Bug 91342</a> - Very dark textures on some objects in indoors environments in Postal 2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91526">Bug 91526</a> - World of Warcraft (on Wine) has UI corruption with nouveau</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91551">Bug 91551</a> - DXTn compressed normal maps produce severe artifacts on all NV5x and NVDx chipsets</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91596">Bug 91596</a> - EGL_KHR_gl_colorspace (v2) causes problem with Android-x86 GUI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91716">Bug 91716</a> - [bisected] piglit.shaders.glsl-vs-int-attrib regresses on 32 bit BYT, HSW, IVB, SNB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91718">Bug 91718</a> - piglit.spec.arb_shader_image_load_store.invalid causes intermittent GPU HANG</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91719">Bug 91719</a> - [SNB,HSW,BYT] dEQP regressions associated with using NIR for vertex shaders</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91726">Bug 91726</a> - R600 asserts in tgsi_cmp/make_src_for_op3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91780">Bug 91780</a> - Rendering issues with geometry shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91785">Bug 91785</a> - make check DispatchSanity_test.GLES31 regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91788">Bug 91788</a> - [HSW Regression] Synmark2_v6 Multithread performance case FPS reduced by 36%</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91847">Bug 91847</a> - glGenerateTextureMipmap not working (no errors) unless glActiveTexture(GL_TEXTURE1) is called before</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91857">Bug 91857</a> - Mesa 10.6.3 linker is slow</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91881">Bug 91881</a> - regression: GPU lockups since mesa-11.0.0_rc1 on RV620 (r600) driver</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91890">Bug 91890</a> - [nve7] witcher2: blurry image &amp; DATA_ERRORs (class 0xa097 mthd 0x2380/0x238c)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91898">Bug 91898</a> - src/util/mesa-sha1.c:250:25: fatal error: openssl/sha.h: No such file or directory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91927">Bug 91927</a> - [SKL] [regression] piglit compressed textures tests fail  with kernel upgrade</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91930">Bug 91930</a> - Program with GtkGLArea widget does not redraw</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91970">Bug 91970</a> - [BSW regression] dEQP-GLES3.functional.shaders.precision.int.highp_mul_vertex</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91985">Bug 91985</a> - [regression, bisected] FTBFS with commit f9caabe8f1: R600_UCP_CONST_BUFFER is undefined</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91993">Bug 91993</a> - Graphical glitch in Astromenace (open-source game).</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92009">Bug 92009</a> - ES3-CTS.gtf.GL3Tests.packed_pixels.packed_pixels fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92033">Bug 92033</a> - [SNB,regression,dEQP,bisected] functional.shaders.random tests regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92052">Bug 92052</a> - nir/nir_builder.h:79: error: expected primary-expression before ‘.’ token</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92054">Bug 92054</a> - make check gbm-symbols-check regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92066">Bug 92066</a> - [ILK,G45,regression] New assertion on BRW_MAX_MRF breaks ilk and g45</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92072">Bug 92072</a> - Wine breakage since d082c5324 (st/mesa: don't call st_validate_state in BlitFramebuffer)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92095">Bug 92095</a> - [Regression, bisected] arb_shader_atomic_counters.compiler.builtins.frag</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92122">Bug 92122</a> - [bisected, cts] Regression with Assault Android Cactus</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92124">Bug 92124</a> - shader_query.cpp:841:34: error: ‘strndup’ was not declared in this scope</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92183">Bug 92183</a> - linker.cpp:3187:46: error: ‘strtok_r’ was not declared in this scope</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92193">Bug 92193</a> - [SKL] ES2-CTS.gtf.GL2ExtensionTests.compressed_astc_texture.compressed_astc_texture fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92214">Bug 92214</a> - Flightgear crashes during splashboot with R600 driver, LLVM 3.7.0 and mesa 11.0.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92221">Bug 92221</a> - Unintended code changes in _mesa_base_tex_format commit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92265">Bug 92265</a> - Black windows in weston after update mesa to 11.0.2-1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92304">Bug 92304</a> - [cts] cts.shaders.negative conformance tests fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92363">Bug 92363</a> - [BSW/BDW] ogles1conform Gets test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92437">Bug 92437</a> - osmesa: Expose GL entry points for Windows build, via .def file</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92438">Bug 92438</a> - Segfault in pushbuf_kref when running the android emulator (qemu) on nv50</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92476">Bug 92476</a> - [cts] ES2-CTS.gtf.GL2ExtensionTests.egl_image.egl_image fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92588">Bug 92588</a> - [HSW,BDW,BSW,SKL-Y][GLES 3.1 CTS] ES31-CTS.arrays_of_arrays.InteractionFunctionCalls2 - assert</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92621">Bug 92621</a> - [G965 ILK G45] Regression: 24 piglit regressions in glsl-1.10</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92623">Bug 92623</a> - Differences in prog_data ignored when caching fragment programs (causes hangs)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92634">Bug 92634</a> - gallium's vl_mpeg12_decoder does not work with st/va</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92639">Bug 92639</a> - [Regression bisected] Ogles1conform mustpass.c fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92641">Bug 92641</a> - [SKL BSW] [Regression] Ogles1conform userclip.c fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92645">Bug 92645</a> - kodi vdpau interop fails since  mesa,meta: move gl_texture_object::TargetIndex initializations</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92705">Bug 92705</a> - [clover] fail to build with llvm-svn/clang-svn 3.8</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92709">Bug 92709</a> - &quot;LLVM triggered Diagnostic Handler: unsupported call to function ldexpf in main&quot; when starting race in stuntrally</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92738">Bug 92738</a> - Randon R7 240 doesn't work on 16KiB page size platform</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92744">Bug 92744</a> - [g965 Regression bisected] Performance regression and piglit assertions due to liveness analysis</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92770">Bug 92770</a> - [SNB, regression, dEQP] deqp-gles3.functional.shaders.discard.dynamic_loop_texture</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92824">Bug 92824</a> - [regression, bisected] `make check` dispatch-sanity broken by GL_EXT_buffer_storage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92849">Bug 92849</a> - [IVB HSW BDW] piglit image load/store load-from-cleared-image.shader_test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92859">Bug 92859</a> - [regression, bisected] validate_intrinsic_instr: Assertion triggered</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92860">Bug 92860</a> - [radeonsi][bisected] st/mesa: implement ARB_copy_image - Corruption in ARK Survival Evolved</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92900">Bug 92900</a> - [regression bisected] About 700 piglit regressions is what could go wrong</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92909">Bug 92909</a> - Offset/alignment issue with layout std140 and vec3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92985">Bug 92985</a> - Mac OS X build error &quot;ar: no archive members specified&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93015">Bug 93015</a> - Tonga Elemental segfault + VM faults since  radeon: implement r600_query_hw_get_result via function pointers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93048">Bug 93048</a> - [CTS regression] mesa af2723 breaks GL Conformance for debug extension</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93063">Bug 93063</a> - drm_helper.h:227:1: error: static declaration of ‘pipe_virgl_create_screen’ follows non-static declaration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93091">Bug 93091</a> - [opencl] segfault when running any opencl programs (like clinfo)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93126">Bug 93126</a> - wrongly claim supporting GL_EXT_texture_rg</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93180">Bug 93180</a> - [regression] arb_separate_shader_objects.active sampler conflict fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93235">Bug 93235</a> - [regression] dispatch sanity broken by GetPointerv</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93266">Bug 93266</a> - gl_arb_shading_language_420pack does not allow binding of image variables</li>
+
+</ul>
+

 <h2>Changes</h2>

-TBD.
+<li>MPEG4 decoding has been disabled by default in the VAAPI driver</li>

 </div>
 </body>
--- a/docs/relnotes/11.1.1.html
+++ b/docs/relnotes/11.1.1.html
@@ -0,0 +1,197 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.1.1 Release Notes / January 13, 2016</h1>
+
+<p>
+Mesa 11.1.1 is a bug fix release which fixes bugs found since the 11.1.0 release.
+</p>
+<p>
+Mesa 11.1.1 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+b15089817540ba0bffd0aad323ecf3a8ff6779568451827c7274890b4a269d58  mesa-11.1.1.tar.gz
+64db074fc514136b5fb3890111f0d50604db52f0b1e94ba3fcb0fe8668a7fd20  mesa-11.1.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91806">Bug 91806</a> - configure does not test whether assembler supports sse4.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92229">Bug 92229</a> - [APITRACE] SOMA have serious graphical errors</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92233">Bug 92233</a> - Unigine Heaven 4.0 silhuette run</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93004">Bug 93004</a> - Guild Wars 2 crash on nouveau DX11 cards</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93215">Bug 93215</a> - [Regression bisected] Ogles1conform Automatic mipmap generation test is fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93257">Bug 93257</a> - [SKL, bisected] ASTC dEQP tests segfault</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>st/mesa: check state-&gt;mesa in early return check in st_validate_state()</li>
+</ul>
+
+<p>Dave Airlie (6):</p>
+<ul>
+  <li>mesa/varray: set double arrays to non-normalised.</li>
+  <li>mesa/shader: return correct attribute location for double matrix arrays</li>
+  <li>glsl: pass stage into mark function</li>
+  <li>glsl/fp64: add helper for dual slot double detection.</li>
+  <li>glsl: fix count_attribute_slots to allow for different 64-bit handling</li>
+  <li>glsl: only update doubles inputs for vertex inputs.</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.1</li>
+  <li>cherry-ignore: drop the "re-enable" DCC on Stoney</li>
+  <li>cherry-ignore: don't pick a specific i965 formats patch</li>
+  <li>Update version to 11.1.1</li>
+</ul>
+
+<p>Eric Anholt (2):</p>
+<ul>
+  <li>vc4: Warn instead of abort()ing on exec ioctl failures.</li>
+  <li>vc4: Keep sample mask writes from being reordered after TLB writes</li>
+</ul>
+
+<p>Grazvydas Ignotas (1):</p>
+<ul>
+  <li>r600: fix constant buffer size programming</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>meta/generate_mipmap: Work-around GLES 1.x problem with GL_DRAW_FRAMEBUFFER</li>
+</ul>
+
+<p>Ilia Mirkin (9):</p>
+<ul>
+  <li>nv50/ir: can't have predication and immediates</li>
+  <li>gk104/ir: simplify and fool-proof texbar algorithm</li>
+  <li>glsl: assign varying locations to tess shaders when doing SSO</li>
+  <li>glx/dri3: a drawable might not be bound at wait time</li>
+  <li>nvc0: don't forget to reset VTX_TMP bufctx slot after blit completion</li>
+  <li>nv50/ir: float(s32 &amp; 0xff) = float(u8), not s8</li>
+  <li>nv50,nvc0: make sure there's pushbuf space and that we ref the bo early</li>
+  <li>nv50,nvc0: fix crash when increasing bsp bo size for h264</li>
+  <li>nvc0: scale up inter_bo size so that it's 16M for a 4K video</li>
+</ul>
+
+<p>Jonathan Gray (2):</p>
+<ul>
+  <li>configure.ac: use pkg-config for libelf</li>
+  <li>configure: check for python2.7 for PYTHON2</li>
+</ul>
+
+<p>Kenneth Graunke (5):</p>
+<ul>
+  <li>ralloc: Fix ralloc_adopt() to the old context's last child's parent.</li>
+  <li>drirc: Disable ARB_blend_func_extended for Heaven 4.0/Valley 1.0.</li>
+  <li>glsl: Fix varying struct locations when varying packing is disabled.</li>
+  <li>nvc0: Set winding order regardless of domain.</li>
+  <li>nir: Add a lower_fdiv option, turn fdiv into fmul/frcp.</li>
+</ul>
+
+<p>Marek Olšák (7):</p>
+<ul>
+  <li>tgsi/scan: add flag colors_written</li>
+  <li>r600g: write all MRTs only if there is exactly one output (fixes a hang)</li>
+  <li>radeonsi: don't call of u_prims_for_vertices for patches and rectangles</li>
+  <li>radeonsi: apply the streamout workaround to Fiji as well</li>
+  <li>gallium/radeon: fix Hyper-Z hangs by programming PA_SC_MODE_CNTL_1 correctly</li>
+  <li>program: add _mesa_reserve_parameter_storage</li>
+  <li>st/mesa: fix GLSL uniform updates for glBitmap &amp; glDrawPixels (v2)</li>
+</ul>
+
+<p>Mark Janes (1):</p>
+<ul>
+  <li>Add missing platform information for KBL</li>
+</ul>
+
+<p>Miklós Máté (1):</p>
+<ul>
+  <li>mesa: Don't leak ATIfs instructions in DeleteFragmentShader</li>
+</ul>
+
+<p>Neil Roberts (3):</p>
+<ul>
+  <li>i965: Add MESA_FORMAT_B8G8R8X8_SRGB to brw_format_for_mesa_format</li>
+  <li>i965: Add B8G8R8X8_SRGB to the alpha format override</li>
+  <li>i965: Fix crash when calling glViewport with no surface bound</li>
+</ul>
+
+<p>Nicolai Hähnle (2):</p>
+<ul>
+  <li>gallium/radeon: only dispose locally created target machine in radeon_llvm_compile</li>
+  <li>gallium/radeon: fix regression in a number of driver queries</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>configura.ac: fix test for SSE4.1 assembler support</li>
+</ul>
+
+<p>Patrick Rudolph (2):</p>
+<ul>
+  <li>nv50,nvc0: fix use-after-free when vertex buffers are unbound</li>
+  <li>gallium/util: return correct number of bound vertex buffers</li>
+</ul>
+
+<p>Rob Herring (1):</p>
+<ul>
+  <li>freedreno/ir3: fix 32-bit builds with pointer-to-int-cast error enabled</li>
+</ul>
+
+<p>Samuel Pitoiset (3):</p>
+<ul>
+  <li>nvc0: free memory allocated by the prog which reads MP perf counters</li>
+  <li>nv50,nvc0: free memory allocated by performance metrics</li>
+  <li>nv50: free memory allocated by the prog which reads MP perf counters</li>
+</ul>
+
+<p>Sarah Sharp (1):</p>
+<ul>
+  <li>mesa: Add KBL PCI IDs and platform information.</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/11.1.2.html
+++ b/docs/relnotes/11.1.2.html
@@ -0,0 +1,181 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.1.2 Release Notes / February 10, 2016</h1>
+
+<p>
+Mesa 11.1.2 is a bug fix release which fixes bugs found since the 11.1.1 release.
+</p>
+<p>
+Mesa 11.1.2 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91596">Bug 91596</a> - EGL_KHR_gl_colorspace (v2) causes problem with Android-x86 GUI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93628">Bug 93628</a> - Exception: attempt to use unavailable module DRM when building MesaGL 11.1.0 on windows</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93648">Bug 93648</a> - Random lines being rendered when playing Dolphin (geometry shaders related, w/ apitrace)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93650">Bug 93650</a> - GL_ARB_separate_shader_objects is buggy (PCSX2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93717">Bug 93717</a> - Meta mipmap generation can corrupt texture state</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93722">Bug 93722</a> - Segfault when compiling shader with a subroutine that takes a parameter</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93731">Bug 93731</a> - glUniformSubroutinesuiv segfaults when subroutine uniform is bound to a specific location</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93761">Bug 93761</a> - A conditional discard in a fragment shader causes no depth writing at all</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/bxt: Fix conservative wm thread counts.</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>glsl: fix subroutine lowering reusing actual parmaters</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.1.1</li>
+  <li>cherry-ignore: drop the i965/kbl .num_slices patch</li>
+  <li>i915: correctly parse/set the context flags</li>
+  <li>targets/dri: android: use WHOLE static libraries</li>
+  <li>egl/dri2: expose srgb configs when KHR_gl_colorspace is available</li>
+  <li>Update version to 11.1.2</li>
+</ul>
+
+<p>Eric Anholt (2):</p>
+<ul>
+  <li>vc4: Don't record the seqno of a failed job submit.</li>
+  <li>vc4: Throttle outstanding rendering after submission.</li>
+</ul>
+
+<p>François Tigeot (1):</p>
+<ul>
+  <li>gallium: Add DragonFly support</li>
+</ul>
+
+<p>Grazvydas Ignotas (1):</p>
+<ul>
+  <li>r600g: don't leak driver const buffers</li>
+</ul>
+
+<p>Ian Romanick (2):</p>
+<ul>
+  <li>meta/blit: Restore GL_DEPTH_STENCIL_TEXTURE_MODE state for GL_TEXTURE_RECTANGLE</li>
+  <li>meta: Use internal functions to set texture parameters</li>
+</ul>
+
+<p>Ilia Mirkin (6):</p>
+<ul>
+  <li>st/mesa: use surface format to generate mipmaps when available</li>
+  <li>glsl: always compute proper varying type, irrespective of varying packing</li>
+  <li>nvc0: avoid crashing when there are holes in vertex array bindings</li>
+  <li>nv50,nvc0: fix buffer clearing to respect engine alignment requirements</li>
+  <li>nv50/ir: fix false global CSE on instructions with multiple defs</li>
+  <li>st/mesa: treat a write as a read for range purposes</li>
+</ul>
+
+<p>Jason Ekstrand (3):</p>
+<ul>
+  <li>i965/vec4: Use UW type for multiply into accumulator on GEN8+</li>
+  <li>i965/fs/generator: Take an actual shader stage rather than a string</li>
+  <li>i965/fs: Always set channel 2 of texture headers in some stages</li>
+</ul>
+
+<p>Jose Fonseca (2):</p>
+<ul>
+  <li>scons: Conditionally use DRM module on pipe-loader.</li>
+  <li>pipe-loader: Fix PATH_MAX define on MSVC.</li>
+</ul>
+
+<p>Karol Herbst (1):</p>
+<ul>
+  <li>nv50/ir: fix memory corruption when spilling and redoing RA</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>glsl: Make bitfield_insert/extract and bfi/bfm non-vectorizable.</li>
+  <li>glsl: Allow implicit int -&gt; uint conversions for bitwise operators (&amp;, ^, |).</li>
+</ul>
+
+<p>Leo Liu (2):</p>
+<ul>
+  <li>vl: add zig zag scan for list 4x4</li>
+  <li>st/omx/dec/h264: fix corruption when scaling matrix present flag set</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: don't miss changes to SPI_TMPRING_SIZE</li>
+</ul>
+
+<p>Nicolai Hähnle (11):</p>
+<ul>
+  <li>mesa/bufferobj: make _mesa_delete_buffer_object externally accessible</li>
+  <li>st/mesa: use _mesa_delete_buffer_object</li>
+  <li>radeon: use _mesa_delete_buffer_object</li>
+  <li>i915: use _mesa_delete_buffer_object</li>
+  <li>i965: use _mesa_delete_buffer_object</li>
+  <li>util/u_pstipple.c: copy immediates during transformation</li>
+  <li>radeonsi: extract the VGT_GS_MODE calculation into its own function</li>
+  <li>radeonsi: ensure that VGT_GS_MODE is sent when necessary</li>
+  <li>radeonsi: add DCC buffer for sampler views on new CS</li>
+  <li>st/mesa: use the correct address generation functions in st_TexSubImage blit</li>
+  <li>radeonsi: fix discard-only fragment shaders (11.1 version)</li>
+</ul>
+
+<p>Timothy Arceri (4):</p>
+<ul>
+  <li>glsl: fix segfault linking subroutine uniform with explicit location</li>
+  <li>mesa: fix segfault in glUniformSubroutinesuiv()</li>
+  <li>glsl: fix interface block error message</li>
+  <li>glsl: create helper to remove outer vertex index array used by some stages</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/include/GLES2/gl2ext.h
+++ b/include/GLES2/gl2ext.h
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -132,6 +132,28 @@ CHIPSET(0x1932, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193A, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193B, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x193D, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x5902, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x5906, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590A, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590B, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x590E, kbl_gt1, "Intel(R) Kabylake GT1")
+CHIPSET(0x5913, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5915, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5917, kbl_gt1_5, "Intel(R) Kabylake GT1.5")
+CHIPSET(0x5912, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x5916, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591A, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591B, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591D, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x591E, kbl_gt2, "Intel(R) Kabylake GT2")
+CHIPSET(0x5921, kbl_gt2, "Intel(R) Kabylake GT2F")
+CHIPSET(0x5926, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x592A, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x592B, kbl_gt3, "Intel(R) Kabylake GT3")
+CHIPSET(0x5932, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593A, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x593D, kbl_gt4, "Intel(R) Kabylake GT4")
 CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -235,6 +235,8 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,

      case __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE:
         srgb = value != 0;
+         if (!disp->Extensions.KHR_gl_colorspace && srgb)
+            return NULL;
         break;

      default:
--- a/src/gallium/auxiliary/pipe-loader/Makefile.am
+++ b/src/gallium/auxiliary/pipe-loader/Makefile.am
@@ -37,12 +37,12 @@ libpipe_loader_static_la_SOURCES += \
 libpipe_loader_dynamic_la_SOURCES += \
 	$(DRM_SOURCES)

+endif
+
 libpipe_loader_static_la_LIBADD = \
 	$(top_builddir)/src/loader/libloader.la

 libpipe_loader_dynamic_la_LIBADD = \
 	$(top_builddir)/src/loader/libloader.la

-endif
-
 EXTRA_DIST = SConscript
--- a/src/gallium/auxiliary/pipe-loader/SConscript
+++ b/src/gallium/auxiliary/pipe-loader/SConscript
@@ -17,12 +17,11 @@ env.Append(CPPDEFINES = [

 source = env.ParseSourceList('Makefile.sources', 'COMMON_SOURCES')

-#if HAVE_LIBDRM
-source += env.ParseSourceList('Makefile.sources', 'DRM_SOURCES')
+if env['HAVE_DRM']:
+    source += env.ParseSourceList('Makefile.sources', 'DRM_SOURCES')

-env.PkgUseModules('DRM')
-env.Append(LIBS = [libloader])
-#endif
+    env.PkgUseModules('DRM')
+    env.Append(LIBS = [libloader])

 pipe_loader = env.ConvenienceLibrary(
    target = 'pipe_loader',
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
@@ -32,6 +32,11 @@
 #include "util/u_string.h"
 #include "util/u_dl.h"

+#ifdef _MSC_VER
+#include <stdlib.h>
+#define PATH_MAX _MAX_PATH
+#endif
+
 #define MODULE_PREFIX "pipe_"

 static int (*backends[])(struct pipe_loader_device **, int) = {
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -94,6 +94,18 @@ static const struct drm_driver_descriptor driver_descriptors[] = {
        .create_screen = pipe_i915_create_screen,
        .configuration = configuration_query,
    },
+#ifdef USE_VC4_SIMULATOR
+    /* VC4 simulator and ILO (i965) are mutually exclusive (error at
+     * configure). As the latter is unconditionally added, keep this one above
+     * it.
+     */
+    {
+        .name = "i965",
+        .driver_name = "vc4",
+        .create_screen = pipe_vc4_create_screen,
+        .configuration = configuration_query,
+    },
+#endif
    {
        .name = "i965",
        .driver_name = "i915",
@@ -154,14 +166,6 @@ static const struct drm_driver_descriptor driver_descriptors[] = {
        .create_screen = pipe_vc4_create_screen,
        .configuration = configuration_query,
    },
-#ifdef USE_VC4_SIMULATOR
-    {
-        .name = "i965",
-        .driver_name = "vc4",
-        .create_screen = pipe_vc4_create_screen,
-        .configuration = configuration_query,
-    },
-#endif
 };
 #endif

--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -33,9 +33,10 @@
 #include "sw/kms-dri/kms_dri_sw_winsys.h"
 #include "sw/null/null_sw_winsys.h"
 #include "sw/wrapper/wrapper_sw_winsys.h"
-#include "target-helpers/inline_sw_helper.h"
+#include "target-helpers/sw_helper_public.h"
 #include "state_tracker/drisw_api.h"
 #include "state_tracker/sw_driver.h"
+#include "state_tracker/sw_winsys.h"

 struct pipe_loader_sw_device {
   struct pipe_loader_device base;
@@ -136,7 +137,7 @@ pipe_loader_sw_probe_dri(struct pipe_loader_device **devs, struct drisw_loader_f
   if (!pipe_loader_sw_probe_init_common(sdev))
      goto fail;

-   for (i = 0; sdev->dd->winsys; i++) {
+   for (i = 0; sdev->dd->winsys[i].name; i++) {
      if (strcmp(sdev->dd->winsys[i].name, "dri") == 0) {
         sdev->ws = sdev->dd->winsys[i].create_winsys(drisw_lf);
         break;
@@ -168,7 +169,7 @@ pipe_loader_sw_probe_kms(struct pipe_loader_device **devs, int fd)
   if (!pipe_loader_sw_probe_init_common(sdev))
      goto fail;

-   for (i = 0; sdev->dd->winsys; i++) {
+   for (i = 0; sdev->dd->winsys[i].name; i++) {
      if (strcmp(sdev->dd->winsys[i].name, "kms_dri") == 0) {
         sdev->ws = sdev->dd->winsys[i].create_winsys(fd);
         break;
@@ -199,7 +200,7 @@ pipe_loader_sw_probe_null(struct pipe_loader_device **devs)
   if (!pipe_loader_sw_probe_init_common(sdev))
      goto fail;

-   for (i = 0; sdev->dd->winsys; i++) {
+   for (i = 0; sdev->dd->winsys[i].name; i++) {
      if (strcmp(sdev->dd->winsys[i].name, "null") == 0) {
         sdev->ws = sdev->dd->winsys[i].create_winsys();
         break;
@@ -222,7 +223,7 @@ pipe_loader_sw_probe(struct pipe_loader_device **devs, int ndev)
 {
   int i = 1;

-   if (i < ndev) {
+   if (i <= ndev) {
      if (!pipe_loader_sw_probe_null(devs)) {
         i--;
      }
@@ -244,7 +245,7 @@ pipe_loader_sw_probe_wrapped(struct pipe_loader_device **dev,
   if (!pipe_loader_sw_probe_init_common(sdev))
      goto fail;

-   for (i = 0; sdev->dd->winsys; i++) {
+   for (i = 0; sdev->dd->winsys[i].name; i++) {
      if (strcmp(sdev->dd->winsys[i].name, "wrapped") == 0) {
         sdev->ws = sdev->dd->winsys[i].create_winsys(screen);
         break;
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -223,7 +223,7 @@ pipe_freedreno_create_screen(int fd)
 #include "virgl/drm/virgl_drm_public.h"
 #include "virgl/virgl_public.h"

-static struct pipe_screen *
+struct pipe_screen *
 pipe_virgl_create_screen(int fd)
 {
   struct virgl_winsys *vws;
--- a/src/gallium/auxiliary/target-helpers/sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/sw_helper.h
@@ -0,0 +1,73 @@
+
+#ifndef SW_HELPER_H
+#define SW_HELPER_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "target-helpers/sw_helper_public.h"
+#include "state_tracker/sw_winsys.h"
+
+
+/* Helper function to choose and instantiate one of the software rasterizers:
+ * llvmpipe, softpipe.
+ */
+
+#ifdef GALLIUM_SOFTPIPE
+#include "softpipe/sp_public.h"
+#endif
+
+#ifdef GALLIUM_LLVMPIPE
+#include "llvmpipe/lp_public.h"
+#endif
+
+#ifdef GALLIUM_VIRGL
+#include "virgl/virgl_public.h"
+#include "virgl/vtest/virgl_vtest_public.h"
+#endif
+
+static inline struct pipe_screen *
+sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
+{
+   struct pipe_screen *screen = NULL;
+
+#if defined(GALLIUM_LLVMPIPE)
+   if (screen == NULL && strcmp(driver, "llvmpipe") == 0)
+      screen = llvmpipe_create_screen(winsys);
+#endif
+
+#if defined(GALLIUM_VIRGL)
+   if (screen == NULL && strcmp(driver, "virpipe") == 0) {
+      struct virgl_winsys *vws;
+      vws = virgl_vtest_winsys_wrap(winsys);
+      screen = virgl_create_screen(vws);
+   }
+#endif
+
+#if defined(GALLIUM_SOFTPIPE)
+   if (screen == NULL)
+      screen = softpipe_create_screen(winsys);
+#endif
+
+   return screen;
+}
+
+
+struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   const char *default_driver;
+   const char *driver;
+
+#if defined(GALLIUM_LLVMPIPE)
+   default_driver = "llvmpipe";
+#elif defined(GALLIUM_SOFTPIPE)
+   default_driver = "softpipe";
+#else
+   default_driver = "";
+#endif
+
+   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
+   return sw_screen_create_named(winsys, driver);
+}
+
+#endif
--- a/src/gallium/auxiliary/target-helpers/sw_helper_public.h
+++ b/src/gallium/auxiliary/target-helpers/sw_helper_public.h
@@ -0,0 +1,10 @@
+#ifndef _SW_HELPER_PUBLIC_H
+#define _SW_HELPER_PUBLIC_H
+
+struct pipe_screen;
+struct sw_winsys;
+
+struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys);
+
+#endif /* _SW_HELPER_PUBLIC_H */
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -365,6 +365,9 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                  info->output_semantic_index[reg] = (ubyte) semIndex;
                  info->num_outputs++;

+                  if (semName == TGSI_SEMANTIC_COLOR)
+                     info->colors_written |= 1 << semIndex;
+
                  if (procType == TGSI_PROCESSOR_VERTEX ||
                      procType == TGSI_PROCESSOR_GEOMETRY ||
                      procType == TGSI_PROCESSOR_TESS_CTRL ||
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -77,6 +77,7 @@ struct tgsi_shader_info

   uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */

+   ubyte colors_written;
   boolean reads_position; /**< does fragment shader read position? */
   boolean reads_z; /**< does fragment shader read depth? */
   boolean writes_z;  /**< does fragment shader write Z value? */
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -52,7 +52,7 @@
 #include <machine/cpu.h>
 #endif

-#if defined(PIPE_OS_FREEBSD)
+#if defined(PIPE_OS_FREEBSD) || defined(PIPE_OS_DRAGONFLY)
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
--- a/src/gallium/auxiliary/util/u_helpers.c
+++ b/src/gallium/auxiliary/util/u_helpers.c
@@ -81,7 +81,13 @@ void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
                                   const struct pipe_vertex_buffer *src,
                                   unsigned start_slot, unsigned count)
 {
-   uint32_t enabled_buffers = (1ull << *dst_count) - 1;
+   unsigned i;
+   uint32_t enabled_buffers = 0;
+
+   for (i = 0; i < *dst_count; i++) {
+      if (dst[i].buffer || dst[i].user_buffer)
+         enabled_buffers |= (1ull << i);
+   }

   util_set_vertex_buffers_mask(dst, &enabled_buffers, src, start_slot,
                                count);
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -229,6 +229,7 @@ pstip_transform_immed(struct tgsi_transform_context *ctx,
   struct pstip_transform_context *pctx =
      (struct pstip_transform_context *) ctx;
   pctx->numImmed++;
+   ctx->emit_immediate(ctx, immed);
 }


--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@@ -115,7 +115,7 @@ vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format)
      return const_resource_formats_VUYA;

   case PIPE_FORMAT_R8G8B8X8_UNORM:
-      return const_resource_formats_VUYX;
+      return const_resource_formats_YUVX;

   case PIPE_FORMAT_B8G8R8X8_UNORM:
      return const_resource_formats_VUYX;
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -392,7 +392,7 @@ vl_dri2_screen_create(Display *display, int screen)
      goto free_connect;

   if (drmGetMagic(fd, &magic))
-      goto free_connect;
+      goto close_fd;

   authenticate_cookie = xcb_dri2_authenticate_unchecked(scrn->conn,
                                                         get_xcb_screen(s, screen)->root,
@@ -402,7 +402,7 @@ vl_dri2_screen_create(Display *display, int screen)
   if (authenticate == NULL || !authenticate->authenticated)
      goto free_authenticate;

-   if (pipe_loader_drm_probe_fd(&scrn->base.dev, dup(fd)))
+   if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd))
      scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev);

   if (!scrn->base.pscreen)
@@ -428,8 +428,11 @@ vl_dri2_screen_create(Display *display, int screen)
 release_pipe:
   if (scrn->base.dev)
      pipe_loader_release(&scrn->base.dev, 1);
+   fd = -1;
 free_authenticate:
   free(authenticate);
+close_fd:
+   close(fd);
 free_connect:
   free(connect);
 free_query:
--- a/src/gallium/auxiliary/vl/vl_winsys_drm.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@@ -41,12 +41,16 @@ struct vl_screen *
 vl_drm_screen_create(int fd)
 {
   struct vl_screen *vscreen;
+   int new_fd = -1;

   vscreen = CALLOC_STRUCT(vl_screen);
   if (!vscreen)
      return NULL;

-   if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd)))
+   if (fd < 0 || (new_fd = dup(fd)) < 0)
+      goto error;
+
+   if (pipe_loader_drm_probe_fd(&vscreen->dev, new_fd))
      vscreen->pscreen = pipe_loader_create_screen(vscreen->dev);

   if (!vscreen->pscreen)
@@ -63,6 +67,8 @@ vl_drm_screen_create(int fd)
 error:
   if (vscreen->dev)
      pipe_loader_release(&vscreen->dev, 1);
+   else
+      close(new_fd);

   FREE(vscreen);
   return NULL;
--- a/src/gallium/auxiliary/vl/vl_zscan.c
+++ b/src/gallium/auxiliary/vl/vl_zscan.c
@@ -49,6 +49,13 @@ enum VS_OUTPUT
   VS_O_VTEX = 0
 };

+const int vl_zscan_normal_16[] =
+{
+   /* Zig-Zag scan pattern */
+    0, 1, 4, 8, 5, 2, 3, 6,
+    9,12,13,10, 7,11,14,15
+};
+
 const int vl_zscan_linear[] =
 {
   /* Linear scan pattern */
--- a/src/gallium/auxiliary/vl/vl_zscan.h
+++ b/src/gallium/auxiliary/vl/vl_zscan.h
@@ -64,6 +64,7 @@ struct vl_zscan_buffer
   struct pipe_surface *dst;
 };

+extern const int vl_zscan_normal_16[];
 extern const int vl_zscan_linear[];
 extern const int vl_zscan_normal[];
 extern const int vl_zscan_alternate[];
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -627,7 +627,7 @@ static inline uint32_t A4XX_RB_FS_OUTPUT_ENABLE_BLEND(uint32_t val)
 {
 	return ((val) << A4XX_RB_FS_OUTPUT_ENABLE_BLEND__SHIFT) & A4XX_RB_FS_OUTPUT_ENABLE_BLEND__MASK;
 }
-#define A4XX_RB_FS_OUTPUT_FAST_CLEAR				0x00000100
+#define A4XX_RB_FS_OUTPUT_INDEPENDENT_BLEND			0x00000100
 #define A4XX_RB_FS_OUTPUT_SAMPLE_MASK__MASK			0xffff0000
 #define A4XX_RB_FS_OUTPUT_SAMPLE_MASK__SHIFT			16
 static inline uint32_t A4XX_RB_FS_OUTPUT_SAMPLE_MASK(uint32_t val)
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
@@ -137,7 +137,8 @@ fd4_blend_state_create(struct pipe_context *pctx,
 			so->rb_mrt[i].buf_info |= A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS);
 	}

-	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend);
+	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend) |
+		COND(cso->independent_blend_enable, A4XX_RB_FS_OUTPUT_INDEPENDENT_BLEND);

 	return so;
 }
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -194,7 +194,7 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			if (view->base.texture) {
 				struct fd_resource *rsc = fd_resource(view->base.texture);
 				uint32_t offset = fd_resource_offset(rsc, start, 0);
-				OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+				OUT_RELOC(ring, rsc->bo, offset, view->texconst4, 0);
 			} else {
 				OUT_RING(ring, 0x00000000);
 			}
@@ -497,11 +497,16 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RINGP(ring, val, &fd4_context(ctx)->rbrc_patches);
 	}

-	if (dirty & FD_DIRTY_ZSA) {
+	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) {
 		struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa);
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		uint32_t rb_alpha_control = zsa->rb_alpha_control;
+
+		if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])))
+			rb_alpha_control &= ~A4XX_RB_ALPHA_CONTROL_ALPHA_TEST;

 		OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1);
-		OUT_RING(ring, zsa->rb_alpha_control);
+		OUT_RING(ring, rb_alpha_control);

 		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2);
 		OUT_RING(ring, zsa->rb_stencil_control);
@@ -628,10 +633,16 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 			enum pipe_format format = pipe_surface_format(
 					ctx->framebuffer.cbufs[i]);
+			bool is_int = util_format_is_pure_integer(format);
 			bool has_alpha = util_format_has_alpha(format);
 			uint32_t control = blend->rb_mrt[i].control;
 			uint32_t blend_control = blend->rb_mrt[i].blend_control_alpha;

+			if (is_int) {
+				control &= A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK;
+				control |= A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY);
+			}
+
 			if (has_alpha) {
 				blend_control |= blend->rb_mrt[i].blend_control_rgb;
 			} else {
@@ -651,19 +662,48 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff));
 	}

-	if (dirty & FD_DIRTY_BLEND_COLOR) {
+	if (dirty & (FD_DIRTY_BLEND_COLOR | FD_DIRTY_FRAMEBUFFER)) {
 		struct pipe_blend_color *bcolor = &ctx->blend_color;
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		float factor = 65535.0;
+		int i;
+
+		for (i = 0; i < pfb->nr_cbufs; i++) {
+			enum pipe_format format = pipe_surface_format(pfb->cbufs[i]);
+			const struct util_format_description *desc =
+				util_format_description(format);
+			int j;
+
+			if (desc->is_mixed)
+				continue;
+
+			j = util_format_get_first_non_void_channel(format);
+			if (j == -1)
+				continue;
+
+			if (desc->channel[j].size > 8 || !desc->channel[j].normalized ||
+				desc->channel[j].pure_integer)
+				continue;
+
+			/* Just use the first unorm8/snorm8 render buffer. Can't keep
+			 * everyone happy.
+			 */
+			if (desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED)
+				factor = 32767.0;
+			break;
+		}
+
 		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8);
-		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * factor) |
 				A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]));
 		OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0]));
-		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * factor) |
 				A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]));
 		OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1]));
-		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * factor) |
 				A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]));
 		OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2]));
-		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * factor) |
 				A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
 		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
 	}
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -214,6 +214,7 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	struct fd_resource *rsc = fd_resource(prsc);
 	unsigned lvl = fd_sampler_first_level(cso);
 	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
+	uint32_t sz2 = 0;

 	if (!so)
 		return NULL;
@@ -259,7 +260,10 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	case PIPE_TEXTURE_3D:
 		so->texconst3 =
 			A4XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) |
-			A4XX_TEX_CONST_3_LAYERSZ(rsc->slices[0].size0);
+			A4XX_TEX_CONST_3_LAYERSZ(rsc->slices[lvl].size0);
+		while (lvl < cso->u.tex.last_level && sz2 != rsc->slices[lvl+1].size0)
+			sz2 = rsc->slices[++lvl].size0;
+		so->texconst4 = A4XX_TEX_CONST_4_LAYERSZ(sz2);
 		break;
 	default:
 		so->texconst3 = 0x00000000;
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
@@ -51,7 +51,7 @@ fd4_sampler_stateobj(struct pipe_sampler_state *samp)

 struct fd4_pipe_sampler_view {
 	struct pipe_sampler_view base;
-	uint32_t texconst0, texconst1, texconst2, texconst3, textconst4;
+	uint32_t texconst0, texconst1, texconst2, texconst3, texconst4;
 };

 static inline struct fd4_pipe_sampler_view *
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -551,7 +551,7 @@ fd_resource_create(struct pipe_screen *pscreen,
 	struct fd_resource *rsc = CALLOC_STRUCT(fd_resource);
 	struct pipe_resource *prsc = &rsc->base.b;
 	enum pipe_format format = tmpl->format;
-	uint32_t size;
+	uint32_t size, alignment;

 	DBG("target=%d, format=%s, %ux%ux%u, array_size=%u, last_level=%u, "
 			"nr_samples=%u, usage=%u, bind=%x, flags=%x",
@@ -583,6 +583,7 @@ fd_resource_create(struct pipe_screen *pscreen,

 	assert(rsc->cpp);

+	alignment = slice_alignment(pscreen, tmpl);
 	if (is_a4xx(fd_screen(pscreen))) {
 		switch (tmpl->target) {
 		case PIPE_TEXTURE_3D:
@@ -590,11 +591,12 @@ fd_resource_create(struct pipe_screen *pscreen,
 			break;
 		default:
 			rsc->layer_first = true;
+			alignment = 1;
 			break;
 		}
 	}

-	size = setup_slices(rsc, slice_alignment(pscreen, tmpl), format);
+	size = setup_slices(rsc, alignment, format);

 	if (rsc->layer_first) {
 		rsc->layer_size = align(size, 4096);
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -143,7 +143,7 @@ block_id(struct ir3_block *block)
 #ifdef DEBUG
 	return block->serialno;
 #else
-	return (uint32_t)(uint64_t)block;
+	return (uint32_t)(unsigned long)block;
 #endif
 }

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)

   if (b->prev)
      b->prev->next = b;
-   if (a->prev)
+   if (a->next)
      a->next->prev = a;
 }

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -575,8 +575,8 @@ CodeEmitterGK110::emitIMUL(const Instruction *i)
   if (isLIMM(i->src(1), TYPE_S32)) {
      emitForm_L(i, 0x280, 2, Modifier(0));

-      assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH);
-
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 24;
      if (i->sType == TYPE_S32)
         code[1] |= 3 << 25;
   } else {
@@ -695,14 +695,9 @@ CodeEmitterGK110::emitIMAD(const Instruction *i)
   if (i->sType == TYPE_S32)
      code[1] |= (1 << 19) | (1 << 24);

-   if (code[0] & 0x1) {
-      assert(!i->subOp);
-      SAT_(39);
-   } else {
-      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
-         code[1] |= 1 << 25;
-      SAT_(35);
-   }
+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+      code[1] |= 1 << 25;
+   SAT_(35);
 }

 void
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2893,6 +2893,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
      }
      setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true);
+
+      // If the loop never breaks (e.g. only has RET's inside), then there
+      // will be no way to get to the break bb. However BGNLOOP will have
+      // already made a PREBREAK to it, so it must be in the CFG.
+      if (getBB()->cfg.incidentCount() == 0)
+         loopBB->cfg.attach(&getBB()->cfg, Graph::Edge::TREE);
   }
      break;
   case TGSI_OPCODE_BRK:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -202,7 +202,8 @@ NV50LegalizePostRA::visit(Function *fn)
   Program *prog = fn->getProgram();

   r63 = new_LValue(fn, FILE_GPR);
-   if (prog->maxGPR < 63)
+   // GPR units on nv50 are in half-regs
+   if (prog->maxGPR < 126)
      r63->reg.data.id = 63;
   else
      r63->reg.data.id = 127;
@@ -832,7 +833,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i)
   }
   Value *flags = bld.getScratch(1, FILE_FLAGS);
   bld.setPosition(cond, true);
-   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;

   Instruction *tex[4];
   for (l = 0; l < 4; ++l) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -186,91 +186,67 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
      uses.push_back(TexUse(usei, texi));
 }

+// While it might be tempting to use the an algorithm that just looks at tex
+// uses, not all texture results are guaranteed to be used on all paths. In
+// the case where along some control flow path a texture result is never used,
+// we might reuse that register for something else, creating a
+// write-after-write hazard. So we have to manually look through all
+// instructions looking for ones that reference the registers in question.
 void
-NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
-                                        Instruction *insn,
-                                        const BasicBlock *term,
-                                        std::list<TexUse> &uses)
+NVC0LegalizePostRA::findFirstUses(
+   Instruction *texi, std::list<TexUse> &uses)
 {
-   while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
-      insn = insn->getSrc(0)->getUniqueInsn();
+   int minGPR = texi->def(0).rep()->reg.data.id;
+   int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;

-   // NOTE: the tex itself is, of course, not an overwriting definition
-   if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
-      return;
-
-   switch (insn->op) {
-   /* Values not connected to the tex's definition through any of these should
-    * not be conflicting.
-    */
-   case OP_SPLIT:
-   case OP_MERGE:
-   case OP_PHI:
-   case OP_UNION:
-      /* recurse again */
-      for (int s = 0; insn->srcExists(s); ++s)
-         findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
-                             uses);
-      break;
-   default:
-      // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
-      addTexUse(uses, insn, texi);
-      break;
-   }
+   unordered_set<const BasicBlock *> visited;
+   findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
 }

 void
-NVC0LegalizePostRA::findFirstUses(
-      const Instruction *texi,
-      const Instruction *insn,
-      std::list<TexUse> &uses,
-      unordered_set<const Instruction *>& visited)
+NVC0LegalizePostRA::findFirstUsesBB(
+   int minGPR, int maxGPR, Instruction *start,
+   const Instruction *texi, std::list<TexUse> &uses,
+   unordered_set<const BasicBlock *> &visited)
 {
-   for (int d = 0; insn->defExists(d); ++d) {
-      Value *v = insn->getDef(d);
-      for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
-         Instruction *usei = (*u)->getInsn();
+   const BasicBlock *bb = start->bb;

-         // NOTE: In case of a loop that overwrites a value but never uses
-         // it, it can happen that we have a cycle of uses that consists only
-         // of phis and no-op moves and will thus cause an infinite loop here
-         // since these are not considered actual uses.
-         // The most obvious (and perhaps the only) way to prevent this is to
-         // remember which instructions we've already visited.
+   // We don't process the whole bb the first time around. This is correct,
+   // however we might be in a loop and hit this BB again, and need to process
+   // the full thing. So only mark a bb as visited if we processed it from the
+   // beginning.
+   if (start == bb->getEntry()) {
+      if (visited.find(bb) != visited.end())
+         return;
+      visited.insert(bb);
+   }

-         if (visited.find(usei) != visited.end())
+   for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
+      if (insn->isNop())
+         continue;
+
+      for (int d = 0; insn->defExists(d); ++d) {
+         if (insn->def(d).getFile() != FILE_GPR ||
+             insn->def(d).rep()->reg.data.id < minGPR ||
+             insn->def(d).rep()->reg.data.id > maxGPR)
            continue;
-
-         visited.insert(usei);
-
-         if (usei->op == OP_PHI || usei->op == OP_UNION) {
-            // need a barrier before WAW cases, like:
-            //   %r0 = tex
-            //   if ...
-            //     texbar <- is required or tex might replace x again
-            //     %r1 = x <- overwriting def
-            //   %r2 = phi %r0, %r1
-            for (int s = 0; usei->srcExists(s); ++s) {
-               Instruction *defi = usei->getSrc(s)->getUniqueInsn();
-               if (defi && &usei->src(s) != *u)
-                  findOverwritingDefs(texi, defi, usei->bb, uses);
-            }
-         }
-
-         if (usei->op == OP_SPLIT ||
-             usei->op == OP_MERGE ||
-             usei->op == OP_PHI ||
-             usei->op == OP_UNION) {
-            // these uses don't manifest in the machine code
-            findFirstUses(texi, usei, uses, visited);
-         } else
-         if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
-             usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
-            findFirstUses(texi, usei, uses, visited);
-         } else {
-            addTexUse(uses, usei, texi);
-         }
+         addTexUse(uses, insn, texi);
+         return;
      }
+
+      for (int s = 0; insn->srcExists(s); ++s) {
+         if (insn->src(s).getFile() != FILE_GPR ||
+             insn->src(s).rep()->reg.data.id < minGPR ||
+             insn->src(s).rep()->reg.data.id > maxGPR)
+            continue;
+         addTexUse(uses, insn, texi);
+         return;
+      }
+   }
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
+                      texi, uses, visited);
   }
 }

@@ -323,8 +299,7 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
   if (!uses)
      return false;
   for (size_t i = 0; i < texes.size(); ++i) {
-      unordered_set<const Instruction *> visited;
-      findFirstUses(texes[i], texes[i], uses[i], visited);
+      findFirstUses(texes[i], uses[i]);
   }

   // determine the barrier level at each use
@@ -686,7 +661,7 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
         i->tex.s = 0x1f;
         i->setIndirectR(hnd);
         i->setIndirectS(NULL);
-      } else if (i->tex.r == i->tex.s) {
+      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
         i->tex.r += prog->driver->io.texBindBase / 4;
         i->tex.s  = 0; // only a single cX[] value possible here
      } else {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -69,12 +69,10 @@ private:
   };
   bool insertTextureBarriers(Function *);
   inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
-   void findFirstUses(const Instruction *tex, const Instruction *def,
-                      std::list<TexUse>&,
-                      unordered_set<const Instruction *>&);
-   void findOverwritingDefs(const Instruction *tex, Instruction *insn,
-                            const BasicBlock *term,
-                            std::list<TexUse>&);
+   void findFirstUses(Instruction *texi, std::list<TexUse> &uses);
+   void findFirstUsesBB(int minGPR, int maxGPR, Instruction *start,
+                        const Instruction *texi, std::list<TexUse> &uses,
+                        unordered_set<const BasicBlock *> &visited);
   void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
   const Instruction *recurseDef(const Instruction *);

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -858,6 +858,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
         i->src(0).mod = i->src(t).mod;
         i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
         i->src(1).mod = 0;
+      } else
+      if (i->postFactor && i->sType == TYPE_F32) {
+         /* Can't emit a postfactor with an immediate, have to fold it in */
+         i->setSrc(s, new_ImmediateValue(
+                      prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
+         i->postFactor = 0;
      }
      break;
   case OP_MAD:
@@ -1705,6 +1711,9 @@ AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
         arg = shift->getSrc(0);
         offset = imm.reg.data.u32;
      }
+      // We just AND'd the high bits away, which means this is effectively an
+      // unsigned value.
+      cvt->sType = TYPE_U32;
   } else if (insn->op == OP_SHR &&
              insn->sType == cvt->sType &&
              insn->src(1).getImmediate(imm)) {
@@ -2653,8 +2662,11 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb)
             i->getSrc(0)->reg.data.id >= 64)
            break;

+         if (i->getPredicate())
+            break;
+
         def = i->getSrc(1)->getInsn();
-         if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
+         if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
            vtmp = i->getSrc(1);
            i->setSrc(1, def->getSrc(0));

@@ -2802,6 +2814,8 @@ GlobalCSE::visit(BasicBlock *bb)
      ik = phi->getSrc(0)->getInsn();
      if (!ik)
         continue; // probably a function input
+      if (ik->defCount(0xff) > 1)
+         continue; // too painful to check if we can really push this forward
      for (s = 1; phi->srcExists(s); ++s) {
         if (phi->getSrc(s)->refCount() > 1)
            break;
@@ -2956,6 +2970,16 @@ DeadCodeElim::visit(BasicBlock *bb)
   return true;
 }

+// Each load can go into up to 4 destinations, any of which might potentially
+// be dead (i.e. a hole). These can always be split into 2 loads, independent
+// of where the holes are. We find the first contiguous region, put it into
+// the first load, and then put the second contiguous region into the second
+// load. There can be at most 2 contiguous regions.
+//
+// Note that there are some restrictions, for example it's not possible to do
+// a 64-bit load that's not 64-bit aligned, so such a load has to be split
+// up. Also hardware doesn't support 96-bit loads, so those also have to be
+// split into a 64-bit and 32-bit load.
 void
 DeadCodeElim::checkSplitLoad(Instruction *ld1)
 {
@@ -2976,6 +3000,8 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
   addr1 = ld1->getSrc(0)->reg.data.offset;
   n1 = n2 = 0;
   size1 = size2 = 0;
+
+   // Compute address/width for first load
   for (d = 0; ld1->defExists(d); ++d) {
      if (mask & (1 << d)) {
         if (size1 && (addr1 & 0x7))
@@ -2989,16 +3015,34 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
         break;
      }
   }
+
+   // Scale back the size of the first load until it can be loaded. This
+   // typically happens for TYPE_B96 loads.
+   while (n1 &&
+          !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
+                                                typeOfSize(size1))) {
+      size1 -= def1[--n1]->reg.size;
+      d--;
+   }
+
+   // Compute address/width for second load
   for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
      if (mask & (1 << d)) {
+         assert(!size2 || !(addr2 & 0x7));
         def2[n2] = ld1->getDef(d);
         size2 += def2[n2++]->reg.size;
-      } else {
+      } else if (!n2) {
         assert(!n2);
         addr2 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
      }
   }

+   // Make sure that we've processed all the values
+   for (; ld1->defExists(d); ++d)
+      assert(!(mask & (1 << d)));
+
   updateLdStOffset(ld1, addr1, func);
   ld1->setType(typeOfSize(size1));
   for (d = 0; d < 4; ++d)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1499,6 +1499,9 @@ GCRA::cleanup(const bool success)

   delete[] nodes;
   nodes = NULL;
+   hi.next = hi.prev = &hi;
+   lo[0].next = lo[0].prev = &lo[0];
+   lo[1].next = lo[1].prev = &lo[1];
 }

 Symbol *
@@ -1573,10 +1576,28 @@ SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)

   Instruction *st;
   if (slot->reg.file == FILE_MEMORY_LOCAL) {
-      st = new_Instruction(func, OP_STORE, ty);
-      st->setSrc(0, slot);
-      st->setSrc(1, lval);
      lval->noSpill = 1;
+      if (ty != TYPE_B96) {
+         st = new_Instruction(func, OP_STORE, ty);
+         st->setSrc(0, slot);
+         st->setSrc(1, lval);
+      } else {
+         st = new_Instruction(func, OP_SPLIT, ty);
+         st->setSrc(0, lval);
+         for (int d = 0; d < lval->reg.size / 4; ++d)
+            st->setDef(d, new_LValue(func, FILE_GPR));
+
+         for (int d = lval->reg.size / 4 - 1; d >= 0; --d) {
+            Value *tmp = cloneShallow(func, slot);
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32);
+            s->setSrc(0, tmp);
+            s->setSrc(1, st->getDef(d));
+            defi->bb->insertAfter(defi, s);
+         }
+      }
   } else {
      st = new_Instruction(func, OP_CVT, ty);
      st->setDef(0, slot);
@@ -1596,7 +1617,27 @@ SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
   Instruction *ld;
   if (slot->reg.file == FILE_MEMORY_LOCAL) {
      lval->noSpill = 1;
-      ld = new_Instruction(func, OP_LOAD, ty);
+      if (ty != TYPE_B96) {
+         ld = new_Instruction(func, OP_LOAD, ty);
+      } else {
+         ld = new_Instruction(func, OP_MERGE, ty);
+         for (int d = 0; d < lval->reg.size / 4; ++d) {
+            Value *tmp = cloneShallow(func, slot);
+            LValue *val;
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32);
+            l->setDef(0, (val = new_LValue(func, FILE_GPR)));
+            l->setSrc(0, tmp);
+            usei->bb->insertBefore(usei, l);
+            ld->setSrc(d, val);
+            val->noSpill = 1;
+         }
+         ld->setDef(0, lval);
+         usei->bb->insertBefore(usei, ld);
+         return lval;
+      }
   } else {
      ld = new_Instruction(func, OP_CVT, ty);
   }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -454,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
         return false;
      }
   }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
      return false;
   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -439,7 +439,7 @@ TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
         return false;
      }
   }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
      return false;
   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -657,8 +657,8 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
   if (buffer->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
                             PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
      buffer->domain = NOUVEAU_BO_GART;
-   } else if (buffer->base.bind &
-              (screen->vidmem_bindings & screen->sysmem_bindings)) {
+   } else if (buffer->base.bind == 0 || (buffer->base.bind &
+              (screen->vidmem_bindings & screen->sysmem_bindings))) {
      switch (buffer->base.usage) {
      case PIPE_USAGE_DEFAULT:
      case PIPE_USAGE_IMMUTABLE:
@@ -685,6 +685,10 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
      if (buffer->base.bind & screen->sysmem_bindings)
         buffer->domain = NOUVEAU_BO_GART;
   }
+   /* There can be very special situations where we want non-gpu-mapped
+    * buffers, but never through this interface.
+    */
+   assert(buffer->domain);
   ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);

   if (ret == false)
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -168,9 +168,10 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
                                 int ref)
 {
   struct nv50_context *nv50 = nv50_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
   unsigned s, i;

-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
      assert(nv50->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
      for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
         if (nv50->framebuffer.cbufs[i] &&
@@ -182,7 +183,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
         }
      }
   }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
      if (nv50->framebuffer.zsbuf &&
          nv50->framebuffer.zsbuf->texture == res) {
         nv50->dirty |= NV50_NEW_FRAMEBUFFER;
@@ -192,11 +193,11 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
      }
   }

-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_SAMPLER_VIEW)) {

      assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
      for (i = 0; i < nv50->num_vtxbufs; ++i) {
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -113,6 +113,12 @@ static void
 nv50_hw_destroy_query(struct nv50_context *nv50, struct nv50_query *q)
 {
   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nv50, hq);
+      return;
+   }
+
   nv50_hw_query_allocate(nv50, q, 0);
   nouveau_fence_ref(NULL, &hq->fence);
   FREE(hq);
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
@@ -71,7 +71,8 @@ nv50_hw_metric_destroy_query(struct nv50_context *nv50,
   unsigned i;

   for (i = 0; i < hmq->num_queries; i++)
-      hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
+      if (hmq->queries[i]->funcs->destroy_query)
+         hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
   FREE(hmq);
 }

--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
@@ -153,7 +153,9 @@ static void
 nv50_hw_sm_destroy_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
 {
   struct nv50_query *q = &hq->base;
-   q->funcs->destroy_query(nv50, q);
+   nv50_hw_query_allocate(nv50, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
 }

 static boolean
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -405,6 +405,11 @@ nv50_screen_destroy(struct pipe_screen *pscreen)

   if (screen->blitter)
      nv50_blitter_destroy(screen);
+   if (screen->pm.prog) {
+      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
+      nv50_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
+   }

   nouveau_bo_ref(NULL, &screen->code);
   nouveau_bo_ref(NULL, &screen->tls_bo);
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -994,6 +994,9 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
   struct nv50_context *nv50 = nv50_context(pipe);
   unsigned i;

+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+   nv50->dirty |= NV50_NEW_ARRAYS;
+
   util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
                                 start_slot, count);

@@ -1017,10 +1020,6 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
         nv50->vbo_constant &= ~(1 << dst_index);
      }
   }
-
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
-
-   nv50->dirty |= NV50_NEW_ARRAYS;
 }

 static void
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -591,6 +591,82 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
   PUSH_DATA (push, nv50->rt_array_mode);
 }

+static void
+nv50_clear_buffer_push(struct pipe_context *pipe,
+                       struct pipe_resource *res,
+                       unsigned offset, unsigned size,
+                       const void *data, int data_size)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(res);
+   unsigned count = (size + 3) / 4;
+   unsigned xcoord = offset & 0xff;
+   unsigned tmp, i;
+
+   if (data_size == 1) {
+      tmp = *(unsigned char *)data;
+      tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp;
+      data = &tmp;
+      data_size = 4;
+   } else if (data_size == 2) {
+      tmp = *(unsigned short *)data;
+      tmp = (tmp << 16) | tmp;
+      data = &tmp;
+      data_size = 4;
+   }
+
+   unsigned data_words = data_size / 4;
+
+   nouveau_bufctx_refn(nv50->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nv50->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   offset &= ~0xff;
+
+   BEGIN_NV04(push, NV50_2D(DST_FORMAT), 2);
+   PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_2D(DST_PITCH), 5);
+   PUSH_DATA (push, 262144);
+   PUSH_DATA (push, 65536);
+   PUSH_DATA (push, 1);
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
+   BEGIN_NV04(push, NV50_2D(SIFC_BITMAP_ENABLE), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+   BEGIN_NV04(push, NV50_2D(SIFC_WIDTH), 10);
+   PUSH_DATA (push, size);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, xcoord);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;
+
+      BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx, 0);
+}
+
 static void
 nv50_clear_buffer(struct pipe_context *pipe,
                  struct pipe_resource *res,
@@ -640,9 +716,22 @@ nv50_clear_buffer(struct pipe_context *pipe,

   assert(size % data_size == 0);

+   if (offset & 0xff) {
+      unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset);
+      assert(fixup_size % data_size == 0);
+      nv50_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size);
+      offset += fixup_size;
+      size -= fixup_size;
+      if (!size)
+         return;
+   }
+
   elements = size / data_size;
   height = (elements + 8191) / 8192;
   width = elements / height;
+   if (height > 1)
+      width &= ~0xff;
+   assert(width > 0);

   BEGIN_NV04(push, NV50_3D(CLEAR_COLOR(0)), 4);
   PUSH_DATAf(push, color.f[0]);
@@ -666,13 +755,13 @@ nv50_clear_buffer(struct pipe_context *pipe,
   BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
   PUSH_DATA (push, 1);
   BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 5);
-   PUSH_DATAh(push, buf->bo->offset + buf->offset + offset);
-   PUSH_DATA (push, buf->bo->offset + buf->offset + offset);
+   PUSH_DATAh(push, buf->address + offset);
+   PUSH_DATA (push, buf->address + offset);
   PUSH_DATA (push, nv50_format_table[dst_fmt].rt);
   PUSH_DATA (push, 0);
   PUSH_DATA (push, 0);
   BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
-   PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size));
+   PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | align(width * data_size, 0x100));
   PUSH_DATA (push, height);
   BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
   PUSH_DATA (push, 0);
@@ -691,25 +780,20 @@ nv50_clear_buffer(struct pipe_context *pipe,
   BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
   PUSH_DATA (push, 0x3c);

-   if (width * height != elements) {
-      offset += width * height * data_size;
-      width = elements - width * height;
-      height = 1;
-      BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 2);
-      PUSH_DATAh(push, buf->bo->offset + buf->offset + offset);
-      PUSH_DATA (push, buf->bo->offset + buf->offset + offset);
-      BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
-      PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size));
-      PUSH_DATA (push, height);
-      BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
-      PUSH_DATA (push, 0x3c);
-   }
-
   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
   PUSH_DATA (push, nv50->cond_condmode);

-   nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
-   nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   if (buf->mm) {
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   if (width * height != elements) {
+      offset += width * height * data_size;
+      width = elements - width * height;
+      nv50_clear_buffer_push(pipe, res, offset, width * data_size,
+                             data, data_size);
+   }

   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -633,8 +633,8 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
         BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
         PUSH_DATA (push, prim);

-         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
         nouveau_pushbuf_space(push, 8, 0, 1);
+         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);

         switch (index_size) {
         case 4:
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -77,7 +77,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
      bsp_size += (1 << 20) - 1;
      bsp_size &= ~((1 << 20) - 1);

-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
      if (ret) {
         debug_printf("reallocating bsp %u -> %u failed with %i\n",
                      bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -90,7 +90,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
      struct nouveau_bo *tmp_bo = NULL;

-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
      if (ret) {
         debug_printf("reallocating inter %u -> %u failed with %i\n",
                      inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -180,9 +180,10 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
                                 int ref)
 {
   struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
   unsigned s, i;

-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
      for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
         if (nvc0->framebuffer.cbufs[i] &&
             nvc0->framebuffer.cbufs[i]->texture == res) {
@@ -193,7 +194,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
         }
      }
   }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
      if (nvc0->framebuffer.zsbuf &&
          nvc0->framebuffer.zsbuf->texture == res) {
         nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -203,12 +204,12 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
      }
   }

-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_COMMAND_ARGS_BUFFER |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_COMMAND_ARGS_BUFFER |
+               PIPE_BIND_SAMPLER_VIEW)) {
      for (i = 0; i < nvc0->num_vtxbufs; ++i) {
         if (nvc0->vtxbuf[i].buffer == res) {
            nvc0->dirty |= NVC0_NEW_ARRAYS;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -285,8 +285,6 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
      break;
   case PIPE_PRIM_TRIANGLES:
      tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
-      if (info->prop.tp.winding > 0)
-         tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
      break;
   case PIPE_PRIM_QUADS:
      tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
@@ -295,6 +293,10 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
      tp->tp.tess_mode = ~0;
      return;
   }
+
+   if (info->prop.tp.winding > 0)
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
+
   if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS)
      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -116,6 +116,12 @@ static void
 nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
   struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nvc0, hq);
+      return;
+   }
+
   nvc0_hw_query_allocate(nvc0, q, 0);
   nouveau_fence_ref(NULL, &hq->fence);
   FREE(hq);
@@ -467,7 +473,6 @@ nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))

   PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
-   nouveau_pushbuf_space(push, 0, 0, 1);
   nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
                        NVC0_IB_ENTRY_1_NO_PREFETCH);
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -187,7 +187,8 @@ nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
   unsigned i;

   for (i = 0; i < hmq->num_queries; i++)
-      hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
+      if (hmq->queries[i]->funcs->destroy_query)
+         hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
   FREE(hmq);
 }

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -851,7 +851,9 @@ static void
 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
 {
   struct nvc0_query *q = &hq->base;
-   q->funcs->destroy_query(nvc0, q);
+   nvc0_hw_query_allocate(nvc0, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
 }

 static boolean
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -428,6 +428,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
   if (screen->pm.prog) {
      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
      nvc0_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
   }

   nouveau_bo_ref(NULL, &screen->text);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -320,6 +320,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)

      if (!targ->clean)
         nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq));
+      nouveau_pushbuf_space(push, 0, 0, 1);
      BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
      PUSH_DATA (push, 1);
      PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -1000,6 +1000,9 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    unsigned i;

+    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
+    nvc0->dirty |= NVC0_NEW_ARRAYS;
+
    util_set_vertex_buffers_count(nvc0->vtxbuf, &nvc0->num_vtxbufs, vb,
                                  start_slot, count);

@@ -1023,9 +1026,6 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
          nvc0->constant_vbos &= ~(1 << dst_index);
       }
    }
-
-    nvc0->dirty |= NVC0_NEW_ARRAYS;
-    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
 }

 static void
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -355,27 +355,132 @@ nvc0_clear_render_target(struct pipe_context *pipe,
 }

 static void
-nvc0_clear_buffer_cpu(struct pipe_context *pipe,
-                      struct pipe_resource *res,
-                      unsigned offset, unsigned size,
-                      const void *data, int data_size)
+nvc0_clear_buffer_push_nvc0(struct pipe_context *pipe,
+                            struct pipe_resource *res,
+                            unsigned offset, unsigned size,
+                            const void *data, int data_size)
 {
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   struct nv04_resource *buf = nv04_resource(res);
-   struct pipe_transfer *pt;
-   struct pipe_box box;
-   unsigned elements, i;
+   unsigned i;

-   elements = size / data_size;
+   nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nvc0->bufctx);
+   nouveau_pushbuf_validate(push);

-   u_box_1d(offset, size, &box);
+   unsigned count = (size + 3) / 4;
+   unsigned data_words = data_size / 4;

-   uint8_t *map = buf->vtbl->transfer_map(pipe, res, 0, PIPE_TRANSFER_WRITE,
-                                          &box, &pt);
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;

-   for (i = 0; i < elements; ++i)
-      memcpy(&map[i*data_size], data, data_size);
+      if (!PUSH_SPACE(push, nr + 9))
+         break;

-   buf->vtbl->transfer_unmap(pipe, pt);
+      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
+      PUSH_DATAh(push, buf->address + offset);
+      PUSH_DATA (push, buf->address + offset);
+      BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, MIN2(size, nr * 4));
+      PUSH_DATA (push, 1);
+      BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
+      PUSH_DATA (push, 0x100111);
+
+      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
+      BEGIN_NIC0(push, NVC0_M2MF(DATA), nr);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+static void
+nvc0_clear_buffer_push_nve4(struct pipe_context *pipe,
+                            struct pipe_resource *res,
+                            unsigned offset, unsigned size,
+                            const void *data, int data_size)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(res);
+   unsigned i;
+
+   nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nvc0->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   unsigned count = (size + 3) / 4;
+   unsigned data_words = data_size / 4;
+
+   while (count) {
+      unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
+      unsigned nr = nr_data * data_words;
+
+      if (!PUSH_SPACE(push, nr + 10))
+         break;
+
+      BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, buf->address + offset);
+      PUSH_DATA (push, buf->address + offset);
+      BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, MIN2(size, nr * 4));
+      PUSH_DATA (push, 1);
+      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
+      BEGIN_1IC0(push, NVE4_P2MF(UPLOAD_EXEC), nr + 1);
+      PUSH_DATA (push, 0x1001);
+      for (i = 0; i < nr_data; i++)
+         PUSH_DATAp(push, data, data_words);
+
+      count -= nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+static void
+nvc0_clear_buffer_push(struct pipe_context *pipe,
+                       struct pipe_resource *res,
+                       unsigned offset, unsigned size,
+                       const void *data, int data_size)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   unsigned tmp;
+
+   if (data_size == 1) {
+      tmp = *(unsigned char *)data;
+      tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp;
+      data = &tmp;
+      data_size = 4;
+   } else if (data_size == 2) {
+      tmp = *(unsigned short *)data;
+      tmp = (tmp << 16) | tmp;
+      data = &tmp;
+      data_size = 4;
+   }
+
+   if (nvc0->screen->base.class_3d < NVE4_3D_CLASS)
+      nvc0_clear_buffer_push_nvc0(pipe, res, offset, size, data, data_size);
+   else
+      nvc0_clear_buffer_push_nve4(pipe, res, offset, size, data, data_size);
 }

 static void
@@ -400,10 +505,8 @@ nvc0_clear_buffer(struct pipe_context *pipe,
      memcpy(&color.ui, data, 16);
      break;
   case 12:
-      /* This doesn't work, RGB32 is not a valid RT format.
-       * dst_fmt = PIPE_FORMAT_R32G32B32_UINT;
-       * memcpy(&color.ui, data, 12);
-       * memset(&color.ui[3], 0, 4);
+      /* RGB32 is not a valid RT format. This will be handled by the pushbuf
+       * uploader.
       */
      break;
   case 8:
@@ -435,14 +538,26 @@ nvc0_clear_buffer(struct pipe_context *pipe,
   assert(size % data_size == 0);

   if (data_size == 12) {
-      /* TODO: Find a way to do this with the GPU! */
-      nvc0_clear_buffer_cpu(pipe, res, offset, size, data, data_size);
+      nvc0_clear_buffer_push(pipe, res, offset, size, data, data_size);
      return;
   }

+   if (offset & 0xff) {
+      unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset);
+      assert(fixup_size % data_size == 0);
+      nvc0_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size);
+      offset += fixup_size;
+      size -= fixup_size;
+      if (!size)
+         return;
+   }
+
   elements = size / data_size;
   height = (elements + 16383) / 16384;
   width = elements / height;
+   if (height > 1)
+      width &= ~0xff;
+   assert(width > 0);

   if (!PUSH_SPACE(push, 40))
      return;
@@ -463,7 +578,7 @@ nvc0_clear_buffer(struct pipe_context *pipe,
   BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 9);
   PUSH_DATAh(push, buf->address + offset);
   PUSH_DATA (push, buf->address + offset);
-   PUSH_DATA (push, width * data_size);
+   PUSH_DATA (push, align(width * data_size, 0x100));
   PUSH_DATA (push, height);
   PUSH_DATA (push, nvc0_format_table[dst_fmt].rt);
   PUSH_DATA (push, NVC0_3D_RT_TILE_MODE_LINEAR);
@@ -478,24 +593,20 @@ nvc0_clear_buffer(struct pipe_context *pipe,

   IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);

+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
+   if (buf->mm) {
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
+   }
+
   if (width * height != elements) {
      offset += width * height * data_size;
      width = elements - width * height;
-      height = 1;
-
-      BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 4);
-      PUSH_DATAh(push, buf->address + offset);
-      PUSH_DATA (push, buf->address + offset);
-      PUSH_DATA (push, width * data_size);
-      PUSH_DATA (push, height);
-
-      IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
+      nvc0_clear_buffer_push(pipe, res, offset, width * data_size,
+                             data, data_size);
   }

-   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
-
-   nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
-   nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
   nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
 }

@@ -1026,9 +1137,11 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit)
      nvc0->base.pipe.render_condition(&nvc0->base.pipe, nvc0->cond_query,
                                       nvc0->cond_cond, nvc0->cond_mode);

+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP);
   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0));
   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1));
+   nouveau_scratch_done(&nvc0->base);

   nvc0->dirty = blit->saved.dirty |
      (NVC0_NEW_FRAMEBUFFER | NVC0_NEW_SCISSOR | NVC0_NEW_SAMPLE_MASK |
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -331,7 +331,7 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0)
      b = ve->pipe.vertex_buffer_index;
      vb = &nvc0->vtxbuf[b];

-      if (!vb->buffer) {
+      if (nvc0->vbo_user & (1 << b)) {
         if (!(nvc0->constant_vbos & (1 << b))) {
            if (ve->pipe.instance_divisor) {
               BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_DIVISOR(i)), 1);
@@ -349,13 +349,13 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0)

      if (unlikely(ve->pipe.instance_divisor)) {
         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 4);
-         PUSH_DATA (push, (1 << 12) | vb->stride);
+         PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
         PUSH_DATAh(push, res->address + offset);
         PUSH_DATA (push, res->address + offset);
         PUSH_DATA (push, ve->pipe.instance_divisor);
      } else {
         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 3);
-         PUSH_DATA (push, (1 << 12) | vb->stride);
+         PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
         PUSH_DATAh(push, res->address + offset);
         PUSH_DATA (push, res->address + offset);
      }
@@ -392,6 +392,10 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)
         }
         /* address/value set in nvc0_update_user_vbufs_shared */
         continue;
+      } else if (!vb->buffer) {
+         /* there can be holes in the vertex buffer lists */
+         IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 0);
+         continue;
      }
      buf = nv04_resource(vb->buffer);
      offset = vb->buffer_offset;
@@ -407,6 +411,12 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)

      BCTX_REFN(nvc0->bufctx_3d, VTX, buf, RD);
   }
+   /* If there are more elements than buffers, we might not have unset
+    * fetching on the later elements.
+    */
+   for (; b < nvc0->vertex->num_elements; ++b)
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 0);
+
   if (nvc0->vbo_user)
      nvc0_update_user_vbufs_shared(nvc0);
 }
@@ -784,7 +794,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
   }

   while (num_instances--) {
-      PUSH_SPACE(push, 8);
+      nouveau_pushbuf_space(push, 9, 0, 1);
      BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
      PUSH_DATA (push, mode);
      BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1);
@@ -811,7 +821,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
      IMMED_NVC0(push, SUBC_3D(NV10_SUBCHAN_REF_CNT), 0);

-   PUSH_SPACE(push, 8);
+   nouveau_pushbuf_space(push, 8, 0, 1);
+   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
   if (info->indexed) {
      assert(nvc0->idxbuf.buffer);
      assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
@@ -829,8 +840,6 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   }
   PUSH_DATA(push, nvc0_prim_gl(info->mode));
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
-   nouveau_pushbuf_space(push, 0, 0, 1);
   nouveau_pushbuf_data(push,
                        buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -169,9 +169,12 @@ nvc0_create_decoder(struct pipe_context *context,
   for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                           0, 1 << 20, &cfg, &dec->bsp_bo[i]);
-   if (!ret)
+   if (!ret) {
+      /* total fudge factor... just has to be bigger for higher bitrates? */
+      unsigned inter_size = align(templ->width * templ->height * 2, 4 << 20);
      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
-                           0x100, 4 << 20, &cfg, &dec->inter_bo[0]);
+                           0x100, inter_size, &cfg, &dec->inter_bo[0]);
+   }
   if (!ret) {
      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                           0x100, dec->inter_bo[0]->size, &cfg,
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -81,7 +81,7 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
      bsp_size += (1 << 20) - 1;
      bsp_size &= ~((1 << 20) - 1);

-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
      if (ret) {
         debug_printf("reallocating bsp %u -> %u failed with %i\n",
                      bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -98,7 +98,7 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
      cfg.nvc0.tile_mode = 0x10;
      cfg.nvc0.memtype = 0xfe;

-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
      if (ret) {
         debug_printf("reallocating inter %u -> %u failed with %i\n",
                      inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1555,12 +1555,17 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples,
 				     S_028C00_EXPAND_LINE_WIDTH(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, S_028C04_MSAA_NUM_SAMPLES(util_logbase2(nr_samples)) |
 				     S_028C04_MAX_SAMPLE_DIST(max_dist)); /* R_028C04_PA_SC_AA_CONFIG */
-		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	} else {
 		radeon_set_context_reg_seq(cs, R_028C00_PA_SC_LINE_CNTL, 2);
 		radeon_emit(cs, S_028C00_LAST_PIXEL(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, 0); /* R_028C04_PA_SC_AA_CONFIG */
-		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }

@@ -1939,7 +1944,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,

 		if (!gs_ring_buffer) {
 			radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4,
-						    ALIGN_DIVUP(cb->buffer_size >> 4, 16), pkt_flags);
+						    ALIGN_DIVUP(cb->buffer_size, 256), pkt_flags);
 			radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8,
 						    pkt_flags);
 		}
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -68,6 +68,7 @@ static const struct debug_named_value r600_debug_options[] = {
 static void r600_destroy_context(struct pipe_context *context)
 {
 	struct r600_context *rctx = (struct r600_context *)context;
+	unsigned sh;

 	r600_isa_destroy(rctx->isa);

@@ -76,6 +77,11 @@ static void r600_destroy_context(struct pipe_context *context)
 	pipe_resource_reference((struct pipe_resource**)&rctx->dummy_cmask, NULL);
 	pipe_resource_reference((struct pipe_resource**)&rctx->dummy_fmask, NULL);

+	for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
+		rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL);
+		free(rctx->driver_consts[sh].constants);
+	}
+
 	if (rctx->dummy_pixel_shader) {
 		rctx->b.b.delete_fs_state(&rctx->b.b, rctx->dummy_pixel_shader);
 	}
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -59,7 +59,7 @@

 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS	16
-#define R600_MAX_DRAW_CS_DWORDS		47
+#define R600_MAX_DRAW_CS_DWORDS		52
 #define R600_TRACE_CS_DWORDS		7

 #define R600_MAX_USER_CONST_BUFFERS 13
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -598,6 +598,106 @@ static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back
 	return 0;
 }

+/* execute a single slot ALU calculation */
+static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
+		for (i = 0; i < 4; i++) {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = op;
+			alu.src[0].sel = src0_sel;
+			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[0].value = src0_chan_val;
+			else
+				alu.src[0].chan = src0_chan_val;
+			alu.src[1].sel = src1_sel;
+			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[1].value = src1_chan_val;
+			else
+				alu.src[1].chan = src1_chan_val;
+			alu.dst.sel = dst_sel;
+			alu.dst.chan = i;
+			alu.dst.write = i == dst_chan;
+			alu.last = (i == 3);
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+		return 0;
+	}
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+/* execute a single slot ALU calculation */
+static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val,
+			  int src2_sel, unsigned src2_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r;
+
+	/* validate this for other ops */
+	assert(op == ALU_OP3_MULADD_UINT24);
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.src[2].sel = src2_sel;
+	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[2].value = src2_chan_val;
+	else
+		alu.src[2].chan = src2_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.is_op3 = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
+{
+	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
+}
+
 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
 {
 	int i;
@@ -1129,6 +1229,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	unsigned vtx_id = src->Dimension.Index;
 	int offset_reg = vtx_id / 3;
 	int offset_chan = vtx_id % 3;
+	int t2 = 0;

 	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
 	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
@@ -1136,13 +1237,24 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	if (offset_reg == 0 && offset_chan == 2)
 		offset_chan = 3;

+	if (src->Dimension.Indirect || src->Register.Indirect)
+		t2 = r600_get_temp(ctx);
+
 	if (src->Dimension.Indirect) {
 		int treg[3];
-		int t2;
 		struct r600_bytecode_alu alu;
 		int r, i;
-
-		/* you have got to be shitting me -
+		unsigned addr_reg;
+		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
+		if (src->DimIndirect.Index > 0) {
+			r = single_alu_op2(ctx, ALU_OP1_MOV,
+					   ctx->bc->ar_reg, 0,
+					   addr_reg, 0,
+					   0, 0);
+			if (r)
+				return r;
+		}
+		/*
 		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
 		   at least this is what fglrx seems to do. */
 		for (i = 0; i < 3; i++) {
@@ -1150,7 +1262,6 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		}
 		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);

-		t2 = r600_get_temp(ctx);
 		for (i = 0; i < 3; i++) {
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ALU_OP1_MOV;
@@ -1175,8 +1286,33 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		if (r)
 			return r;
 		offset_reg = t2;
+		offset_chan = 0;
 	}

+	if (src->Register.Indirect) {
+		int addr_reg;
+		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
+
+		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
+
+		/* pull the value from index_reg */
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   t2, 1,
+				   addr_reg, 0,
+				   V_SQ_ALU_SRC_LITERAL, first);
+		if (r)
+			return r;
+		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+				   t2, 0,
+				   t2, 1,
+				   V_SQ_ALU_SRC_LITERAL, 4,
+				   offset_reg, offset_chan);
+		if (r)
+			return r;
+		offset_reg = t2;
+		offset_chan = 0;
+		index = src->Register.Index - first;
+	}

 	memset(&vtx, 0, sizeof(vtx));
 	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
@@ -1222,6 +1358,7 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)

 			fetch_gs_input(ctx, src, treg);
 			ctx->src[i].sel = treg;
+			ctx->src[i].rel = 0;
 		}
 	}
 	return 0;
@@ -1498,7 +1635,7 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 		*last_exp_pos = NULL, *last_exp_param = NULL;
 	int i, j, next_clip_pos = 61, next_param = 0;
 	int ring;
-
+	bool only_ring_0 = true;
 	cshader = calloc(1, sizeof(struct r600_pipe_shader));
 	if (!cshader)
 		return 0;
@@ -1570,6 +1707,8 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 		for (i = 0; i < so->num_outputs; i++) {
 			if (so->output[i].stream == ring) {
 				enabled = true;
+				if (ring > 0)
+					only_ring_0 = false;
 				break;
 			}
 		}
@@ -1604,7 +1743,7 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 		cf_jump = ctx.bc->cf_last;

 		if (enabled)
-			emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
+			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
 		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
 	}

@@ -2042,7 +2181,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.nliterals = 0;
 	ctx.literals = NULL;

-	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
+	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+			       ctx.info.colors_written == 1;
 	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
 	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];

@@ -2206,6 +2346,11 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 			struct r600_bytecode_alu alu;
 			int r;
+
+			/* GS thread with no output workaround - emit a cut at start of GS */
+			if (ctx.bc->chip_class == R600)
+				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
+
 			for (j = 0; j < 4; j++) {
 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 				alu.op = ALU_OP1_MOV;
@@ -7180,7 +7325,7 @@ static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
 	struct r600_bytecode_alu alu;
 	int r;
 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
-	unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg;
+	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);

 	assert(inst->Dst[0].Register.Index < 3);
 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1768,7 +1768,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,

 		if (!gs_ring_buffer) {
 			radeon_set_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4,
-					       ALIGN_DIVUP(cb->buffer_size >> 4, 16));
+					       ALIGN_DIVUP(cb->buffer_size, 256));
 			radeon_set_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8);
 		}

@@ -2213,10 +2213,11 @@ void r600_init_atom_start_cs(struct r600_context *rctx)
 		num_temp_gprs = 4;
 		num_gs_gprs = 0;
 		num_es_gprs = 0;
-		num_ps_threads = 136;
-		num_vs_threads = 48;
-		num_gs_threads = 4;
-		num_es_threads = 4;
+		/* use limits 40 VS and at least 16 ES/GS */
+		num_ps_threads = 120;
+		num_vs_threads = 40;
+		num_gs_threads = 16;
+		num_es_threads = 16;
 		num_ps_stack_entries = 40;
 		num_vs_stack_entries = 40;
 		num_gs_stack_entries = 32;
@@ -2675,6 +2676,9 @@ void r600_update_vs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 		S_02881C_USE_VTX_VIEWPORT_INDX(rshader->vs_out_viewport);
 }

+#define RV610_GSVS_ALIGN 32
+#define R600_GSVS_ALIGN 16
+
 void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *shader)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
@@ -2684,6 +2688,23 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 	unsigned gsvs_itemsize =
 			(cp_shader->ring_item_sizes[0] * shader->selector->gs_max_out_vertices) >> 2;

+	/* some r600s needs gsvs itemsize aligned to cacheline size
+	   this was fixed in rs780 and above. */
+	switch (rctx->b.family) {
+	case CHIP_RV610:
+		gsvs_itemsize = align(gsvs_itemsize, RV610_GSVS_ALIGN);
+		break;
+	case CHIP_R600:
+	case CHIP_RV630:
+	case CHIP_RV670:
+	case CHIP_RV620:
+	case CHIP_RV635:
+		gsvs_itemsize = align(gsvs_itemsize, R600_GSVS_ALIGN);
+		break;
+	default:
+		break;
+	}
+
 	r600_init_command_buffer(cb, 64);

 	/* VGT_GS_MODE is written by r600_emit_shader_stages */
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1770,6 +1770,24 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 					(info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0);
 	}

+	/* SMX returns CONTEXT_DONE too early workaround */
+	if (rctx->b.family == CHIP_R600 ||
+	    rctx->b.family == CHIP_RV610 ||
+	    rctx->b.family == CHIP_RV630 ||
+	    rctx->b.family == CHIP_RV635) {
+		/* if we have gs shader or streamout
+		   we need to do a wait idle after every draw */
+		if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) {
+			radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1));
+		}
+	}
+
+	/* ES ring rolling over at EOP - workaround */
+	if (rctx->b.chip_class == R600) {
+		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT);
+	}
+
 	if (rctx->screen->b.trace_bo) {
 		r600_trace_emit(rctx);
 	}
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -130,6 +130,7 @@
 #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
 #define EVENT_TYPE_FLUSH_AND_INV_DB_META       0x2c /* supported on r700+ */
 #define EVENT_TYPE_VGT_FLUSH                   0x24
+#define EVENT_TYPE_SQ_NON_EVENT                0x26
 #define EVENT_TYPE_FLUSH_AND_INV_CB_META	46 /* supported on r700+ */
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
--- a/src/gallium/drivers/radeon/Makefile.am
+++ b/src/gallium/drivers/radeon/Makefile.am
@@ -16,7 +16,8 @@ libradeon_la_SOURCES = \
 if NEED_RADEON_LLVM

 AM_CFLAGS += \
-	$(LLVM_CFLAGS)
+	$(LLVM_CFLAGS) \
+	$(LIBELF_CFLAGS)

 libradeon_la_SOURCES += \
 	$(LLVM_C_FILES)
@@ -24,7 +25,7 @@ libradeon_la_SOURCES += \
 libradeon_la_LIBADD = \
 	$(CLOCK_LIB) \
 	$(LLVM_LIBS) \
-	$(ELF_LIB)
+	$(LIBELF_LIBS)

 libradeon_la_LDFLAGS = \
 	$(LLVM_LDFLAGS)
--- a/src/gallium/drivers/radeon/cayman_msaa.c
+++ b/src/gallium/drivers/radeon/cayman_msaa.c
@@ -229,13 +229,17 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
 			radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
-					     EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+					       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		} else if (overrast_samples > 1) {
 			radeon_set_context_reg(cs, CM_R_028804_DB_EQAA,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) |
 					       S_028804_OVERRASTERIZATION_AMOUNT(log_samples));
-			radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+			radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		}
 	} else {
 		radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2);
@@ -245,6 +249,8 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 		radeon_set_context_reg(cs, CM_R_028804_DB_EQAA,
 				       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 				       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
-		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -136,8 +136,12 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
 	/* suspend queries */
-	if (!LIST_IS_EMPTY(&ctx->active_nontimer_queries))
+	if (ctx->num_cs_dw_nontimer_queries_suspend) {
+		/* Since non-timer queries are suspended during blits,
+		 * we have to guard against double-suspends. */
 		r600_suspend_nontimer_queries(ctx);
+		ctx->nontimer_queries_suspended_by_flush = true;
+	}
 	if (!LIST_IS_EMPTY(&ctx->active_timer_queries))
 		r600_suspend_timer_queries(ctx);

@@ -158,8 +162,10 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 	/* resume queries */
 	if (!LIST_IS_EMPTY(&ctx->active_timer_queries))
 		r600_resume_timer_queries(ctx);
-	if (!LIST_IS_EMPTY(&ctx->active_nontimer_queries))
+	if (ctx->nontimer_queries_suspended_by_flush) {
+		ctx->nontimer_queries_suspended_by_flush = false;
 		r600_resume_nontimer_queries(ctx);
+	}
 }

 static void r600_flush_from_st(struct pipe_context *ctx,
@@ -233,8 +239,8 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->family = rscreen->family;
 	rctx->chip_class = rscreen->chip_class;

-	if (rscreen->family == CHIP_HAWAII)
-		rctx->max_db = 16;
+	if (rscreen->chip_class >= CIK)
+		rctx->max_db = MAX2(8, rscreen->info.r600_num_backends);
 	else if (rscreen->chip_class >= EVERGREEN)
 		rctx->max_db = 8;
 	else
@@ -550,10 +556,11 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_TONGA: return "tonga";
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
-	case CHIP_FIJI: return "fiji";
 #if HAVE_LLVM <= 0x0307
+	case CHIP_FIJI: return "tonga";
 	case CHIP_STONEY: return "carrizo";
 #else
+	case CHIP_FIJI: return "fiji";
 	case CHIP_STONEY: return "stoney";
 #endif
 	default: return "";
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -392,6 +392,7 @@ struct r600_common_context {
 	struct list_head		active_nontimer_queries;
 	struct list_head		active_timer_queries;
 	unsigned			num_cs_dw_nontimer_queries_suspend;
+	bool				nontimer_queries_suspended_by_flush;
 	unsigned			num_cs_dw_timer_queries_suspend;
 	/* Additional hardware info. */
 	unsigned			backend_mask;
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -119,7 +119,7 @@ static void r600_query_sw_end(struct r600_common_context *rctx,
 		rctx->b.flush(&rctx->b, &query->fence, 0);
 		break;
 	case R600_QUERY_DRAW_CALLS:
-		query->begin_result = rctx->num_draw_calls;
+		query->end_result = rctx->num_draw_calls;
 		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
@@ -141,10 +141,10 @@ static void r600_query_sw_end(struct r600_common_context *rctx,
 		query->begin_result = 0;
 		break;
 	case R600_QUERY_NUM_COMPILATIONS:
-		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+		query->end_result = p_atomic_read(&rctx->screen->num_compilations);
 		break;
 	case R600_QUERY_NUM_SHADERS_CREATED:
-		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
 		break;
 	default:
 		unreachable("r600_query_sw_end: bad query type");
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -489,6 +489,10 @@ static void vi_texture_alloc_dcc_separate(struct r600_common_screen *rscreen,
 	if (rscreen->debug_flags & DBG_NO_DCC)
 		return;

+	/* TODO: DCC is broken on Stoney */
+	if (rscreen->family == CHIP_STONEY)
+		return;
+
 	rtex->dcc_buffer = (struct r600_resource *)
 		r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
 				   PIPE_USAGE_DEFAULT, rtex->surface.dcc_size, rtex->surface.dcc_alignment);
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -168,6 +168,8 @@

 #define EG_R_028A4C_PA_SC_MODE_CNTL_1                0x028A4C
 #define   EG_S_028A4C_PS_ITER_SAMPLE(x)                 (((x) & 0x1) << 16)
+#define   EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)        (((x) & 0x1) << 25)
+#define   EG_S_028A4C_FORCE_EOV_REZ_ENABLE(x)           (((x) & 0x1) << 26)

 #define CM_R_028804_DB_EQAA                          0x00028804
 #define   S_028804_MAX_ANCHOR_SAMPLES(x)		(((x) & 0x7) << 0)
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -188,8 +188,8 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	if (mem_err) {
 		fprintf(stderr, "%s: %s", __FUNCTION__, err);
 		FREE(err);
-		LLVMDisposeTargetMachine(tm);
-		return 1;
+		rval = 1;
+		goto out;
 	}

 	if (0 != rval) {
@@ -205,6 +205,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);

+out:
 	if (dispose_tm) {
 		LLVMDisposeTargetMachine(tm);
 	}
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -1539,7 +1539,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
 	bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.exp2.f32";
+	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp.";
 	bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32";
 	bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem;
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -958,6 +958,8 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	dec->msg->body.decode.db_pitch = dec->base.width;

 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
+	if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY)
+		dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2;

 	switch (u_reduce_video_profile(picture->profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -394,7 +394,10 @@ struct ruvd_msg {
 			uint32_t	dt_chroma_top_offset;
 			uint32_t	dt_chroma_bottom_offset;
 			uint32_t	dt_surf_tile_config;
-			uint32_t	dt_reserved[3];
+			uint32_t	dt_uv_surf_tile_config;
+			// re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney
+			uint32_t	dt_wa_chroma_top_offset;
+			uint32_t	dt_wa_chroma_bottom_offset;

 			uint32_t	reserved[16];

--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -389,6 +389,11 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	struct radeon_surf *tmp_surf;
 	unsigned cpb_size;

+	if (rscreen->info.family == CHIP_STONEY) {
+		RVID_ERR("Stoney VCE is not supported!\n");
+		return NULL;
+	}
+
 	if (!rscreen->info.vce_fw_version) {
 		RVID_ERR("Kernel doesn't supports VCE!\n");
 		return NULL;
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -34,11 +34,6 @@

 #define MAX_GLOBAL_BUFFERS 20

-/* XXX: Even though we don't pass the scratch buffer via user sgprs any more
- * LLVM still expects that we specify 4 USER_SGPRS so it can remain compatible
- * with older mesa. */
-#define NUM_USER_SGPRS 4
-
 struct si_compute {
 	struct si_context *ctx;

@@ -238,7 +233,6 @@ static void si_launch_grid(
 	uint64_t kernel_args_va;
 	uint64_t scratch_buffer_va = 0;
 	uint64_t shader_va;
-	unsigned arg_user_sgpr_count = NUM_USER_SGPRS;
 	unsigned i;
 	struct si_shader *shader = &program->shader;
 	unsigned lds_blocks;
@@ -366,20 +360,7 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);

-	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1,
-		/* We always use at least 3 VGPRS, these come from
-		 * TIDIG_COMP_CNT.
-		 * XXX: The compiler should account for this.
-		 */
-		S_00B848_VGPRS((MAX2(3, shader->num_vgprs) - 1) / 4)
-		/* We always use at least 4 + arg_user_sgpr_count.  The 4 extra
-		 * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN
-		 * XXX: The compiler should account for this.
-		 */
-		|  S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count,
-		                        shader->num_sgprs)) - 1) / 8)
-		|  S_00B028_FLOAT_MODE(shader->float_mode))
-		;
+	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);

 	lds_blocks = shader->lds_size;
 	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
@@ -395,17 +376,10 @@ static void si_launch_grid(

 	assert(lds_blocks <= 0xFF);

-	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2,
-		S_00B84C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)
-		| S_00B84C_USER_SGPR(arg_user_sgpr_count)
-		| S_00B84C_TGID_X_EN(1)
-		| S_00B84C_TGID_Y_EN(1)
-		| S_00B84C_TGID_Z_EN(1)
-		| S_00B84C_TG_SIZE_EN(1)
-		| S_00B84C_TIDIG_COMP_CNT(2)
-		| S_00B84C_LDS_SIZE(lds_blocks)
-		| S_00B84C_EXCP_EN(0))
-		;
+	shader->rsrc2 &= C_00B84C_LDS_SIZE;
+	shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+
+	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
 	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);

 	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -632,7 +632,7 @@ void si_check_vm_faults(struct si_context *sctx)
 	/* Use conservative timeout 800ms, after which we won't wait any
 	 * longer and assume the GPU is hung.
 	 */
-	screen->fence_finish(screen, sctx->last_gfx_fence, 800*1000*1000);
+	sctx->b.ws->fence_wait(sctx->b.ws, sctx->last_gfx_fence, 800*1000*1000);

 	if (!si_vm_fault_occured(sctx, &addr))
 		return;
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -138,6 +138,22 @@ static void si_release_sampler_views(struct si_sampler_views *views)
 	si_release_descriptors(&views->desc);
 }

+static void si_sampler_view_add_buffers(struct si_context *sctx,
+					struct si_sampler_view *rview)
+{
+	if (rview->resource) {
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+			rview->resource, RADEON_USAGE_READ,
+			r600_get_sampler_view_priority(rview->resource));
+	}
+
+	if (rview->dcc_buffer && rview->dcc_buffer != rview->resource) {
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+			rview->dcc_buffer, RADEON_USAGE_READ,
+			RADEON_PRIO_DCC);
+	}
+}
+
 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 					  struct si_sampler_views *views)
 {
@@ -149,12 +165,7 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 		struct si_sampler_view *rview =
 			(struct si_sampler_view*)views->views[i];

-		if (!rview->resource)
-			continue;
-
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				      rview->resource, RADEON_USAGE_READ,
-				      r600_get_sampler_view_priority(rview->resource));
+		si_sampler_view_add_buffers(sctx, rview);
 	}

 	if (!views->desc.buffer)
@@ -176,15 +187,7 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 		struct si_sampler_view *rview =
 			(struct si_sampler_view*)view;

-		if (rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				rview->resource, RADEON_USAGE_READ,
-				r600_get_sampler_view_priority(rview->resource));
-
-		if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				rview->dcc_buffer, RADEON_USAGE_READ,
-				RADEON_PRIO_DCC);
+		si_sampler_view_add_buffers(sctx, rview);

 		pipe_sampler_view_reference(&views->views[slot], view);
 		memcpy(views->desc.list + slot*8, view_desc, 8*4);
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -594,6 +594,14 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 			    lp_build_const_int32(gallivm, swizzle));

 	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	if (type == TGSI_TYPE_DOUBLE) {
+		LLVMValueRef value2;
+		dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+				       lp_build_const_int32(gallivm, swizzle + 1));
+		value2 = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+		return radeon_llvm_emit_fetch_double(bld_base, value, value2);
+	}
+
 	return LLVMBuildBitCast(gallivm->builder, value,
 				tgsi2llvmtype(bld_base, type), "");
 }
@@ -733,6 +741,7 @@ static LLVMValueRef fetch_input_gs(
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
 	unsigned param;
+	LLVMValueRef value;

 	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
 		return get_primitive_id(bld_base, swizzle);
@@ -774,11 +783,22 @@ static LLVMValueRef fetch_input_gs(
 	args[7] = uint->zero; /* SLC */
 	args[8] = uint->zero; /* TFE */

+	value = lp_build_intrinsic(gallivm->builder,
+				   "llvm.SI.buffer.load.dword.i32.i32",
+				   i32, args, 9,
+				   LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+	if (type == TGSI_TYPE_DOUBLE) {
+		LLVMValueRef value2;
+		args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
+		value2 = lp_build_intrinsic(gallivm->builder,
+					    "llvm.SI.buffer.load.dword.i32.i32",
+					    i32, args, 9,
+					    LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+		return radeon_llvm_emit_fetch_double(bld_base,
+						     value, value2);
+	}
 	return LLVMBuildBitCast(gallivm->builder,
-				lp_build_intrinsic(gallivm->builder,
-						"llvm.SI.buffer.load.dword.i32.i32",
-						i32, args, 9,
-						LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
+				value,
 				tgsi2llvmtype(bld_base, type), "");
 }

@@ -2271,6 +2291,9 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 		last_args[6]= uint->zero;
 		last_args[7]= uint->zero;
 		last_args[8]= uint->zero;
+
+		if (info->uses_kill)
+			si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_R;
 	}

 	/* Specify whether the EXEC mask represents the valid mask */
@@ -3745,12 +3768,14 @@ void si_shader_binary_read_config(const struct si_screen *sscreen,
 			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
 			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 			shader->float_mode =  G_00B028_FLOAT_MODE(value);
+			shader->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
 			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
 			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
+			shader->rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			shader->spi_ps_input_ena = value;
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -290,8 +290,8 @@ struct si_shader {
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */

-	unsigned		ls_rsrc1;
-	unsigned		ls_rsrc2;
+	unsigned		rsrc1;
+	unsigned		rsrc2;
 };

 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;

 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->ls_rsrc2;
+	ls_rsrc2 = ls->current->rsrc2;

 	if (sctx->b.chip_class >= CIK) {
 		assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
 		radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
 	radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->ls_rsrc1);
+	radeon_emit(cs, ls->current->rsrc1);
 	radeon_emit(cs, ls_rsrc2);

 	/* Compute userdata SGPRs. */
@@ -216,6 +216,18 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
 }

+static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
+{
+	switch (info->mode) {
+	case PIPE_PRIM_PATCHES:
+		return info->count / info->vertices_per_patch;
+	case R600_PRIM_RECTANGLE_LIST:
+		return info->count / 3;
+	default:
+		return u_prims_for_vertices(info->mode, info->count);
+	}
+}
+
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 					  const struct pipe_draw_info *info,
 					  unsigned num_patches)
@@ -320,7 +332,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
 	    (info->indirect ||
 	     (info->instance_count > 1 &&
-	      u_prims_for_vertices(info->mode, info->count) <= 1)))
+	      si_num_prims_for_vertices(info) <= 1)))
 		sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;

 	return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
@@ -872,7 +884,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)

 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
-	if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) &&
+	if ((sctx->b.family == CHIP_HAWAII ||
+	     sctx->b.family == CHIP_TONGA ||
+	     sctx->b.family == CHIP_FIJI) &&
 	    (sctx->b.streamout.streamout_enabled ||
 	     sctx->b.streamout.prims_gen_query_enabled)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -121,11 +121,11 @@ static void si_shader_ls(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);

-	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+	shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
 			   S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
-	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+	shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
 			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
 }

@@ -212,13 +212,37 @@ static void si_shader_es(struct si_shader *shader)
 		si_set_tesseval_regs(shader, pm4);
 }

+/**
+ * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
+ * geometry shader.
+ */
+static uint32_t si_vgt_gs_mode(struct si_shader *shader)
+{
+	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
+	unsigned cut_mode;
+
+	if (gs_max_vert_out <= 128) {
+		cut_mode = V_028A40_GS_CUT_128;
+	} else if (gs_max_vert_out <= 256) {
+		cut_mode = V_028A40_GS_CUT_256;
+	} else if (gs_max_vert_out <= 512) {
+		cut_mode = V_028A40_GS_CUT_512;
+	} else {
+		assert(gs_max_vert_out <= 1024);
+		cut_mode = V_028A40_GS_CUT_1024;
+	}
+
+	return S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
+	       S_028A40_CUT_MODE(cut_mode)|
+	       S_028A40_ES_WRITE_OPTIMIZE(1) |
+	       S_028A40_GS_WRITE_OPTIMIZE(1);
+}
+
 static void si_shader_gs(struct si_shader *shader)
 {
 	unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
-	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
 	unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
 	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
-	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
 	uint64_t va;
@@ -232,22 +256,7 @@ static void si_shader_gs(struct si_shader *shader)
 	if (pm4 == NULL)
 		return;

-	if (gs_max_vert_out <= 128) {
-		cut_mode = V_028A40_GS_CUT_128;
-	} else if (gs_max_vert_out <= 256) {
-		cut_mode = V_028A40_GS_CUT_256;
-	} else if (gs_max_vert_out <= 512) {
-		cut_mode = V_028A40_GS_CUT_512;
-	} else {
-		assert(gs_max_vert_out <= 1024);
-		cut_mode = V_028A40_GS_CUT_1024;
-	}
-
-	si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
-		       S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
-		       S_028A40_CUT_MODE(cut_mode)|
-		       S_028A40_ES_WRITE_OPTIMIZE(1) |
-		       S_028A40_GS_WRITE_OPTIMIZE(1));
+	si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(shader));

 	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
 	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
@@ -255,7 +264,7 @@ static void si_shader_gs(struct si_shader *shader)

 	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));

-	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
+	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, shader->selector->gs_max_out_vertices);

 	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
 	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
@@ -289,7 +298,14 @@ static void si_shader_gs(struct si_shader *shader)
 		       S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 }

-static void si_shader_vs(struct si_shader *shader)
+/**
+ * Compute the state for \p shader, which will run as a vertex shader on the
+ * hardware.
+ *
+ * If \p gs is non-NULL, it points to the geometry shader for which this shader
+ * is the copy shader.
+ */
+static void si_shader_vs(struct si_shader *shader, struct si_shader *gs)
 {
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
@@ -304,15 +320,21 @@ static void si_shader_vs(struct si_shader *shader)
 	if (pm4 == NULL)
 		return;

-	/* If this is the GS copy shader, the GS state writes this register.
-	 * Otherwise, the VS state writes it.
+	/* We always write VGT_GS_MODE in the VS state, because every switch
+	 * between different shader pipelines involving a different GS or no
+	 * GS at all involves a switch of the VS (different GS use different
+	 * copy shaders). On the other hand, when the API switches from a GS to
+	 * no GS and then back to the same GS used originally, the GS state is
+	 * not sent again.
 	 */
-	if (!shader->is_gs_copy_shader) {
+	if (!gs) {
 		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
 			       S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0));
 		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id);
-	} else
+	} else {
+		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(gs));
 		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0);
+	}

 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
@@ -473,7 +495,7 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 		else if (shader->key.vs.as_es)
 			si_shader_es(shader);
 		else
-			si_shader_vs(shader);
+			si_shader_vs(shader, NULL);
 		break;
 	case PIPE_SHADER_TESS_CTRL:
 		si_shader_hs(shader);
@@ -482,11 +504,11 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 		if (shader->key.tes.as_es)
 			si_shader_es(shader);
 		else
-			si_shader_vs(shader);
+			si_shader_vs(shader, NULL);
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		si_shader_gs(shader);
-		si_shader_vs(shader->gs_copy_shader);
+		si_shader_vs(shader->gs_copy_shader, shader);
 		break;
 	case PIPE_SHADER_FRAGMENT:
 		si_shader_ps(shader);
@@ -1281,6 +1303,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 		si_get_max_scratch_bytes_per_wave(sctx);
 	unsigned scratch_needed_size = scratch_bytes_per_wave *
 		sctx->scratch_waves;
+	unsigned spi_tmpring_size;
 	int r;

 	if (scratch_needed_size > 0) {
@@ -1350,8 +1373,12 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 	assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
 		"scratch size should already be aligned correctly.");

-	sctx->spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
-				S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
+	spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+			   S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
+	if (spi_tmpring_size != sctx->spi_tmpring_size) {
+		sctx->spi_tmpring_size = spi_tmpring_size;
+		sctx->emit_scratch_reloc = true;
+	}
 	return true;
 }

--- a/src/gallium/drivers/vc4/Automake.inc
+++ b/src/gallium/drivers/vc4/Automake.inc
@@ -6,8 +6,4 @@ TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/winsys/vc4/drm/libvc4drm.la \
 	$(top_builddir)/src/gallium/drivers/vc4/libvc4.la

-if USE_VC4_SIMULATOR
-TARGET_CPPFLAGS += -DUSE_VC4_SIMULATOR
-endif
-
 endif
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -23,7 +23,6 @@ include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc

 if USE_VC4_SIMULATOR
-SIM_CFLAGS = -DUSE_VC4_SIMULATOR=1
 SIM_LDFLAGS = -lsimpenrose
 endif

--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -21,6 +21,7 @@ C_SOURCES := \
 	vc4_job.c \
 	vc4_nir_lower_blend.c \
 	vc4_nir_lower_io.c \
+	vc4_nir_lower_txf_ms.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
 	vc4_opt_copy_propagation.c \
--- a/src/gallium/drivers/vc4/kernel/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -121,6 +121,11 @@ enum vc4_packet {
 #define VC4_PACKET_TILE_COORDINATES_SIZE				3
 #define VC4_PACKET_GEM_HANDLES_SIZE					9

+/* Number of multisamples supported. */
+#define VC4_MAX_SAMPLES							4
+/* Size of a full resolution color or Z tile buffer load/store. */
+#define VC4_TILE_BUFFER_SIZE			(64 * 64 * 4)
+
 #define VC4_MASK(high, low) (((1 << ((high) - (low) + 1)) - 1) << (low))
 /* Using the GNU statement expression extension */
 #define VC4_SET_FIELD(value, field)                                       \
@@ -151,6 +156,16 @@ enum vc4_packet {
 #define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              (1 << 1)
 #define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           (1 << 0)

+/** @{
+ *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     (1 << 3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       (1 << 2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              (1 << 1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           (1 << 0)
+
 /** @{
 *
 * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -36,9 +36,11 @@

 struct vc4_rcl_setup {
 	struct drm_gem_cma_object *color_read;
-	struct drm_gem_cma_object *color_ms_write;
+	struct drm_gem_cma_object *color_write;
 	struct drm_gem_cma_object *zs_read;
 	struct drm_gem_cma_object *zs_write;
+	struct drm_gem_cma_object *msaa_color_write;
+	struct drm_gem_cma_object *msaa_zs_write;

 	struct drm_gem_cma_object *rcl;
 	u32 next_offset;
@@ -62,7 +64,6 @@ static inline void rcl_u32(struct vc4_rcl_setup *setup, u32 val)
 	setup->next_offset += 4;
 }

-
 /*
 * Emits a no-op STORE_TILE_BUFFER_GENERAL.
 *
@@ -81,6 +82,22 @@ static void vc4_store_before_load(struct vc4_rcl_setup *setup)
 	rcl_u32(setup, 0); /* no address, since we're in None mode */
 }

+/*
+ * Calculates the physical address of the start of a tile in a RCL surface.
+ *
+ * Unlike the other load/store packets,
+ * VC4_PACKET_LOAD/STORE_FULL_RES_TILE_BUFFER don't look at the tile
+ * coordinates packet, and instead just store to the address given.
+ */
+static uint32_t vc4_full_res_offset(struct vc4_exec_info *exec,
+				    struct drm_gem_cma_object *bo,
+				    struct drm_vc4_submit_rcl_surface *surf,
+				    uint8_t x, uint8_t y)
+{
+	return bo->paddr + surf->offset + VC4_TILE_BUFFER_SIZE *
+		(DIV_ROUND_UP(exec->args->width, 32) * y + x);
+}
+
 /*
 * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
 *
@@ -108,22 +125,41 @@ static void emit_tile(struct vc4_exec_info *exec,
 	 * may be outstanding at a time.
 	 */
 	if (setup->color_read) {
-		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, args->color_read.bits);
-		rcl_u32(setup,
-			setup->color_read->paddr + args->color_read.offset);
+		if (args->color_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			rcl_u8(setup, VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER);
+			rcl_u32(setup,
+				vc4_full_res_offset(exec, setup->color_read,
+						    &args->color_read, x, y) |
+				VC4_LOADSTORE_FULL_RES_DISABLE_ZS);
+		} else {
+			rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+			rcl_u16(setup, args->color_read.bits);
+			rcl_u32(setup, setup->color_read->paddr +
+				args->color_read.offset);
+		}
 	}

 	if (setup->zs_read) {
-		if (setup->color_read) {
-			/* Exec previous load. */
-			vc4_tile_coordinates(setup, x, y);
-			vc4_store_before_load(setup);
-		}
+		if (args->zs_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			rcl_u8(setup, VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER);
+			rcl_u32(setup,
+				vc4_full_res_offset(exec, setup->zs_read,
+						    &args->zs_read, x, y) |
+				VC4_LOADSTORE_FULL_RES_DISABLE_COLOR);
+		} else {
+			if (setup->color_read) {
+				/* Exec previous load. */
+				vc4_tile_coordinates(setup, x, y);
+				vc4_store_before_load(setup);
+			}

-		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, args->zs_read.bits);
-		rcl_u32(setup, setup->zs_read->paddr + args->zs_read.offset);
+			rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+			rcl_u16(setup, args->zs_read.bits);
+			rcl_u32(setup, setup->zs_read->paddr +
+				args->zs_read.offset);
+		}
 	}

 	/* Clipping depends on tile coordinates having been
@@ -144,20 +180,60 @@ static void emit_tile(struct vc4_exec_info *exec,
 				(y * exec->bin_tiles_x + x) * 32));
 	}

+	if (setup->msaa_color_write) {
+		bool last_tile_write = (!setup->msaa_zs_write &&
+					!setup->zs_write &&
+					!setup->color_write);
+		uint32_t bits = VC4_LOADSTORE_FULL_RES_DISABLE_ZS;
+
+		if (!last_tile_write)
+			bits |= VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL;
+		else if (last)
+			bits |= VC4_LOADSTORE_FULL_RES_EOF;
+		rcl_u8(setup, VC4_PACKET_STORE_FULL_RES_TILE_BUFFER);
+		rcl_u32(setup,
+			vc4_full_res_offset(exec, setup->msaa_color_write,
+					    &args->msaa_color_write, x, y) |
+			bits);
+	}
+
+	if (setup->msaa_zs_write) {
+		bool last_tile_write = (!setup->zs_write &&
+					!setup->color_write);
+		uint32_t bits = VC4_LOADSTORE_FULL_RES_DISABLE_COLOR;
+
+		if (setup->msaa_color_write)
+			vc4_tile_coordinates(setup, x, y);
+		if (!last_tile_write)
+			bits |= VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL;
+		else if (last)
+			bits |= VC4_LOADSTORE_FULL_RES_EOF;
+		rcl_u8(setup, VC4_PACKET_STORE_FULL_RES_TILE_BUFFER);
+		rcl_u32(setup,
+			vc4_full_res_offset(exec, setup->msaa_zs_write,
+					    &args->msaa_zs_write, x, y) |
+			bits);
+	}
+
 	if (setup->zs_write) {
+		bool last_tile_write = !setup->color_write;
+
+		if (setup->msaa_color_write || setup->msaa_zs_write)
+			vc4_tile_coordinates(setup, x, y);
+
 		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
 		rcl_u16(setup, args->zs_write.bits |
-			(setup->color_ms_write ?
-			 VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0));
+			(last_tile_write ?
+			 0 : VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR));
 		rcl_u32(setup,
 			(setup->zs_write->paddr + args->zs_write.offset) |
-			((last && !setup->color_ms_write) ?
+			((last && last_tile_write) ?
 			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
 	}

-	if (setup->color_ms_write) {
-		if (setup->zs_write) {
-			/* Reset after previous store */
+	if (setup->color_write) {
+		if (setup->msaa_color_write || setup->msaa_zs_write ||
+		    setup->zs_write) {
 			vc4_tile_coordinates(setup, x, y);
 		}

@@ -192,14 +268,26 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	}

 	if (setup->color_read) {
-		loop_body_size += (VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE);
+		if (args->color_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			loop_body_size += VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE;
+		} else {
+			loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		}
 	}
 	if (setup->zs_read) {
-		if (setup->color_read) {
-			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
-			loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+		if (args->zs_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			loop_body_size += VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE;
+		} else {
+			if (setup->color_read &&
+			    !(args->color_read.flags &
+			      VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES)) {
+				loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+				loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+			}
+			loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
 		}
-		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
 	}

 	if (has_bin) {
@@ -207,13 +295,23 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 		loop_body_size += VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE;
 	}

+	if (setup->msaa_color_write)
+		loop_body_size += VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE;
+	if (setup->msaa_zs_write)
+		loop_body_size += VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE;
+
 	if (setup->zs_write)
 		loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
-	if (setup->color_ms_write) {
-		if (setup->zs_write)
-			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+	if (setup->color_write)
 		loop_body_size += VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE;
-	}
+
+	/* We need a VC4_PACKET_TILE_COORDINATES in between each store. */
+	loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE *
+		((setup->msaa_color_write != NULL) +
+		 (setup->msaa_zs_write != NULL) +
+		 (setup->color_write != NULL) +
+		 (setup->zs_write != NULL) - 1);
+
 	size += xtiles * ytiles * loop_body_size;

 	setup->rcl = drm_gem_cma_create(dev, size);
@@ -224,13 +322,12 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,

 	rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
 	rcl_u32(setup,
-		(setup->color_ms_write ?
-		 (setup->color_ms_write->paddr +
-		  args->color_ms_write.offset) :
+		(setup->color_write ? (setup->color_write->paddr +
+				       args->color_write.offset) :
 		 0));
 	rcl_u16(setup, args->width);
 	rcl_u16(setup, args->height);
-	rcl_u16(setup, args->color_ms_write.bits);
+	rcl_u16(setup, args->color_write.bits);

 	/* The tile buffer gets cleared when the previous tile is stored.  If
 	 * the clear values changed between frames, then the tile buffer has
@@ -255,6 +352,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 		for (x = min_x_tile; x <= max_x_tile; x++) {
 			bool first = (x == min_x_tile && y == min_y_tile);
 			bool last = (x == max_x_tile && y == max_y_tile);
+
 			emit_tile(exec, setup, x, y, first, last);
 		}
 	}
@@ -266,6 +364,56 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	return 0;
 }

+static int vc4_full_res_bounds_check(struct vc4_exec_info *exec,
+				     struct drm_gem_cma_object *obj,
+				     struct drm_vc4_submit_rcl_surface *surf)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	u32 render_tiles_stride = DIV_ROUND_UP(exec->args->width, 32);
+
+	if (surf->offset > obj->base.size) {
+		DRM_ERROR("surface offset %d > BO size %zd\n",
+			  surf->offset, obj->base.size);
+		return -EINVAL;
+	}
+
+	if ((obj->base.size - surf->offset) / VC4_TILE_BUFFER_SIZE <
+	    render_tiles_stride * args->max_y_tile + args->max_x_tile) {
+		DRM_ERROR("MSAA tile %d, %d out of bounds "
+			  "(bo size %zd, offset %d).\n",
+			  args->max_x_tile, args->max_y_tile,
+			  obj->base.size,
+			  surf->offset);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int vc4_rcl_msaa_surface_setup(struct vc4_exec_info *exec,
+				      struct drm_gem_cma_object **obj,
+				      struct drm_vc4_submit_rcl_surface *surf)
+{
+	if (surf->flags != 0 || surf->bits != 0) {
+		DRM_ERROR("MSAA surface had nonzero flags/bits\n");
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	*obj = vc4_use_bo(exec, surf->hindex);
+	if (!*obj)
+		return -EINVAL;
+
+	if (surf->offset & 0xf) {
+		DRM_ERROR("MSAA write must be 16b aligned.\n");
+		return -EINVAL;
+	}
+
+	return vc4_full_res_bounds_check(exec, *obj, surf);
+}
+
 static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 				 struct drm_gem_cma_object **obj,
 				 struct drm_vc4_submit_rcl_surface *surf)
@@ -277,9 +425,10 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	uint8_t format = VC4_GET_FIELD(surf->bits,
 				       VC4_LOADSTORE_TILE_BUFFER_FORMAT);
 	int cpp;
+	int ret;

-	if (surf->pad != 0) {
-		DRM_ERROR("Padding unset\n");
+	if (surf->flags & ~VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+		DRM_ERROR("Extra flags set\n");
 		return -EINVAL;
 	}

@@ -290,6 +439,25 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	if (!*obj)
 		return -EINVAL;

+	if (surf->flags & VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+		if (surf == &exec->args->zs_write) {
+			DRM_ERROR("general zs write may not be a full-res.\n");
+			return -EINVAL;
+		}
+
+		if (surf->bits != 0) {
+			DRM_ERROR("load/store general bits set with "
+				  "full res load/store.\n");
+			return -EINVAL;
+		}
+
+		ret = vc4_full_res_bounds_check(exec, *obj, surf);
+		if (!ret)
+			return ret;
+
+		return 0;
+	}
+
 	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
 			   VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK |
 			   VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK)) {
@@ -341,9 +509,10 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 }

 static int
-vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
-			 struct drm_gem_cma_object **obj,
-			 struct drm_vc4_submit_rcl_surface *surf)
+vc4_rcl_render_config_surface_setup(struct vc4_exec_info *exec,
+				    struct vc4_rcl_setup *setup,
+				    struct drm_gem_cma_object **obj,
+				    struct drm_vc4_submit_rcl_surface *surf)
 {
 	uint8_t tiling = VC4_GET_FIELD(surf->bits,
 				       VC4_RENDER_CONFIG_MEMORY_FORMAT);
@@ -351,13 +520,15 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
 				       VC4_RENDER_CONFIG_FORMAT);
 	int cpp;

-	if (surf->pad != 0) {
-		DRM_ERROR("Padding unset\n");
+	if (surf->flags != 0) {
+		DRM_ERROR("No flags supported on render config.\n");
 		return -EINVAL;
 	}

 	if (surf->bits & ~(VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK |
-			   VC4_RENDER_CONFIG_FORMAT_MASK)) {
+			   VC4_RENDER_CONFIG_FORMAT_MASK |
+			   VC4_RENDER_CONFIG_MS_MODE_4X |
+			   VC4_RENDER_CONFIG_DECIMATE_MODE_4X)) {
 		DRM_ERROR("Unknown bits in render config: 0x%04x\n",
 			  surf->bits);
 		return -EINVAL;
@@ -414,18 +585,20 @@ int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec)
 	if (has_bin &&
 	    (args->max_x_tile > exec->bin_tiles_x ||
 	     args->max_y_tile > exec->bin_tiles_y)) {
-		DRM_ERROR("Render tiles (%d,%d) outside of bin config (%d,%d)\n",
+		DRM_ERROR("Render tiles (%d,%d) outside of bin config "
+			  "(%d,%d)\n",
 			  args->max_x_tile, args->max_y_tile,
 			  exec->bin_tiles_x, exec->bin_tiles_y);
 		return -EINVAL;
 	}

-	ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
+	ret = vc4_rcl_render_config_surface_setup(exec, &setup,
+						  &setup.color_write,
+						  &args->color_write);
 	if (ret)
 		return ret;

-	ret = vc4_rcl_ms_surface_setup(exec, &setup.color_ms_write,
-				       &args->color_ms_write);
+	ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
 	if (ret)
 		return ret;

@@ -437,10 +610,21 @@ int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec)
 	if (ret)
 		return ret;

+	ret = vc4_rcl_msaa_surface_setup(exec, &setup.msaa_color_write,
+					 &args->msaa_color_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_msaa_surface_setup(exec, &setup.msaa_zs_write,
+					 &args->msaa_zs_write);
+	if (ret)
+		return ret;
+
 	/* We shouldn't even have the job submitted to us if there's no
 	 * surface to write out.
 	 */
-	if (!setup.color_ms_write && !setup.zs_write) {
+	if (!setup.color_write && !setup.zs_write &&
+	    !setup.msaa_color_write && !setup.msaa_zs_write) {
 		DRM_ERROR("RCL requires color or Z/S write\n");
 		return -EINVAL;
 	}
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -47,7 +47,6 @@
 	void *validated,				\
 	void *untrusted

-
 /** Return the width in pixels of a 64-byte microtile. */
 static uint32_t
 utile_width(int cpp)
@@ -191,7 +190,7 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,

 	if (size + offset < size ||
 	    size + offset > fbo->base.size) {
-		DRM_ERROR("Overflow in %dx%d (%dx%d) fbo size (%d + %d > %d)\n",
+		DRM_ERROR("Overflow in %dx%d (%dx%d) fbo size (%d + %d > %zd)\n",
 			  width, height,
 			  aligned_width, aligned_height,
 			  size, offset, fbo->base.size);
@@ -201,7 +200,6 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
 	return true;
 }

-
 static int
 validate_flush(VALIDATE_ARGS)
 {
@@ -270,7 +268,7 @@ validate_indexed_prim_list(VALIDATE_ARGS)

 	if (offset > ib->base.size ||
 	    (ib->base.size - offset) / index_size < length) {
-		DRM_ERROR("IB access overflow (%d + %d*%d > %d)\n",
+		DRM_ERROR("IB access overflow (%d + %d*%d > %zd)\n",
 			  offset, length, index_size, ib->base.size);
 		return -EINVAL;
 	}
@@ -361,9 +359,8 @@ validate_tile_binning_config(VALIDATE_ARGS)
 	}

 	if (flags & (VC4_BIN_CONFIG_DB_NON_MS |
-		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT |
-		     VC4_BIN_CONFIG_MS_MODE_4X)) {
-		DRM_ERROR("unsupported bining config flags 0x%02x\n", flags);
+		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT)) {
+		DRM_ERROR("unsupported binning config flags 0x%02x\n", flags);
 		return -EINVAL;
 	}

@@ -424,8 +421,8 @@ validate_gem_handles(VALIDATE_ARGS)
 	return 0;
 }

-#define VC4_DEFINE_PACKET(packet, name, func) \
-	[packet] = { packet ## _SIZE, name, func }
+#define VC4_DEFINE_PACKET(packet, func) \
+	[packet] = { packet ## _SIZE, #packet, func }

 static const struct cmd_info {
 	uint16_t len;
@@ -433,42 +430,42 @@ static const struct cmd_info {
 	int (*func)(struct vc4_exec_info *exec, void *validated,
 		    void *untrusted);
 } cmd_info[] = {
-	VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", validate_flush),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning),
-	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore),
+	VC4_DEFINE_PACKET(VC4_PACKET_HALT, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_NOP, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, validate_flush),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING,
+			  validate_start_tile_binning),
+	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE,
+			  validate_increment_semaphore),

-	VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, "Indexed Primitive List", validate_indexed_prim_list),
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE,
+			  validate_indexed_prim_list),
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE,
+			  validate_gl_array_primitive),

-	VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, "Vertex Array Primitives", validate_gl_array_primitive),
+	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, NULL),

-	/* This is only used by clipped primitives (packets 48 and 49), which
-	 * we don't support parsing yet.
-	 */
-	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, validate_gl_shader_state),

-	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state),
-	/* We don't support validating NV shader states. */
-
-	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, "point size", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, "line width", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, "RHT X boundary", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, "Depth Offset", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, "Clip Window", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, "Viewport Offset", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, "Clipper XY Scaling", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, NULL),
 	/* Note: The docs say this was also 105, but it was 106 in the
 	 * initial userland code drop.
 	 */
-	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, "Clipper Z Scale and Offset", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, NULL),

-	VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG, "tile binning configuration", validate_tile_binning_config),
+	VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG,
+			  validate_tile_binning_config),

-	VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, "GEM handles", validate_gem_handles),
+	VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, validate_gem_handles),
 };

 int
@@ -500,11 +497,6 @@ vc4_validate_bin_cl(struct drm_device *dev,
 			return -EINVAL;
 		}

-#if 0
-		DRM_INFO("0x%08x: packet %d (%s) size %d processing...\n",
-			 src_offset, cmd, info->name, info->len);
-#endif
-
 		if (src_offset + info->len > len) {
 			DRM_ERROR("0x%08x: packet %d (%s) length 0x%08x "
 				  "exceeds bounds (0x%08x)\n",
@@ -519,8 +511,7 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		if (info->func && info->func(exec,
 					     dst_pkt + 1,
 					     src_pkt + 1)) {
-			DRM_ERROR("0x%08x: packet %d (%s) failed to "
-				  "validate\n",
+			DRM_ERROR("0x%08x: packet %d (%s) failed to validate\n",
 				  src_offset, cmd, info->name);
 			return -EINVAL;
 		}
@@ -588,12 +579,14 @@ reloc_tex(struct vc4_exec_info *exec,

 	if (sample->is_direct) {
 		uint32_t remaining_size = tex->base.size - p0;
+
 		if (p0 > tex->base.size - 4) {
 			DRM_ERROR("UBO offset greater than UBO size\n");
 			goto fail;
 		}
 		if (p1 > remaining_size - 4) {
-			DRM_ERROR("UBO clamp would allow reads outside of UBO\n");
+			DRM_ERROR("UBO clamp would allow reads "
+				  "outside of UBO\n");
 			goto fail;
 		}
 		*validated_p0 = tex->paddr + p0;
@@ -866,7 +859,7 @@ validate_gl_shader_rec(struct drm_device *dev,

 		if (vbo->base.size < offset ||
 		    vbo->base.size - offset < attr_size) {
-			DRM_ERROR("BO offset overflow (%d + %d > %d)\n",
+			DRM_ERROR("BO offset overflow (%d + %d > %zd)\n",
 				  offset, attr_size, vbo->base.size);
 			return -EINVAL;
 		}
@@ -875,7 +868,8 @@ validate_gl_shader_rec(struct drm_device *dev,
 			max_index = ((vbo->base.size - offset - attr_size) /
 				     stride);
 			if (state->max_index > max_index) {
-				DRM_ERROR("primitives use index %d out of supplied %d\n",
+				DRM_ERROR("primitives use index %d out of "
+					  "supplied %d\n",
 					  state->max_index, max_index);
 				return -EINVAL;
 			}
--- a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
@@ -24,24 +24,16 @@
 /**
 * DOC: Shader validator for VC4.
 *
- * The VC4 has no IOMMU between it and system memory.  So, a user with access
- * to execute shaders could escalate privilege by overwriting system memory
- * (using the VPM write address register in the general-purpose DMA mode) or
- * reading system memory it shouldn't (reading it as a texture, or uniform
- * data, or vertex data).
+ * The VC4 has no IOMMU between it and system memory, so a user with
+ * access to execute shaders could escalate privilege by overwriting
+ * system memory (using the VPM write address register in the
+ * general-purpose DMA mode) or reading system memory it shouldn't
+ * (reading it as a texture, or uniform data, or vertex data).
 *
- * This walks over a shader starting from some offset within a BO, ensuring
- * that its accesses are appropriately bounded, and recording how many texture
- * accesses are made and where so that we can do relocations for them in the
+ * This walks over a shader BO, ensuring that its accesses are
+ * appropriately bounded, and recording how many texture accesses are
+ * made and where so that we can do relocations for them in the
 * uniform stream.
- *
- * The kernel API has shaders stored in user-mapped BOs.  The BOs will be
- * forcibly unmapped from the process before validation, and any cache of
- * validated state will be flushed if the mapping is faulted back in.
- *
- * Storing the shaders in BOs means that the validation process will be slow
- * due to uncached reads, but since shaders are long-lived and shader BOs are
- * never actually modified, this shouldn't be a problem.
 */

 #include "vc4_drv.h"
@@ -71,7 +63,6 @@ waddr_to_live_reg_index(uint32_t waddr, bool is_b)
 		else
 			return waddr;
 	} else if (waddr <= QPU_W_ACC3) {
-
 		return 64 + waddr - QPU_W_ACC0;
 	} else {
 		return ~0;
@@ -86,15 +77,14 @@ raddr_add_a_to_live_reg_index(uint64_t inst)
 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);

-	if (add_a == QPU_MUX_A) {
+	if (add_a == QPU_MUX_A)
 		return raddr_a;
-	} else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) {
+	else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
 		return 32 + raddr_b;
-	} else if (add_a <= QPU_MUX_R3) {
+	else if (add_a <= QPU_MUX_R3)
 		return 64 + add_a;
-	} else {
+	else
 		return ~0;
-	}
 }

 static bool
@@ -112,9 +102,9 @@ is_tmu_write(uint32_t waddr)
 }

 static bool
-record_validated_texture_sample(struct vc4_validated_shader_info *validated_shader,
-				struct vc4_shader_validation_state *validation_state,
-				int tmu)
+record_texture_sample(struct vc4_validated_shader_info *validated_shader,
+		      struct vc4_shader_validation_state *validation_state,
+		      int tmu)
 {
 	uint32_t s = validated_shader->num_texture_samples;
 	int i;
@@ -227,8 +217,8 @@ check_tmu_write(uint64_t inst,
 		validated_shader->uniforms_size += 4;

 	if (submit) {
-		if (!record_validated_texture_sample(validated_shader,
-						     validation_state, tmu)) {
+		if (!record_texture_sample(validated_shader,
+					   validation_state, tmu)) {
 			return false;
 		}

@@ -239,10 +229,10 @@ check_tmu_write(uint64_t inst,
 }

 static bool
-check_register_write(uint64_t inst,
-		     struct vc4_validated_shader_info *validated_shader,
-		     struct vc4_shader_validation_state *validation_state,
-		     bool is_mul)
+check_reg_write(uint64_t inst,
+		struct vc4_validated_shader_info *validated_shader,
+		struct vc4_shader_validation_state *validation_state,
+		bool is_mul)
 {
 	uint32_t waddr = (is_mul ?
 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
@@ -298,7 +288,7 @@ check_register_write(uint64_t inst,
 		return true;

 	case QPU_W_TLB_STENCIL_SETUP:
-                return true;
+		return true;
 	}

 	return true;
@@ -361,7 +351,7 @@ track_live_clamps(uint64_t inst,
 		}

 		validation_state->live_max_clamp_regs[lri_add] = true;
-	} if (op_add == QPU_A_MIN) {
+	} else if (op_add == QPU_A_MIN) {
 		/* Track live clamps of a value clamped to a minimum of 0 and
 		 * a maximum of some uniform's offset.
 		 */
@@ -393,8 +383,10 @@ check_instruction_writes(uint64_t inst,
 		return false;
 	}

-	ok = (check_register_write(inst, validated_shader, validation_state, false) &&
-	      check_register_write(inst, validated_shader, validation_state, true));
+	ok = (check_reg_write(inst, validated_shader, validation_state,
+			      false) &&
+	      check_reg_write(inst, validated_shader, validation_state,
+			      true));

 	track_live_clamps(inst, validated_shader, validation_state);

@@ -442,7 +434,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 	shader = shader_obj->vaddr;
 	max_ip = shader_obj->base.size / sizeof(uint64_t);

-	validated_shader = kcalloc(sizeof(*validated_shader), 1, GFP_KERNEL);
+	validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
 	if (!validated_shader)
 		return NULL;

@@ -498,7 +490,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)

 	if (ip == max_ip) {
 		DRM_ERROR("shader failed to terminate before "
-			  "shader BO end at %d\n",
+			  "shader BO end at %zd\n",
 			  shader_obj->base.size);
 		goto fail;
 	}
@@ -514,6 +506,9 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 	return validated_shader;

 fail:
-	kfree(validated_shader);
+	if (validated_shader) {
+		kfree(validated_shader->texture_samples);
+		kfree(validated_shader);
+	}
 	return NULL;
 }
--- a/Show More
+++ b/Show More