docs: add sha256 checksums for 11.0.9

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
docs: add release notes for 11.0.9
2016-01-22 15:40:17 +00:00 · 2016-01-22 14:51:19 +00:00 · 2016-01-22 14:47:40 +00:00 · 2016-01-22 14:46:39 +00:00 · 2016-01-22 14:46:28 +00:00 · 2016-01-22 14:46:14 +00:00
206 changed files with 5083 additions and 908 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -32,6 +32,7 @@ AM_DISTCHECK_CONFIGURE_FLAGS = \
 	--enable-vdpau \
 	--enable-xa \
 	--enable-xvmc \
+	--disable-llvm-shared-libs \
 	--with-egl-platforms=x11,wayland,drm \
 	--with-dri-drivers=i915,i965,nouveau,radeon,r200,swrast \
 	--with-gallium-drivers=i915,ilo,nouveau,r300,r600,radeonsi,freedreno,svga,swrast
--- a/2
+++ b/2
@@ -1 +1 @@
-11.0.3
+11.0.9
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,14 @@
+# The commit base differs greatly between 11.0 and master
+2832ca95ecce064c7d841a3a374c2179f56161be glsl: fix stream qualifier for blocks with an instance name
+
+# Somewhat of a mixed feature/bugfix patch, causing some 200 piglit regressions
+2b676570960277d47477822ffeccc672613f9142 gallium/swrast: fix front buffer blitting. (v2)
+
+# causes regression in xwayland, kde/plasma, mpv, steam ... fdo#92759
+839793680f99b8387bee9489733d5071c10f3ace i965: Use MESA_FORMAT_B8G8R8X8_SRGB for RGB visuals
+
+# already picked as commit 94ac4b3e84737b8c5faa371834670fd25502e024
+b5b87c4ed1dfd58aec8905e0514c9ba92ba83e1d r600g: write all MRTs only if there is exactly one output (fixes a hang)
+
+# patch not applicable on branch (null check already exists)
+f7b71451231c75c36771e8b7b0d78f05e0d50f65 glx/dri3: a drawable might not be bound at wait time
--- a/bin/get-extra-pick-list.sh
+++ b/bin/get-extra-pick-list.sh
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+# Script for generating a list of candidates which fix commits that have been
+# previously cherry-picked to a stable branch.
+#
+# Usage examples:
+#
+# $ bin/get-extra-pick-list.sh
+# $ bin/get-extra-pick-list.sh > picklist
+# $ bin/get-extra-pick-list.sh | tee picklist
+
+# Use the last branchpoint as our limit for the search
+# XXX: there should be a better way for this
+latest_branchpoint=`git branch | grep \* | cut -c 3-`-branchpoint
+
+# Grep for commits with "cherry picked from commit" in the commit message.
+git log --reverse --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
+	grep "cherry picked from commit" |\
+	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' |\
+	cut -c -8 |\
+while read sha
+do
+	# Check if the original commit is referenced in master
+	git log -n1 --pretty=oneline --grep=$sha $latest_branchpoint..origin/master |\
+		cut -c -8 |\
+	while read candidate
+	do
+		# Check if the potential fix, hasn't landed in branch yet.
+		found=`git log -n1 --pretty=oneline --reverse --grep=$candidate $latest_branchpoint..HEAD |wc -l`
+		if test $found = 0
+		then
+			echo Commit $candidate might need to be picked, as it references $sha
+		fi
+	done
+done
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -14,7 +14,7 @@ git log --reverse --grep="cherry picked from commit" origin/master..HEAD |\
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked

 # Grep for commits that were marked as a candidate for the stable tree.
-git log --reverse --pretty=%H -i --grep='^\([[:space:]]*NOTE: .*[Cc]andidate\|CC:.*mesa-stable\)' HEAD..origin/master |\
+git log --reverse --pretty=%H -i --grep='^\([[:space:]]*NOTE: .*[Cc]andidate\|CC:.*11\.0.*mesa-stable\)' HEAD..origin/master |\
 while read sha
 do
 	# Check to see whether the patch is on the ignore list.
--- a/configure.ac
+++ b/configure.ac
@@ -97,7 +97,7 @@ AC_PROG_CXX
 AM_PROG_CC_C_O
 AM_PROG_AS
 AX_CHECK_GNU_MAKE
-AC_CHECK_PROGS([PYTHON2], [python2 python])
+AC_CHECK_PROGS([PYTHON2], [python2.7 python2 python])
 AC_PROG_SED
 AC_PROG_MKDIR_P

@@ -106,6 +106,8 @@ AC_SYS_LARGEFILE
 LT_PREREQ([2.2])
 LT_INIT([disable-static])

+AC_CHECK_PROG(RM, rm, [rm -f])
+
 AX_PROG_BISON([],
              AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-parse.c"],
                    [AC_MSG_ERROR([bison not found - unable to compile glcpp-parse.y])]))
@@ -373,10 +375,11 @@ save_CFLAGS="$CFLAGS"
 CFLAGS="$SSE41_CFLAGS $CFLAGS"
 AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
 #include <smmintrin.h>
+int param;
 int main () {
-    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
    c = _mm_max_epu32(a, b);
-    return 0;
+    return _mm_cvtsi128_si32(c);
 }]])], SSE41_SUPPORTED=1)
 CFLAGS="$save_CFLAGS"
 if test "x$SSE41_SUPPORTED" = x1; then
@@ -1527,7 +1530,15 @@ AC_ARG_WITH([clang-libdir],
   [CLANG_LIBDIR=''])

 PKG_CHECK_EXISTS([libclc], [have_libclc=yes], [have_libclc=no])
-AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;ELF_LIB=-lelf])
+PKG_CHECK_MODULES([LIBELF], [libelf], [have_libelf=yes], [have_libelf=no])
+
+if test "x$have_libelf" = xno; then
+   LIBELF_LIBS=''
+   LIBELF_CFLAGS=''
+   AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;LIBELF_LIBS=-lelf], [have_libelf=no])
+   AC_SUBST([LIBELF_LIBS])
+   AC_SUBST([LIBELF_CFLAGS])
+fi

 if test "x$enable_opencl" = xyes; then
    if test -z "$with_gallium_drivers"; then
@@ -2125,8 +2136,6 @@ if test "x$USE_VC4_SIMULATOR" = xyes -a "x$HAVE_GALLIUM_ILO" = xyes; then
    AC_MSG_ERROR([VC4 simulator on x86 replaces i965 driver build, so ilo must be disabled.])
 fi

-AC_SUBST([ELF_LIB])
-
 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_X11_DRIVER, test "x$enable_xlib_glx" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
--- a/docs/relnotes/11.0.3.html
+++ b/docs/relnotes/11.0.3.html
@@ -31,7 +31,8 @@ because compatibility contexts are not supported.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+c2210e3daecc10ed9fdcea500327652ed6effc2f47c4b9cee63fb08f560d7117  mesa-11.0.3.tar.gz
+ab2992eece21adc23c398720ef8c6933cb69ea42e1b2611dc09d031e17e033d6  mesa-11.0.3.tar.xz
 </pre>


--- a/docs/relnotes/11.0.4.html
+++ b/docs/relnotes/11.0.4.html
@@ -0,0 +1,168 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.4 Release Notes / October 24, 2015</h1>
+
+<p>
+Mesa 11.0.4 is a bug fix release which fixes bugs found since the 11.0.3 release.
+</p>
+<p>
+Mesa 11.0.4 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+ed412ca6a46d1bd055120e5c12806c15419ae8c4dd6d3f6ea20a83091d5c78bf  mesa-11.0.4.tar.gz
+40201bf7fc6fa12a6d9edfe870b41eb4dd6669154e3c42c48a96f70805f5483d  mesa-11.0.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86281">Bug 86281</a> - brw_meta_fast_clear (brw=brw&#64;entry=0x7fffd4097a08, fb=fb&#64;entry=0x7fffd40fa900, buffers=buffers&#64;entry=2, partial_clear=partial_clear&#64;entry=false)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86720">Bug 86720</a> - [radeon] Europa Universalis 4 freezing during game start (10.3.3+, still broken on 11.0.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91788">Bug 91788</a> - [HSW Regression] Synmark2_v6 Multithread performance case FPS reduced by 36%</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92304">Bug 92304</a> - [cts] cts.shaders.negative conformance tests fail</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alejandro Piñeiro (2):</p>
+<ul>
+  <li>i965/vec4: check writemask when bailing out at register coalesce</li>
+  <li>i965/vec4: fill src_reg type using the constructor type parameter</li>
+</ul>
+
+<p>Brian Paul (2):</p>
+<ul>
+  <li>vbo: fix incorrect switch statement in init_mat_currval()</li>
+  <li>mesa: fix incorrect opcode in save_BlendFunci()</li>
+</ul>
+
+<p>Chih-Wei Huang (3):</p>
+<ul>
+  <li>mesa: android: Fix the incorrect path of sse_minmax.c</li>
+  <li>nv50/ir: use C++11 standard std::unordered_map if possible</li>
+  <li>nv30: include the header of ffs prototype</li>
+</ul>
+
+<p>Chris Wilson (1):</p>
+<ul>
+  <li>i965: Remove early release of DRI2 miptree</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>mesa/uniforms: fix get_uniform for doubles (v2)</li>
+</ul>
+
+<p>Emil Velikov (1):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.3</li>
+</ul>
+
+<p>Francisco Jerez (5):</p>
+<ul>
+  <li>i965: Don't tell the hardware about our UAV access.</li>
+  <li>mesa: Expose function to calculate whether a shader image unit is valid.</li>
+  <li>mesa: Skip redundant texture completeness checking during image validation.</li>
+  <li>i965: Use _mesa_is_image_unit_valid() instead of gl_image_unit::_Valid.</li>
+  <li>mesa: Get rid of texture-dependent image unit derived state.</li>
+</ul>
+
+<p>Ian Romanick (8):</p>
+<ul>
+  <li>glsl: Allow built-in functions as constant expressions in OpenGL ES 1.00</li>
+  <li>ff_fragment_shader: Use binding to set the sampler unit</li>
+  <li>glsl/linker: Use constant_initializer instead of constant_value to initialize uniforms</li>
+  <li>glsl: Use constant_initializer instead of constant_value to determine whether to keep an unused uniform</li>
+  <li>glsl: Only set ir_variable::constant_value for const-decorated variables</li>
+  <li>glsl: Restrict initializers for global variables to constant expression in ES</li>
+  <li>glsl: Add method to determine whether an expression contains the sequence operator</li>
+  <li>glsl: In later GLSL versions, sequence operator is cannot be a constant expression</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nouveau: make sure there's always room to emit a fence</li>
+</ul>
+
+<p>Indrajit Das (1):</p>
+<ul>
+  <li>st/va: Used correct parameter to derive the value of the "h" variable in vlVaCreateImage</li>
+</ul>
+
+<p>Jonathan Gray (1):</p>
+<ul>
+  <li>configure.ac: ensure RM is set</li>
+</ul>
+
+<p>Krzysztof Sobiecki (1):</p>
+<ul>
+  <li>st/fbo: use pipe_surface_release instead of pipe_surface_reference</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>st/omx/dec/h264: fix field picture type 0 poc disorder</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>st/mesa: fix clip state dependencies</li>
+  <li>radeonsi: fix a GS copy shader leak</li>
+  <li>gallium: add PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>u_vbuf: fix vb slot assignment for translated buffers</li>
+</ul>
+
+<p>Rob Clark (1):</p>
+<ul>
+  <li>freedreno/a3xx: cache-flush is needed after MEM_WRITE</li>
+</ul>
+
+<p>Tapani Pälli (3):</p>
+<ul>
+  <li>mesa: add GL_UNSIGNED_INT_24_8 to _mesa_pack_depth_span</li>
+  <li>mesa: Set api prefix to version string when overriding version</li>
+  <li>mesa: fix ARRAY_SIZE query for GetProgramResourceiv</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/11.0.5.html
+++ b/docs/relnotes/11.0.5.html
@@ -0,0 +1,174 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.5 Release Notes / November 11, 2015</h1>
+
+<p>
+Mesa 11.0.5 is a bug fix release which fixes bugs found since the 11.0.4 release.
+</p>
+<p>
+Mesa 11.0.5 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+8495ef5c06f7f726452462b7d408a5b40048373ff908f2283a3b4d1f49b45ee6  mesa-11.0.5.tar.gz
+9c255a2a6695fcc6ef4a279e1df0aeaf417dc142f39ee59dfb533d80494bb67a  mesa-11.0.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91993">Bug 91993</a> - Graphical glitch in Astromenace (open-source game).</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92214">Bug 92214</a> - Flightgear crashes during splashboot with R600 driver, LLVM 3.7.0 and mesa 11.0.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92437">Bug 92437</a> - osmesa: Expose GL entry points for Windows build, via .def file</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92476">Bug 92476</a> - [cts] ES2-CTS.gtf.GL2ExtensionTests.egl_image.egl_image fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92623">Bug 92623</a> - Differences in prog_data ignored when caching fragment programs (causes hangs)</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Deucher (1):</p>
+<ul>
+  <li>radeon/uvd: don't expose HEVC on old UVD hw (v3)</li>
+</ul>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/skl: Add GT4 PCI IDs</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.4</li>
+  <li>cherry-ignore: ignore a possible wrong nomination</li>
+  <li>Revert "mesa/glformats: Undo code changes from _mesa_base_tex_format() move"</li>
+  <li>Update version to 11.0.5</li>
+</ul>
+
+<p>Emmanuel Gil Peyrot (1):</p>
+<ul>
+  <li>gbm.h: Add a missing stddef.h include for size_t.</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>vc4: When the create ioctl fails, free our cache and try again.</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>i965: Fix is-renderable check in intel_image_target_renderbuffer_storage</li>
+</ul>
+
+<p>Ilia Mirkin (3):</p>
+<ul>
+  <li>nvc0: respect edgeflag attribute width</li>
+  <li>nouveau: set MaxDrawBuffers to the same value as MaxColorAttachments</li>
+  <li>nouveau: relax fence emit space assert</li>
+</ul>
+
+<p>Ivan Kalvachev (1):</p>
+<ul>
+  <li>r600g: Fix special negative immediate constants when using ABS modifier.</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>nir/lower_vec_to_movs: Pass the shader around directly</li>
+  <li>nir: Report progress from lower_vec_to_movs().</li>
+</ul>
+
+<p>Jose Fonseca (2):</p>
+<ul>
+  <li>gallivm: Translate all util_cpu_caps bits to LLVM attributes.</li>
+  <li>gallivm: Explicitly disable unsupported CPU features.</li>
+</ul>
+
+<p>Julien Isorce (4):</p>
+<ul>
+  <li>st/va: pass picture desc to begin and decode</li>
+  <li>nvc0: fix crash when nv50_miptree_from_handle fails</li>
+  <li>st/va: do not destroy old buffer when new one failed</li>
+  <li>st/va: add more errors checks in vlVaBufferSetNumElements and vlVaMapBuffer</li>
+</ul>
+
+<p>Kenneth Graunke (6):</p>
+<ul>
+  <li>i965: Fix missing BRW_NEW_*_PROG_DATA flagging caused by cache reuse.</li>
+  <li>nir: Report progress from nir_split_var_copies().</li>
+  <li>nir: Properly invalidate metadata in nir_split_var_copies().</li>
+  <li>nir: Properly invalidate metadata in nir_opt_copy_prop().</li>
+  <li>nir: Properly invalidate metadata in nir_lower_vec_to_movs().</li>
+  <li>nir: Properly invalidate metadata in nir_opt_remove_phis().</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: add register definitions for Stoney</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>mesa/glformats: Undo code changes from _mesa_base_tex_format() move</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>st/mesa: fix mipmap generation for immutable textures with incomplete pyramids</li>
+</ul>
+
+<p>Nigel Stewart (1):</p>
+<ul>
+  <li>osmesa: Expose GL entry points for Windows build via DEF file.</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>gallivm: disable f16c when not using AVX</li>
+</ul>
+
+<p>Samuel Li (2):</p>
+<ul>
+  <li>radeonsi: add support for Stoney asics (v3)</li>
+  <li>radeonsi: add Stoney pci ids</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/11.0.6.html
+++ b/docs/relnotes/11.0.6.html
@@ -0,0 +1,145 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.6 Release Notes / November 21, 2015</h1>
+
+<p>
+Mesa 11.0.6 is a bug fix release which fixes bugs found since the 11.0.5 release.
+</p>
+<p>
+Mesa 11.0.6 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+4bdf054af66ebabf3eca0616f9f5e44c2f234695661b570261c391bc2f4f7482  mesa-11.0.6.tar.gz
+8340e64cdc91999840404c211496f3de38e7b4cb38db34e2f72f1642c5134760  mesa-11.0.6.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91780">Bug 91780</a> - Rendering issues with geometry shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92588">Bug 92588</a> - [HSW,BDW,BSW,SKL-Y][GLES 3.1 CTS] ES31-CTS.arrays_of_arrays.InteractionFunctionCalls2 - assert</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92738">Bug 92738</a> - Randon R7 240 doesn't work on 16KiB page size platform</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92860">Bug 92860</a> - [radeonsi][bisected] st/mesa: implement ARB_copy_image - Corruption in ARK Survival Evolved</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92900">Bug 92900</a> - [regression bisected] About 700 piglit regressions is what could go wrong</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Deucher (1):</p>
+<ul>
+  <li>radeonsi: enable optimal raster config setting for fiji (v2)</li>
+</ul>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/skl/gt4: Fix URB programming restriction.</li>
+</ul>
+
+<p>Boyuan Zhang (2):</p>
+<ul>
+  <li>st/vaapi: fix vaapi VC-1 simple/main corruption v2</li>
+  <li>radeon/uvd: fix VC-1 simple/main profile decode v2</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>r600: initialised PGM_RESOURCES_2 for ES/GS</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.5</li>
+  <li>cherry-ignore: add the swrast front buffer support</li>
+  <li>automake: use static llvm for make distcheck</li>
+  <li>Update version to 11.0.6</li>
+</ul>
+
+<p>Eric Anholt (3):</p>
+<ul>
+  <li>vc4: Return GL_OUT_OF_MEMORY when buffer allocation fails.</li>
+  <li>vc4: Return NULL when we can't make our shadow for a sampler view.</li>
+  <li>vc4: Add support for nir_op_uge, using the carry bit on QPU_A_SUB.</li>
+</ul>
+
+<p>Ian Romanick (2):</p>
+<ul>
+  <li>meta/generate_mipmap: Don't leak the sampler object</li>
+  <li>meta/generate_mipmap: Only modify the draw framebuffer binding in fallback_required</li>
+</ul>
+
+<p>Ilia Mirkin (2):</p>
+<ul>
+  <li>mesa/copyimage: allow width/height to not be multiples of block</li>
+  <li>nouveau: don't expose HEVC decoding support</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>nir/vars_to_ssa: Rework copy set handling in lower_copies_to_load_store</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>glsl: Allow implicit int -&gt; uint conversions for the % operator.</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: initialize SX_PS_DOWNCONVERT to 0 on Stoney</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>winsys/radeon: Use CPU page size instead of hardcoding 4096 bytes v3</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>llvmpipe: use simple coeffs calc for 128bit vectors</li>
+</ul>
+
+<p>Roland Scheidegger (2):</p>
+<ul>
+  <li>radeon: fix bgrx8/xrgb8 blits</li>
+  <li>r200: fix bgrx8/xrgb8 blits</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/11.0.7.html
+++ b/docs/relnotes/11.0.7.html
@@ -0,0 +1,154 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.7 Release Notes / December 9, 2015</h1>
+
+<p>
+Mesa 11.0.7 is a bug fix release which fixes bugs found since the 11.0.6 release.
+</p>
+<p>
+Mesa 11.0.7 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+07c27004ff68b288097d17b2faa7bdf15ec73c96b7e6c9835266e544adf0a62f  mesa-11.0.7.tar.gz
+e7e90a332ede6c8fd08eff90786a3fd1605a4e62ebf3a9b514047838194538cb  mesa-11.0.7.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90348">Bug 90348</a> - Spilling failure of b96 merged value</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92363">Bug 92363</a> - [BSW/BDW] ogles1conform Gets test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92438">Bug 92438</a> - Segfault in pushbuf_kref when running the android emulator (qemu) on nv50</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93110">Bug 93110</a> - [NVE4] textureSize() and textureQueryLevels() uses a texture bound during the previous draw call</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93126">Bug 93126</a> - wrongly claim supporting GL_EXT_texture_rg</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Chris Wilson (1):</p>
+<ul>
+  <li>meta: Compute correct buffer size with SkipRows/SkipPixels</li>
+</ul>
+
+<p>Daniel Stone (1):</p>
+<ul>
+  <li>egl/wayland: Ignore rects from SwapBuffersWithDamage</li>
+</ul>
+
+<p>Dave Airlie (4):</p>
+<ul>
+  <li>texgetimage: consolidate 1D array handling code.</li>
+  <li>r600: geometry shader gsvs itemsize workaround</li>
+  <li>r600: rv670 use at least 16es/gs threads</li>
+  <li>r600: workaround empty geom shader.</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.6</li>
+  <li>get-pick-list.sh: Require explicit "11.0" for nominating stable patches</li>
+  <li>mesa; add get-extra-pick-list.sh script into bin/</li>
+  <li>Update version to 11.0.7</li>
+</ul>
+
+<p>François Tigeot (1):</p>
+<ul>
+  <li>xmlconfig: Add support for DragonFly</li>
+</ul>
+
+<p>Ian Romanick (22):</p>
+<ul>
+  <li>mesa: Make bind_vertex_buffer avilable outside varray.c</li>
+  <li>mesa: Refactor update_array_format to make _mesa_update_array_format_public</li>
+  <li>mesa: Refactor enable_vertex_array_attrib to make _mesa_enable_vertex_array_attrib</li>
+  <li>i965: Pass brw_context instead of gl_context to brw_draw_rectlist</li>
+  <li>i965: Use DSA functions for VBOs in brw_meta_fast_clear</li>
+  <li>i965: Use internal functions for buffer object access</li>
+  <li>i965: Don't pollute the buffer object namespace in brw_meta_fast_clear</li>
+  <li>meta: Use DSA functions for PBO in create_texture_for_pbo</li>
+  <li>meta: Use _mesa_NamedBufferData and _mesa_NamedBufferSubData for users of _mesa_meta_setup_vertex_objects</li>
+  <li>i965: Use _mesa_NamedBufferSubData for users of _mesa_meta_setup_vertex_objects</li>
+  <li>meta: Don't leave the VBO bound after _mesa_meta_setup_vertex_objects</li>
+  <li>meta: Track VBO using gl_buffer_object instead of GL API object handle</li>
+  <li>meta: Use DSA functions for VBOs in _mesa_meta_setup_vertex_objects</li>
+  <li>meta: Use internal functions for buffer object and VAO access</li>
+  <li>meta: Don't pollute the buffer object namespace in _mesa_meta_setup_vertex_objects</li>
+  <li>meta: Partially convert _mesa_meta_DrawTex to DSA</li>
+  <li>meta: Track VBO using gl_buffer_object instead of GL API object handle in _mesa_meta_DrawTex</li>
+  <li>meta: Use internal functions for buffer object and VAO access in _mesa_meta_DrawTex</li>
+  <li>meta: Don't pollute the buffer object namespace in _mesa_meta_DrawTex</li>
+  <li>meta/TexSubImage: Don't pollute the buffer object namespace</li>
+  <li>meta/generate_mipmap: Don't leak the framebuffer object</li>
+  <li>glsl: Fix off-by-one error in array size check assertion</li>
+</ul>
+
+<p>Ilia Mirkin (7):</p>
+<ul>
+  <li>nvc0/ir: actually emit AFETCH on kepler</li>
+  <li>nir: fix typo in idiv lowering, causing large-udiv-udiv failures</li>
+  <li>nouveau: use the buffer usage to determine placement when no binding</li>
+  <li>nv50,nvc0: properly handle buffer storage invalidation on dsa buffer</li>
+  <li>nv50/ir: fix (un)spilling of 3-wide results</li>
+  <li>mesa: support GL_RED/GL_RG in ES2 contexts when driver support exists</li>
+  <li>nvc0/ir: start offset at texBindBase for txq, like regular texturing</li>
+</ul>
+
+<p>Jonathan Gray (1):</p>
+<ul>
+  <li>automake: fix some occurrences of hardcoded -ldl and -lpthread</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>radeon/vce: disable Stoney VCE for 11.0</li>
+</ul>
+
+<p>Marta Lofstedt (1):</p>
+<ul>
+  <li>gles2: Update gl2ext.h to revision: 32120</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>llvmpipe: disable VSX in ppc due to LLVM PPC bug</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/11.0.8.html
+++ b/docs/relnotes/11.0.8.html
@@ -0,0 +1,200 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.8 Release Notes / December 9, 2015</h1>
+
+<p>
+Mesa 11.0.8 is a bug fix release which fixes bugs found since the 11.0.7 release.
+</p>
+<p>
+Mesa 11.0.8 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+ab9db87b54d7525e4b611b82577ea9a9eae55927558df57b190059d5ecd9406f  mesa-11.0.8.tar.gz
+5696e4730518b6805d2ed5def393c4293f425a2c2c01bd5ed4bdd7ad62f7ad75  mesa-11.0.8.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91806">Bug 91806</a> - configure does not test whether assembler supports sse4.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92849">Bug 92849</a> - [IVB HSW BDW] piglit image load/store load-from-cleared-image.shader_test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92909">Bug 92909</a> - Offset/alignment issue with layout std140 and vec3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93004">Bug 93004</a> - Guild Wars 2 crash on nouveau DX11 cards</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93215">Bug 93215</a> - [Regression bisected] Ogles1conform Automatic mipmap generation test is fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93266">Bug 93266</a> - gl_arb_shading_language_420pack does not allow binding of image variables</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Boyuan Zhang (1):</p>
+<ul>
+  <li>radeon/uvd: uv pitch separation for stoney</li>
+</ul>
+
+<p>Dave Airlie (9):</p>
+<ul>
+  <li>r600: do SQ flush ES ring rolling workaround</li>
+  <li>r600: SMX returns CONTEXT_DONE early workaround</li>
+  <li>r600/shader: split address get out to a function.</li>
+  <li>r600/shader: add utility functions to do single slot arithmatic</li>
+  <li>r600g: fix geom shader input indirect indexing.</li>
+  <li>r600: handle geometry dynamic input array index</li>
+  <li>radeonsi: handle doubles in lds load path.</li>
+  <li>mesa/varray: set double arrays to non-normalised.</li>
+  <li>mesa/shader: return correct attribute location for double matrix arrays</li>
+</ul>
+
+<p>Emil Velikov (8):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.7</li>
+  <li>cherry-ignore: don't pick a specific i965 formats patch</li>
+  <li>Revert "i965/nir: Remove unused indirect handling"</li>
+  <li>Revert "i965/state: Get rid of dword_pitch arguments to buffer functions"</li>
+  <li>Revert "i965/vec4: Use a stride of 1 and byte offsets for UBOs"</li>
+  <li>Revert "i965/fs: Use a stride of 1 and byte offsets for UBOs"</li>
+  <li>Revert "i965/vec4: Use byte offsets for UBO pulls on Sandy Bridge"</li>
+  <li>Update version to 11.0.8</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965: Resolve color and flush for all active shader images in intel_update_state().</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>meta/generate_mipmap: Work-around GLES 1.x problem with GL_DRAW_FRAMEBUFFER</li>
+</ul>
+
+<p>Ilia Mirkin (17):</p>
+<ul>
+  <li>freedreno/a4xx: support lod_bias</li>
+  <li>freedreno/a4xx: fix 5_5_5_1 texture sampler format</li>
+  <li>freedreno/a4xx: point regid to "red" even for alpha-only rb formats</li>
+  <li>nvc0/ir: fold postfactor into immediate</li>
+  <li>nv50/ir: deal with loops with no breaks</li>
+  <li>nv50/ir: the mad source might not have a defining instruction</li>
+  <li>nv50/ir: fix instruction permutation logic</li>
+  <li>nv50/ir: don't forget to mark flagsDef on cvt in txb lowering</li>
+  <li>nv50/ir: fix DCE to not generate 96-bit loads</li>
+  <li>nv50/ir: avoid looking at uninitialized srcMods entries</li>
+  <li>gk110/ir: fix imul hi emission with limm arg</li>
+  <li>gk104/ir: sampler doesn't matter for txf</li>
+  <li>gk110/ir: fix imad sat/hi flag emission for immediate args</li>
+  <li>nv50/ir: fix cutoff for using r63 vs r127 when replacing zero</li>
+  <li>nv50/ir: can't have predication and immediates</li>
+  <li>glsl: assign varying locations to tess shaders when doing SSO</li>
+  <li>ttn: add TEX2 support</li>
+</ul>
+
+<p>Jason Ekstrand (5):</p>
+<ul>
+  <li>i965/vec4: Use byte offsets for UBO pulls on Sandy Bridge</li>
+  <li>i965/fs: Use a stride of 1 and byte offsets for UBOs</li>
+  <li>i965/vec4: Use a stride of 1 and byte offsets for UBOs</li>
+  <li>i965/state: Get rid of dword_pitch arguments to buffer functions</li>
+  <li>i965/nir: Remove unused indirect handling</li>
+</ul>
+
+<p>Jonathan Gray (2):</p>
+<ul>
+  <li>configure.ac: use pkg-config for libelf</li>
+  <li>configure: check for python2.7 for PYTHON2</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>i965: Fix fragment shader struct inputs.</li>
+  <li>i965: Fix scalar vertex shader struct outputs.</li>
+</ul>
+
+<p>Marek Olšák (8):</p>
+<ul>
+  <li>radeonsi: fix occlusion queries on Fiji</li>
+  <li>radeonsi: fix a hang due to uninitialized border color registers</li>
+  <li>radeonsi: fix Fiji for LLVM &lt;= 3.7</li>
+  <li>radeonsi: don't call of u_prims_for_vertices for patches and rectangles</li>
+  <li>radeonsi: apply the streamout workaround to Fiji as well</li>
+  <li>gallium/radeon: fix Hyper-Z hangs by programming PA_SC_MODE_CNTL_1 correctly</li>
+  <li>tgsi/scan: add flag colors_written</li>
+  <li>r600g: write all MRTs only if there is exactly one output (fixes a hang)</li>
+</ul>
+
+<p>Matt Turner (1):</p>
+<ul>
+  <li>glsl: Allow binding of image variables with 420pack.</li>
+</ul>
+
+<p>Neil Roberts (2):</p>
+<ul>
+  <li>i965: Add MESA_FORMAT_B8G8R8X8_SRGB to brw_format_for_mesa_format</li>
+  <li>i965: Add B8G8R8X8_SRGB to the alpha format override</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>configura.ac: fix test for SSE4.1 assembler support</li>
+</ul>
+
+<p>Patrick Rudolph (2):</p>
+<ul>
+  <li>nv50,nvc0: fix use-after-free when vertex buffers are unbound</li>
+  <li>gallium/util: return correct number of bound vertex buffers</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>nvc0: free memory allocated by the prog which reads MP perf counters</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>i965: use _Shader to get fragment program when updating surface state</li>
+</ul>
+
+<p>Tom Stellard (2):</p>
+<ul>
+  <li>radeonsi: Rename si_shader::ls_rsrc{1,2} to si_shader::rsrc{1,2}</li>
+  <li>radeonsi/compute: Use the compiler's COMPUTE_PGM_RSRC* register values</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/11.0.9.html
+++ b/docs/relnotes/11.0.9.html
@@ -0,0 +1,127 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.9 Release Notes / January 22, 2016</h1>
+
+<p>
+Mesa 11.0.9 is a bug fix release which fixes bugs found since the 11.0.8 release.
+</p>
+<p>
+Mesa 11.0.9 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+1597c2e983f476f98efdd6cd58b5298896d18479ff542bdeff28b98b129ede05  mesa-11.0.9.tar.gz
+a1262ff1c66a16ccf341186cf0e57b306b8589eb2cc5ce92ffb6788ab01d2b01  mesa-11.0.9.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91596">Bug 91596</a> - EGL_KHR_gl_colorspace (v2) causes problem with Android-x86 GUI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92229">Bug 92229</a> - [APITRACE] SOMA have serious graphical errors</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93257">Bug 93257</a> - [SKL, bisected] ASTC dEQP tests segfault</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.8</li>
+  <li>cherry-ignore: add patch already in branch</li>
+  <li>cherry-ignore: add the dri3 glx null check patch</li>
+  <li>i915: correctly parse/set the context flags</li>
+  <li>egl/dri2: expose srgb configs when KHR_gl_colorspace is available</li>
+  <li>Update version to 11.0.9</li>
+</ul>
+
+<p>Grazvydas Ignotas (1):</p>
+<ul>
+  <li>r600: fix constant buffer size programming</li>
+</ul>
+
+<p>Ilia Mirkin (5):</p>
+<ul>
+  <li>nvc0: don't forget to reset VTX_TMP bufctx slot after blit completion</li>
+  <li>nv50/ir: float(s32 &amp; 0xff) = float(u8), not s8</li>
+  <li>nv50,nvc0: make sure there's pushbuf space and that we ref the bo early</li>
+  <li>nv50,nvc0: fix crash when increasing bsp bo size for h264</li>
+  <li>nvc0: scale up inter_bo size so that it's 16M for a 4K video</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>ralloc: Fix ralloc_adopt() to the old context's last child's parent.</li>
+  <li>nvc0: Set winding order regardless of domain.</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: don't miss changes to SPI_TMPRING_SIZE</li>
+</ul>
+
+<p>Miklós Máté (1):</p>
+<ul>
+  <li>mesa: Don't leak ATIfs instructions in DeleteFragmentShader</li>
+</ul>
+
+<p>Neil Roberts (1):</p>
+<ul>
+  <li>i965: Fix crash when calling glViewport with no surface bound</li>
+</ul>
+
+<p>Nicolai Hähnle (6):</p>
+<ul>
+  <li>gallium/radeon: only dispose locally created target machine in radeon_llvm_compile</li>
+  <li>mesa/bufferobj: make _mesa_delete_buffer_object externally accessible</li>
+  <li>st/mesa: use _mesa_delete_buffer_object</li>
+  <li>radeon: use _mesa_delete_buffer_object</li>
+  <li>i915: use _mesa_delete_buffer_object</li>
+  <li>i965: use _mesa_delete_buffer_object</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>llvmpipe: use vpkswss when dst is signed</li>
+</ul>
+
+<p>Rob Herring (1):</p>
+<ul>
+  <li>freedreno/ir3: fix 32-bit builds with pointer-to-int-cast error enabled</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/include/GLES2/gl2ext.h
+++ b/include/GLES2/gl2ext.h
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -124,6 +124,10 @@ CHIPSET(0x1921, skl_gt2, "Intel(R) Skylake ULT GT2F")
 CHIPSET(0x1926, skl_gt3, "Intel(R) Skylake ULT GT3")
 CHIPSET(0x192A, skl_gt3, "Intel(R) Skylake SRV GT3")
 CHIPSET(0x192B, skl_gt3, "Intel(R) Skylake Halo GT3")
+CHIPSET(0x1932, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193A, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193B, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193D, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -181,3 +181,5 @@ CHIPSET(0x9876, CARRIZO_, CARRIZO)
 CHIPSET(0x9877, CARRIZO_, CARRIZO)

 CHIPSET(0x7300, FIJI_, FIJI)
+
+CHIPSET(0x98E4, STONEY_, STONEY)
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -236,6 +236,8 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,

      case __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE:
         srgb = value != 0;
+         if (!disp->Extensions.KHR_gl_colorspace && srgb)
+            return NULL;
         break;

      default:
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -703,18 +703,10 @@ dri2_wl_swap_buffers_with_damage(_EGLDriver *drv,
   dri2_surf->dx = 0;
   dri2_surf->dy = 0;

-   if (n_rects == 0) {
-      wl_surface_damage(dri2_surf->wl_win->surface,
-                        0, 0, INT32_MAX, INT32_MAX);
-   } else {
-      for (i = 0; i < n_rects; i++) {
-         const int *rect = &rects[i * 4];
-         wl_surface_damage(dri2_surf->wl_win->surface,
-                           rect[0],
-                           dri2_surf->base.Height - rect[1] - rect[3],
-                           rect[2], rect[3]);
-      }
-   }
+   /* We deliberately ignore the damage region and post maximum damage, due to
+    * https://bugs.freedesktop.org/78190 */
+   wl_surface_damage(dri2_surf->wl_win->surface,
+                     0, 0, INT32_MAX, INT32_MAX);

   if (dri2_dpy->is_different_gpu) {
      _EGLContext *ctx = _eglGetCurrentContext();
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -427,6 +427,7 @@ lp_build_init(void)
       */
      util_cpu_caps.has_avx = 0;
      util_cpu_caps.has_avx2 = 0;
+      util_cpu_caps.has_f16c = 0;
   }

 #ifdef PIPE_ARCH_PPC_64
@@ -458,7 +459,9 @@ lp_build_init(void)
   util_cpu_caps.has_sse3 = 0;
   util_cpu_caps.has_ssse3 = 0;
   util_cpu_caps.has_sse4_1 = 0;
+   util_cpu_caps.has_sse4_2 = 0;
   util_cpu_caps.has_avx = 0;
+   util_cpu_caps.has_avx2 = 0;
   util_cpu_caps.has_f16c = 0;
 #endif

--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -137,6 +137,8 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
   case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
      return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
   }
   /* if we get here, we missed a shader cap above (and should have seen
    * a compiler warning.)
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -497,20 +497,57 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
 #endif
   }

-   llvm::SmallVector<std::string, 1> MAttrs;
-   if (util_cpu_caps.has_avx) {
-      /*
-       * AVX feature is not automatically detected from CPUID by the X86 target
-       * yet, because the old (yet default) JIT engine is not capable of
-       * emitting the opcodes. On newer llvm versions it is and at least some
-       * versions (tested with 3.3) will emit avx opcodes without this anyway.
-       */
-      MAttrs.push_back("+avx");
-      if (util_cpu_caps.has_f16c) {
-         MAttrs.push_back("+f16c");
-      }
-      builder.setMAttrs(MAttrs);
+   llvm::SmallVector<std::string, 16> MAttrs;
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * We need to unset attributes because sometimes LLVM mistakenly assumes
+    * certain features are present given the processor name.
+    *
+    * https://bugs.freedesktop.org/show_bug.cgi?id=92214
+    * http://llvm.org/PR25021
+    * http://llvm.org/PR19429
+    * http://llvm.org/PR16721
+    */
+   MAttrs.push_back(util_cpu_caps.has_sse    ? "+sse"    : "-sse"   );
+   MAttrs.push_back(util_cpu_caps.has_sse2   ? "+sse2"   : "-sse2"  );
+   MAttrs.push_back(util_cpu_caps.has_sse3   ? "+sse3"   : "-sse3"  );
+   MAttrs.push_back(util_cpu_caps.has_ssse3  ? "+ssse3"  : "-ssse3" );
+#if HAVE_LLVM >= 0x0304
+   MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1");
+#else
+   MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse41"  : "-sse41" );
+#endif
+#if HAVE_LLVM >= 0x0304
+   MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2");
+#else
+   MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse42"  : "-sse42" );
+#endif
+   /*
+    * AVX feature is not automatically detected from CPUID by the X86 target
+    * yet, because the old (yet default) JIT engine is not capable of
+    * emitting the opcodes. On newer llvm versions it is and at least some
+    * versions (tested with 3.3) will emit avx opcodes without this anyway.
+    */
+   MAttrs.push_back(util_cpu_caps.has_avx  ? "+avx"  : "-avx");
+   MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c");
+   MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2");
+#endif
+
+#if defined(PIPE_ARCH_PPC)
+   MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
+#if HAVE_LLVM >= 0x0304
+   /*
+    * Make sure VSX instructions are disabled
+    * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=25503#c7
+    */
+   if (util_cpu_caps.has_altivec) {
+      MAttrs.push_back("-vsx");
   }
+#endif
+#endif
+
+   builder.setMAttrs(MAttrs);

 #if HAVE_LLVM >= 0x0305
   StringRef MCPU = llvm::sys::getHostCPUName();
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -461,50 +461,49 @@ lp_build_pack2(struct gallivm_state *gallivm,
   assert(src_type.length * 2 == dst_type.length);

   /* Check for special cases first */
-   if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
-       src_type.width * src_type.length >= 128) {
+   if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
+        src_type.width * src_type.length >= 128) {
      const char *intrinsic = NULL;
      boolean swap_intrinsic_operands = FALSE;

      switch(src_type.width) {
      case 32:
         if (util_cpu_caps.has_sse2) {
-           if(dst_type.sign) {
+           if (dst_type.sign) {
              intrinsic = "llvm.x86.sse2.packssdw.128";
-           }
-           else {
+           } else {
              if (util_cpu_caps.has_sse4_1) {
                 intrinsic = "llvm.x86.sse41.packusdw";
              }
           }
         } else if (util_cpu_caps.has_altivec) {
            if (dst_type.sign) {
-              intrinsic = "llvm.ppc.altivec.vpkswus";
-           } else {
-              intrinsic = "llvm.ppc.altivec.vpkuwus";
-           }
+               intrinsic = "llvm.ppc.altivec.vpkswss";
+            } else {
+               intrinsic = "llvm.ppc.altivec.vpkuwus";
+            }
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-           swap_intrinsic_operands = TRUE;
+            swap_intrinsic_operands = TRUE;
 #endif
         }
         break;
      case 16:
         if (dst_type.sign) {
            if (util_cpu_caps.has_sse2) {
-              intrinsic = "llvm.x86.sse2.packsswb.128";
+               intrinsic = "llvm.x86.sse2.packsswb.128";
            } else if (util_cpu_caps.has_altivec) {
-              intrinsic = "llvm.ppc.altivec.vpkshss";
+               intrinsic = "llvm.ppc.altivec.vpkshss";
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-              swap_intrinsic_operands = TRUE;
+               swap_intrinsic_operands = TRUE;
 #endif
            }
         } else {
            if (util_cpu_caps.has_sse2) {
-              intrinsic = "llvm.x86.sse2.packuswb.128";
+               intrinsic = "llvm.x86.sse2.packuswb.128";
            } else if (util_cpu_caps.has_altivec) {
-	      intrinsic = "llvm.ppc.altivec.vpkshus";
+               intrinsic = "llvm.ppc.altivec.vpkshus";
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-              swap_intrinsic_operands = TRUE;
+               swap_intrinsic_operands = TRUE;
 #endif
            }
         }
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1087,6 +1087,11 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
      op = nir_texop_tex;
      num_srcs = 1;
      break;
+   case TGSI_OPCODE_TEX2:
+      op = nir_texop_tex;
+      num_srcs = 1;
+      samp = 2;
+      break;
   case TGSI_OPCODE_TXP:
      op = nir_texop_tex;
      num_srcs = 2;
@@ -1242,10 +1247,12 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
   }

   if (instr->is_shadow) {
-      if (instr->coord_components < 3)
-         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], Z));
-      else
+      if (instr->coord_components == 4)
+         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[1], X));
+      else if (instr->coord_components == 3)
         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], W));
+      else
+         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], Z));

      instr->src[src_number].src_type = nir_tex_src_comparitor;
      src_number++;
@@ -1651,6 +1658,7 @@ ttn_emit_instruction(struct ttn_compile *c)
   case TGSI_OPCODE_TXL:
   case TGSI_OPCODE_TXB:
   case TGSI_OPCODE_TXD:
+   case TGSI_OPCODE_TEX2:
   case TGSI_OPCODE_TXL2:
   case TGSI_OPCODE_TXB2:
   case TGSI_OPCODE_TXQ_LZ:
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -463,6 +463,8 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
   case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
      return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
   }
   /* if we get here, we missed a shader cap above (and should have seen
    * a compiler warning.)
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -258,6 +258,9 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                  info->output_semantic_index[reg] = (ubyte) semIndex;
                  info->num_outputs++;

+                  if (semName == TGSI_SEMANTIC_COLOR)
+                     info->colors_written |= 1 << semIndex;
+
                  if (procType == TGSI_PROCESSOR_VERTEX ||
                      procType == TGSI_PROCESSOR_GEOMETRY ||
                      procType == TGSI_PROCESSOR_TESS_CTRL ||
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -76,6 +76,7 @@ struct tgsi_shader_info

   uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */

+   ubyte colors_written;
   boolean reads_position; /**< does fragment shader read position? */
   boolean reads_z; /**< does fragment shader read depth? */
   boolean writes_z;  /**< does fragment shader write Z value? */
--- a/src/gallium/auxiliary/util/u_helpers.c
+++ b/src/gallium/auxiliary/util/u_helpers.c
@@ -81,7 +81,13 @@ void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
                                   const struct pipe_vertex_buffer *src,
                                   unsigned start_slot, unsigned count)
 {
-   uint32_t enabled_buffers = (1ull << *dst_count) - 1;
+   unsigned i;
+   uint32_t enabled_buffers = 0;
+
+   for (i = 0; i < *dst_count; i++) {
+      if (dst[i].buffer || dst[i].user_buffer)
+         enabled_buffers |= (1ull << i);
+   }

   util_set_vertex_buffers_mask(dst, &enabled_buffers, src, start_slot,
                                count);
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -545,6 +545,7 @@ u_vbuf_translate_find_free_vb_slots(struct u_vbuf *mgr,

         index = ffs(unused_vb_mask) - 1;
         fallback_vbs[type] = index;
+         unused_vb_mask &= ~(1 << index);
         /*printf("found slot=%i for type=%i\n", index, type);*/
      }
   }
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -355,6 +355,10 @@ to be 0.
  are supported.
 * ``PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE``: Whether the driver doesn't
  ignore tgsi_declaration_range::Last for shader inputs and outputs.
+* ``PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT``: This is the maximum number
+  of iterations that loops are allowed to have to be unrolled. It is only
+  a hint to state trackers. Whether any loops will be unrolled is not
+  guaranteed.


 .. _pipe_compute_cap:
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -828,11 +828,7 @@ fd3_emit_restore(struct fd_context *ctx)
 	OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) |
 			A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0));

-	OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
-	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
-	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
-			A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
-			A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);
+	fd3_emit_cache_flush(ctx, ring);

 	OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
 	OUT_RING(ring, 0x00000000);                  /* GRAS_CL_CLIP_CNTL */
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -90,4 +90,15 @@ void fd3_emit_restore(struct fd_context *ctx);

 void fd3_emit_init(struct pipe_context *pctx);

+static inline void
+fd3_emit_cache_flush(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+	fd_wfi(ctx, ring);
+	OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
+	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
+	OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
+			A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
+			A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);
+}
+
 #endif /* FD3_EMIT_H */
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -558,6 +558,8 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, fui(x1));
 	OUT_RING(ring, fui(y1));

+	fd3_emit_cache_flush(ctx, ring);
+
 	for (i = 0; i < 4; i++) {
 		OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) |
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -153,7 +153,7 @@ enum a4xx_vtx_fmt {

 enum a4xx_tex_fmt {
 	TFMT4_5_6_5_UNORM = 11,
-	TFMT4_5_5_5_1_UNORM = 10,
+	TFMT4_5_5_5_1_UNORM = 9,
 	TFMT4_4_4_4_4_UNORM = 8,
 	TFMT4_X8Z24_UNORM = 71,
 	TFMT4_10_10_10_2_UNORM = 33,
@@ -2718,6 +2718,12 @@ static inline uint32_t A4XX_TEX_SAMP_0_ANISO(enum a4xx_tex_aniso val)
 {
 	return ((val) << A4XX_TEX_SAMP_0_ANISO__SHIFT) & A4XX_TEX_SAMP_0_ANISO__MASK;
 }
+#define A4XX_TEX_SAMP_0_LOD_BIAS__MASK				0xfff80000
+#define A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT				19
+static inline uint32_t A4XX_TEX_SAMP_0_LOD_BIAS(float val)
+{
+	return ((((int32_t)(val * 256.0))) << A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT) & A4XX_TEX_SAMP_0_LOD_BIAS__MASK;
+}

 #define REG_A4XX_TEX_SAMP_1					0x00000001
 #define A4XX_TEX_SAMP_1_COMPARE_FUNC__MASK			0x0000000e
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -250,14 +250,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 		}
 	}

-	/* adjust regids for alpha output formats. there is no alpha render
-	 * format, so it's just treated like red
-	 */
-	for (i = 0; i < nr; i++)
-		if (util_format_is_alpha(pipe_surface_format(bufs[i])))
-			color_regid[i] += 3;
-
-
 	/* TODO get these dynamically: */
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
 	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -111,6 +111,7 @@ fd4_sampler_state_create(struct pipe_context *pctx,
 		COND(!cso->normalized_coords, A4XX_TEX_SAMP_1_UNNORM_COORDS);

 	if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+		so->texsamp0 |= A4XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias);
 		so->texsamp1 |=
 			A4XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) |
 			A4XX_TEX_SAMP_1_MAX_LOD(cso->max_lod);
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -407,6 +407,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return 16;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		return 32;
 	}
 	debug_printf("unknown shader param %d\n", param);
 	return 0;
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -143,7 +143,7 @@ block_id(struct ir3_block *block)
 #ifdef DEBUG
 	return block->serialno;
 #else
-	return (uint32_t)(uint64_t)block;
+	return (uint32_t)(unsigned long)block;
 #endif
 }

--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -167,6 +167,8 @@ i915_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_sha
      case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
         return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
      default:
         debug_printf("%s: Unknown cap %u.\n", __FUNCTION__, cap);
         return 0;
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -138,6 +138,8 @@ ilo_get_shader_param(struct pipe_screen *screen, unsigned shader,
      return PIPE_SHADER_IR_TGSI;
   case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
      return 1;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;

   default:
      return 0;
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -746,7 +746,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,

   pos_init(bld, x0, y0);

-   if (coeff_type.length > 4) {
+   /*
+    * Simple method (single step interpolation) may be slower if vector length
+    * is just 4, but the results are different (generally less accurate) with
+    * the other method, so always use more accurate version.
+    */
+   if (1) {
      bld->simple_interp = TRUE;
      {
         /* XXX this should use a global static table */
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)

   if (b->prev)
      b->prev->next = b;
-   if (a->prev)
+   if (a->next)
      a->next->prev = a;
 }

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -575,8 +575,8 @@ CodeEmitterGK110::emitIMUL(const Instruction *i)
   if (isLIMM(i->src(1), TYPE_S32)) {
      emitForm_L(i, 0x280, 2, Modifier(0));

-      assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH);
-
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 24;
      if (i->sType == TYPE_S32)
         code[1] |= 3 << 25;
   } else {
@@ -695,14 +695,9 @@ CodeEmitterGK110::emitIMAD(const Instruction *i)
   if (i->sType == TYPE_S32)
      code[1] |= (1 << 19) | (1 << 24);

-   if (code[0] & 0x1) {
-      assert(!i->subOp);
-      SAT_(39);
-   } else {
-      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
-         code[1] |= 1 << 25;
-      SAT_(35);
-   }
+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+      code[1] |= 1 << 25;
+   SAT_(35);
 }

 void
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2322,6 +2322,9 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
   case OP_PFETCH:
      emitPFETCH(insn);
      break;
+   case OP_AFETCH:
+      emitAFETCH(insn);
+      break;
   case OP_EMIT:
   case OP_RESTART:
      emitOUT(insn);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2870,6 +2870,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
      }
      setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true);
+
+      // If the loop never breaks (e.g. only has RET's inside), then there
+      // will be no way to get to the break bb. However BGNLOOP will have
+      // already made a PREBREAK to it, so it must be in the CFG.
+      if (getBB()->cfg.incidentCount() == 0)
+         loopBB->cfg.attach(&getBB()->cfg, Graph::Edge::TREE);
   }
      break;
   case TGSI_OPCODE_BRK:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -202,7 +202,8 @@ NV50LegalizePostRA::visit(Function *fn)
   Program *prog = fn->getProgram();

   r63 = new_LValue(fn, FILE_GPR);
-   if (prog->maxGPR < 63)
+   // GPR units on nv50 are in half-regs
+   if (prog->maxGPR < 126)
      r63->reg.data.id = 63;
   else
      r63->reg.data.id = 127;
@@ -831,7 +832,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i)
   }
   Value *flags = bld.getScratch(1, FILE_FLAGS);
   bld.setPosition(cond, true);
-   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;

   Instruction *tex[4];
   for (l = 0; l < 4; ++l) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -686,7 +686,7 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
         i->tex.s = 0x1f;
         i->setIndirectR(hnd);
         i->setIndirectS(NULL);
-      } else if (i->tex.r == i->tex.s) {
+      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
         i->tex.r += prog->driver->io.texBindBase / 4;
         i->tex.s  = 0; // only a single cX[] value possible here
      } else {
@@ -962,11 +962,14 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
 bool
 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
 {
+   const int chipset = prog->getTarget()->getChipset();
+   if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
+      txq->tex.r += prog->driver->io.texBindBase / 4;
+
   if (txq->tex.rIndirectSrc < 0)
      return true;

   Value *ticRel = txq->getIndirectR();
-   const int chipset = prog->getTarget()->getChipset();

   txq->setIndirectS(NULL);
   txq->tex.sIndirectSrc = -1;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -842,6 +842,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
         i->src(0).mod = i->src(t).mod;
         i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
         i->src(1).mod = 0;
+      } else
+      if (i->postFactor && i->sType == TYPE_F32) {
+         /* Can't emit a postfactor with an immediate, have to fold it in */
+         i->setSrc(s, new_ImmediateValue(
+                      prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
+         i->postFactor = 0;
      }
      break;
   case OP_MAD:
@@ -1658,6 +1664,9 @@ AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
         arg = shift->getSrc(0);
         offset = imm.reg.data.u32;
      }
+      // We just AND'd the high bits away, which means this is effectively an
+      // unsigned value.
+      cvt->sType = TYPE_U32;
   } else if (insn->op == OP_SHR &&
              insn->sType == cvt->sType &&
              insn->src(1).getImmediate(imm)) {
@@ -2606,8 +2615,11 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb)
             i->getSrc(0)->reg.data.id >= 64)
            break;

+         if (i->getPredicate())
+            break;
+
         def = i->getSrc(1)->getInsn();
-         if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
+         if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
            vtmp = i->getSrc(1);
            i->setSrc(1, def->getSrc(0));

@@ -2909,6 +2921,16 @@ DeadCodeElim::visit(BasicBlock *bb)
   return true;
 }

+// Each load can go into up to 4 destinations, any of which might potentially
+// be dead (i.e. a hole). These can always be split into 2 loads, independent
+// of where the holes are. We find the first contiguous region, put it into
+// the first load, and then put the second contiguous region into the second
+// load. There can be at most 2 contiguous regions.
+//
+// Note that there are some restrictions, for example it's not possible to do
+// a 64-bit load that's not 64-bit aligned, so such a load has to be split
+// up. Also hardware doesn't support 96-bit loads, so those also have to be
+// split into a 64-bit and 32-bit load.
 void
 DeadCodeElim::checkSplitLoad(Instruction *ld1)
 {
@@ -2929,6 +2951,8 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
   addr1 = ld1->getSrc(0)->reg.data.offset;
   n1 = n2 = 0;
   size1 = size2 = 0;
+
+   // Compute address/width for first load
   for (d = 0; ld1->defExists(d); ++d) {
      if (mask & (1 << d)) {
         if (size1 && (addr1 & 0x7))
@@ -2942,16 +2966,34 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
         break;
      }
   }
+
+   // Scale back the size of the first load until it can be loaded. This
+   // typically happens for TYPE_B96 loads.
+   while (n1 &&
+          !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
+                                                typeOfSize(size1))) {
+      size1 -= def1[--n1]->reg.size;
+      d--;
+   }
+
+   // Compute address/width for second load
   for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
      if (mask & (1 << d)) {
+         assert(!size2 || !(addr2 & 0x7));
         def2[n2] = ld1->getDef(d);
         size2 += def2[n2++]->reg.size;
-      } else {
+      } else if (!n2) {
         assert(!n2);
         addr2 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
      }
   }

+   // Make sure that we've processed all the values
+   for (; ld1->defExists(d); ++d)
+      assert(!(mask & (1 << d)));
+
   updateLdStOffset(ld1, addr1, func);
   ld1->setType(typeOfSize(size1));
   for (d = 0; d < 4; ++d)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -25,10 +25,24 @@

 #include <stack>
 #include <limits>
+#if __cplusplus >= 201103L
+#include <unordered_map>
+#else
 #include <tr1/unordered_map>
+#endif

 namespace nv50_ir {

+#if __cplusplus >= 201103L
+using std::hash;
+using std::unordered_map;
+#elif !defined(ANDROID)
+using std::tr1::hash;
+using std::tr1::unordered_map;
+#else
+#error Android release before Lollipop is not supported!
+#endif
+
 #define MAX_REGISTER_FILE_SIZE 256

 class RegisterSet
@@ -349,12 +363,12 @@ RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p)

 struct PhiMapHash {
   size_t operator()(const std::pair<Instruction *, BasicBlock *>& val) const {
-      return std::tr1::hash<Instruction*>()(val.first) * 31 +
-         std::tr1::hash<BasicBlock*>()(val.second);
+      return hash<Instruction*>()(val.first) * 31 +
+         hash<BasicBlock*>()(val.second);
   }
 };

-typedef std::tr1::unordered_map<
+typedef unordered_map<
   std::pair<Instruction *, BasicBlock *>, Value *, PhiMapHash> PhiMap;

 // Critical edges need to be split up so that work can be inserted along
@@ -1559,10 +1573,28 @@ SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)

   Instruction *st;
   if (slot->reg.file == FILE_MEMORY_LOCAL) {
-      st = new_Instruction(func, OP_STORE, ty);
-      st->setSrc(0, slot);
-      st->setSrc(1, lval);
      lval->noSpill = 1;
+      if (ty != TYPE_B96) {
+         st = new_Instruction(func, OP_STORE, ty);
+         st->setSrc(0, slot);
+         st->setSrc(1, lval);
+      } else {
+         st = new_Instruction(func, OP_SPLIT, ty);
+         st->setSrc(0, lval);
+         for (int d = 0; d < lval->reg.size / 4; ++d)
+            st->setDef(d, new_LValue(func, FILE_GPR));
+
+         for (int d = lval->reg.size / 4 - 1; d >= 0; --d) {
+            Value *tmp = cloneShallow(func, slot);
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32);
+            s->setSrc(0, tmp);
+            s->setSrc(1, st->getDef(d));
+            defi->bb->insertAfter(defi, s);
+         }
+      }
   } else {
      st = new_Instruction(func, OP_CVT, ty);
      st->setDef(0, slot);
@@ -1582,7 +1614,27 @@ SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
   Instruction *ld;
   if (slot->reg.file == FILE_MEMORY_LOCAL) {
      lval->noSpill = 1;
-      ld = new_Instruction(func, OP_LOAD, ty);
+      if (ty != TYPE_B96) {
+         ld = new_Instruction(func, OP_LOAD, ty);
+      } else {
+         ld = new_Instruction(func, OP_MERGE, ty);
+         for (int d = 0; d < lval->reg.size / 4; ++d) {
+            Value *tmp = cloneShallow(func, slot);
+            LValue *val;
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32);
+            l->setDef(0, (val = new_LValue(func, FILE_GPR)));
+            l->setSrc(0, tmp);
+            usei->bb->insertBefore(usei, l);
+            ld->setSrc(d, val);
+            val->noSpill = 1;
+         }
+         ld->setDef(0, lval);
+         usei->bb->insertBefore(usei, ld);
+         return lval;
+      }
   } else {
      ld = new_Instruction(func, OP_CVT, ty);
   }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -454,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
         return false;
      }
   }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
      return false;
   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -426,7 +426,7 @@ TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
         return false;
      }
   }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
      return false;
   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -656,8 +656,8 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
   if (buffer->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
                             PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
      buffer->domain = NOUVEAU_BO_GART;
-   } else if (buffer->base.bind &
-              (screen->vidmem_bindings & screen->sysmem_bindings)) {
+   } else if (buffer->base.bind == 0 || (buffer->base.bind &
+              (screen->vidmem_bindings & screen->sysmem_bindings))) {
      switch (buffer->base.usage) {
      case PIPE_USAGE_DEFAULT:
      case PIPE_USAGE_IMMUTABLE:
@@ -684,6 +684,10 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
      if (buffer->base.bind & screen->sysmem_bindings)
         buffer->domain = NOUVEAU_BO_GART;
   }
+   /* There can be very special situations where we want non-gpu-mapped
+    * buffers, but never through this interface.
+    */
+   assert(buffer->domain);
   ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);

   if (ret == false)
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -190,8 +190,14 @@ nouveau_fence_wait(struct nouveau_fence *fence)
   /* wtf, someone is waiting on a fence in flush_notify handler? */
   assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING);

-   if (fence->state < NOUVEAU_FENCE_STATE_EMITTED)
-      nouveau_fence_emit(fence);
+   if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) {
+      PUSH_SPACE(screen->pushbuf, 8);
+      /* The space allocation might trigger a flush, which could emit the
+       * current fence. So check again.
+       */
+      if (fence->state < NOUVEAU_FENCE_STATE_EMITTED)
+         nouveau_fence_emit(fence);
+   }

   if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED)
      if (nouveau_pushbuf_kick(screen->pushbuf, screen->pushbuf->channel))
@@ -224,8 +230,12 @@ nouveau_fence_wait(struct nouveau_fence *fence)
 void
 nouveau_fence_next(struct nouveau_screen *screen)
 {
-   if (screen->fence.current->state < NOUVEAU_FENCE_STATE_EMITTING)
-      nouveau_fence_emit(screen->fence.current);
+   if (screen->fence.current->state < NOUVEAU_FENCE_STATE_EMITTING) {
+      if (screen->fence.current->ref > 1)
+         nouveau_fence_emit(screen->fence.current);
+      else
+         return;
+   }

   nouveau_fence_ref(NULL, &screen->fence.current);

--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.c
@@ -437,6 +437,7 @@ nouveau_vp3_screen_get_video_param(struct pipe_screen *pscreen,
      /* VP3 does not support MPEG4, VP4+ do. */
      return entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM &&
         profile >= PIPE_VIDEO_PROFILE_MPEG1 &&
+         profile < PIPE_VIDEO_PROFILE_HEVC_MAIN &&
         (!vp3 || codec != PIPE_VIDEO_FORMAT_MPEG4) &&
         firmware_present(pscreen, profile);
   case PIPE_VIDEO_CAP_NPOT_TEXTURES:
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -24,6 +24,8 @@ PUSH_AVAIL(struct nouveau_pushbuf *push)
 static inline bool
 PUSH_SPACE(struct nouveau_pushbuf *push, uint32_t size)
 {
+   /* Provide a buffer so that fences always have room to be emitted */
+   size += 8;
   if (PUSH_AVAIL(push) < size)
      return nouveau_pushbuf_space(push, size, 0, 0) == 0;
   return true;
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -261,6 +261,8 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
      case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
         return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
      default:
         debug_printf("unknown vertex shader param %d\n", param);
         return 0;
@@ -302,6 +304,8 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
      case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
         return 0;
+      case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+         return 32;
      default:
         debug_printf("unknown fragment shader param %d\n", param);
         return 0;
@@ -345,7 +349,9 @@ nv30_screen_fence_emit(struct pipe_screen *pscreen, uint32_t *sequence)

   *sequence = ++screen->base.fence.sequence;

-   BEGIN_NV04(push, NV30_3D(FENCE_OFFSET), 2);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 3);
+   PUSH_DATA (push, NV30_3D_FENCE_OFFSET |
+              (2 /* size */ << 18) | (7 /* subchan */ << 13));
   PUSH_DATA (push, 0);
   PUSH_DATA (push, *sequence);
 }
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -1,3 +1,4 @@
+#include <strings.h>
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -159,9 +159,10 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
                                 int ref)
 {
   struct nv50_context *nv50 = nv50_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
   unsigned s, i;

-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
      assert(nv50->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
      for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
         if (nv50->framebuffer.cbufs[i] &&
@@ -173,7 +174,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
         }
      }
   }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
      if (nv50->framebuffer.zsbuf &&
          nv50->framebuffer.zsbuf->texture == res) {
         nv50->dirty |= NV50_NEW_FRAMEBUFFER;
@@ -183,11 +184,11 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
      }
   }

-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_SAMPLER_VIEW)) {

      assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
      for (i = 0; i < nv50->num_vtxbufs; ++i) {
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -297,6 +297,8 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
      return 0;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
   default:
      NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
      return 0;
@@ -386,6 +388,7 @@ nv50_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
   /* we need to do it after possible flush in MARK_RING */
   *sequence = ++screen->base.fence.sequence;

+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
   PUSH_DATA (push, NV50_FIFO_PKHDR(NV50_3D(QUERY_ADDRESS_HIGH), 4));
   PUSH_DATAh(push, screen->fence.bo->offset);
   PUSH_DATA (push, screen->fence.bo->offset);
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -65,14 +65,9 @@ nv50_constbufs_validate(struct nv50_context *nv50)
               PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
            }
            while (words) {
-               unsigned nr;
-
-               if (!PUSH_SPACE(push, 16))
-                  break;
-               nr = PUSH_AVAIL(push);
-               assert(nr >= 16);
-               nr = MIN2(MIN2(nr - 3, words), NV04_PFIFO_MAX_PACKET_LEN);
+               unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN);

+               PUSH_SPACE(push, nr + 3);
               BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
               PUSH_DATA (push, (start << 8) | b);
               BEGIN_NI04(push, NV50_3D(CB_DATA(0)), nr);
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -960,6 +960,9 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
   struct nv50_context *nv50 = nv50_context(pipe);
   unsigned i;

+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+   nv50->dirty |= NV50_NEW_ARRAYS;
+
   util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
                                 start_slot, count);

@@ -983,10 +986,6 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
         nv50->vbo_constant &= ~(1 << dst_index);
      }
   }
-
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
-
-   nv50->dirty |= NV50_NEW_ARRAYS;
 }

 static void
--- a/src/gallium/drivers/nouveau/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c
@@ -187,14 +187,7 @@ nv50_sifc_linear_u8(struct nouveau_context *nv,
   PUSH_DATA (push, 0);

   while (count) {
-      unsigned nr;
-
-      if (!PUSH_SPACE(push, 16))
-         break;
-      nr = PUSH_AVAIL(push);
-      assert(nr >= 16);
-      nr = MIN2(count, nr - 1);
-      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN);
+      unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);

      BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr);
      PUSH_DATAp(push, src, nr);
@@ -395,12 +388,9 @@ nv50_cb_push(struct nouveau_context *nv,
   nouveau_pushbuf_validate(push);

   while (words) {
-      unsigned nr;
-
-      nr = PUSH_AVAIL(push);
-      nr = MIN2(nr - 7, words);
-      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1);
+      unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN);

+      PUSH_SPACE(push, nr + 7);
      BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3);
      PUSH_DATAh(push, bo->offset + base);
      PUSH_DATA (push, bo->offset + base);
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -641,8 +641,8 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
         BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
         PUSH_DATA (push, prim);

-         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
         nouveau_pushbuf_space(push, 8, 0, 1);
+         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);

         switch (index_size) {
         case 4:
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -77,7 +77,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
      bsp_size += (1 << 20) - 1;
      bsp_size &= ~((1 << 20) - 1);

-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, NULL, &tmp_bo);
      if (ret) {
         debug_printf("reallocating bsp %u -> %u failed with %i\n",
                      bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -90,7 +90,7 @@ nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
   if (!inter_bo || bsp_bo->size * 4 > inter_bo->size) {
      struct nouveau_bo *tmp_bo = NULL;

-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, NULL, &tmp_bo);
      if (ret) {
         debug_printf("reallocating inter %u -> %u failed with %i\n",
                      inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -180,9 +180,10 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
                                 int ref)
 {
   struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
   unsigned s, i;

-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
      for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
         if (nvc0->framebuffer.cbufs[i] &&
             nvc0->framebuffer.cbufs[i]->texture == res) {
@@ -193,7 +194,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
         }
      }
   }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
      if (nvc0->framebuffer.zsbuf &&
          nvc0->framebuffer.zsbuf->texture == res) {
         nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -203,12 +204,12 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
      }
   }

-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_COMMAND_ARGS_BUFFER |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_COMMAND_ARGS_BUFFER |
+               PIPE_BIND_SAMPLER_VIEW)) {
      for (i = 0; i < nvc0->num_vtxbufs; ++i) {
         if (nvc0->vtxbuf[i].buffer == res) {
            nvc0->dirty |= NVC0_NEW_ARRAYS;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -287,8 +287,6 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
      break;
   case PIPE_PRIM_TRIANGLES:
      tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
-      if (info->prop.tp.winding > 0)
-         tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
      break;
   case PIPE_PRIM_QUADS:
      tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
@@ -297,6 +295,10 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
      tp->tp.tess_mode = ~0;
      return;
   }
+
+   if (info->prop.tp.winding > 0)
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
+
   if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS)
      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -618,7 +618,6 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))

   PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
-   nouveau_pushbuf_space(push, 0, 0, 1);
   nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
                        NVC0_IB_ENTRY_1_NO_PREFETCH);
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
@@ -26,7 +26,8 @@ nvc0_resource_from_handle(struct pipe_screen * screen,
   } else {
      struct pipe_resource *res = nv50_miptree_from_handle(screen,
                                                           templ, whandle);
-      nv04_resource(res)->vtbl = &nvc0_miptree_vtbl;
+      if (res)
+         nv04_resource(res)->vtbl = &nvc0_miptree_vtbl;
      return res;
   }
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -310,6 +310,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
      return 16; /* would be 32 in linked (OpenGL-style) mode */
   case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
      return 16; /* XXX not sure if more are really safe */
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 32;
   default:
      NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
      return 0;
@@ -415,6 +417,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
   if (screen->pm.prog) {
      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
      nvc0_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
   }

   nouveau_bo_ref(NULL, &screen->text);
@@ -535,7 +538,8 @@ nvc0_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
   /* we need to do it after possible flush in MARK_RING */
   *sequence = ++screen->base.fence.sequence;

-   BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
+   PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(NVC0_3D(QUERY_ADDRESS_HIGH), 4));
   PUSH_DATAh(push, screen->fence.bo->offset);
   PUSH_DATA (push, screen->fence.bo->offset);
   PUSH_DATA (push, *sequence);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -273,6 +273,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)

      if (!targ->clean)
         nvc0_query_fifo_wait(push, targ->pq);
+      nouveau_pushbuf_space(push, 0, 0, 1);
      BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
      PUSH_DATA (push, 1);
      PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -998,6 +998,9 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    unsigned i;

+    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
+    nvc0->dirty |= NVC0_NEW_ARRAYS;
+
    util_set_vertex_buffers_count(nvc0->vtxbuf, &nvc0->num_vtxbufs, vb,
                                  start_slot, count);

@@ -1021,9 +1024,6 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
          nvc0->constant_vbos &= ~(1 << dst_index);
       }
    }
-
-    nvc0->dirty |= NVC0_NEW_ARRAYS;
-    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
 }

 static void
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1006,9 +1006,11 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit)
      nvc0->base.pipe.render_condition(&nvc0->base.pipe, nvc0->cond_query,
                                       nvc0->cond_cond, nvc0->cond_mode);

+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP);
   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0));
   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1));
+   nouveau_scratch_done(&nvc0->base);

   nvc0->dirty = blit->saved.dirty |
      (NVC0_NEW_FRAMEBUFFER | NVC0_NEW_SCISSOR | NVC0_NEW_SAMPLE_MASK |
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -188,14 +188,10 @@ nvc0_m2mf_push_linear(struct nouveau_context *nv,
   nouveau_pushbuf_validate(push);

   while (count) {
-      unsigned nr;
+      unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);

-      if (!PUSH_SPACE(push, 16))
+      if (!PUSH_SPACE(push, nr + 9))
         break;
-      nr = PUSH_AVAIL(push);
-      assert(nr >= 16);
-      nr = MIN2(count, nr - 9);
-      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN);

      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
      PUSH_DATAh(push, dst->offset + offset);
@@ -234,14 +230,10 @@ nve4_p2mf_push_linear(struct nouveau_context *nv,
   nouveau_pushbuf_validate(push);

   while (count) {
-      unsigned nr;
+      unsigned nr = MIN2(count, (NV04_PFIFO_MAX_PACKET_LEN - 1));

-      if (!PUSH_SPACE(push, 16))
+      if (!PUSH_SPACE(push, nr + 10))
         break;
-      nr = PUSH_AVAIL(push);
-      assert(nr >= 16);
-      nr = MIN2(count, nr - 8);
-      nr = MIN2(nr, (NV04_PFIFO_MAX_PACKET_LEN - 1));

      BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2);
      PUSH_DATAh(push, dst->offset + offset);
@@ -571,9 +563,7 @@ nvc0_cb_bo_push(struct nouveau_context *nv,
   PUSH_DATA (push, bo->offset + base);

   while (words) {
-      unsigned nr = PUSH_AVAIL(push);
-      nr = MIN2(nr, words);
-      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1);
+      unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN - 1);

      PUSH_SPACE(push, nr + 2);
      PUSH_REFN (push, bo, NOUVEAU_BO_WR | domain);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -783,7 +783,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0,
   }

   while (num_instances--) {
-      PUSH_SPACE(push, 8);
+      nouveau_pushbuf_space(push, 9, 0, 1);
      BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
      PUSH_DATA (push, mode);
      BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1);
@@ -810,7 +810,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
      IMMED_NVC0(push, SUBC_3D(NV10_SUBCHAN_REF_CNT), 0);

-   PUSH_SPACE(push, 8);
+   nouveau_pushbuf_space(push, 8, 0, 1);
+   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
   if (info->indexed) {
      assert(nvc0->idxbuf.buffer);
      assert(nouveau_resource_mapped_by_gpu(nvc0->idxbuf.buffer));
@@ -828,8 +829,6 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   }
   PUSH_DATA(push, nvc0_prim_gl(info->mode));
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
-   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
-   nouveau_pushbuf_space(push, 0, 0, 1);
   nouveau_pushbuf_data(push,
                        buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -27,6 +27,7 @@ struct push_context {
   struct {
      bool enabled;
      bool value;
+      uint8_t width;
      unsigned stride;
      const uint8_t *data;
   } edgeflag;
@@ -53,6 +54,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
   /* silence warnings */
   ctx->edgeflag.data = NULL;
   ctx->edgeflag.stride = 0;
+   ctx->edgeflag.width = 0;
 }

 static inline void
@@ -100,6 +102,7 @@ nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
   struct nv04_resource *buf = nv04_resource(vb->buffer);

   ctx->edgeflag.stride = vb->stride;
+   ctx->edgeflag.width = util_format_get_blocksize(ve->src_format);
   if (buf) {
      unsigned offset = vb->buffer_offset + ve->src_offset;
      ctx->edgeflag.data = nouveau_resource_map_offset(&nvc0->base,
@@ -137,10 +140,17 @@ prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
 }

 static inline bool
-ef_value(const struct push_context *ctx, uint32_t index)
+ef_value_8(const struct push_context *ctx, uint32_t index)
 {
-   float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
-   return *pf ? true : false;
+   uint8_t *pf = (uint8_t *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
+   return !!*pf;
+}
+
+static inline bool
+ef_value_32(const struct push_context *ctx, uint32_t index)
+{
+   uint32_t *pf = (uint32_t *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
+   return !!*pf;
 }

 static inline bool
@@ -154,7 +164,11 @@ static inline unsigned
 ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
 {
   unsigned i;
-   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   bool ef = ctx->edgeflag.value;
+   if (ctx->edgeflag.width == 1)
+      for (i = 0; i < n && ef_value_8(ctx, elts[i]) == ef; ++i);
+   else
+      for (i = 0; i < n && ef_value_32(ctx, elts[i]) == ef; ++i);
   return i;
 }

@@ -162,7 +176,11 @@ static inline unsigned
 ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
 {
   unsigned i;
-   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   bool ef = ctx->edgeflag.value;
+   if (ctx->edgeflag.width == 1)
+      for (i = 0; i < n && ef_value_8(ctx, elts[i]) == ef; ++i);
+   else
+      for (i = 0; i < n && ef_value_32(ctx, elts[i]) == ef; ++i);
   return i;
 }

@@ -170,7 +188,11 @@ static inline unsigned
 ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
 {
   unsigned i;
-   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   bool ef = ctx->edgeflag.value;
+   if (ctx->edgeflag.width == 1)
+      for (i = 0; i < n && ef_value_8(ctx, elts[i]) == ef; ++i);
+   else
+      for (i = 0; i < n && ef_value_32(ctx, elts[i]) == ef; ++i);
   return i;
 }

@@ -178,7 +200,11 @@ static inline unsigned
 ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
 {
   unsigned i;
-   for (i = 0; i < n && ef_value(ctx, start++) == ctx->edgeflag.value; ++i);
+   bool ef = ctx->edgeflag.value;
+   if (ctx->edgeflag.width == 1)
+      for (i = 0; i < n && ef_value_8(ctx, start++) == ef; ++i);
+   else
+      for (i = 0; i < n && ef_value_32(ctx, start++) == ef; ++i);
   return i;
 }

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -169,9 +169,12 @@ nvc0_create_decoder(struct pipe_context *context,
   for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                           0, 1 << 20, &cfg, &dec->bsp_bo[i]);
-   if (!ret)
+   if (!ret) {
+      /* total fudge factor... just has to be bigger for higher bitrates? */
+      unsigned inter_size = align(templ->width * templ->height * 2, 4 << 20);
      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
-                           0x100, 4 << 20, &cfg, &dec->inter_bo[0]);
+                           0x100, inter_size, &cfg, &dec->inter_bo[0]);
+   }
   if (!ret) {
      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
                           0x100, dec->inter_bo[0]->size, &cfg,
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -81,7 +81,7 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
      bsp_size += (1 << 20) - 1;
      bsp_size &= ~((1 << 20) - 1);

-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_size, &cfg, &tmp_bo);
      if (ret) {
         debug_printf("reallocating bsp %u -> %u failed with %i\n",
                      bsp_bo ? (unsigned)bsp_bo->size : 0, bsp_size, ret);
@@ -98,7 +98,7 @@ nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
      cfg.nvc0.tile_mode = 0x10;
      cfg.nvc0.memtype = 0xfe;

-      ret = nouveau_bo_new(dec->bitplane_bo->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
+      ret = nouveau_bo_new(dec->client->device, NOUVEAU_BO_VRAM, 0, bsp_bo->size * 4, &cfg, &tmp_bo);
      if (ret) {
         debug_printf("reallocating inter %u -> %u failed with %i\n",
                      inter_bo ? (unsigned)inter_bo->size : 0, (unsigned)bsp_bo->size * 4, ret);
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -300,6 +300,8 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
        case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
            return 0;
+        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+            return 32;
        case PIPE_SHADER_CAP_PREFERRED_IR:
            return PIPE_SHADER_IR_TGSI;
        }
@@ -356,6 +358,8 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
        case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
            return 0;
+        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+            return 32;
        case PIPE_SHADER_CAP_PREFERRED_IR:
            return PIPE_SHADER_IR_TGSI;
        }
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1527,12 +1527,17 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples,
 				     S_028C00_EXPAND_LINE_WIDTH(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, S_028C04_MSAA_NUM_SAMPLES(util_logbase2(nr_samples)) |
 				     S_028C04_MAX_SAMPLE_DIST(max_dist)); /* R_028C04_PA_SC_AA_CONFIG */
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	} else {
 		r600_write_context_reg_seq(cs, R_028C00_PA_SC_LINE_CNTL, 2);
 		radeon_emit(cs, S_028C00_LAST_PIXEL(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, 0); /* R_028C04_PA_SC_AA_CONFIG */
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }

@@ -1911,7 +1916,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,

 		if (!gs_ring_buffer) {
 			r600_write_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4,
-						    ALIGN_DIVUP(cb->buffer_size >> 4, 16), pkt_flags);
+						    ALIGN_DIVUP(cb->buffer_size, 256), pkt_flags);
 			r600_write_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8,
 						    pkt_flags);
 		}
@@ -2342,6 +2347,8 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)

 	r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0);

 	/* to avoid GPU doing any preloading of constant from random address */
@@ -2781,6 +2788,8 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)

 	r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0);

 	/* to avoid GPU doing any preloading of constant from random address */
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -1497,6 +1497,7 @@
 #define   S_028878_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
 #define   G_028878_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
 #define   C_028878_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define R_02887C_SQ_PGM_RESOURCES_2_GS                 0x02887C

 #define R_028890_SQ_PGM_RESOURCES_ES                 0x028890
 #define   S_028890_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
@@ -1511,6 +1512,7 @@
 #define   S_028890_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
 #define   G_028890_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
 #define   C_028890_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define R_028894_SQ_PGM_RESOURCES_2_ES                 0x028894

 #define R_028864_SQ_PGM_RESOURCES_2_VS               0x028864
 #define   S_028864_SINGLE_ROUND(x)                     (((x) & 0x3) << 0)
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -621,7 +621,7 @@ static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
 	return 0;
 }

-void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg)
+void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg, unsigned abs)
 {
 	switch(value) {
 	case 0:
@@ -641,11 +641,11 @@ void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *ne
 		break;
 	case 0xBF800000: /* -1.0f */
 		*sel = V_SQ_ALU_SRC_1;
-		*neg ^= 1;
+		*neg ^= !abs;
 		break;
 	case 0xBF000000: /* -0.5f */
 		*sel = V_SQ_ALU_SRC_0_5;
-		*neg ^= 1;
+		*neg ^= !abs;
 		break;
 	default:
 		*sel = V_SQ_ALU_SRC_LITERAL;
@@ -1194,7 +1194,7 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
 		}
 		if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
 			r600_bytecode_special_constants(nalu->src[i].value,
-				&nalu->src[i].sel, &nalu->src[i].neg);
+				&nalu->src[i].sel, &nalu->src[i].neg, nalu->src[i].abs);
 	}
 	if (nalu->dst.sel >= bc->ngpr) {
 		bc->ngpr = nalu->dst.sel + 1;
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -254,7 +254,7 @@ int r600_bytecode_add_cfinst(struct r600_bytecode *bc,
 int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
 		const struct r600_bytecode_alu *alu, unsigned type);
 void r600_bytecode_special_constants(uint32_t value,
-		unsigned *sel, unsigned *neg);
+		unsigned *sel, unsigned *neg, unsigned abs);
 void r600_bytecode_disasm(struct r600_bytecode *bc);
 void r600_bytecode_alu_read(struct r600_bytecode *bc,
 		struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1);
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -504,6 +504,12 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
 		return 0;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		/* due to a bug in the shader compiler, some loops hang
+		 * if they are not unrolled, see:
+		 *    https://bugs.freedesktop.org/show_bug.cgi?id=86720
+		 */
+		return 255;
 	}
 	return 0;
 }
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -57,7 +57,7 @@

 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS	16
-#define R600_MAX_DRAW_CS_DWORDS		47
+#define R600_MAX_DRAW_CS_DWORDS		52
 #define R600_TRACE_CS_DWORDS		7

 #define R600_MAX_USER_CONST_BUFFERS 13
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -598,6 +598,106 @@ static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back
 	return 0;
 }

+/* execute a single slot ALU calculation */
+static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
+		for (i = 0; i < 4; i++) {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = op;
+			alu.src[0].sel = src0_sel;
+			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[0].value = src0_chan_val;
+			else
+				alu.src[0].chan = src0_chan_val;
+			alu.src[1].sel = src1_sel;
+			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[1].value = src1_chan_val;
+			else
+				alu.src[1].chan = src1_chan_val;
+			alu.dst.sel = dst_sel;
+			alu.dst.chan = i;
+			alu.dst.write = i == dst_chan;
+			alu.last = (i == 3);
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+		return 0;
+	}
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+/* execute a single slot ALU calculation */
+static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val,
+			  int src2_sel, unsigned src2_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r;
+
+	/* validate this for other ops */
+	assert(op == ALU_OP3_MULADD_UINT24);
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.src[2].sel = src2_sel;
+	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[2].value = src2_chan_val;
+	else
+		alu.src[2].chan = src2_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.is_op3 = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
+{
+	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
+}
+
 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
 {
 	int i;
@@ -1004,7 +1104,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {

 			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
-			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
+			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
 			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
 				return;
 		}
@@ -1129,6 +1229,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	unsigned vtx_id = src->Dimension.Index;
 	int offset_reg = vtx_id / 3;
 	int offset_chan = vtx_id % 3;
+	int t2 = 0;

 	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
 	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
@@ -1136,13 +1237,24 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	if (offset_reg == 0 && offset_chan == 2)
 		offset_chan = 3;

+	if (src->Dimension.Indirect || src->Register.Indirect)
+		t2 = r600_get_temp(ctx);
+
 	if (src->Dimension.Indirect) {
 		int treg[3];
-		int t2;
 		struct r600_bytecode_alu alu;
 		int r, i;
-
-		/* you have got to be shitting me -
+		unsigned addr_reg;
+		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
+		if (src->DimIndirect.Index > 0) {
+			r = single_alu_op2(ctx, ALU_OP1_MOV,
+					   ctx->bc->ar_reg, 0,
+					   addr_reg, 0,
+					   0, 0);
+			if (r)
+				return r;
+		}
+		/*
 		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
 		   at least this is what fglrx seems to do. */
 		for (i = 0; i < 3; i++) {
@@ -1150,7 +1262,6 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		}
 		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);

-		t2 = r600_get_temp(ctx);
 		for (i = 0; i < 3; i++) {
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ALU_OP1_MOV;
@@ -1175,8 +1286,33 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		if (r)
 			return r;
 		offset_reg = t2;
+		offset_chan = 0;
 	}

+	if (src->Register.Indirect) {
+		int addr_reg;
+		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
+
+		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
+
+		/* pull the value from index_reg */
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   t2, 1,
+				   addr_reg, 0,
+				   V_SQ_ALU_SRC_LITERAL, first);
+		if (r)
+			return r;
+		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+				   t2, 0,
+				   t2, 1,
+				   V_SQ_ALU_SRC_LITERAL, 4,
+				   offset_reg, offset_chan);
+		if (r)
+			return r;
+		offset_reg = t2;
+		offset_chan = 0;
+		index = src->Register.Index - first;
+	}

 	memset(&vtx, 0, sizeof(vtx));
 	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
@@ -1222,6 +1358,7 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)

 			fetch_gs_input(ctx, src, treg);
 			ctx->src[i].sel = treg;
+			ctx->src[i].rel = 0;
 		}
 	}
 	return 0;
@@ -1972,7 +2109,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,

 	ctx.nliterals = 0;
 	ctx.literals = NULL;
-	shader->fs_write_all = FALSE;
+
+	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+			       ctx.info.colors_written == 1;

 	if (shader->vs_as_gs_a)
 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
@@ -2003,10 +2142,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		case TGSI_TOKEN_TYPE_PROPERTY:
 			property = &ctx.parse.FullToken.FullProperty;
 			switch (property->Property.PropertyName) {
-			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-				if (property->u[0].Data == 1)
-					shader->fs_write_all = TRUE;
-				break;
 			case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION:
 				if (property->u[0].Data == 1)
 					shader->vs_position_window_space = TRUE;
@@ -2159,6 +2294,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			struct r600_bytecode_alu alu;
 			int r;

+			/* GS thread with no output workaround - emit a cut at start of GS */
+			if (ctx.bc->chip_class == R600)
+				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
+
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ALU_OP1_MOV;
 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
@@ -6671,7 +6810,7 @@ static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
 	struct r600_bytecode_alu alu;
 	int r;
 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
-	unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg;
+	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);

 	assert(inst->Dst[0].Register.Index < 3);
 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1732,7 +1732,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,

 		if (!gs_ring_buffer) {
 			r600_write_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4,
-					       ALIGN_DIVUP(cb->buffer_size >> 4, 16));
+					       ALIGN_DIVUP(cb->buffer_size, 256));
 			r600_write_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8);
 		}

@@ -2181,10 +2181,11 @@ void r600_init_atom_start_cs(struct r600_context *rctx)
 		num_temp_gprs = 4;
 		num_gs_gprs = 0;
 		num_es_gprs = 0;
-		num_ps_threads = 136;
-		num_vs_threads = 48;
-		num_gs_threads = 4;
-		num_es_threads = 4;
+		/* use limits 40 VS and at least 16 ES/GS */
+		num_ps_threads = 120;
+		num_vs_threads = 40;
+		num_gs_threads = 16;
+		num_es_threads = 16;
 		num_ps_stack_entries = 40;
 		num_vs_stack_entries = 40;
 		num_gs_stack_entries = 32;
@@ -2643,6 +2644,9 @@ void r600_update_vs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 		S_02881C_USE_VTX_VIEWPORT_INDX(rshader->vs_out_viewport);
 }

+#define RV610_GSVS_ALIGN 32
+#define R600_GSVS_ALIGN 16
+
 void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *shader)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
@@ -2652,6 +2656,23 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 	unsigned gsvs_itemsize =
 			(cp_shader->ring_item_size * rshader->gs_max_out_vertices) >> 2;

+	/* some r600s needs gsvs itemsize aligned to cacheline size
+	   this was fixed in rs780 and above. */
+	switch (rctx->b.family) {
+	case CHIP_RV610:
+		gsvs_itemsize = align(gsvs_itemsize, RV610_GSVS_ALIGN);
+		break;
+	case CHIP_R600:
+	case CHIP_RV630:
+	case CHIP_RV670:
+	case CHIP_RV620:
+	case CHIP_RV635:
+		gsvs_itemsize = align(gsvs_itemsize, R600_GSVS_ALIGN);
+		break;
+	default:
+		break;
+	}
+
 	r600_init_command_buffer(cb, 64);

 	/* VGT_GS_MODE is written by r600_emit_shader_stages */
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1691,6 +1691,24 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 					(info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0);
 	}

+	/* SMX returns CONTEXT_DONE too early workaround */
+	if (rctx->b.family == CHIP_R600 ||
+	    rctx->b.family == CHIP_RV610 ||
+	    rctx->b.family == CHIP_RV630 ||
+	    rctx->b.family == CHIP_RV635) {
+		/* if we have gs shader or streamout
+		   we need to do a wait idle after every draw */
+		if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) {
+			r600_write_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1));
+		}
+	}
+
+	/* ES ring rolling over at EOP - workaround */
+	if (rctx->b.chip_class == R600) {
+		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT);
+	}
+
 	if (rctx->screen->b.trace_bo) {
 		r600_trace_emit(rctx);
 	}
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -130,6 +130,7 @@
 #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
 #define EVENT_TYPE_FLUSH_AND_INV_DB_META       0x2c /* supported on r700+ */
 #define EVENT_TYPE_VGT_FLUSH                   0x24
+#define EVENT_TYPE_SQ_NON_EVENT                0x26
 #define EVENT_TYPE_FLUSH_AND_INV_CB_META	46 /* supported on r700+ */
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
--- a/src/gallium/drivers/radeon/Makefile.am
+++ b/src/gallium/drivers/radeon/Makefile.am
@@ -16,7 +16,8 @@ libradeon_la_SOURCES = \
 if NEED_RADEON_LLVM

 AM_CFLAGS += \
-	$(LLVM_CFLAGS)
+	$(LLVM_CFLAGS) \
+	$(LIBELF_CFLAGS)

 libradeon_la_SOURCES += \
 	$(LLVM_C_FILES)
@@ -24,7 +25,7 @@ libradeon_la_SOURCES += \
 libradeon_la_LIBADD = \
 	$(CLOCK_LIB) \
 	$(LLVM_LIBS) \
-	$(ELF_LIB)
+	$(LIBELF_LIBS)

 libradeon_la_LDFLAGS = \
 	$(LLVM_LDFLAGS)
--- a/src/gallium/drivers/radeon/cayman_msaa.c
+++ b/src/gallium/drivers/radeon/cayman_msaa.c
@@ -229,13 +229,17 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
 			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
-					     EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+					       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		} else if (overrast_samples > 1) {
 			r600_write_context_reg(cs, CM_R_028804_DB_EQAA,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) |
 					       S_028804_OVERRASTERIZATION_AMOUNT(log_samples));
-			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		}
 	} else {
 		r600_write_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2);
@@ -245,6 +249,8 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 		r600_write_context_reg(cs, CM_R_028804_DB_EQAA,
 				       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 				       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -226,8 +226,8 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->family = rscreen->family;
 	rctx->chip_class = rscreen->chip_class;

-	if (rscreen->family == CHIP_HAWAII)
-		rctx->max_db = 16;
+	if (rscreen->chip_class >= CIK)
+		rctx->max_db = MAX2(8, rscreen->info.r600_num_backends);
 	else if (rscreen->chip_class >= EVERGREEN)
 		rctx->max_db = 8;
 	else
@@ -415,6 +415,7 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_ICELAND: return "AMD ICELAND";
 	case CHIP_CARRIZO: return "AMD CARRIZO";
 	case CHIP_FIJI: return "AMD FIJI";
+	case CHIP_STONEY: return "AMD STONEY";
 	default: return "AMD unknown";
 	}
 }
@@ -542,7 +543,13 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_TONGA: return "tonga";
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
+#if HAVE_LLVM <= 0x0307
+	case CHIP_FIJI: return "tonga";
+	case CHIP_STONEY: return "carrizo";
+#else
 	case CHIP_FIJI: return "fiji";
+	case CHIP_STONEY: return "stoney";
+#endif
 	default: return "";
 	}
 }
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -168,6 +168,8 @@

 #define EG_R_028A4C_PA_SC_MODE_CNTL_1                0x028A4C
 #define   EG_S_028A4C_PS_ITER_SAMPLE(x)                 (((x) & 0x1) << 16)
+#define   EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)        (((x) & 0x1) << 25)
+#define   EG_S_028A4C_FORCE_EOV_REZ_ENABLE(x)           (((x) & 0x1) << 26)

 #define CM_R_028804_DB_EQAA                          0x00028804
 #define   S_028804_MAX_ANCHOR_SAMPLES(x)		(((x) & 0x7) << 0)
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -194,8 +194,8 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	if (mem_err) {
 		fprintf(stderr, "%s: %s", __FUNCTION__, err);
 		FREE(err);
-		LLVMDisposeTargetMachine(tm);
-		return 1;
+		rval = 1;
+		goto out;
 	}

 	if (0 != rval) {
@@ -211,6 +211,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);

+out:
 	if (dispose_tm) {
 		LLVMDisposeTargetMachine(tm);
 	}
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -940,11 +940,19 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	dec->msg->body.decode.width_in_samples = dec->base.width;
 	dec->msg->body.decode.height_in_samples = dec->base.height;

+	if ((picture->profile == PIPE_VIDEO_PROFILE_VC1_SIMPLE) ||
+	    (picture->profile == PIPE_VIDEO_PROFILE_VC1_MAIN)) {
+		dec->msg->body.decode.width_in_samples = align(dec->msg->body.decode.width_in_samples, 16) / 16;
+		dec->msg->body.decode.height_in_samples = align(dec->msg->body.decode.height_in_samples, 16) / 16;
+	}
+
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
 	dec->msg->body.decode.db_pitch = dec->base.width;

 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
+	if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY)
+		dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2;

 	switch (u_reduce_video_profile(picture->profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -385,7 +385,10 @@ struct ruvd_msg {
 			uint32_t	dt_chroma_top_offset;
 			uint32_t	dt_chroma_bottom_offset;
 			uint32_t	dt_surf_tile_config;
-			uint32_t	dt_reserved[3];
+			uint32_t	dt_uv_surf_tile_config;
+			// re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney
+			uint32_t	dt_wa_chroma_top_offset;
+			uint32_t	dt_wa_chroma_bottom_offset;

 			uint32_t	reserved[16];

--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -388,6 +388,11 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	struct radeon_surf *tmp_surf;
 	unsigned cpb_size;

+	if (rscreen->info.family == CHIP_STONEY) {
+		RVID_ERR("Stoney VCE is not supported!\n");
+		return NULL;
+	}
+
 	if (!rscreen->info.vce_fw_version) {
 		RVID_ERR("Kernel doesn't supports VCE!\n");
 		return NULL;
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -205,11 +205,12 @@ int rvid_get_video_param(struct pipe_screen *screen,
 			 enum pipe_video_cap param)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
+	enum pipe_video_format codec = u_reduce_video_profile(profile);

 	if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
 		switch (param) {
 		case PIPE_VIDEO_CAP_SUPPORTED:
-			return u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
+			return codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
 				rvce_is_fw_version_supported(rscreen);
 	        case PIPE_VIDEO_CAP_NPOT_TEXTURES:
        	        return 1;
@@ -232,38 +233,18 @@ int rvid_get_video_param(struct pipe_screen *screen,
 		}
 	}

-	/* UVD 2.x limits */
-	if (rscreen->family < CHIP_PALM) {
-		enum pipe_video_format codec = u_reduce_video_profile(profile);
-		switch (param) {
-		case PIPE_VIDEO_CAP_SUPPORTED:
-			/* no support for MPEG4 */
-			return codec != PIPE_VIDEO_FORMAT_MPEG4 &&
-			       /* FIXME: VC-1 simple/main profile is broken */
-			       profile != PIPE_VIDEO_PROFILE_VC1_SIMPLE &&
-			       profile != PIPE_VIDEO_PROFILE_VC1_MAIN;
-		case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-		case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-			/* MPEG2 only with shaders and no support for
-			   interlacing on R6xx style UVD */
-			return codec != PIPE_VIDEO_FORMAT_MPEG12 &&
-			       rscreen->family > CHIP_RV770;
-		default:
-			break;
-		}
-	}
-
 	switch (param) {
 	case PIPE_VIDEO_CAP_SUPPORTED:
-		switch (u_reduce_video_profile(profile)) {
+		switch (codec) {
 		case PIPE_VIDEO_FORMAT_MPEG12:
 		case PIPE_VIDEO_FORMAT_MPEG4:
 		case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-			return entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE;
+			if (rscreen->family < CHIP_PALM)
+				/* no support for MPEG4 */
+				return codec != PIPE_VIDEO_FORMAT_MPEG4;
+			return true;
 		case PIPE_VIDEO_FORMAT_VC1:
-			/* FIXME: VC-1 simple/main profile is broken */
-			return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED &&
-			       entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE;
+			return true;
 		case PIPE_VIDEO_FORMAT_HEVC:
 			/* Carrizo only supports HEVC Main */
 			return rscreen->family >= CHIP_CARRIZO &&
@@ -280,13 +261,17 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
-			return false; //The hardware doesn't support interlaced HEVC.
-		return true;
 	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
-			return false; //The hardware doesn't support interlaced HEVC.
-		return true;
+		if (rscreen->family < CHIP_PALM) {
+			/* MPEG2 only with shaders and no support for
+			   interlacing on R6xx style UVD */
+			return codec != PIPE_VIDEO_FORMAT_MPEG12 &&
+			       rscreen->family > CHIP_RV770;
+		} else {
+			if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
+				return false; //The firmware doesn't support interlaced HEVC.
+			return true;
+		}
 	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
 		return true;
 	case PIPE_VIDEO_CAP_MAX_LEVEL:
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -137,6 +137,7 @@ enum radeon_family {
    CHIP_ICELAND,
    CHIP_CARRIZO,
    CHIP_FIJI,
+    CHIP_STONEY,
    CHIP_LAST,
 };

--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -33,14 +33,6 @@
 #include "sid.h"

 #define MAX_GLOBAL_BUFFERS 20
-#if HAVE_LLVM < 0x0305
-#define NUM_USER_SGPRS 2
-#else
-/* XXX: Even though we don't pass the scratch buffer via user sgprs any more
- * LLVM still expects that we specify 4 USER_SGPRS so it can remain compatible
- * with older mesa. */
-#define NUM_USER_SGPRS 4
-#endif

 struct si_compute {
 	struct si_context *ctx;
@@ -241,7 +233,6 @@ static void si_launch_grid(
 	uint64_t kernel_args_va;
 	uint64_t scratch_buffer_va = 0;
 	uint64_t shader_va;
-	unsigned arg_user_sgpr_count = NUM_USER_SGPRS;
 	unsigned i;
 	struct si_shader *shader = &program->shader;
 	unsigned lds_blocks;
@@ -365,20 +356,7 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, (shader_va >> 8) & 0xffffffff);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);

-	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1,
-		/* We always use at least 3 VGPRS, these come from
-		 * TIDIG_COMP_CNT.
-		 * XXX: The compiler should account for this.
-		 */
-		S_00B848_VGPRS((MAX2(3, shader->num_vgprs) - 1) / 4)
-		/* We always use at least 4 + arg_user_sgpr_count.  The 4 extra
-		 * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN
-		 * XXX: The compiler should account for this.
-		 */
-		|  S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count,
-		                        shader->num_sgprs)) - 1) / 8)
-		|  S_00B028_FLOAT_MODE(shader->float_mode))
-		;
+	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);

 	lds_blocks = shader->lds_size;
 	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
@@ -394,17 +372,10 @@ static void si_launch_grid(

 	assert(lds_blocks <= 0xFF);

-	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2,
-		S_00B84C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)
-		| S_00B84C_USER_SGPR(arg_user_sgpr_count)
-		| S_00B84C_TGID_X_EN(1)
-		| S_00B84C_TGID_Y_EN(1)
-		| S_00B84C_TGID_Z_EN(1)
-		| S_00B84C_TG_SIZE_EN(1)
-		| S_00B84C_TIDIG_COMP_CNT(2)
-		| S_00B84C_LDS_SIZE(lds_blocks)
-		| S_00B84C_EXCP_EN(0))
-		;
+	shader->rsrc2 &= C_00B84C_LDS_SIZE;
+	shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+
+	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
 	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);

 	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -489,6 +489,8 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 1;
+	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+		return 32;
 	}
 	return 0;
 }
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -637,6 +637,14 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 			    lp_build_const_int32(gallivm, swizzle));

 	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	if (type == TGSI_TYPE_DOUBLE) {
+		LLVMValueRef value2;
+		dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+				       lp_build_const_int32(gallivm, swizzle + 1));
+		value2 = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+		return radeon_llvm_emit_fetch_double(bld_base, value, value2);
+	}
+
 	return LLVMBuildBitCast(gallivm->builder, value,
 				tgsi2llvmtype(bld_base, type), "");
 }
@@ -3752,12 +3760,14 @@ void si_shader_binary_read_config(const struct si_screen *sscreen,
 			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
 			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 			shader->float_mode =  G_00B028_FLOAT_MODE(value);
+			shader->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
 			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
 			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
+			shader->rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			shader->spi_ps_input_ena = value;
@@ -4201,8 +4211,10 @@ out:

 void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
 {
-	if (shader->gs_copy_shader)
+	if (shader->gs_copy_shader) {
 		si_shader_destroy(ctx, shader->gs_copy_shader);
+		FREE(shader->gs_copy_shader);
+	}

 	if (shader->scratch_bo)
 		r600_resource_reference(&shader->scratch_bo, NULL);
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .0.3
 .0.9