Add release notes for the 10.3.3 release

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
Update version to 10.3.3
2014-11-08 16:43:13 +00:00 · 2014-11-08 16:36:00 +00:00 · 2014-11-06 14:55:49 +00:00 · 2014-11-06 14:51:37 +00:00 · 2014-10-29 18:54:55 +00:00 · 2014-10-29 18:18:54 +00:00
96 changed files with 3044 additions and 787 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-10.3.1
+10.3.3
--- a/configure.ac
+++ b/configure.ac
@@ -2057,7 +2057,12 @@ if test "x$MESA_LLVM" != x0; then
           dnl already added all of these objects to LLVM_LIBS.
        fi
    else
-        AC_MSG_WARN([Building mesa with staticly linked LLVM may cause compilation issues])
+        AC_MSG_WARN([Building mesa with statically linked LLVM may cause compilation issues])
+        dnl We need to link to llvm system libs when using static libs
+        dnl However, only llvm 3.5+ provides --system-libs
+        if test $LLVM_VERSION_MAJOR -eq 3 -a $LLVM_VERSION_MINOR -ge 5; then
+            LLVM_LIBS="$LLVM_LIBS `$LLVM_CONFIG --system-libs`"
+        fi
    fi
 fi

--- a/docs/relnotes/10.3.1.html
+++ b/docs/relnotes/10.3.1.html
@@ -30,7 +30,9 @@ because compatibility contexts are not supported.

 <h2>SHA256 checksums</h2>
 <pre>
-TBA
+155afcbad17be8bb80282c761b957d5cc716c14a1fa16c4f5ee04e76df729c6d  MesaLib-10.3.1.tar.gz
+b081d077d717e5d56f2d59677490856052c41573e50378ff86d6c72456714add  MesaLib-10.3.1.tar.bz2
+07a14febfed06412d519e091a62d24513fee6745f1a6f8a8f1956bfe04b77d15  MesaLib-10.3.1.zip
 </pre>

 <h2>New features</h2>
--- a/docs/relnotes/10.3.2.html
+++ b/docs/relnotes/10.3.2.html
@@ -0,0 +1,115 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.3.2 Release Notes / October 24, 2014</h1>
+
+<p>
+Mesa 10.3.2 is a bug fix release which fixes bugs found since the 10.3 release.
+</p>
+<p>
+Mesa 10.3.2 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+<h2>SHA256 checksums</h2>
+<pre>
+e65f8e691f06f111c1aeb3a376b13c9cc88cb162bee2709e0e7e6b0e6628ca75  MesaLib-10.3.2.tar.gz
+e9849bcb9aa9acd98a753d6d46d2e7d7238d3367036e11357a60efd16de8bea3  MesaLib-10.3.2.tar.bz2
+427dc0d670d38e713ebff2675665ec2fe4ff7d04ce227bd54de946999fc1d234  MesaLib-10.3.2.zip
+</pre>
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=54372">Bug 54372</a> - GLX_INTEL_swap_event crashes driver when swapping window buffers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=81680">Bug 81680</a> - [r600g] Firefox crashes with hardware acceleration turned on</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84140">Bug 84140</a> - mplayer crashes playing some files using vdpau output</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84662">Bug 84662</a> - Long pauses with Unreal demo Elemental on R9270X since : Always flush the HDP cache before submitting a CS to the GPU</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=85267">Bug 85267</a> - vlc crashes with vdpau (Radeon 3850HD) [r600]</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>Brian Paul (3):</p>
+<ul>
+  <li>mesa: fix spurious wglGetProcAddress / GL_INVALID_OPERATION error</li>
+  <li>st/wgl: add WINAPI qualifiers on wgl function typedefs</li>
+  <li>glsl: fix several use-after-free bugs</li>
+</ul>
+
+<p>Daniel Manjarres (1):</p>
+<ul>
+  <li>glx: Fix glxUseXFont for glxWindow and glxPixmaps</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>mesa: fix GetTexImage for 1D array depth textures</li>
+</ul>
+
+<p>Emil Velikov (2):</p>
+<ul>
+  <li>docs: Add sha256 sums for the 10.3.1 release</li>
+  <li>Update VERSION to 10.3.2</li>
+</ul>
+
+<p>Ilia Mirkin (4):</p>
+<ul>
+  <li>gm107/ir: add dnz emission for fmul</li>
+  <li>gk110/ir: add dnz flag emission for fmul/fmad</li>
+  <li>nouveau: 3d textures are unsupported, limit 3d levels to 1</li>
+  <li>st/gbm: fix order of arguments passed to is_format_supported</li>
+</ul>
+
+<p>Kenneth Graunke (3):</p>
+<ul>
+  <li>i965: Add a BRW_MOCS_PTE #define.</li>
+  <li>i965: Use BDW_MOCS_PTE for renderbuffers.</li>
+  <li>i965: Fix register write checks.</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>st/mesa: use pipe_sampler_view_release for releasing sampler views</li>
+  <li>glsl_to_tgsi: fix the value of gl_FrontFacing with native integers</li>
+</ul>
+
+<p>Michel Dänzer (4):</p>
+<ul>
+  <li>radeonsi: Clear sampler view flags when binding a buffer</li>
+  <li>r600g,radeonsi: Always use GTT again for PIPE_USAGE_STREAM buffers</li>
+  <li>winsys/radeon: Use separate caching buffer manager for each set of flags</li>
+  <li>r600g: Drop references to destroyed blend state</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/10.3.3.html
+++ b/docs/relnotes/10.3.3.html
@@ -0,0 +1,207 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.3.3 Release Notes / November 8, 2014</h1>
+
+<p>
+Mesa 10.3.3 is a bug fix release which fixes bugs found since the 10.3.2 release.
+</p>
+<p>
+Mesa 10.3.3 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=70410">Bug 70410</a> - egl-static/Makefile: linking fails with llvm &gt;= 3.4</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=82921">Bug 82921</a> - layout(location=0) emits error &gt;= MAX_UNIFORM_LOCATIONS due to integer underflow</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=83574">Bug 83574</a> - [llvmpipe] [softpipe] piglit arb_explicit_uniform_location-use-of-unused-loc regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=85454">Bug 85454</a> - Unigine Sanctuary with Wine crashes on Mesa Git</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=85918">Bug 85918</a> - Mesa: MSVC 2010/2012 Compile error</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Anuj Phogat (2):</p>
+<ul>
+  <li>glsl: Fix crash due to negative array index</li>
+  <li>glsl: Use signed array index in update_max_array_access()</li>
+</ul>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>mesa: fix UNCLAMPED_FLOAT_TO_UBYTE() macro for MSVC</li>
+</ul>
+
+<p>Emil Velikov (2):</p>
+<ul>
+  <li>docs: Add sha256 sums for the 10.3.2 release</li>
+  <li>Update version to 10.3.3</li>
+</ul>
+
+<p>Ilia Mirkin (27):</p>
+<ul>
+  <li>freedreno/ir3: fix FSLT/etc handling to return 0/-1 instead of 0/1.0</li>
+  <li>freedreno/ir3: INEG operates on src0, not src1</li>
+  <li>freedreno/ir3: add UARL support</li>
+  <li>freedreno/ir3: negate result of USLT/etc</li>
+  <li>freedreno/ir3: use unsigned comparison for UIF</li>
+  <li>freedreno/ir3: add TXL support</li>
+  <li>freedreno/ir3: fix UCMP handling</li>
+  <li>freedreno/ir3: implement UMUL correctly</li>
+  <li>freedreno: add default .dir-locals.el for emacs settings</li>
+  <li>freedreno/ir3: make texture instruction construction more dynamic</li>
+  <li>freedreno/ir3: fix TXB/TXL to actually pull the bias/lod argument</li>
+  <li>freedreno/ir3: add TXQ support</li>
+  <li>freedreno/ir3: add TXB2 support</li>
+  <li>freedreno: dual-source render targets are not supported</li>
+  <li>freedreno: instanced drawing/compute not yet supported</li>
+  <li>freedreno/ir3: avoid fan-in sources referring to same instruction</li>
+  <li>freedreno/ir3: add IDIV/UDIV support</li>
+  <li>freedreno/ir3: add UMOD support, based on UDIV</li>
+  <li>freedreno/ir3: add MOD support</li>
+  <li>freedreno/ir3: add ISSG support</li>
+  <li>freedreno/ir3: add UMAD support</li>
+  <li>freedreno/ir3: make TXQ return integers, not floats</li>
+  <li>freedreno/ir3: shadow comes before array</li>
+  <li>freedreno/ir3: add texture offset support</li>
+  <li>freedreno/ir3: add TXD support and expose ARB_shader_texture_lod</li>
+  <li>freedreno/ir3: add TXF support</li>
+  <li>freedreno: positions come out as integers, not half-integers</li>
+</ul>
+
+<p>Jan Vesely (1):</p>
+<ul>
+  <li>configure: include llvm systemlibs when using static llvm</li>
+</ul>
+
+<p>Marek Olšák (5):</p>
+<ul>
+  <li>r600g: fix polygon mode for points and lines and point/line fill modes</li>
+  <li>radeonsi: fix polygon mode for points and lines and point/line fill modes</li>
+  <li>radeonsi: fix incorrect index buffer max size for lowered 8-bit indices</li>
+  <li>Revert "st/mesa: set MaxUnrollIterations = 255"</li>
+  <li>r300g: remove enabled/disabled hyperz and AA compression messages</li>
+</ul>
+
+<p>Mauro Rossi (1):</p>
+<ul>
+  <li>gallium/nouveau: fully build the driver under android</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>radeon/llvm: Dynamically allocate branch/loop stack arrays</li>
+</ul>
+
+<p>Rob Clark (62):</p>
+<ul>
+  <li>freedreno/ir3: detect scheduler fail</li>
+  <li>freedreno/ir3: add TXB</li>
+  <li>freedreno/ir3: add DDX/DDY</li>
+  <li>freedreno/ir3: bit of debug</li>
+  <li>freedreno/ir3: fix error in bail logic</li>
+  <li>freedreno/ir3: fix constlen with relative addressing</li>
+  <li>freedreno/ir3: add no-copy-propagate fallback step</li>
+  <li>freedreno: don't overflow cmdstream buffer so much</li>
+  <li>freedreno/ir3: fix potential segfault in RA</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a3xx: enable hw primitive-restart</li>
+  <li>freedreno/a3xx: handle rendering to layer != 0</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a3xx: format fixes</li>
+  <li>util/u_format: add _is_alpha()</li>
+  <li>freedreno/a3xx: alpha render-target shenanigans</li>
+  <li>freedreno/ir3: catch incorrect usage of tmp-dst</li>
+  <li>freedreno/ir3: add missing put_dst</li>
+  <li>freedreno: "fix" problems with excessive flushes</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a3xx: 3d/array textures</li>
+  <li>freedreno: add DRM_CONF_SHARE_FD</li>
+  <li>freedreno/a3xx: more texture array fixes</li>
+  <li>freedreno/a3xx: initial texture border-color</li>
+  <li>freedreno: fix compiler warning</li>
+  <li>freedreno: don't advertise mirror-clamp support</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno: we have more than 0 viewports!</li>
+  <li>freedreno: turn missing caps into compile warnings</li>
+  <li>freedreno/a3xx: add LOD_BIAS</li>
+  <li>freedreno/a3xx: add flat interpolation mode</li>
+  <li>freedreno/a3xx: add 32bit integer vtx formats</li>
+  <li>freedreno/a3xx: fix border color order</li>
+  <li>freedreno: move bind_sampler_states to per-generation</li>
+  <li>freedreno: add texcoord clamp support to lowering</li>
+  <li>freedreno/a3xx: add support to emulate GL_CLAMP</li>
+  <li>freedreno/a3xx: re-emit shaders on variant change</li>
+  <li>freedreno/lowering: fix token calculation for lowering</li>
+  <li>freedreno: destroy transfer pool after blitter</li>
+  <li>freedreno: max-texture-lod-bias should be 15.0f</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a3xx: handle large shader program sizes</li>
+  <li>freedreno/a3xx: emit all immediates in one shot</li>
+  <li>freedreno/ir3: fix lockups with lame FRAG shaders</li>
+  <li>freedreno/a3xx: handle VS only outputting BCOLOR</li>
+  <li>freedreno: query fixes</li>
+  <li>freedreno/a3xx: refactor vertex state emit</li>
+  <li>freedreno/a3xx: refactor/optimize emit</li>
+  <li>freedreno/ir3: optimize shader key comparision</li>
+  <li>freedreno: inline fd_draw_emit()</li>
+  <li>freedreno: fix layer_stride</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/ir3: large const support</li>
+  <li>freedreno/a3xx: more layer/level fixes</li>
+  <li>freedreno/ir3: comment + better fxn name</li>
+  <li>freedreno/ir3: fix potential gpu lockup with kill</li>
+  <li>freedreno/a3xx: disable early-z when we have kill's</li>
+  <li>freedreno/ir3: add debug flag to disable cp</li>
+  <li>freedreno: clear vs scissor</li>
+  <li>freedreno: mark scissor state dirty when enable bit changes</li>
+  <li>freedreno/a3xx: fix viewport state during clear</li>
+  <li>freedreno/a3xx: fix depth/stencil restore format</li>
+</ul>
+
+<p>Tapani Pälli (2):</p>
+<ul>
+  <li>glsl: fix uniform location count used for glsl types</li>
+  <li>mesa: check that uniform exists in glUniform* functions</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
@@ -465,7 +465,7 @@ dd_configuration(enum drm_conf conf)
 #endif
 #if defined(GALLIUM_FREEDRENO)
   if ((strcmp(driver_name, "kgsl") == 0) || (strcmp(driver_name, "msm") == 0))
-      return NULL;
+      return configuration_query(conf);
   else
 #endif
      return NULL;
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -91,6 +91,23 @@ util_format_is_luminance(enum pipe_format format)
   return FALSE;
 }

+boolean
+util_format_is_alpha(enum pipe_format format)
+{
+   const struct util_format_description *desc =
+      util_format_description(format);
+
+   if ((desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+        desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) &&
+       desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_0 &&
+       desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_X) {
+      return TRUE;
+   }
+   return FALSE;
+}
+
 boolean
 util_format_is_pure_integer(enum pipe_format format)
 {
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -661,6 +661,8 @@ util_format_has_alpha(enum pipe_format format);
 boolean
 util_format_is_luminance(enum pipe_format format);

+boolean
+util_format_is_alpha(enum pipe_format format);

 boolean
 util_format_is_luminance_alpha(enum pipe_format format);
--- a/src/gallium/drivers/freedreno/.dir-locals.el
+++ b/src/gallium/drivers/freedreno/.dir-locals.el
@@ -0,0 +1,8 @@
+((nil
+  (indent-tabs-mode . true)
+  (tab-width . 4)
+  (c-basic-offset . 4)
+  (c-file-style . "k&r")
+  (fill-column . 78)
+  )
+ )
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (   9859 bytes, from 2014-06-02 15:21:30)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14477 bytes, from 2014-07-19 17:20:53)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  58020 bytes, from 2014-07-19 17:21:17)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  36670 bytes, from 2014-07-19 17:18:34)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10347 bytes, from 2014-10-01 18:55:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14960 bytes, from 2014-07-27 17:22:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  60533 bytes, from 2014-10-15 18:32:43)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  41068 bytes, from 2014-08-01 12:22:48)

 Copyright (C) 2013-2014 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -1243,13 +1243,13 @@ static inline uint32_t A2XX_CLEAR_COLOR_ALPHA(uint32_t val)
 #define A2XX_PA_SU_POINT_SIZE_HEIGHT__SHIFT			0
 static inline uint32_t A2XX_PA_SU_POINT_SIZE_HEIGHT(float val)
 {
-	return ((((uint32_t)(val * 8.0))) << A2XX_PA_SU_POINT_SIZE_HEIGHT__SHIFT) & A2XX_PA_SU_POINT_SIZE_HEIGHT__MASK;
+	return ((((uint32_t)(val * 16.0))) << A2XX_PA_SU_POINT_SIZE_HEIGHT__SHIFT) & A2XX_PA_SU_POINT_SIZE_HEIGHT__MASK;
 }
 #define A2XX_PA_SU_POINT_SIZE_WIDTH__MASK			0xffff0000
 #define A2XX_PA_SU_POINT_SIZE_WIDTH__SHIFT			16
 static inline uint32_t A2XX_PA_SU_POINT_SIZE_WIDTH(float val)
 {
-	return ((((uint32_t)(val * 8.0))) << A2XX_PA_SU_POINT_SIZE_WIDTH__SHIFT) & A2XX_PA_SU_POINT_SIZE_WIDTH__MASK;
+	return ((((uint32_t)(val * 16.0))) << A2XX_PA_SU_POINT_SIZE_WIDTH__SHIFT) & A2XX_PA_SU_POINT_SIZE_WIDTH__MASK;
 }

 #define REG_A2XX_PA_SU_POINT_MINMAX				0x00002281
@@ -1257,13 +1257,13 @@ static inline uint32_t A2XX_PA_SU_POINT_SIZE_WIDTH(float val)
 #define A2XX_PA_SU_POINT_MINMAX_MIN__SHIFT			0
 static inline uint32_t A2XX_PA_SU_POINT_MINMAX_MIN(float val)
 {
-	return ((((uint32_t)(val * 8.0))) << A2XX_PA_SU_POINT_MINMAX_MIN__SHIFT) & A2XX_PA_SU_POINT_MINMAX_MIN__MASK;
+	return ((((uint32_t)(val * 16.0))) << A2XX_PA_SU_POINT_MINMAX_MIN__SHIFT) & A2XX_PA_SU_POINT_MINMAX_MIN__MASK;
 }
 #define A2XX_PA_SU_POINT_MINMAX_MAX__MASK			0xffff0000
 #define A2XX_PA_SU_POINT_MINMAX_MAX__SHIFT			16
 static inline uint32_t A2XX_PA_SU_POINT_MINMAX_MAX(float val)
 {
-	return ((((uint32_t)(val * 8.0))) << A2XX_PA_SU_POINT_MINMAX_MAX__SHIFT) & A2XX_PA_SU_POINT_MINMAX_MAX__MASK;
+	return ((((uint32_t)(val * 16.0))) << A2XX_PA_SU_POINT_MINMAX_MAX__SHIFT) & A2XX_PA_SU_POINT_MINMAX_MAX__MASK;
 }

 #define REG_A2XX_PA_SU_LINE_CNTL				0x00002282
@@ -1271,7 +1271,7 @@ static inline uint32_t A2XX_PA_SU_POINT_MINMAX_MAX(float val)
 #define A2XX_PA_SU_LINE_CNTL_WIDTH__SHIFT			0
 static inline uint32_t A2XX_PA_SU_LINE_CNTL_WIDTH(float val)
 {
-	return ((((uint32_t)(val * 8.0))) << A2XX_PA_SU_LINE_CNTL_WIDTH__SHIFT) & A2XX_PA_SU_LINE_CNTL_WIDTH__MASK;
+	return ((((uint32_t)(val * 16.0))) << A2XX_PA_SU_LINE_CNTL_WIDTH__SHIFT) & A2XX_PA_SU_LINE_CNTL_WIDTH__MASK;
 }

 #define REG_A2XX_PA_SC_LINE_STIPPLE				0x00002283
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -30,7 +30,6 @@
 #include "util/u_string.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
-#include "util/u_pack_color.h"

 #include "freedreno_state.h"
 #include "freedreno_resource.h"
@@ -57,8 +56,8 @@ emit_cacheflush(struct fd_ringbuffer *ring)
 static void
 emit_vertexbufs(struct fd_context *ctx)
 {
-	struct fd_vertex_stateobj *vtx = ctx->vtx;
-	struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf;
+	struct fd_vertex_stateobj *vtx = ctx->vtx.vtx;
+	struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vtx.vertexbuf;
 	struct fd2_vertex_buf bufs[PIPE_MAX_ATTRIBS];
 	unsigned i;

@@ -118,14 +117,6 @@ fd2_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
 }


-static uint32_t
-pack_rgba(enum pipe_format format, const float *rgba)
-{
-	union util_color uc;
-	util_pack_color(rgba, format, &uc);
-	return uc.ui[0];
-}
-
 static void
 fd2_clear(struct fd_context *ctx, unsigned buffers,
 		const union pipe_color_union *color, double depth, unsigned stencil)
--- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
@@ -317,10 +317,10 @@ fd2_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL));
 	OUT_RING(ring, 0x00000000);

-	if (ctx->restore & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
 		emit_mem2gmem_surf(ctx, bin_w * bin_h, pfb->zsbuf);

-	if (ctx->restore & FD_BUFFER_COLOR)
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR))
 		emit_mem2gmem_surf(ctx, 0, pfb->cbufs[0]);

 	/* TODO blob driver seems to toss in a CACHE_FLUSH after each DRAW_INDX.. */
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
@@ -174,7 +174,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
 		struct ir2_instruction *instr = so->vfetch_instrs[i];
 		struct pipe_vertex_element *elem = &vtx->pipe[i];
 		struct pipe_vertex_buffer *vb =
-				&ctx->vertexbuf.vb[elem->vertex_buffer_index];
+				&ctx->vtx.vertexbuf.vb[elem->vertex_buffer_index];
 		enum pipe_format format = elem->src_format;
 		const struct util_format_description *desc =
 				util_format_description(format);
@@ -258,7 +258,7 @@ fd2_program_validate(struct fd_context *ctx)

 	/* if necessary, fix up vertex fetch instructions: */
 	if (ctx->dirty & (FD_DIRTY_VTXSTATE | FD_DIRTY_PROG))
-		patch_vtx_fetches(ctx, prog->vp, ctx->vtx);
+		patch_vtx_fetches(ctx, prog->vp, ctx->vtx.vtx);

 	/* if necessary, fix up texture fetch instructions: */
 	if (ctx->dirty & (FD_DIRTY_TEXSTATE | FD_DIRTY_PROG)) {
--- a/src/gallium/drivers/freedreno/a2xx/fd2_texture.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_texture.c
@@ -101,6 +101,25 @@ fd2_sampler_state_create(struct pipe_context *pctx,
 	return so;
 }

+static void
+fd2_sampler_states_bind(struct pipe_context *pctx,
+		unsigned shader, unsigned start,
+		unsigned nr, void **hwcso)
+{
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		struct fd_context *ctx = fd_context(pctx);
+
+		/* on a2xx, since there is a flat address space for textures/samplers,
+		 * a change in # of fragment textures/samplers will trigger patching and
+		 * re-emitting the vertex shader:
+		 */
+		if (nr != ctx->fragtex.num_samplers)
+			ctx->dirty |= FD_DIRTY_TEXSTATE;
+	}
+
+	fd_sampler_states_bind(pctx, shader, start, nr, hwcso);
+}
+
 static struct pipe_sampler_view *
 fd2_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 		const struct pipe_sampler_view *cso)
@@ -154,5 +173,6 @@ void
 fd2_texture_init(struct pipe_context *pctx)
 {
 	pctx->create_sampler_state = fd2_sampler_state_create;
+	pctx->bind_sampler_states = fd2_sampler_states_bind;
 	pctx->create_sampler_view = fd2_sampler_view_create;
 }
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (   9859 bytes, from 2014-06-02 15:21:30)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14477 bytes, from 2014-07-19 17:20:53)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  58020 bytes, from 2014-07-19 17:21:17)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  36670 bytes, from 2014-07-19 17:18:34)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10347 bytes, from 2014-10-01 18:55:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14960 bytes, from 2014-07-27 17:22:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  60533 bytes, from 2014-10-15 18:32:43)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  41068 bytes, from 2014-08-01 12:22:48)

 Copyright (C) 2013-2014 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -86,6 +86,14 @@ enum a3xx_vtx_fmt {
 	VFMT_NORM_USHORT_16_16 = 29,
 	VFMT_NORM_USHORT_16_16_16 = 30,
 	VFMT_NORM_USHORT_16_16_16_16 = 31,
+	VFMT_UINT_32 = 32,
+	VFMT_UINT_32_32 = 33,
+	VFMT_UINT_32_32_32 = 34,
+	VFMT_UINT_32_32_32_32 = 35,
+	VFMT_INT_32 = 36,
+	VFMT_INT_32_32 = 37,
+	VFMT_INT_32_32_32 = 38,
+	VFMT_INT_32_32_32_32 = 39,
 	VFMT_UBYTE_8 = 40,
 	VFMT_UBYTE_8_8 = 41,
 	VFMT_UBYTE_8_8_8 = 42,
@@ -112,6 +120,7 @@ enum a3xx_tex_fmt {
 	TFMT_NORM_USHORT_565 = 4,
 	TFMT_NORM_USHORT_5551 = 6,
 	TFMT_NORM_USHORT_4444 = 7,
+	TFMT_NORM_USHORT_Z16 = 9,
 	TFMT_NORM_UINT_X8Z24 = 10,
 	TFMT_NORM_UINT_NV12_UV_TILED = 17,
 	TFMT_NORM_UINT_NV12_Y_TILED = 19,
@@ -149,6 +158,7 @@ enum a3xx_color_fmt {
 	RB_R8G8B8A8_UNORM = 8,
 	RB_Z16_UNORM = 12,
 	RB_A8_UNORM = 20,
+	RB_R8_UNORM = 21,
 	RB_R16G16B16A16_FLOAT = 27,
 	RB_R32G32B32A32_FLOAT = 51,
 };
@@ -194,6 +204,11 @@ enum a3xx_rb_blend_opcode {
 	BLEND_MAX_DST_SRC = 4,
 };

+enum a3xx_intp_mode {
+	SMOOTH = 0,
+	FLAT = 1,
+};
+
 enum a3xx_tex_filter {
 	A3XX_TEX_NEAREST = 0,
 	A3XX_TEX_LINEAR = 1,
@@ -632,13 +647,13 @@ static inline uint32_t A3XX_GRAS_CL_VPORT_ZSCALE(float val)
 #define A3XX_GRAS_SU_POINT_MINMAX_MIN__SHIFT			0
 static inline uint32_t A3XX_GRAS_SU_POINT_MINMAX_MIN(float val)
 {
-	return ((((uint32_t)(val * 8.0))) << A3XX_GRAS_SU_POINT_MINMAX_MIN__SHIFT) & A3XX_GRAS_SU_POINT_MINMAX_MIN__MASK;
+	return ((((uint32_t)(val * 16.0))) << A3XX_GRAS_SU_POINT_MINMAX_MIN__SHIFT) & A3XX_GRAS_SU_POINT_MINMAX_MIN__MASK;
 }
 #define A3XX_GRAS_SU_POINT_MINMAX_MAX__MASK			0xffff0000
 #define A3XX_GRAS_SU_POINT_MINMAX_MAX__SHIFT			16
 static inline uint32_t A3XX_GRAS_SU_POINT_MINMAX_MAX(float val)
 {
-	return ((((uint32_t)(val * 8.0))) << A3XX_GRAS_SU_POINT_MINMAX_MAX__SHIFT) & A3XX_GRAS_SU_POINT_MINMAX_MAX__MASK;
+	return ((((uint32_t)(val * 16.0))) << A3XX_GRAS_SU_POINT_MINMAX_MAX__SHIFT) & A3XX_GRAS_SU_POINT_MINMAX_MAX__MASK;
 }

 #define REG_A3XX_GRAS_SU_POINT_SIZE				0x00002069
@@ -646,7 +661,7 @@ static inline uint32_t A3XX_GRAS_SU_POINT_MINMAX_MAX(float val)
 #define A3XX_GRAS_SU_POINT_SIZE__SHIFT				0
 static inline uint32_t A3XX_GRAS_SU_POINT_SIZE(float val)
 {
-	return ((((uint32_t)(val * 8.0))) << A3XX_GRAS_SU_POINT_SIZE__SHIFT) & A3XX_GRAS_SU_POINT_SIZE__MASK;
+	return ((((int32_t)(val * 16.0))) << A3XX_GRAS_SU_POINT_SIZE__SHIFT) & A3XX_GRAS_SU_POINT_SIZE__MASK;
 }

 #define REG_A3XX_GRAS_SU_POLY_OFFSET_SCALE			0x0000206c
@@ -654,7 +669,7 @@ static inline uint32_t A3XX_GRAS_SU_POINT_SIZE(float val)
 #define A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL__SHIFT		0
 static inline uint32_t A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(float val)
 {
-	return ((((uint32_t)(val * 28.0))) << A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL__SHIFT) & A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL__MASK;
+	return ((((int32_t)(val * 16384.0))) << A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL__SHIFT) & A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL__MASK;
 }

 #define REG_A3XX_GRAS_SU_POLY_OFFSET_OFFSET			0x0000206d
@@ -662,7 +677,7 @@ static inline uint32_t A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(float val)
 #define A3XX_GRAS_SU_POLY_OFFSET_OFFSET__SHIFT			0
 static inline uint32_t A3XX_GRAS_SU_POLY_OFFSET_OFFSET(float val)
 {
-	return ((((uint32_t)(val * 28.0))) << A3XX_GRAS_SU_POLY_OFFSET_OFFSET__SHIFT) & A3XX_GRAS_SU_POLY_OFFSET_OFFSET__MASK;
+	return ((((int32_t)(val * 16384.0))) << A3XX_GRAS_SU_POLY_OFFSET_OFFSET__SHIFT) & A3XX_GRAS_SU_POLY_OFFSET_OFFSET__MASK;
 }

 #define REG_A3XX_GRAS_SU_MODE_CONTROL				0x00002070
@@ -673,7 +688,7 @@ static inline uint32_t A3XX_GRAS_SU_POLY_OFFSET_OFFSET(float val)
 #define A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH__SHIFT		3
 static inline uint32_t A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(float val)
 {
-	return ((((uint32_t)(val * 4.0))) << A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH__SHIFT) & A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH__MASK;
+	return ((((int32_t)(val * 4.0))) << A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH__SHIFT) & A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH__MASK;
 }
 #define A3XX_GRAS_SU_MODE_CONTROL_POLY_OFFSET			0x00000800

@@ -1265,6 +1280,7 @@ static inline uint32_t A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(enum adreno_pa_
 {
 	return ((val) << A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE__SHIFT) & A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE__MASK;
 }
+#define A3XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART			0x00100000
 #define A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST		0x02000000
 #define A3XX_PC_PRIM_VTX_CNTL_PSIZE				0x04000000

@@ -1281,7 +1297,12 @@ static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(enum a3xx_threadsize
 #define A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART			0x00000200
 #define A3XX_HLSQ_CONTROL_0_REG_RESERVED2			0x00000400
 #define A3XX_HLSQ_CONTROL_0_REG_CHUNKDISABLE			0x04000000
-#define A3XX_HLSQ_CONTROL_0_REG_CONSTSWITCHMODE			0x08000000
+#define A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__MASK			0x08000000
+#define A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__SHIFT		27
+static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__SHIFT) & A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__MASK;
+}
 #define A3XX_HLSQ_CONTROL_0_REG_LAZYUPDATEDISABLE		0x10000000
 #define A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE		0x20000000
 #define A3XX_HLSQ_CONTROL_0_REG_TPFULLUPDATE			0x40000000
@@ -1537,6 +1558,7 @@ static inline uint32_t A3XX_VFD_DECODE_INSTR_REGID(uint32_t val)
 {
 	return ((val) << A3XX_VFD_DECODE_INSTR_REGID__SHIFT) & A3XX_VFD_DECODE_INSTR_REGID__MASK;
 }
+#define A3XX_VFD_DECODE_INSTR_INT				0x00100000
 #define A3XX_VFD_DECODE_INSTR_SWAP__MASK			0x00c00000
 #define A3XX_VFD_DECODE_INSTR_SWAP__SHIFT			22
 static inline uint32_t A3XX_VFD_DECODE_INSTR_SWAP(enum a3xx_color_swap val)
@@ -1604,6 +1626,102 @@ static inline uint32_t A3XX_VPC_PACK_NUMNONPOSVSVAR(uint32_t val)
 static inline uint32_t REG_A3XX_VPC_VARYING_INTERP(uint32_t i0) { return 0x00002282 + 0x1*i0; }

 static inline uint32_t REG_A3XX_VPC_VARYING_INTERP_MODE(uint32_t i0) { return 0x00002282 + 0x1*i0; }
+#define A3XX_VPC_VARYING_INTERP_MODE_C0__MASK			0x00000003
+#define A3XX_VPC_VARYING_INTERP_MODE_C0__SHIFT			0
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C0(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C0__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C0__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C1__MASK			0x0000000c
+#define A3XX_VPC_VARYING_INTERP_MODE_C1__SHIFT			2
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C1(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C1__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C1__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C2__MASK			0x00000030
+#define A3XX_VPC_VARYING_INTERP_MODE_C2__SHIFT			4
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C2(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C2__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C2__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C3__MASK			0x000000c0
+#define A3XX_VPC_VARYING_INTERP_MODE_C3__SHIFT			6
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C3(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C3__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C3__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C4__MASK			0x00000300
+#define A3XX_VPC_VARYING_INTERP_MODE_C4__SHIFT			8
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C4(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C4__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C4__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C5__MASK			0x00000c00
+#define A3XX_VPC_VARYING_INTERP_MODE_C5__SHIFT			10
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C5(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C5__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C5__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C6__MASK			0x00003000
+#define A3XX_VPC_VARYING_INTERP_MODE_C6__SHIFT			12
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C6(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C6__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C6__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C7__MASK			0x0000c000
+#define A3XX_VPC_VARYING_INTERP_MODE_C7__SHIFT			14
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C7(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C7__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C7__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C8__MASK			0x00030000
+#define A3XX_VPC_VARYING_INTERP_MODE_C8__SHIFT			16
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C8(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C8__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C8__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_C9__MASK			0x000c0000
+#define A3XX_VPC_VARYING_INTERP_MODE_C9__SHIFT			18
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_C9(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_C9__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_C9__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_CA__MASK			0x00300000
+#define A3XX_VPC_VARYING_INTERP_MODE_CA__SHIFT			20
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_CA(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_CA__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_CA__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_CB__MASK			0x00c00000
+#define A3XX_VPC_VARYING_INTERP_MODE_CB__SHIFT			22
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_CB(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_CB__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_CB__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_CC__MASK			0x03000000
+#define A3XX_VPC_VARYING_INTERP_MODE_CC__SHIFT			24
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_CC(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_CC__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_CC__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_CD__MASK			0x0c000000
+#define A3XX_VPC_VARYING_INTERP_MODE_CD__SHIFT			26
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_CD(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_CD__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_CD__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_CE__MASK			0x30000000
+#define A3XX_VPC_VARYING_INTERP_MODE_CE__SHIFT			28
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_CE(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_CE__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_CE__MASK;
+}
+#define A3XX_VPC_VARYING_INTERP_MODE_CF__MASK			0xc0000000
+#define A3XX_VPC_VARYING_INTERP_MODE_CF__SHIFT			30
+static inline uint32_t A3XX_VPC_VARYING_INTERP_MODE_CF(enum a3xx_intp_mode val)
+{
+	return ((val) << A3XX_VPC_VARYING_INTERP_MODE_CF__SHIFT) & A3XX_VPC_VARYING_INTERP_MODE_CF__MASK;
+}

 static inline uint32_t REG_A3XX_VPC_VARYING_PS_REPL(uint32_t i0) { return 0x00002286 + 0x1*i0; }

@@ -1696,7 +1814,7 @@ static inline uint32_t A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT__SHIFT) & A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT__MASK;
 }
-#define A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING__MASK		0x3f000000
+#define A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING__MASK		0x7f000000
 #define A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING__SHIFT		24
 static inline uint32_t A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(uint32_t val)
 {
@@ -2347,17 +2465,23 @@ static inline uint32_t A3XX_TEX_SAMP_0_COMPARE_FUNC(enum adreno_compare_func val
 #define A3XX_TEX_SAMP_0_UNNORM_COORDS				0x80000000

 #define REG_A3XX_TEX_SAMP_1					0x00000001
+#define A3XX_TEX_SAMP_1_LOD_BIAS__MASK				0x000007ff
+#define A3XX_TEX_SAMP_1_LOD_BIAS__SHIFT				0
+static inline uint32_t A3XX_TEX_SAMP_1_LOD_BIAS(float val)
+{
+	return ((((int32_t)(val * 64.0))) << A3XX_TEX_SAMP_1_LOD_BIAS__SHIFT) & A3XX_TEX_SAMP_1_LOD_BIAS__MASK;
+}
 #define A3XX_TEX_SAMP_1_MAX_LOD__MASK				0x003ff000
 #define A3XX_TEX_SAMP_1_MAX_LOD__SHIFT				12
 static inline uint32_t A3XX_TEX_SAMP_1_MAX_LOD(float val)
 {
-	return ((((uint32_t)(val * 12.0))) << A3XX_TEX_SAMP_1_MAX_LOD__SHIFT) & A3XX_TEX_SAMP_1_MAX_LOD__MASK;
+	return ((((uint32_t)(val * 64.0))) << A3XX_TEX_SAMP_1_MAX_LOD__SHIFT) & A3XX_TEX_SAMP_1_MAX_LOD__MASK;
 }
 #define A3XX_TEX_SAMP_1_MIN_LOD__MASK				0xffc00000
 #define A3XX_TEX_SAMP_1_MIN_LOD__SHIFT				22
 static inline uint32_t A3XX_TEX_SAMP_1_MIN_LOD(float val)
 {
-	return ((((uint32_t)(val * 12.0))) << A3XX_TEX_SAMP_1_MIN_LOD__SHIFT) & A3XX_TEX_SAMP_1_MIN_LOD__MASK;
+	return ((((uint32_t)(val * 64.0))) << A3XX_TEX_SAMP_1_MIN_LOD__SHIFT) & A3XX_TEX_SAMP_1_MIN_LOD__MASK;
 }

 #define REG_A3XX_TEX_CONST_0					0x00000000
@@ -2448,6 +2572,24 @@ static inline uint32_t A3XX_TEX_CONST_2_SWAP(enum a3xx_color_swap val)
 }

 #define REG_A3XX_TEX_CONST_3					0x00000003
+#define A3XX_TEX_CONST_3_LAYERSZ1__MASK				0x0000000f
+#define A3XX_TEX_CONST_3_LAYERSZ1__SHIFT			0
+static inline uint32_t A3XX_TEX_CONST_3_LAYERSZ1(uint32_t val)
+{
+	return ((val >> 12) << A3XX_TEX_CONST_3_LAYERSZ1__SHIFT) & A3XX_TEX_CONST_3_LAYERSZ1__MASK;
+}
+#define A3XX_TEX_CONST_3_DEPTH__MASK				0x0ffe0000
+#define A3XX_TEX_CONST_3_DEPTH__SHIFT				17
+static inline uint32_t A3XX_TEX_CONST_3_DEPTH(uint32_t val)
+{
+	return ((val) << A3XX_TEX_CONST_3_DEPTH__SHIFT) & A3XX_TEX_CONST_3_DEPTH__MASK;
+}
+#define A3XX_TEX_CONST_3_LAYERSZ2__MASK				0xf0000000
+#define A3XX_TEX_CONST_3_LAYERSZ2__SHIFT			28
+static inline uint32_t A3XX_TEX_CONST_3_LAYERSZ2(uint32_t val)
+{
+	return ((val >> 12) << A3XX_TEX_CONST_3_LAYERSZ2__SHIFT) & A3XX_TEX_CONST_3_LAYERSZ2__MASK;
+}


 #endif /* A3XX_XML */
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -49,6 +49,9 @@ fd3_context_destroy(struct pipe_context *pctx)
 	fd_bo_del(fd3_ctx->fs_pvt_mem);
 	fd_bo_del(fd3_ctx->vsc_size_mem);

+	pctx->delete_vertex_elements_state(pctx, fd3_ctx->solid_vbuf_state.vtx);
+	pctx->delete_vertex_elements_state(pctx, fd3_ctx->blit_vbuf_state.vtx);
+
 	pipe_resource_reference(&fd3_ctx->solid_vbuf, NULL);
 	pipe_resource_reference(&fd3_ctx->blit_texcoord_vbuf, NULL);

@@ -135,7 +138,38 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv)
 	fd3_ctx->solid_vbuf = create_solid_vertexbuf(pctx);
 	fd3_ctx->blit_texcoord_vbuf = create_blit_texcoord_vertexbuf(pctx);

+	/* setup solid_vbuf_state: */
+	fd3_ctx->solid_vbuf_state.vtx = pctx->create_vertex_elements_state(
+			pctx, 1, (struct pipe_vertex_element[]){{
+				.vertex_buffer_index = 0,
+				.src_offset = 0,
+				.src_format = PIPE_FORMAT_R32G32B32_FLOAT,
+			}});
+	fd3_ctx->solid_vbuf_state.vertexbuf.count = 1;
+	fd3_ctx->solid_vbuf_state.vertexbuf.vb[0].stride = 12;
+	fd3_ctx->solid_vbuf_state.vertexbuf.vb[0].buffer = fd3_ctx->solid_vbuf;
+
+	/* setup blit_vbuf_state: */
+	fd3_ctx->blit_vbuf_state.vtx = pctx->create_vertex_elements_state(
+			pctx, 2, (struct pipe_vertex_element[]){{
+				.vertex_buffer_index = 0,
+				.src_offset = 0,
+				.src_format = PIPE_FORMAT_R32G32_FLOAT,
+			}, {
+				.vertex_buffer_index = 1,
+				.src_offset = 0,
+				.src_format = PIPE_FORMAT_R32G32B32_FLOAT,
+			}});
+	fd3_ctx->blit_vbuf_state.vertexbuf.count = 2;
+	fd3_ctx->blit_vbuf_state.vertexbuf.vb[0].stride = 8;
+	fd3_ctx->blit_vbuf_state.vertexbuf.vb[0].buffer = fd3_ctx->blit_texcoord_vbuf;
+	fd3_ctx->blit_vbuf_state.vertexbuf.vb[1].stride = 12;
+	fd3_ctx->blit_vbuf_state.vertexbuf.vb[1].buffer = fd3_ctx->solid_vbuf;
+
 	fd3_query_context_init(pctx);

+	fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096,
+			2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, 0);
+
 	return pctx;
 }
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -29,10 +29,15 @@
 #ifndef FD3_CONTEXT_H_
 #define FD3_CONTEXT_H_

+#include "util/u_upload_mgr.h"
+
 #include "freedreno_drmif.h"

 #include "freedreno_context.h"

+#include "ir3_shader.h"
+
+
 struct fd3_context {
 	struct fd_context base;

@@ -56,6 +61,55 @@ struct fd3_context {
 	/* vertex buf used for mem->gmem tex coords:
 	 */
 	struct pipe_resource *blit_texcoord_vbuf;
+
+	/* vertex state for solid_vbuf:
+	 *    - solid_vbuf / 12 / R32G32B32_FLOAT
+	 */
+	struct fd_vertex_state solid_vbuf_state;
+
+	/* vertex state for blit_prog:
+	 *    - blit_texcoord_vbuf / 8 / R32G32_FLOAT
+	 *    - solid_vbuf / 12 / R32G32B32_FLOAT
+	 */
+	struct fd_vertex_state blit_vbuf_state;
+
+
+	/*
+	 * Border color layout *appears* to be as arrays of 0x40 byte
+	 * elements, with frag shader elements starting at (16 x 0x40).
+	 * But at some point I should probably experiment more with
+	 * samplers in vertex shaders to be sure.  Unclear about why
+	 * there is this offset when there are separate VS and FS base
+	 * addr regs.
+	 *
+	 * The first 8 bytes of each entry are the requested border
+	 * color in fp16.  Unclear about the rest.. could be used for
+	 * other formats, or could simply be for aligning the pitch
+	 * to 32 pixels.
+	 */
+#define BORDERCOLOR_SIZE 0x40
+
+	struct u_upload_mgr *border_color_uploader;
+	struct pipe_resource *border_color_buf;
+
+	/* if *any* of bits are set in {v,f}saturate_{s,t,r} */
+	bool vsaturate, fsaturate;
+
+	/* bitmask of sampler which needs coords clamped for vertex
+	 * shader:
+	 */
+	unsigned vsaturate_s, vsaturate_t, vsaturate_r;
+
+	/* bitmask of sampler which needs coords clamped for frag
+	 * shader:
+	 */
+	unsigned fsaturate_s, fsaturate_t, fsaturate_r;
+
+	/* some state changes require a different shader variant.  Keep
+	 * track of this so we know when we need to re-emit shader state
+	 * due to variant change.  See fixup_shader_state()
+	 */
+	struct ir3_shader_key last_key;
 };

 static INLINE struct fd3_context *
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -30,6 +30,7 @@
 #include "util/u_string.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
+#include "util/u_format.h"

 #include "freedreno_state.h"
 #include "freedreno_resource.h"
@@ -43,39 +44,15 @@


 static void
-emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		struct ir3_shader_key key)
+draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		struct fd3_emit *emit)
 {
-	struct fd_vertex_stateobj *vtx = ctx->vtx;
-	struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf;
-	struct fd3_vertex_buf bufs[PIPE_MAX_ATTRIBS];
-	unsigned i;
+	const struct pipe_draw_info *info = emit->info;

-	if (!vtx->num_elements)
-		return;
+	fd3_emit_state(ctx, ring, emit);

-	for (i = 0; i < vtx->num_elements; i++) {
-		struct pipe_vertex_element *elem = &vtx->pipe[i];
-		struct pipe_vertex_buffer *vb =
-				&vertexbuf->vb[elem->vertex_buffer_index];
-		bufs[i].offset = vb->buffer_offset + elem->src_offset;
-		bufs[i].stride = vb->stride;
-		bufs[i].prsc   = vb->buffer;
-		bufs[i].format = elem->src_format;
-	}
-
-	fd3_emit_vertex_bufs(ring, fd3_shader_variant(ctx->prog.vp, key),
-			bufs, vtx->num_elements);
-}
-
-static void
-draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
-		struct fd_ringbuffer *ring, unsigned dirty, struct ir3_shader_key key)
-{
-	fd3_emit_state(ctx, ring, &ctx->prog, dirty, key);
-
-	if (dirty & FD_DIRTY_VTXBUF)
-		emit_vertexbufs(ctx, ring, key);
+	if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE))
+		fd3_emit_vertex_bufs(ring, emit);

 	OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1);
 	OUT_RING(ring, 0x0000000b);             /* PC_VERTEX_REUSE_BLOCK_CNTL */
@@ -91,27 +68,103 @@ draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
 			info->restart_index : 0xffffffff);

 	fd_draw_emit(ctx, ring,
-			key.binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY,
+			emit->key.binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY,
 			info);
 }

+/* fixup dirty shader state in case some "unrelated" (from the state-
+ * tracker's perspective) state change causes us to switch to a
+ * different variant.
+ */
+static void
+fixup_shader_state(struct fd_context *ctx, struct ir3_shader_key *key)
+{
+	struct fd3_context *fd3_ctx = fd3_context(ctx);
+	struct ir3_shader_key *last_key = &fd3_ctx->last_key;
+
+	if (!ir3_shader_key_equal(last_key, key)) {
+		ctx->dirty |= FD_DIRTY_PROG;
+
+		if (last_key->has_per_samp || key->has_per_samp) {
+			if ((last_key->vsaturate_s != key->vsaturate_s) ||
+					(last_key->vsaturate_t != key->vsaturate_t) ||
+					(last_key->vsaturate_r != key->vsaturate_r))
+				ctx->prog.dirty |= FD_SHADER_DIRTY_VP;
+
+			if ((last_key->fsaturate_s != key->fsaturate_s) ||
+					(last_key->fsaturate_t != key->fsaturate_t) ||
+					(last_key->fsaturate_r != key->fsaturate_r))
+				ctx->prog.dirty |= FD_SHADER_DIRTY_FP;
+		}
+
+		if (last_key->color_two_side != key->color_two_side)
+			ctx->prog.dirty |= FD_SHADER_DIRTY_FP;
+
+		if (last_key->half_precision != key->half_precision)
+			ctx->prog.dirty |= FD_SHADER_DIRTY_FP;
+
+		if (last_key->alpha != key->alpha)
+			ctx->prog.dirty |= FD_SHADER_DIRTY_FP;
+
+		fd3_ctx->last_key = *key;
+	}
+}
+
 static void
 fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
 {
-	unsigned dirty = ctx->dirty;
-	struct ir3_shader_key key = {
+	struct fd3_context *fd3_ctx = fd3_context(ctx);
+	struct fd3_emit emit = {
+		.vtx  = &ctx->vtx,
+		.prog = &ctx->prog,
+		.info = info,
+		.key = {
 			/* do binning pass first: */
 			.binning_pass = true,
 			.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
+			.alpha = util_format_is_alpha(pipe_surface_format(ctx->framebuffer.cbufs[0])),
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
+			.has_per_samp = fd3_ctx->fsaturate || fd3_ctx->vsaturate,
+			.vsaturate_s = fd3_ctx->vsaturate_s,
+			.vsaturate_t = fd3_ctx->vsaturate_t,
+			.vsaturate_r = fd3_ctx->vsaturate_r,
+			.fsaturate_s = fd3_ctx->fsaturate_s,
+			.fsaturate_t = fd3_ctx->fsaturate_t,
+			.fsaturate_r = fd3_ctx->fsaturate_r,
+		},
+		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
 	};
-	draw_impl(ctx, info, ctx->binning_ring,
-			dirty & ~(FD_DIRTY_BLEND), key);
+	unsigned dirty;
+
+	fixup_shader_state(ctx, &emit.key);
+
+	dirty = ctx->dirty;
+	emit.dirty = dirty & ~(FD_DIRTY_BLEND);
+	draw_impl(ctx, ctx->binning_ring, &emit);
+
 	/* and now regular (non-binning) pass: */
-	key.binning_pass = false;
-	draw_impl(ctx, info, ctx->ring, dirty, key);
+	emit.key.binning_pass = false;
+	emit.dirty = dirty;
+	emit.vp = NULL;   /* we changed key so need to refetch vp */
+	draw_impl(ctx, ctx->ring, &emit);
+}
+
+/* clear operations ignore viewport state, so we need to reset it
+ * based on framebuffer state:
+ */
+static void
+reset_viewport(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb)
+{
+	float half_width = pfb->width * 0.5f;
+	float half_height = pfb->height * 0.5f;
+
+	OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 4);
+	OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(half_width - 0.5));
+	OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(half_width));
+	OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(half_height - 0.5));
+	OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(-half_height));
 }

 /* binning pass cmds for a clear:
@@ -127,19 +180,19 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
 {
 	struct fd3_context *fd3_ctx = fd3_context(ctx);
 	struct fd_ringbuffer *ring = ctx->binning_ring;
-	struct ir3_shader_key key = {
+	struct fd3_emit emit = {
+		.vtx  = &fd3_ctx->solid_vbuf_state,
+		.prog = &ctx->solid_prog,
+		.key = {
 			.binning_pass = true,
 			.half_precision = true,
+		},
+		.dirty = dirty,
 	};

-	fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key);
-
-	fd3_emit_vertex_bufs(ring, fd3_shader_variant(ctx->solid_prog.vp, key),
-			(struct fd3_vertex_buf[]) {{
-				.prsc = fd3_ctx->solid_vbuf,
-				.stride = 12,
-				.format = PIPE_FORMAT_R32G32B32_FLOAT,
-			}}, 1);
+	fd3_emit_state(ctx, ring, &emit);
+	fd3_emit_vertex_bufs(ring, &emit);
+	reset_viewport(ring, &ctx->framebuffer);

 	OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
 	OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) |
@@ -168,17 +221,23 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 	struct fd_ringbuffer *ring = ctx->ring;
 	unsigned dirty = ctx->dirty;
 	unsigned ce, i;
-	struct ir3_shader_key key = {
+	struct fd3_emit emit = {
+		.vtx  = &fd3_ctx->solid_vbuf_state,
+		.prog = &ctx->solid_prog,
+		.key = {
 			.half_precision = true,
+		},
 	};

-	dirty &= FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
+	dirty &= FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
 	dirty |= FD_DIRTY_PROG;
+	emit.dirty = dirty;

 	fd3_clear_binning(ctx, dirty);

 	/* emit generic state now: */
-	fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key);
+	fd3_emit_state(ctx, ring, &emit);
+	reset_viewport(ring, &ctx->framebuffer);

 	OUT_PKT0(ring, REG_A3XX_RB_BLEND_ALPHA, 1);
 	OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(0xff) |
@@ -269,12 +328,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 	OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1);
 	OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0));

-	fd3_emit_vertex_bufs(ring, fd3_shader_variant(ctx->solid_prog.vp, key),
-			(struct fd3_vertex_buf[]) {{
-				.prsc = fd3_ctx->solid_vbuf,
-				.stride = 12,
-				.format = PIPE_FORMAT_R32G32B32_FLOAT,
-			}}, 1);
+	fd3_emit_vertex_bufs(ring, &emit);

 	fd3_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL);

--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -92,14 +92,13 @@ emit_constants(struct fd_ringbuffer *ring,
 	uint32_t enabled_mask = constbuf->enabled_mask;
 	uint32_t first_immediate;
 	uint32_t base = 0;
-	unsigned i;

 	// XXX TODO only emit dirty consts.. but we need to keep track if
 	// they are clobbered by a clear, gmem2mem, or mem2gmem..
 	constbuf->dirty_mask = enabled_mask;

-	/* in particular, with binning shader and a unneeded consts no
-	 * longer referenced, we could end up w/ constlen that is smaller
+	/* in particular, with binning shader we may end up with unused
+	 * consts, ie. we could end up w/ constlen that is smaller
 	 * than first_immediate.  In that case truncate the user consts
 	 * early to avoid HLSQ lockup caused by writing too many consts
 	 */
@@ -137,12 +136,21 @@ emit_constants(struct fd_ringbuffer *ring,

 	/* emit shader immediates: */
 	if (shader) {
-		for (i = 0; i < shader->immediates_count; i++) {
-			base = 4 * (shader->first_immediate + i);
-			if (base >= (4 * shader->constlen))
-				break;
+		int size = shader->immediates_count;
+		base = shader->first_immediate;
+
+		/* truncate size to avoid writing constants that shader
+		 * does not use:
+		 */
+		size = MIN2(size + base, shader->constlen) - base;
+
+		/* convert out of vec4: */
+		base *= 4;
+		size *= 4;
+
+		if (size > 0) {
 			fd3_emit_constant(ring, sb, base,
-				0, 4, shader->immediates[i].val, NULL);
+				0, size, shader->immediates[0].val, NULL);
 		}
 	}
 }
@@ -152,9 +160,8 @@ emit_constants(struct fd_ringbuffer *ring,
 #define BASETABLE_SZ    A3XX_MAX_MIP_LEVELS

 static void
-emit_textures(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
-		struct fd_texture_stateobj *tex)
+emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		enum adreno_state_block sb, struct fd_texture_stateobj *tex)
 {
 	static const unsigned tex_off[] = {
 			[SB_VERT_TEX] = VERT_TEX_OFF,
@@ -164,7 +171,18 @@ emit_textures(struct fd_ringbuffer *ring,
 			[SB_VERT_TEX] = SB_VERT_MIPADDR,
 			[SB_FRAG_TEX] = SB_FRAG_MIPADDR,
 	};
-	unsigned i, j;
+	static const uint32_t bcolor_reg[] = {
+			[SB_VERT_TEX] = REG_A3XX_TPL1_TP_VS_BORDER_COLOR_BASE_ADDR,
+			[SB_FRAG_TEX] = REG_A3XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR,
+	};
+	struct fd3_context *fd3_ctx = fd3_context(ctx);
+	unsigned i, j, off;
+	void *ptr;
+
+	u_upload_alloc(fd3_ctx->border_color_uploader,
+			0, 2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE, &off,
+			&fd3_ctx->border_color_buf,
+			&ptr);

 	if (tex->num_samplers > 0) {
 		/* output sampler state: */
@@ -180,6 +198,15 @@ emit_textures(struct fd_ringbuffer *ring,
 			const struct fd3_sampler_stateobj *sampler = tex->samplers[i] ?
 					fd3_sampler_stateobj(tex->samplers[i]) :
 					&dummy_sampler;
+			uint16_t *bcolor = (uint16_t *)((uint8_t *)ptr +
+					(BORDERCOLOR_SIZE * tex_off[sb]) +
+					(BORDERCOLOR_SIZE * i));
+
+			bcolor[0] = util_float_to_half(sampler->base.border_color.f[2]);
+			bcolor[1] = util_float_to_half(sampler->base.border_color.f[1]);
+			bcolor[2] = util_float_to_half(sampler->base.border_color.f[0]);
+			bcolor[3] = util_float_to_half(sampler->base.border_color.f[3]);
+
 			OUT_RING(ring, sampler->texsamp0);
 			OUT_RING(ring, sampler->texsamp1);
 		}
@@ -237,18 +264,31 @@ emit_textures(struct fd_ringbuffer *ring,
 			}
 		}
 	}
+
+	OUT_PKT0(ring, bcolor_reg[sb], 1);
+	OUT_RELOC(ring, fd_resource(fd3_ctx->border_color_buf)->bo, off, 0, 0);
+
+	u_upload_unmap(fd3_ctx->border_color_uploader);
 }

 /* emit texture state for mem->gmem restore operation.. eventually it would
 * be good to get rid of this and use normal CSO/etc state for more of these
 * special cases, but for now the compiler is not sufficient..
+ *
+ * Also, for using normal state, not quite sure how to handle the special
+ * case format (fd3_gmem_restore_format()) stuff for restoring depth/stencil.
 */
 void
 fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf)
 {
 	struct fd_resource *rsc = fd_resource(psurf->texture);
+	unsigned lvl = psurf->u.tex.level;
+	struct fd_resource_slice *slice = &rsc->slices[lvl];
+	uint32_t layer_offset = slice->size0 * psurf->u.tex.first_layer;
 	enum pipe_format format = fd3_gmem_restore_format(psurf->format);

+	debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
+
 	/* output sampler state: */
 	OUT_PKT3(ring, CP_LOAD_STATE, 4);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(FRAG_TEX_OFF) |
@@ -272,14 +312,14 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf
 			CP_LOAD_STATE_0_NUM_UNIT(1));
 	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
 			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(psurf->format)) |
+	OUT_RING(ring, A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(format)) |
 			A3XX_TEX_CONST_0_TYPE(A3XX_TEX_2D) |
 			fd3_tex_swiz(format,  PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN,
 					PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA));
 	OUT_RING(ring, A3XX_TEX_CONST_1_FETCHSIZE(TFETCH_DISABLE) |
 			A3XX_TEX_CONST_1_WIDTH(psurf->width) |
 			A3XX_TEX_CONST_1_HEIGHT(psurf->height));
-	OUT_RING(ring, A3XX_TEX_CONST_2_PITCH(rsc->slices[0].pitch * rsc->cpp) |
+	OUT_RING(ring, A3XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp) |
 			A3XX_TEX_CONST_2_INDX(0));
 	OUT_RING(ring, 0x00000000);

@@ -291,18 +331,21 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf
 			CP_LOAD_STATE_0_NUM_UNIT(1));
 	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
 			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-	OUT_RELOC(ring, rsc->bo, 0, 0, 0);
+	OUT_RELOC(ring, rsc->bo, layer_offset, 0, 0);
 }

 void
-fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
-		struct ir3_shader_variant *vp,
-		struct fd3_vertex_buf *vbufs, uint32_t n)
+fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 {
 	uint32_t i, j, last = 0;
 	uint32_t total_in = 0;
+	const struct fd_vertex_state *vtx = emit->vtx;
+	struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
+	unsigned n = MIN2(vtx->vtx->num_elements, vp->inputs_count);

-	n = MIN2(n, vp->inputs_count);
+	/* hw doesn't like to be configured for zero vbo's, it seems: */
+	if (vtx->vtx->num_elements == 0)
+		return;

 	for (i = 0; i < n; i++)
 		if (vp->inputs[i].compmask)
@@ -310,22 +353,25 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,

 	for (i = 0, j = 0; i <= last; i++) {
 		if (vp->inputs[i].compmask) {
-			struct pipe_resource *prsc = vbufs[i].prsc;
-			struct fd_resource *rsc = fd_resource(prsc);
-			enum pipe_format pfmt = vbufs[i].format;
+			struct pipe_vertex_element *elem = &vtx->vtx->pipe[i];
+			const struct pipe_vertex_buffer *vb =
+					&vtx->vertexbuf.vb[elem->vertex_buffer_index];
+			struct fd_resource *rsc = fd_resource(vb->buffer);
+			enum pipe_format pfmt = elem->src_format;
 			enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt);
 			bool switchnext = (i != last);
+			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t fs = util_format_get_blocksize(pfmt);

 			debug_assert(fmt != ~0);

 			OUT_PKT0(ring, REG_A3XX_VFD_FETCH(j), 2);
 			OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) |
-					A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vbufs[i].stride) |
+					A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vb->stride) |
 					COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) |
 					A3XX_VFD_FETCH_INSTR_0_INDEXCODE(j) |
 					A3XX_VFD_FETCH_INSTR_0_STEPRATE(1));
-			OUT_RELOC(ring, rsc->bo, vbufs[i].offset, 0, 0);
+			OUT_RELOC(ring, rsc->bo, vb->buffer_offset + elem->src_offset, 0, 0);

 			OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(j), 1);
 			OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL |
@@ -335,6 +381,7 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
 					A3XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) |
 					A3XX_VFD_DECODE_INSTR_SHIFTCNT(fs) |
 					A3XX_VFD_DECODE_INSTR_LASTCOMPVALID |
+					COND(isint, A3XX_VFD_DECODE_INSTR_INT) |
 					COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT));

 			total_in += vp->inputs[i].ncomp;
@@ -354,14 +401,11 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,

 void
 fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		struct fd_program_stateobj *prog, uint32_t dirty,
-		struct ir3_shader_key key)
+		struct fd3_emit *emit)
 {
-	struct ir3_shader_variant *vp;
-	struct ir3_shader_variant *fp;
-
-	fp = fd3_shader_variant(prog->fp, key);
-	vp = fd3_shader_variant(prog->vp, key);
+	struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
+	struct ir3_shader_variant *fp = fd3_emit_get_fp(emit);
+	uint32_t dirty = emit->dirty;

 	emit_marker(ring, 5);

@@ -372,7 +416,7 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(ctx->sample_mask));
 	}

-	if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !key.binning_pass) {
+	if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !emit->key.binning_pass) {
 		uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_render_control;

 		val |= COND(fp->frag_face, A3XX_RB_RENDER_CONTROL_FACENESS);
@@ -414,6 +458,9 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			val |= A3XX_RB_DEPTH_CONTROL_FRAG_WRITES_Z;
 			val |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE;
 		}
+		if (fp->has_kill) {
+			val |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE;
+		}
 		OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1);
 		OUT_RING(ring, val);
 	}
@@ -444,17 +491,27 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, val);
 	}

-	if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
+	/* NOTE: since primitive_restart is not actually part of any
+	 * state object, we need to make sure that we always emit
+	 * PRIM_VTX_CNTL.. either that or be more clever and detect
+	 * when it changes.
+	 */
+	if (emit->info) {
+		const struct pipe_draw_info *info = emit->info;
 		uint32_t val = fd3_rasterizer_stateobj(ctx->rasterizer)
 				->pc_prim_vtx_cntl;

-		if (!key.binning_pass) {
+		if (!emit->key.binning_pass) {
 			uint32_t stride_in_vpc = align(fp->total_in, 4) / 4;
 			if (stride_in_vpc > 0)
 				stride_in_vpc = MAX2(stride_in_vpc, 2);
 			val |= A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(stride_in_vpc);
 		}

+		if (info->indexed && info->primitive_restart) {
+			val |= A3XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART;
+		}
+
 		val |= COND(vp->writes_psize, A3XX_PC_PRIM_VTX_CNTL_PSIZE);

 		OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
@@ -487,9 +544,8 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(ctx->viewport.scale[2]));
 	}

-	if (dirty & FD_DIRTY_PROG) {
-		fd3_program_emit(ring, prog, key);
-	}
+	if (dirty & FD_DIRTY_PROG)
+		fd3_program_emit(ring, emit);

 	/* TODO we should not need this or fd_wfi() before emit_constants():
 	 */
@@ -498,15 +554,15 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,

 	if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
 			/* evil hack to deal sanely with clear path: */
-			(prog == &ctx->prog)) {
+			(emit->prog == &ctx->prog)) {
 		fd_wfi(ctx, ring);
 		emit_constants(ring,  SB_VERT_SHADER,
 				&ctx->constbuf[PIPE_SHADER_VERTEX],
-				(prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL);
-		if (!key.binning_pass) {
+				(emit->prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL);
+		if (!emit->key.binning_pass) {
 			emit_constants(ring, SB_FRAG_SHADER,
 					&ctx->constbuf[PIPE_SHADER_FRAGMENT],
-					(prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
+					(emit->prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
 		}
 	}

@@ -541,14 +597,14 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,

 	if (dirty & FD_DIRTY_VERTTEX) {
 		if (vp->has_samp)
-			emit_textures(ring, SB_VERT_TEX, &ctx->verttex);
+			emit_textures(ctx, ring, SB_VERT_TEX, &ctx->verttex);
 		else
 			dirty &= ~FD_DIRTY_VERTTEX;
 	}

 	if (dirty & FD_DIRTY_FRAGTEX) {
 		if (fp->has_samp)
-			emit_textures(ring, SB_FRAG_TEX, &ctx->fragtex);
+			emit_textures(ctx, ring, SB_FRAG_TEX, &ctx->fragtex);
 		else
 			dirty &= ~FD_DIRTY_FRAGTEX;
 	}
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -33,6 +33,7 @@

 #include "freedreno_context.h"
 #include "fd3_util.h"
+#include "fd3_program.h"
 #include "ir3_shader.h"

 struct fd_ringbuffer;
@@ -46,21 +47,44 @@ void fd3_emit_constant(struct fd_ringbuffer *ring,
 void fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring,
 		struct pipe_surface *psurf);

-/* NOTE: this just exists because we don't have proper vertex/vertexbuf
- * state objs for clear, and mem2gmem/gmem2mem operations..
- */
-struct fd3_vertex_buf {
-	unsigned offset, stride;
-	struct pipe_resource *prsc;
-	enum pipe_format format;
+/* grouped together emit-state for prog/vertex/state emit: */
+struct fd3_emit {
+	const struct fd_vertex_state *vtx;
+	const struct fd_program_stateobj *prog;
+	const struct pipe_draw_info *info;
+	struct ir3_shader_key key;
+	uint32_t dirty;
+	bool rasterflat;
+
+	/* cached to avoid repeated lookups of same variants: */
+	struct ir3_shader_variant *vp, *fp;
 };

-void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
-		struct ir3_shader_variant *vp,
-		struct fd3_vertex_buf *vbufs, uint32_t n);
+static inline struct ir3_shader_variant *
+fd3_emit_get_vp(struct fd3_emit *emit)
+{
+	if (!emit->vp) {
+		struct fd3_shader_stateobj *so = emit->prog->vp;
+		emit->vp = ir3_shader_variant(so->shader, emit->key);
+	}
+	return emit->vp;
+}
+
+static inline struct ir3_shader_variant *
+fd3_emit_get_fp(struct fd3_emit *emit)
+{
+	if (!emit->fp) {
+		struct fd3_shader_stateobj *so = emit->prog->fp;
+		emit->fp = ir3_shader_variant(so->shader, emit->key);
+	}
+	return emit->fp;
+}
+
+void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit);
+
 void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		struct fd_program_stateobj *prog, uint32_t dirty,
-		struct ir3_shader_key key);
+		struct fd3_emit *emit);
+
 void fd3_emit_restore(struct fd_context *ctx);

 #endif /* FD3_EMIT_H */
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -69,6 +69,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		struct fd_resource_slice *slice = NULL;
 		uint32_t stride = 0;
 		uint32_t base = 0;
+		uint32_t layer_offset = 0;

 		if ((i < nr_bufs) && bufs[i]) {
 			struct pipe_surface *psurf = bufs[i];
@@ -78,6 +79,10 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			format = fd3_pipe2color(psurf->format);
 			swap = fd3_pipe2swap(psurf->format);

+			debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
+
+			layer_offset = slice->size0 * psurf->u.tex.first_layer;
+
 			if (bin_w) {
 				stride = bin_w * rsc->cpp;

@@ -97,7 +102,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		if (bin_w || (i >= nr_bufs)) {
 			OUT_RING(ring, A3XX_RB_MRT_BUF_BASE_COLOR_BUF_BASE(base));
 		} else {
-			OUT_RELOCW(ring, rsc->bo, slice->offset, 0, -1);
+			OUT_RELOCW(ring, rsc->bo,
+					slice->offset + layer_offset, 0, -1);
 		}

 		OUT_PKT0(ring, REG_A3XX_SP_FS_IMAGE_OUTPUT_REG(i), 1);
@@ -152,6 +158,11 @@ emit_binning_workaround(struct fd_context *ctx)
 	struct fd3_context *fd3_ctx = fd3_context(ctx);
 	struct fd_gmem_stateobj *gmem = &ctx->gmem;
 	struct fd_ringbuffer *ring = ctx->ring;
+	struct fd3_emit emit = {
+			.vtx = &fd3_ctx->solid_vbuf_state,
+			.prog = &ctx->solid_prog,
+			.key = key,
+	};

 	OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2);
 	OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) |
@@ -177,13 +188,8 @@ emit_binning_workaround(struct fd_context *ctx)
 			A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
 			A3XX_GRAS_SC_CONTROL_RASTER_MODE(1));

-	fd3_program_emit(ring, &ctx->solid_prog, key);
-	fd3_emit_vertex_bufs(ring, fd3_shader_variant(ctx->solid_prog.vp, key),
-			(struct fd3_vertex_buf[]) {{
-				.prsc = fd3_ctx->solid_vbuf,
-				.stride = 12,
-				.format = PIPE_FORMAT_R32G32B32_FLOAT,
-			}}, 1);
+	fd3_program_emit(ring, &emit);
+	fd3_emit_vertex_bufs(ring, &emit);

 	OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 4);
 	OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
@@ -303,12 +309,16 @@ emit_gmem2mem_surf(struct fd_context *ctx,
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct fd_resource *rsc = fd_resource(psurf->texture);
 	struct fd_resource_slice *slice = &rsc->slices[psurf->u.tex.level];
+	uint32_t layer_offset = slice->size0 * psurf->u.tex.first_layer;
+
+	debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);

 	OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4);
 	OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) |
 			A3XX_RB_COPY_CONTROL_MODE(mode) |
 			A3XX_RB_COPY_CONTROL_GMEM_BASE(base));
-	OUT_RELOCW(ring, rsc->bo, slice->offset, 0, -1);    /* RB_COPY_DEST_BASE */
+
+	OUT_RELOCW(ring, rsc->bo, slice->offset + layer_offset, 0, -1);    /* RB_COPY_DEST_BASE */
 	OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->cpp));
 	OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) |
 			A3XX_RB_COPY_DEST_INFO_FORMAT(fd3_pipe2color(psurf->format)) |
@@ -326,6 +336,11 @@ fd3_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 	struct fd3_context *fd3_ctx = fd3_context(ctx);
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+	struct fd3_emit emit = {
+			.vtx = &fd3_ctx->solid_vbuf_state,
+			.prog = &ctx->solid_prog,
+			.key = key,
+	};

 	OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1);
 	OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER));
@@ -398,13 +413,8 @@ fd3_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, 0);            /* VFD_INSTANCEID_OFFSET */
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */

-	fd3_program_emit(ring, &ctx->solid_prog, key);
-	fd3_emit_vertex_bufs(ring, fd3_shader_variant(ctx->solid_prog.vp, key),
-			(struct fd3_vertex_buf[]) {{
-				.prsc = fd3_ctx->solid_vbuf,
-				.stride = 12,
-				.format = PIPE_FORMAT_R32G32B32_FLOAT,
-			}}, 1);
+	fd3_program_emit(ring, &emit);
+	fd3_emit_vertex_bufs(ring, &emit);

 	if (ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
 		uint32_t base = depth_base(ctx);
@@ -448,6 +458,11 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	struct fd_gmem_stateobj *gmem = &ctx->gmem;
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+	struct fd3_emit emit = {
+			.vtx = &fd3_ctx->blit_vbuf_state,
+			.prog = &ctx->blit_prog,
+			.key = key,
+	};
 	float x0, y0, x1, y1;
 	unsigned bin_w = tile->bin_w;
 	unsigned bin_h = tile->bin_h;
@@ -542,17 +557,8 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, 0);            /* VFD_INSTANCEID_OFFSET */
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */

-	fd3_program_emit(ring, &ctx->blit_prog, key);
-	fd3_emit_vertex_bufs(ring, fd3_shader_variant(ctx->blit_prog.vp, key),
-			(struct fd3_vertex_buf[]) {{
-				.prsc = fd3_ctx->blit_texcoord_vbuf,
-				.stride = 8,
-				.format = PIPE_FORMAT_R32G32_FLOAT,
-			}, {
-				.prsc = fd3_ctx->solid_vbuf,
-				.stride = 12,
-				.format = PIPE_FORMAT_R32G32B32_FLOAT,
-			}}, 2);
+	fd3_program_emit(ring, &emit);
+	fd3_emit_vertex_bufs(ring, &emit);

 	/* for gmem pitch/base calculations, we need to use the non-
 	 * truncated tile sizes:
@@ -560,10 +566,10 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	bin_w = gmem->bin_w;
 	bin_h = gmem->bin_h;

-	if (ctx->restore & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
 		emit_mem2gmem_surf(ctx, depth_base(ctx), pfb->zsbuf, bin_w);

-	if (ctx->restore & FD_BUFFER_COLOR)
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR))
 		emit_mem2gmem_surf(ctx, 0, pfb->cbufs[0], bin_w);

 	OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
@@ -603,8 +609,11 @@ fd3_emit_sysmem_prep(struct fd_context *ctx)
 	struct fd_ringbuffer *ring = ctx->ring;
 	uint32_t pitch = 0;

-	if (pfb->cbufs[0])
-		pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch;
+	if (pfb->cbufs[0]) {
+		struct pipe_surface *psurf = pfb->cbufs[0];
+		unsigned lvl = psurf->u.tex.level;
+		pitch = fd_resource(psurf->texture)->slices[lvl].pitch;
+	}

 	fd3_emit_restore(ctx);

--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -140,13 +140,21 @@ find_output(const struct ir3_shader_variant *so, ir3_semantic semantic)
 	 * in the vertex shader.. but the fragment shader doesn't know this
 	 * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
 	 * at link time if there is no matching OUT.BCOLOR[n], we must map
-	 * OUT.COLOR[n] to IN.BCOLOR[n].
+	 * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
+	 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
 	 */
 	if (sem2name(semantic) == TGSI_SEMANTIC_BCOLOR) {
 		unsigned idx = sem2idx(semantic);
-		return find_output(so, ir3_semantic_name(TGSI_SEMANTIC_COLOR, idx));
+		semantic = ir3_semantic_name(TGSI_SEMANTIC_COLOR, idx);
+	} else if (sem2name(semantic) == TGSI_SEMANTIC_COLOR) {
+		unsigned idx = sem2idx(semantic);
+		semantic = ir3_semantic_name(TGSI_SEMANTIC_BCOLOR, idx);
 	}

+	for (j = 0; j < so->outputs_count; j++)
+		if (so->outputs[j].semantic == semantic)
+			return j;
+
 	debug_assert(0);

 	return 0;
@@ -172,27 +180,72 @@ find_output_regid(const struct ir3_shader_variant *so, ir3_semantic semantic)
 }

 void
-fd3_program_emit(struct fd_ringbuffer *ring,
-		struct fd_program_stateobj *prog, struct ir3_shader_key key)
+fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 {
 	const struct ir3_shader_variant *vp, *fp;
 	const struct ir3_info *vsi, *fsi;
+	enum a3xx_instrbuffermode fpbuffer, vpbuffer;
+	uint32_t fpbuffersz, vpbuffersz, fsoff;
 	uint32_t pos_regid, posz_regid, psize_regid, color_regid;
+	int constmode;
 	int i, j, k;

-	vp = fd3_shader_variant(prog->vp, key);
+	vp = fd3_emit_get_vp(emit);

-	if (key.binning_pass) {
+	if (emit->key.binning_pass) {
 		/* use dummy stateobj to simplify binning vs non-binning: */
 		static const struct ir3_shader_variant binning_fp = {};
 		fp = &binning_fp;
 	} else {
-		fp = fd3_shader_variant(prog->fp, key);
+		fp = fd3_emit_get_fp(emit);
 	}

 	vsi = &vp->info;
 	fsi = &fp->info;

+	fpbuffer = BUFFER;
+	vpbuffer = BUFFER;
+	fpbuffersz = fp->instrlen;
+	vpbuffersz = vp->instrlen;
+
+	/*
+	 * Decide whether to use BUFFER or CACHE mode for VS and FS.  It
+	 * appears like 256 is the hard limit, but when the combined size
+	 * exceeds 128 then blob will try to keep FS in BUFFER mode and
+	 * switch to CACHE for VS until VS is too large.  The blob seems
+	 * to switch FS out of BUFFER mode at slightly under 128.  But
+	 * a bit fuzzy on the decision tree, so use slightly conservative
+	 * limits.
+	 *
+	 * TODO check if these thresholds for BUFFER vs CACHE mode are the
+	 *      same for all a3xx or whether we need to consider the gpuid
+	 */
+
+	if ((fpbuffersz + vpbuffersz) > 128) {
+		if (fpbuffersz < 112) {
+			/* FP:BUFFER   VP:CACHE  */
+			vpbuffer = CACHE;
+			vpbuffersz = 256 - fpbuffersz;
+		} else if (vpbuffersz < 112) {
+			/* FP:CACHE    VP:BUFFER */
+			fpbuffer = CACHE;
+			fpbuffersz = 256 - vpbuffersz;
+		} else {
+			/* FP:CACHE    VP:CACHE  */
+			vpbuffer = fpbuffer = CACHE;
+			vpbuffersz = fpbuffersz = 192;
+		}
+	}
+
+	if (fpbuffer == BUFFER) {
+		fsoff = 128 - fpbuffersz;
+	} else {
+		fsoff = 256 - fpbuffersz;
+	}
+
+	/* seems like vs->constlen + fs->constlen > 256, then CONSTMODE=1 */
+	constmode = ((vp->constlen + fp->constlen) > 256) ? 1 : 0;
+
 	pos_regid = find_output_regid(vp,
 		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
 	posz_regid = find_output_regid(fp,
@@ -208,6 +261,7 @@ fd3_program_emit(struct fd_ringbuffer *ring,

 	OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6);
 	OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
+			A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
 			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
 			 * flush some caches? I think we only need to set those
 			 * bits if we have updated const or shader..
@@ -221,14 +275,14 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 	OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid));
 	OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) |
 			A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) |
-			A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vp->instrlen));
+			A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz));
 	OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) |
 			A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) |
-			A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fp->instrlen));
+			A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz));

 	OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
-	OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) |
-			COND(key.binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) |
+	OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(constmode) |
+			COND(emit->key.binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) |
 			A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
 			A3XX_SP_SP_CTRL_REG_L0MODE(0));

@@ -237,18 +291,18 @@ fd3_program_emit(struct fd_ringbuffer *ring,

 	OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3);
 	OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
-			A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
-			A3XX_SP_VS_CTRL_REG0_CACHEINVALID |
+			A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) |
+			COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) |
 			A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
 			A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
 			A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
 			A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
 			A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
 			COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) |
-			A3XX_SP_VS_CTRL_REG0_LENGTH(vp->instrlen));
+			A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz));
 	OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) |
 			A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) |
-			A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vsi->max_const, 0)));
+			A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0)));
 	OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
 			A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
 			A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(fp->total_in, 4) / 4));
@@ -301,7 +355,7 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 			A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
 	OUT_RELOC(ring, vp->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */

-	if (key.binning_pass) {
+	if (emit->key.binning_pass) {
 		OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
 		OUT_RING(ring, 0x00000000);

@@ -309,35 +363,37 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 		OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
 				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER));
 		OUT_RING(ring, 0x00000000);
+
+		OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1);
+		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
+				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
 	} else {
 		OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
 		OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));

 		OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
 		OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
-				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
-				A3XX_SP_FS_CTRL_REG0_CACHEINVALID |
+				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) |
+				COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) |
 				A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
 				A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
 				A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
 				A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
 				A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
 				COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
-				A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen));
+				A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz));
 		OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
 				A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
-				A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) |
+				A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fp->constlen + 1, 0)) |
 				A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));
+
 		OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
-		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
-				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
+		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(
+					MAX2(128, vp->constlen)) |
+				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff));
 		OUT_RELOC(ring, fp->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
 	}

-	OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2);
-	OUT_RING(ring, 0x00000000);        /* SP_FS_FLAT_SHAD_MODE_REG_0 */
-	OUT_RING(ring, 0x00000000);        /* SP_FS_FLAT_SHAD_MODE_REG_1 */
-
 	OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1);
 	if (fp->writes_pos) {
 		OUT_RING(ring, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE |
@@ -353,13 +409,37 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));
 	OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));

-	if (key.binning_pass) {
+	if (emit->key.binning_pass) {
 		OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
 		OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) |
 				A3XX_VPC_ATTR_LMSIZE(1) |
 				COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE));
 		OUT_RING(ring, 0x00000000);
 	} else {
+		uint32_t vinterp[4] = {0}, flatshade[2] = {0};
+
+		/* figure out VARYING_INTERP / FLAT_SHAD register values: */
+		for (j = -1; (j = next_varying(fp, j)) < (int)fp->inputs_count; ) {
+			uint32_t interp = fp->inputs[j].interpolate;
+			if ((interp == TGSI_INTERPOLATE_CONSTANT) ||
+					((interp == TGSI_INTERPOLATE_COLOR) && emit->rasterflat)) {
+				/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
+				 * instead.. rather than -8 everywhere else..
+				 */
+				uint32_t loc = fp->inputs[j].inloc - 8;
+
+				/* currently assuming varyings aligned to 4 (not
+				 * packed):
+				 */
+				debug_assert((loc % 4) == 0);
+
+				for (i = 0; i < 4; i++, loc++) {
+					vinterp[loc / 16] |= FLAT << ((loc % 16) * 2);
+					flatshade[loc / 32] |= 1 << (loc % 32);
+				}
+			}
+		}
+
 		OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
 		OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) |
 				A3XX_VPC_ATTR_THRDASSIGN(1) |
@@ -369,29 +449,35 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 				A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in));

 		OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
-		OUT_RING(ring, fp->shader->vinterp[0]);    /* VPC_VARYING_INTERP[0].MODE */
-		OUT_RING(ring, fp->shader->vinterp[1]);    /* VPC_VARYING_INTERP[1].MODE */
-		OUT_RING(ring, fp->shader->vinterp[2]);    /* VPC_VARYING_INTERP[2].MODE */
-		OUT_RING(ring, fp->shader->vinterp[3]);    /* VPC_VARYING_INTERP[3].MODE */
+		OUT_RING(ring, vinterp[0]);    /* VPC_VARYING_INTERP[0].MODE */
+		OUT_RING(ring, vinterp[1]);    /* VPC_VARYING_INTERP[1].MODE */
+		OUT_RING(ring, vinterp[2]);    /* VPC_VARYING_INTERP[2].MODE */
+		OUT_RING(ring, vinterp[3]);    /* VPC_VARYING_INTERP[3].MODE */

 		OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
 		OUT_RING(ring, fp->shader->vpsrepl[0]);    /* VPC_VARYING_PS_REPL[0].MODE */
 		OUT_RING(ring, fp->shader->vpsrepl[1]);    /* VPC_VARYING_PS_REPL[1].MODE */
 		OUT_RING(ring, fp->shader->vpsrepl[2]);    /* VPC_VARYING_PS_REPL[2].MODE */
 		OUT_RING(ring, fp->shader->vpsrepl[3]);    /* VPC_VARYING_PS_REPL[3].MODE */
+
+		OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2);
+		OUT_RING(ring, flatshade[0]);        /* SP_FS_FLAT_SHAD_MODE_REG_0 */
+		OUT_RING(ring, flatshade[1]);        /* SP_FS_FLAT_SHAD_MODE_REG_1 */
 	}

 	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
 	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
 			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252));

-	emit_shader(ring, vp);
+	if (vpbuffer == BUFFER)
+		emit_shader(ring, vp);

 	OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
 	OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */

-	if (!key.binning_pass) {
-		emit_shader(ring, fp);
+	if (!emit->key.binning_pass) {
+		if (fpbuffer == BUFFER)
+			emit_shader(ring, fp);

 		OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
 		OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
@@ -37,8 +37,9 @@ struct fd3_shader_stateobj {
 	struct ir3_shader *shader;
 };

-void fd3_program_emit(struct fd_ringbuffer *ring,
-		struct fd_program_stateobj *prog, struct ir3_shader_key key);
+struct fd3_emit;
+
+void fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit);

 void fd3_prog_init(struct pipe_context *pctx);

--- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
@@ -119,14 +119,14 @@ occlusion_predicate_accumulate_result(struct fd_context *ctx,

 static const struct fd_hw_sample_provider occlusion_counter = {
 		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
-		.active = FD_STAGE_DRAW, /* | FD_STAGE_CLEAR ??? */
+		.active = FD_STAGE_DRAW,
 		.get_sample = occlusion_get_sample,
 		.accumulate_result = occlusion_counter_accumulate_result,
 };

 static const struct fd_hw_sample_provider occlusion_predicate = {
 		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
-		.active = FD_STAGE_DRAW, /* | FD_STAGE_CLEAR ??? */
+		.active = FD_STAGE_DRAW,
 		.get_sample = occlusion_get_sample,
 		.accumulate_result = occlusion_predicate_accumulate_result,
 };
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -36,28 +36,31 @@
 #include "fd3_util.h"

 static enum a3xx_tex_clamp
-tex_clamp(unsigned wrap)
+tex_clamp(unsigned wrap, bool clamp_to_edge)
 {
-	/* hardware probably supports more, but we can't coax all the
-	 * wrap/clamp modes out of the GLESv2 blob driver.
-	 *
-	 * TODO once we have basics working, go back and just try
-	 * different values and see what happens
-	 */
+	/* Hardware does not support _CLAMP, but we emulate it: */
+	if (wrap == PIPE_TEX_WRAP_CLAMP) {
+		wrap = (clamp_to_edge) ?
+			PIPE_TEX_WRAP_CLAMP_TO_EDGE : PIPE_TEX_WRAP_CLAMP_TO_BORDER;
+	}
+
 	switch (wrap) {
 	case PIPE_TEX_WRAP_REPEAT:
 		return A3XX_TEX_REPEAT;
-	case PIPE_TEX_WRAP_CLAMP:
 	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 		return A3XX_TEX_CLAMP_TO_EDGE;
 	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 		return A3XX_TEX_CLAMP_TO_BORDER;
-	case PIPE_TEX_WRAP_MIRROR_CLAMP:
-	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		/* only works for PoT.. need to emulate otherwise! */
 		return A3XX_TEX_MIRROR_CLAMP;
 	case PIPE_TEX_WRAP_MIRROR_REPEAT:
 		return A3XX_TEX_MIRROR_REPEAT;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		/* these two we could perhaps emulate, but we currently
+		 * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP
+		 */
 	default:
 		DBG("invalid wrap: %u", wrap);
 		return 0;
@@ -84,6 +87,7 @@ fd3_sampler_state_create(struct pipe_context *pctx,
 {
 	struct fd3_sampler_stateobj *so = CALLOC_STRUCT(fd3_sampler_stateobj);
 	bool miplinear = false;
+	bool clamp_to_edge;

 	if (!so)
 		return NULL;
@@ -93,20 +97,36 @@ fd3_sampler_state_create(struct pipe_context *pctx,

 	so->base = *cso;

+	/*
+	 * For nearest filtering, _CLAMP means _CLAMP_TO_EDGE;  for linear
+	 * filtering, _CLAMP means _CLAMP_TO_BORDER while additionally
+	 * clamping the texture coordinates to [0.0, 1.0].
+	 *
+	 * The clamping will be taken care of in the shaders.  There are two
+	 * filters here, but let the minification one has a say.
+	 */
+	clamp_to_edge = (cso->min_img_filter == PIPE_TEX_FILTER_NEAREST);
+	if (!clamp_to_edge) {
+		so->saturate_s = (cso->wrap_s == PIPE_TEX_WRAP_CLAMP);
+		so->saturate_t = (cso->wrap_t == PIPE_TEX_WRAP_CLAMP);
+		so->saturate_r = (cso->wrap_r == PIPE_TEX_WRAP_CLAMP);
+	}
+
 	so->texsamp0 =
 			COND(!cso->normalized_coords, A3XX_TEX_SAMP_0_UNNORM_COORDS) |
 			COND(miplinear, A3XX_TEX_SAMP_0_MIPFILTER_LINEAR) |
 			A3XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter)) |
 			A3XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter)) |
-			A3XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s)) |
-			A3XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t)) |
-			A3XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r));
+			A3XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, clamp_to_edge)) |
+			A3XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, clamp_to_edge)) |
+			A3XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, clamp_to_edge));

 	if (cso->compare_mode)
 		so->texsamp0 |= A3XX_TEX_SAMP_0_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */

 	if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
 		so->texsamp1 =
+				A3XX_TEX_SAMP_1_LOD_BIAS(cso->lod_bias) |
 				A3XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) |
 				A3XX_TEX_SAMP_1_MAX_LOD(cso->max_lod);
 	} else {
@@ -116,6 +136,50 @@ fd3_sampler_state_create(struct pipe_context *pctx,
 	return so;
 }

+static void
+fd3_sampler_states_bind(struct pipe_context *pctx,
+		unsigned shader, unsigned start,
+		unsigned nr, void **hwcso)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd3_context *fd3_ctx = fd3_context(ctx);
+	uint16_t saturate_s = 0, saturate_t = 0, saturate_r = 0;
+	unsigned i;
+
+	for (i = 0; i < nr; i++) {
+		if (hwcso[i]) {
+			struct fd3_sampler_stateobj *sampler =
+					fd3_sampler_stateobj(hwcso[i]);
+			if (sampler->saturate_s)
+				saturate_s |= (1 << i);
+			if (sampler->saturate_t)
+				saturate_t |= (1 << i);
+			if (sampler->saturate_r)
+				saturate_r |= (1 << i);
+		}
+	}
+
+	fd_sampler_states_bind(pctx, shader, start, nr, hwcso);
+
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		fd3_ctx->fsaturate =
+			(saturate_s != 0) ||
+			(saturate_t != 0) ||
+			(saturate_r != 0);
+		fd3_ctx->fsaturate_s = saturate_s;
+		fd3_ctx->fsaturate_t = saturate_t;
+		fd3_ctx->fsaturate_r = saturate_r;
+	} else if (shader == PIPE_SHADER_VERTEX) {
+		fd3_ctx->vsaturate =
+			(saturate_s != 0) ||
+			(saturate_t != 0) ||
+			(saturate_r != 0);
+		fd3_ctx->vsaturate_s = saturate_s;
+		fd3_ctx->vsaturate_t = saturate_t;
+		fd3_ctx->vsaturate_r = saturate_r;
+	}
+}
+
 static enum a3xx_tex_type
 tex_type(unsigned target)
 {
@@ -175,7 +239,24 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	/* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */
 	so->texconst2 =
 			A3XX_TEX_CONST_2_PITCH(rsc->slices[lvl].pitch * rsc->cpp);
-	so->texconst3 = 0x00000000;  /* ??? */
+	switch (prsc->target) {
+	case PIPE_TEXTURE_1D_ARRAY:
+	case PIPE_TEXTURE_2D_ARRAY:
+		so->texconst3 =
+				A3XX_TEX_CONST_3_DEPTH(prsc->array_size - 1) |
+				A3XX_TEX_CONST_3_LAYERSZ1(rsc->slices[0].size0) |
+				A3XX_TEX_CONST_3_LAYERSZ2(rsc->slices[0].size0);
+		break;
+	case PIPE_TEXTURE_3D:
+		so->texconst3 =
+				A3XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) |
+				A3XX_TEX_CONST_3_LAYERSZ1(rsc->slices[0].size0) |
+				A3XX_TEX_CONST_3_LAYERSZ2(rsc->slices[0].size0);
+		break;
+	default:
+		so->texconst3 = 0x00000000;
+		break;
+	}

 	return &so->base;
 }
@@ -184,5 +265,6 @@ void
 fd3_texture_init(struct pipe_context *pctx)
 {
 	pctx->create_sampler_state = fd3_sampler_state_create;
+	pctx->bind_sampler_states = fd3_sampler_states_bind;
 	pctx->create_sampler_view = fd3_sampler_view_create;
 }
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
@@ -40,6 +40,7 @@
 struct fd3_sampler_stateobj {
 	struct pipe_sampler_state base;
 	uint32_t texsamp0, texsamp1;
+	bool saturate_s, saturate_t, saturate_r;
 };

 static INLINE struct fd3_sampler_stateobj *
--- a/src/gallium/drivers/freedreno/a3xx/fd3_util.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_util.c
@@ -134,6 +134,14 @@ fd3_pipe2vtx(enum pipe_format format)
 	case PIPE_FORMAT_R16G16_SNORM:
 		return VFMT_NORM_SHORT_16_16;

+	case PIPE_FORMAT_R32_UINT:
+	case PIPE_FORMAT_R32_USCALED:
+		return VFMT_UINT_32;
+
+	case PIPE_FORMAT_R32_SINT:
+	case PIPE_FORMAT_R32_SSCALED:
+		return VFMT_INT_32;
+
 	case PIPE_FORMAT_R10G10B10A2_UNORM:
 		return VFMT_NORM_UINT_10_10_10_2;

@@ -196,6 +204,14 @@ fd3_pipe2vtx(enum pipe_format format)
 	case PIPE_FORMAT_R16G16B16A16_FLOAT:
 		return VFMT_FLOAT_16_16_16_16;

+	case PIPE_FORMAT_R32G32_UINT:
+	case PIPE_FORMAT_R32G32_USCALED:
+		return VFMT_UINT_32_32;
+
+	case PIPE_FORMAT_R32G32_SINT:
+	case PIPE_FORMAT_R32G32_SSCALED:
+		return VFMT_INT_32_32;
+
 	/* 96-bit buffers. */
 	case PIPE_FORMAT_R32G32B32_FLOAT:
 		return VFMT_FLOAT_32_32_32;
@@ -203,6 +219,14 @@ fd3_pipe2vtx(enum pipe_format format)
 	case PIPE_FORMAT_R32G32B32_FIXED:
 		return VFMT_FIXED_32_32_32;

+	case PIPE_FORMAT_R32G32B32_UINT:
+	case PIPE_FORMAT_R32G32B32_USCALED:
+		return VFMT_UINT_32_32_32;
+
+	case PIPE_FORMAT_R32G32B32_SINT:
+	case PIPE_FORMAT_R32G32B32_SSCALED:
+		return VFMT_INT_32_32_32;
+
 	/* 128-bit buffers. */
 	case PIPE_FORMAT_R32G32B32A32_FLOAT:
 		return VFMT_FLOAT_32_32_32_32;
@@ -210,26 +234,20 @@ fd3_pipe2vtx(enum pipe_format format)
 	case PIPE_FORMAT_R32G32B32A32_FIXED:
 		return VFMT_FIXED_32_32_32_32;

-/* TODO probably need gles3 blob drivers to find the 32bit int formats:
+	case PIPE_FORMAT_R32G32B32A32_UINT:
+	case PIPE_FORMAT_R32G32B32A32_USCALED:
+		return VFMT_UINT_32_32_32_32;
+
+	case PIPE_FORMAT_R32G32B32A32_SINT:
+	case PIPE_FORMAT_R32G32B32A32_SSCALED:
+		return VFMT_INT_32_32_32_32;
+
+/* TODO normalized 32bit int formats do not appear to be supported
+ * natively.. will require either shader variant or VFD_DECODE
+ * gymnastics like the blob driver does..
 	case PIPE_FORMAT_R32G32B32A32_SNORM:
 	case PIPE_FORMAT_R32G32B32A32_UNORM:
-	case PIPE_FORMAT_R32G32B32A32_SINT:
-	case PIPE_FORMAT_R32G32B32A32_UINT:
-
-	case PIPE_FORMAT_R32_UINT:
-	case PIPE_FORMAT_R32_SINT:
-	case PIPE_FORMAT_A32_UINT:
-	case PIPE_FORMAT_A32_SINT:
-	case PIPE_FORMAT_L32_UINT:
-	case PIPE_FORMAT_L32_SINT:
-	case PIPE_FORMAT_I32_UINT:
-	case PIPE_FORMAT_I32_SINT:
-
-	case PIPE_FORMAT_R32G32_SINT:
-	case PIPE_FORMAT_R32G32_UINT:
-	case PIPE_FORMAT_L32A32_UINT:
-	case PIPE_FORMAT_L32A32_SINT:
-*/
+ */

 	default:
 		return ~0;
@@ -246,6 +264,9 @@ fd3_pipe2tex(enum pipe_format format)
 	case PIPE_FORMAT_I8_UNORM:
 		return TFMT_NORM_UINT_8;

+	case PIPE_FORMAT_R8G8_UNORM:
+		return TFMT_NORM_UINT_8_8;
+
 	case PIPE_FORMAT_B8G8R8A8_UNORM:
 	case PIPE_FORMAT_B8G8R8X8_UNORM:
 	case PIPE_FORMAT_R8G8B8A8_UNORM:
@@ -257,13 +278,11 @@ fd3_pipe2tex(enum pipe_format format)
 		return TFMT_NORM_UINT_8_8_8_8;

 	case PIPE_FORMAT_Z24X8_UNORM:
+	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 		return TFMT_NORM_UINT_X8Z24;

-	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-		return TFMT_NORM_UINT_8_8_8_8;
-
 	case PIPE_FORMAT_Z16_UNORM:
-		return TFMT_NORM_UINT_8_8;
+		return TFMT_NORM_USHORT_Z16;

 	case PIPE_FORMAT_R16G16B16A16_FLOAT:
 	case PIPE_FORMAT_R16G16B16X16_FLOAT:
@@ -331,6 +350,8 @@ fd3_pipe2color(enum pipe_format format)

 	case PIPE_FORMAT_R8_UNORM:
 	case PIPE_FORMAT_L8_UNORM:
+		return RB_R8_UNORM;
+
 	case PIPE_FORMAT_A8_UNORM:
 		return RB_A8_UNORM;

@@ -360,8 +381,9 @@ fd3_gmem_restore_format(enum pipe_format format)
 	switch (format) {
 	case PIPE_FORMAT_Z24X8_UNORM:
 	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+		return PIPE_FORMAT_R8G8B8A8_UNORM;
 	case PIPE_FORMAT_Z16_UNORM:
-		return PIPE_FORMAT_B8G8R8A8_UNORM;
+		return PIPE_FORMAT_R8G8_UNORM;
 	default:
 		return format;
 	}
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (   9859 bytes, from 2014-06-02 15:21:30)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14477 bytes, from 2014-07-19 17:20:53)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  58020 bytes, from 2014-07-19 17:21:17)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  36670 bytes, from 2014-07-19 17:18:34)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10347 bytes, from 2014-10-01 18:55:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14960 bytes, from 2014-07-27 17:22:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  60533 bytes, from 2014-10-15 18:32:43)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  41068 bytes, from 2014-08-01 12:22:48)

 Copyright (C) 2013-2014 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -132,6 +132,7 @@ enum a3xx_threadmode {
 };

 enum a3xx_instrbuffermode {
+	CACHE = 0,
 	BUFFER = 1,
 };

--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (   9859 bytes, from 2014-06-02 15:21:30)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14477 bytes, from 2014-07-19 17:20:53)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  58020 bytes, from 2014-07-19 17:21:17)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  36670 bytes, from 2014-07-19 17:18:34)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10347 bytes, from 2014-10-01 18:55:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14960 bytes, from 2014-07-27 17:22:13)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  60533 bytes, from 2014-10-15 18:32:43)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  41068 bytes, from 2014-08-01 12:22:48)

 Copyright (C) 2013-2014 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -163,12 +163,16 @@ enum adreno_pm4_type3_packets {
 	CP_INDIRECT_BUFFER_PFE = 63,
 	CP_SET_BIN = 76,
 	CP_TEST_TWO_MEMS = 113,
+	CP_REG_WR_NO_CTXT = 120,
+	CP_RECORD_PFP_TIMESTAMP = 17,
 	CP_WAIT_FOR_ME = 19,
 	CP_SET_DRAW_STATE = 67,
 	CP_DRAW_INDX_OFFSET = 56,
 	CP_DRAW_INDIRECT = 40,
 	CP_DRAW_INDX_INDIRECT = 41,
 	CP_DRAW_AUTO = 36,
+	CP_UNKNOWN_1A = 26,
+	CP_WIDE_REG_WRITE = 116,
 	IN_IB_PREFETCH_END = 23,
 	IN_SUBBLK_PREFETCH = 31,
 	IN_INSTR_PREFETCH = 32,
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -100,7 +100,7 @@ fd_context_render(struct pipe_context *pctx)
 	if (!ctx->needs_flush)
 		return;

-	fd_gmem_render_tiles(pctx);
+	fd_gmem_render_tiles(ctx);

 	DBG("%p/%p/%p", ctx->ring->start, ctx->ring->cur, ctx->ring->end);

@@ -111,7 +111,7 @@ fd_context_render(struct pipe_context *pctx)
 		fd_context_next_rb(pctx);

 	ctx->needs_flush = false;
-	ctx->cleared = ctx->restore = ctx->resolve = 0;
+	ctx->cleared = ctx->partial_cleared = ctx->restore = ctx->resolve = 0;
 	ctx->gmem_reason = 0;
 	ctx->num_draws = 0;

@@ -148,8 +148,6 @@ fd_context_destroy(struct pipe_context *pctx)
 	fd_prog_fini(pctx);
 	fd_hw_query_fini(pctx);

-	util_slab_destroy(&ctx->transfer_pool);
-
 	util_dynarray_fini(&ctx->draw_patches);

 	if (ctx->blitter)
@@ -158,6 +156,8 @@ fd_context_destroy(struct pipe_context *pctx)
 	if (ctx->primconvert)
 		util_primconvert_destroy(ctx->primconvert);

+	util_slab_destroy(&ctx->transfer_pool);
+
 	fd_ringmarker_del(ctx->draw_start);
 	fd_ringmarker_del(ctx->draw_end);
 	fd_ringmarker_del(ctx->binning_start);
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -83,6 +83,15 @@ struct fd_vertex_stateobj {
 	unsigned num_elements;
 };

+/* group together the vertex and vertexbuf state.. for ease of passing
+ * around, and because various internal operations (gmem<->mem, etc)
+ * need their own vertex state:
+ */
+struct fd_vertex_state {
+	struct fd_vertex_stateobj *vtx;
+	struct fd_vertexbuf_stateobj vertexbuf;
+};
+
 /* Bitmask of stages in rendering that a particular query query is
 * active.  Queries will be automatically started/stopped (generating
 * additional fd_hw_sample_period's) on entrance/exit from stages that
@@ -174,6 +183,10 @@ struct fd_context {
 	 * there was a glClear() that invalidated the entire previous buffer
 	 * contents.  Keep track of which buffer(s) are cleared, or needs
 	 * restore.  Masks of PIPE_CLEAR_*
+	 *
+	 * The 'cleared' bits will be set for buffers which are *entirely*
+	 * cleared, and 'partial_cleared' bits will be set if you must
+	 * check cleared_scissor.
 	 */
 	enum {
 		/* align bitmask values w/ PIPE_CLEAR_*.. since that is convenient.. */
@@ -181,7 +194,7 @@ struct fd_context {
 		FD_BUFFER_DEPTH   = PIPE_CLEAR_DEPTH,
 		FD_BUFFER_STENCIL = PIPE_CLEAR_STENCIL,
 		FD_BUFFER_ALL     = FD_BUFFER_COLOR | FD_BUFFER_DEPTH | FD_BUFFER_STENCIL,
-	} cleared, restore, resolve;
+	} cleared, partial_cleared, restore, resolve;

 	bool needs_flush;

@@ -222,6 +235,14 @@ struct fd_context {
 	struct fd_ringbuffer *rings[8];
 	unsigned rings_idx;

+	/* NOTE: currently using a single ringbuffer for both draw and
+	 * tiling commands, we need to make sure we need to leave enough
+	 * room at the end to append the tiling commands when we flush.
+	 * 0x7000 dwords should be a couple times more than we ever need
+	 * so should be a nice conservative threshold.
+	 */
+#define FD_TILING_COMMANDS_DWORDS 0x7000
+
 	/* normal draw/clear cmds: */
 	struct fd_ringbuffer *ring;
 	struct fd_ringmarker *draw_start, *draw_end;
@@ -260,6 +281,14 @@ struct fd_context {
 	 */
 	struct pipe_scissor_state max_scissor;

+	/* Track the cleared scissor for color/depth/stencil, so we know
+	 * which, if any, tiles need to be restored (mem2gmem).  Only valid
+	 * if the corresponding bit in ctx->cleared is set.
+	 */
+	struct {
+		struct pipe_scissor_state color, depth, stencil;
+	} cleared_scissor;
+
 	/* Current gmem/tiling configuration.. gets updated on render_tiles()
 	 * if out of date with current maximal-scissor/cpp:
 	 */
@@ -297,7 +326,7 @@ struct fd_context {

 	struct fd_program_stateobj prog;

-	struct fd_vertex_stateobj *vtx;
+	struct fd_vertex_state vtx;

 	struct pipe_blend_color blend_color;
 	struct pipe_stencil_ref stencil_ref;
@@ -306,7 +335,6 @@ struct fd_context {
 	struct pipe_poly_stipple stipple;
 	struct pipe_viewport_state viewport;
 	struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
-	struct fd_vertexbuf_stateobj vertexbuf;
 	struct pipe_index_buffer indexbuf;

 	/* GMEM/tile handling fxns: */
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -40,51 +40,6 @@
 #include "freedreno_util.h"


-static enum pc_di_index_size
-size2indextype(unsigned index_size)
-{
-	switch (index_size) {
-	case 1: return INDEX_SIZE_8_BIT;
-	case 2: return INDEX_SIZE_16_BIT;
-	case 4: return INDEX_SIZE_32_BIT;
-	}
-	DBG("unsupported index size: %d", index_size);
-	assert(0);
-	return INDEX_SIZE_IGN;
-}
-
-/* this is same for a2xx/a3xx, so split into helper: */
-void
-fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		enum pc_di_vis_cull_mode vismode,
-		const struct pipe_draw_info *info)
-{
-	struct pipe_index_buffer *idx = &ctx->indexbuf;
-	struct fd_bo *idx_bo = NULL;
-	enum pc_di_index_size idx_type = INDEX_SIZE_IGN;
-	enum pc_di_src_sel src_sel;
-	uint32_t idx_size, idx_offset;
-
-	if (info->indexed) {
-		assert(!idx->user_buffer);
-
-		idx_bo = fd_resource(idx->buffer)->bo;
-		idx_type = size2indextype(idx->index_size);
-		idx_size = idx->index_size * info->count;
-		idx_offset = idx->offset + (info->start * idx->index_size);
-		src_sel = DI_SRC_SEL_DMA;
-	} else {
-		idx_bo = NULL;
-		idx_type = INDEX_SIZE_IGN;
-		idx_size = 0;
-		idx_offset = 0;
-		src_sel = DI_SRC_SEL_AUTO_INDEX;
-	}
-
-	fd_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel,
-			info->count, idx_type, idx_size, idx_offset, idx_bo);
-}
-
 static void
 fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 {
@@ -152,13 +107,30 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	ctx->stats.prims_emitted +=
 		u_reduced_prims_for_vertices(info->mode, info->count);

-	/* any buffers that haven't been cleared, we need to restore: */
+	/* any buffers that haven't been cleared yet, we need to restore: */
 	ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared);
 	/* and any buffers used, need to be resolved: */
 	ctx->resolve |= buffers;

+	DBG("%x num_draws=%u (%s/%s)", buffers, ctx->num_draws,
+		util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
+		util_format_short_name(pipe_surface_format(pfb->zsbuf)));
+
 	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_DRAW);
 	ctx->draw(ctx, info);
+
+	/* if an app (or, well, piglit test) does many thousands of draws
+	 * without flush (or anything which implicitly flushes, like
+	 * changing render targets), we can exceed the ringbuffer size.
+	 * Since we don't currently have a sane way to wrapparound, and
+	 * we use the same buffer for both draw and tiling commands, for
+	 * now we need to do this hack and trigger flush if we are running
+	 * low on remaining space for cmds:
+	 */
+	if (((ctx->ring->cur - ctx->ring->start) >
+				(ctx->ring->size/4 - FD_TILING_COMMANDS_DWORDS)) ||
+			(fd_mesa_debug & FD_DBG_FLUSH))
+		fd_context_render(pctx);
 }

 /* TODO figure out how to make better use of existing state mechanism
@@ -173,8 +145,30 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 {
 	struct fd_context *ctx = fd_context(pctx);
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+	struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
+	unsigned cleared_buffers;

-	ctx->cleared |= buffers;
+	/* for bookkeeping about which buffers have been cleared (and thus
+	 * can fully or partially skip mem2gmem) we need to ignore buffers
+	 * that have already had a draw, in case apps do silly things like
+	 * clear after draw (ie. if you only clear the color buffer, but
+	 * something like alpha-test causes side effects from the draw in
+	 * the depth buffer, etc)
+	 */
+	cleared_buffers = buffers & (FD_BUFFER_ALL & ~ctx->restore);
+
+	/* do we have full-screen scissor? */
+	if (!memcmp(scissor, &ctx->disabled_scissor, sizeof(*scissor))) {
+		ctx->cleared |= cleared_buffers;
+	} else {
+		ctx->partial_cleared |= cleared_buffers;
+		if (cleared_buffers & PIPE_CLEAR_COLOR)
+			ctx->cleared_scissor.color = *scissor;
+		if (cleared_buffers & PIPE_CLEAR_DEPTH)
+			ctx->cleared_scissor.depth = *scissor;
+		if (cleared_buffers & PIPE_CLEAR_STENCIL)
+			ctx->cleared_scissor.stencil = *scissor;
+	}
 	ctx->resolve |= buffers;
 	ctx->needs_flush = true;

--- a/src/gallium/drivers/freedreno/freedreno_draw.h
+++ b/src/gallium/drivers/freedreno/freedreno_draw.h
@@ -33,15 +33,12 @@
 #include "pipe/p_context.h"

 #include "freedreno_context.h"
+#include "freedreno_resource.h"
 #include "freedreno_screen.h"
 #include "freedreno_util.h"

 struct fd_ringbuffer;

-void fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
-		enum pc_di_vis_cull_mode vismode,
-		const struct pipe_draw_info *info);
-
 void fd_draw_init(struct pipe_context *pctx);

 static inline void
@@ -98,4 +95,50 @@ fd_draw(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	fd_reset_wfi(ctx);
 }

+
+static inline enum pc_di_index_size
+size2indextype(unsigned index_size)
+{
+	switch (index_size) {
+	case 1: return INDEX_SIZE_8_BIT;
+	case 2: return INDEX_SIZE_16_BIT;
+	case 4: return INDEX_SIZE_32_BIT;
+	}
+	DBG("unsupported index size: %d", index_size);
+	assert(0);
+	return INDEX_SIZE_IGN;
+}
+
+/* this is same for a2xx/a3xx, so split into helper: */
+static inline void
+fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		enum pc_di_vis_cull_mode vismode,
+		const struct pipe_draw_info *info)
+{
+	struct pipe_index_buffer *idx = &ctx->indexbuf;
+	struct fd_bo *idx_bo = NULL;
+	enum pc_di_index_size idx_type = INDEX_SIZE_IGN;
+	enum pc_di_src_sel src_sel;
+	uint32_t idx_size, idx_offset;
+
+	if (info->indexed) {
+		assert(!idx->user_buffer);
+
+		idx_bo = fd_resource(idx->buffer)->bo;
+		idx_type = size2indextype(idx->index_size);
+		idx_size = idx->index_size * info->count;
+		idx_offset = idx->offset + (info->start * idx->index_size);
+		src_sel = DI_SRC_SEL_DMA;
+	} else {
+		idx_bo = NULL;
+		idx_type = INDEX_SIZE_IGN;
+		idx_size = 0;
+		idx_offset = 0;
+		src_sel = DI_SRC_SEL_AUTO_INDEX;
+	}
+
+	fd_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel,
+			info->count, idx_type, idx_size, idx_offset, idx_bo);
+}
+
 #endif /* FREEDRENO_DRAW_H_ */
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -314,9 +314,8 @@ render_sysmem(struct fd_context *ctx)
 }

 void
-fd_gmem_render_tiles(struct pipe_context *pctx)
+fd_gmem_render_tiles(struct fd_context *ctx)
 {
-	struct fd_context *ctx = fd_context(pctx);
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	uint32_t timestamp = 0;
 	bool sysmem = false;
@@ -381,28 +380,50 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
 	ctx->max_scissor.minx = ctx->max_scissor.miny = ~0;
 	ctx->max_scissor.maxx = ctx->max_scissor.maxy = 0;

-	/* Note that because the per-tile setup and mem2gmem/gmem2mem are emitted
-	 * after the draw/clear calls, but executed before, we need to preemptively
-	 * flag some state as dirty before the first draw/clear call.
-	 *
-	 * TODO maybe we need to mark all state as dirty to not worry about state
-	 * being clobbered by other contexts?
-	 */
-	ctx->dirty |= FD_DIRTY_ZSA |
-			FD_DIRTY_RASTERIZER |
-			FD_DIRTY_FRAMEBUFFER |
-			FD_DIRTY_SAMPLE_MASK |
-			FD_DIRTY_VIEWPORT |
-			FD_DIRTY_CONSTBUF |
-			FD_DIRTY_PROG |
-			FD_DIRTY_SCISSOR |
-			/* probably only needed if we need to mem2gmem on the next
-			 * draw..  but not sure if there is a good way to know?
-			 */
-			FD_DIRTY_VERTTEX |
-			FD_DIRTY_FRAGTEX |
-			FD_DIRTY_BLEND;
-
-	if (fd_mesa_debug & FD_DBG_DGMEM)
-		ctx->dirty = 0xffffffff;
+	ctx->dirty = ~0;
+}
+
+/* tile needs restore if it isn't completely contained within the
+ * cleared scissor:
+ */
+static bool
+skip_restore(struct pipe_scissor_state *scissor, struct fd_tile *tile)
+{
+	unsigned minx = tile->xoff;
+	unsigned maxx = tile->xoff + tile->bin_w;
+	unsigned miny = tile->yoff;
+	unsigned maxy = tile->yoff + tile->bin_h;
+	return (minx >= scissor->minx) && (maxx <= scissor->maxx) &&
+			(miny >= scissor->miny) && (maxy <= scissor->maxy);
+}
+
+/* When deciding whether a tile needs mem2gmem, we need to take into
+ * account the scissor rect(s) that were cleared.  To simplify we only
+ * consider the last scissor rect for each buffer, since the common
+ * case would be a single clear.
+ */
+bool
+fd_gmem_needs_restore(struct fd_context *ctx, struct fd_tile *tile,
+		uint32_t buffers)
+{
+	if (!(ctx->restore & buffers))
+		return false;
+
+	/* if buffers partially cleared, then slow-path to figure out
+	 * if this particular tile needs restoring:
+	 */
+	if ((buffers & FD_BUFFER_COLOR) &&
+			(ctx->partial_cleared & FD_BUFFER_COLOR) &&
+			skip_restore(&ctx->cleared_scissor.color, tile))
+		return false;
+	if ((buffers & FD_BUFFER_DEPTH) &&
+			(ctx->partial_cleared & FD_BUFFER_DEPTH) &&
+			skip_restore(&ctx->cleared_scissor.depth, tile))
+		return false;
+	if ((buffers & FD_BUFFER_STENCIL) &&
+			(ctx->partial_cleared & FD_BUFFER_STENCIL) &&
+			skip_restore(&ctx->cleared_scissor.stencil, tile))
+		return false;
+
+	return true;
 }
--- a/src/gallium/drivers/freedreno/freedreno_gmem.h
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.h
@@ -55,6 +55,11 @@ struct fd_gmem_stateobj {
 	bool has_zs;  /* gmem config using depth/stencil? */
 };

-void fd_gmem_render_tiles(struct pipe_context *pctx);
+struct fd_context;
+
+void fd_gmem_render_tiles(struct fd_context *ctx);
+
+bool fd_gmem_needs_restore(struct fd_context *ctx, struct fd_tile *tile,
+		uint32_t buffers);

 #endif /* FREEDRENO_GMEM_H_ */
--- a/src/gallium/drivers/freedreno/freedreno_lowering.c
+++ b/src/gallium/drivers/freedreno/freedreno_lowering.c
@@ -52,6 +52,7 @@ struct fd_lowering_context {
 #define B 1
 	struct tgsi_full_src_register imm;
 	int emitted_decls;
+	unsigned saturate;
 };

 static inline struct fd_lowering_context *
@@ -130,12 +131,14 @@ aliases(const struct tgsi_full_dst_register *dst, unsigned dst_mask,
 static void
 create_mov(struct tgsi_transform_context *tctx,
 	const struct tgsi_full_dst_register *dst,
-	const struct tgsi_full_src_register *src, unsigned mask)
+	const struct tgsi_full_src_register *src,
+	unsigned mask, unsigned saturate)
 {
 	struct tgsi_full_instruction new_inst;

 	new_inst = tgsi_default_full_instruction();
 	new_inst.Instruction.Opcode = TGSI_OPCODE_MOV;
+	new_inst.Instruction.Saturate = saturate;
 	new_inst.Instruction.NumDstRegs = 1;
 	reg_dst(&new_inst.Dst[0], dst, mask);
 	new_inst.Instruction.NumSrcRegs = 1;
@@ -143,6 +146,25 @@ create_mov(struct tgsi_transform_context *tctx,
 	tctx->emit_instruction(tctx, &new_inst);
 }

+/* to help calculate # of tgsi tokens for a lowering.. we assume
+ * the worst case, ie. removed instructions don't have ADDR[] or
+ * anything which increases the # of tokens per src/dst and the
+ * inserted instructions do.
+ *
+ * OINST() - old instruction
+ *    1         : instruction itself
+ *    1         : dst
+ *    1 * nargs : srcN
+ *
+ * NINST() - new instruction
+ *    1         : instruction itself
+ *    2         : dst
+ *    2 * nargs : srcN
+ */
+
+#define OINST(nargs)  (1 + 1 + 1 * (nargs))
+#define NINST(nargs)  (1 + 2 + 2 * (nargs))
+
 /*
 * Lowering Translators:
 */
@@ -169,7 +191,8 @@ create_mov(struct tgsi_transform_context *tctx,
 * MOV dst.w, src1.w
 * MOV dst.x, imm{1.0}
 */
-#define DST_GROW (19 - 4)
+#define DST_GROW (NINST(1) + NINST(1) + NINST(2) + NINST(1) + \
+		NINST(1) + NINST(1) - OINST(2))
 #define DST_TMP  2
 static void
 transform_dst(struct tgsi_transform_context *tctx,
@@ -182,12 +205,12 @@ transform_dst(struct tgsi_transform_context *tctx,
 	struct tgsi_full_instruction new_inst;

 	if (aliases(dst, TGSI_WRITEMASK_Y, src0, TGSI_WRITEMASK_Z)) {
-		create_mov(tctx, &ctx->tmp[A].dst, src0, TGSI_WRITEMASK_YZ);
+		create_mov(tctx, &ctx->tmp[A].dst, src0, TGSI_WRITEMASK_YZ, 0);
 		src0 = &ctx->tmp[A].src;
 	}

 	if (aliases(dst, TGSI_WRITEMASK_YZ, src1, TGSI_WRITEMASK_W)) {
-		create_mov(tctx, &ctx->tmp[B].dst, src1, TGSI_WRITEMASK_YW);
+		create_mov(tctx, &ctx->tmp[B].dst, src1, TGSI_WRITEMASK_YW, 0);
 		src1 = &ctx->tmp[B].src;
 	}

@@ -249,7 +272,7 @@ transform_dst(struct tgsi_transform_context *tctx,
 * SUB dst.xyz, tmpA.xyz, tmpB.xyz
 * MOV dst.w, imm{1.0}
 */
-#define XPD_GROW (15 - 4)
+#define XPD_GROW (NINST(2) + NINST(2) + NINST(2) + NINST(1) - OINST(2))
 #define XPD_TMP  2
 static void
 transform_xpd(struct tgsi_transform_context *tctx,
@@ -320,7 +343,7 @@ transform_xpd(struct tgsi_transform_context *tctx,
 * SIN dst.y, src.x
 * MOV dst.zw, imm{0.0, 1.0}
 */
-#define SCS_GROW (12 - 3)
+#define SCS_GROW (NINST(1) + NINST(1) + NINST(1) + NINST(1) - OINST(1))
 #define SCS_TMP  1
 static void
 transform_scs(struct tgsi_transform_context *tctx,
@@ -332,7 +355,7 @@ transform_scs(struct tgsi_transform_context *tctx,
 	struct tgsi_full_instruction new_inst;

 	if (aliases(dst, TGSI_WRITEMASK_X, src, TGSI_WRITEMASK_X)) {
-		create_mov(tctx, &ctx->tmp[A].dst, src, TGSI_WRITEMASK_X);
+		create_mov(tctx, &ctx->tmp[A].dst, src, TGSI_WRITEMASK_X, 0);
 		src = &ctx->tmp[A].src;
 	}

@@ -382,7 +405,7 @@ transform_scs(struct tgsi_transform_context *tctx,
 * MUL tmpB, tmpB, src2
 * ADD dst, tmpA, tmpB
 */
-#define LRP_GROW (16 - 4)
+#define LRP_GROW (NINST(2) + NINST(2) + NINST(2) + NINST(2) - OINST(3))
 #define LRP_TMP  2
 static void
 transform_lrp(struct tgsi_transform_context *tctx,
@@ -448,7 +471,7 @@ transform_lrp(struct tgsi_transform_context *tctx,
 * FLR tmpA, src
 * SUB dst, src, tmpA
 */
-#define FRC_GROW (7 - 3)
+#define FRC_GROW (NINST(1) + NINST(2) - OINST(1))
 #define FRC_TMP  1
 static void
 transform_frc(struct tgsi_transform_context *tctx,
@@ -492,7 +515,7 @@ transform_frc(struct tgsi_transform_context *tctx,
 * MUL tmpA.x, src1.x, tmpA.x
 * EX2 dst, tmpA.x
 */
-#define POW_GROW (10 - 4)
+#define POW_GROW (NINST(1) + NINST(2) + NINST(1) - OINST(2))
 #define POW_TMP  1
 static void
 transform_pow(struct tgsi_transform_context *tctx,
@@ -551,7 +574,8 @@ transform_pow(struct tgsi_transform_context *tctx,
 * MOV dst.yz, tmpA.xy
 * MOV dst.xw, imm{1.0}
 */
-#define LIT_GROW (30 - 3)
+#define LIT_GROW (NINST(1) + NINST(3) + NINST(1) + NINST(2) + \
+		NINST(1) + NINST(3) + NINST(1) + NINST(1) - OINST(1))
 #define LIT_TMP  1
 static void
 transform_lit(struct tgsi_transform_context *tctx,
@@ -661,7 +685,8 @@ transform_lit(struct tgsi_transform_context *tctx,
 * MOV dst.z, tmpA.y
 * MOV dst.w, imm{1.0}
 */
-#define EXP_GROW (19 - 3)
+#define EXP_GROW (NINST(1) + NINST(1) + NINST(2) + NINST(1) + \
+		NINST(1)+ NINST(1) - OINST(1))
 #define EXP_TMP  1
 static void
 transform_exp(struct tgsi_transform_context *tctx,
@@ -755,7 +780,8 @@ transform_exp(struct tgsi_transform_context *tctx,
 * MOV dst.xz, tmpA.yx
 * MOV dst.w, imm{1.0}
 */
-#define LOG_GROW (25 - 3)
+#define LOG_GROW (NINST(1) + NINST(1) + NINST(1) + NINST(1) + \
+		NINST(2) + NINST(1) + NINST(1) - OINST(1))
 #define LOG_TMP  1
 static void
 transform_log(struct tgsi_transform_context *tctx,
@@ -879,11 +905,11 @@ transform_log(struct tgsi_transform_context *tctx,
 * }
 * ; fixup last instruction to replicate into dst
 */
-#define DP4_GROW  (19 - 4)
-#define DP3_GROW  (14 - 4)
-#define DPH_GROW  (18 - 4)
-#define DP2_GROW  ( 9 - 4)
-#define DP2A_GROW (13 - 4)
+#define DP4_GROW  (NINST(2) + NINST(3) + NINST(3) + NINST(3) - OINST(2))
+#define DP3_GROW  (NINST(2) + NINST(3) + NINST(3) - OINST(2))
+#define DPH_GROW  (NINST(2) + NINST(3) + NINST(3) + NINST(2) - OINST(2))
+#define DP2_GROW  (NINST(2) + NINST(3) - OINST(2))
+#define DP2A_GROW (NINST(2) + NINST(3) + NINST(2) - OINST(3))
 #define DOTP_TMP  1
 static void
 transform_dotp(struct tgsi_transform_context *tctx,
@@ -981,6 +1007,138 @@ transform_dotp(struct tgsi_transform_context *tctx,
 	}
 }

+/* Inserts a MOV_SAT for the needed components of tex coord.  Note that
+ * in the case of TXP, the clamping must happen *after* projection, so
+ * we need to lower TXP to TEX.
+ *
+ *   MOV tmpA, src0
+ *   if (opc == TXP) {
+ *     ; do perspective division manually before clamping:
+ *     RCP tmpB, tmpA.w
+ *     MUL tmpB.<pmask>, tmpA, tmpB.xxxx
+ *     opc = TEX;
+ *   }
+ *   MOV_SAT tmpA.<mask>, tmpA  ; <mask> is the clamped s/t/r coords
+ *   <opc> dst, tmpA, ...
+ */
+#define SAMP_GROW (NINST(1) + NINST(1) + NINST(2) + NINST(1))
+#define SAMP_TMP  2
+static int
+transform_samp(struct tgsi_transform_context *tctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct fd_lowering_context *ctx = fd_lowering_context(tctx);
+	struct tgsi_full_src_register *coord = &inst->Src[0];
+	struct tgsi_full_src_register *samp;
+	struct tgsi_full_instruction new_inst;
+	/* mask is clamped coords, pmask is all coords (for projection): */
+	unsigned mask = 0, pmask = 0, smask;
+	unsigned opcode = inst->Instruction.Opcode;
+
+	if (opcode == TGSI_OPCODE_TXB2) {
+		samp = &inst->Src[2];
+	} else {
+		samp = &inst->Src[1];
+	}
+
+	/* convert sampler # to bitmask to test: */
+	smask = 1 << samp->Register.Index;
+
+	/* check if we actually need to lower this one: */
+	if (!(ctx->saturate & smask))
+		return -1;
+
+	/* figure out which coordinates need saturating:
+	 *   - RECT textures should not get saturated
+	 *   - array index coords should not get saturated
+	 */
+	switch (inst->Texture.Texture) {
+	case TGSI_TEXTURE_3D:
+	case TGSI_TEXTURE_CUBE:
+	case TGSI_TEXTURE_CUBE_ARRAY:
+	case TGSI_TEXTURE_SHADOWCUBE:
+	case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+		if (ctx->config->saturate_r & smask)
+			mask |= TGSI_WRITEMASK_Z;
+		pmask |= TGSI_WRITEMASK_Z;
+		/* fallthrough */
+
+	case TGSI_TEXTURE_2D:
+	case TGSI_TEXTURE_2D_ARRAY:
+	case TGSI_TEXTURE_SHADOW2D:
+	case TGSI_TEXTURE_SHADOW2D_ARRAY:
+	case TGSI_TEXTURE_2D_MSAA:
+	case TGSI_TEXTURE_2D_ARRAY_MSAA:
+		if (ctx->config->saturate_t & smask)
+			mask |= TGSI_WRITEMASK_Y;
+		pmask |= TGSI_WRITEMASK_Y;
+		/* fallthrough */
+
+	case TGSI_TEXTURE_1D:
+	case TGSI_TEXTURE_1D_ARRAY:
+	case TGSI_TEXTURE_SHADOW1D:
+	case TGSI_TEXTURE_SHADOW1D_ARRAY:
+		if (ctx->config->saturate_s & smask)
+			mask |= TGSI_WRITEMASK_X;
+		pmask |= TGSI_WRITEMASK_X;
+		break;
+
+	/* TODO: I think we should ignore these?
+	case TGSI_TEXTURE_RECT:
+	case TGSI_TEXTURE_SHADOWRECT:
+	*/
+	}
+
+	/* sanity check.. driver could be asking to saturate a non-
+	 * existent coordinate component:
+	 */
+	if (!mask)
+		return -1;
+
+	/* MOV tmpA, src0 */
+	create_mov(tctx, &ctx->tmp[A].dst, coord, TGSI_WRITEMASK_XYZW, 0);
+
+	/* This is a bit sad.. we need to clamp *after* the coords
+	 * are projected, which means lowering TXP to TEX and doing
+	 * the projection ourself.  But since I haven't figured out
+	 * how to make the lowering code deliver an electric shock
+	 * to anyone using GL_CLAMP, we must do this instead:
+	 */
+	if (opcode == TGSI_OPCODE_TXP) {
+		/* RCP tmpB.x tmpA.w */
+		new_inst = tgsi_default_full_instruction();
+		new_inst.Instruction.Opcode = TGSI_OPCODE_RCP;
+		new_inst.Instruction.NumDstRegs = 1;
+		reg_dst(&new_inst.Dst[0], &ctx->tmp[B].dst, TGSI_WRITEMASK_X);
+		new_inst.Instruction.NumSrcRegs = 1;
+		reg_src(&new_inst.Src[0], &ctx->tmp[A].src, SWIZ(W,_,_,_));
+		tctx->emit_instruction(tctx, &new_inst);
+
+		/* MUL tmpA.mask, tmpA, tmpB.xxxx */
+		new_inst = tgsi_default_full_instruction();
+		new_inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+		new_inst.Instruction.NumDstRegs = 1;
+		reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, pmask);
+		new_inst.Instruction.NumSrcRegs = 2;
+		reg_src(&new_inst.Src[0], &ctx->tmp[A].src, SWIZ(X,Y,Z,W));
+		reg_src(&new_inst.Src[1], &ctx->tmp[B].src, SWIZ(X,X,X,X));
+		tctx->emit_instruction(tctx, &new_inst);
+
+		opcode = TGSI_OPCODE_TEX;
+	}
+
+	/* MOV_SAT tmpA.<mask>, tmpA */
+	create_mov(tctx, &ctx->tmp[A].dst, &ctx->tmp[A].src, mask,
+			TGSI_SAT_ZERO_ONE);
+
+	/* modify the texture samp instruction to take fixed up coord: */
+	new_inst = *inst;
+	new_inst.Instruction.Opcode = opcode;
+	new_inst.Src[0] = ctx->tmp[A].src;
+	tctx->emit_instruction(tctx, &new_inst);
+
+	return 0;
+}

 /* Two-sided color emulation:
 * For each COLOR input, create a corresponding BCOLOR input, plus
@@ -990,7 +1148,7 @@ transform_dotp(struct tgsi_transform_context *tctx,
 			2 +         /* FACE */               \
 			((n) * 2) + /* IN[] BCOLOR[n] */     \
 			((n) * 1) + /* TEMP[] */             \
-			((n) * 5)   /* CMP instr */          \
+			((n) * NINST(3))   /* CMP instr */   \
 		)

 static void
@@ -1234,6 +1392,14 @@ transform_instr(struct tgsi_transform_context *tctx,
 			goto skip;
 		transform_dotp(tctx, inst);
 		break;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+	case TGSI_OPCODE_TXB:
+	case TGSI_OPCODE_TXB2:
+	case TGSI_OPCODE_TXL:
+		if (transform_samp(tctx, inst))
+			goto skip;
+		break;
 	default:
 	skip:
 		tctx->emit_instruction(tctx, inst);
@@ -1254,6 +1420,9 @@ fd_transform_lowering(const struct fd_lowering_config *config,
 	struct tgsi_token *newtoks;
 	int newlen, numtmp;

+	/* sanity check in case limit is ever increased: */
+	assert((sizeof(config->saturate_s) * 8) >= PIPE_MAX_SAMPLERS);
+
 	memset(&ctx, 0, sizeof(ctx));
 	ctx.base.transform_instruction = transform_instr;
 	ctx.info = info;
@@ -1277,6 +1446,8 @@ fd_transform_lowering(const struct fd_lowering_config *config,
 		}
 	}

+	ctx.saturate = config->saturate_r | config->saturate_s | config->saturate_t;
+
 #define OPCS(x) ((config->lower_ ## x) ? info->opcode_count[TGSI_OPCODE_ ## x] : 0)
 	/* if there are no instructions to lower, then we are done: */
 	if (!(OPCS(DST) ||
@@ -1293,7 +1464,8 @@ fd_transform_lowering(const struct fd_lowering_config *config,
 			OPCS(DPH) ||
 			OPCS(DP2) ||
 			OPCS(DP2A) ||
-			ctx.two_side_colors))
+			ctx.two_side_colors ||
+			ctx.saturate))
 		return NULL;

 #if 0  /* debug */
@@ -1359,6 +1531,15 @@ fd_transform_lowering(const struct fd_lowering_config *config,
 		newlen += DP2A_GROW * OPCS(DP2A);
 		numtmp = MAX2(numtmp, DOTP_TMP);
 	}
+	if (ctx.saturate) {
+		int n = info->opcode_count[TGSI_OPCODE_TEX] +
+			info->opcode_count[TGSI_OPCODE_TXP] +
+			info->opcode_count[TGSI_OPCODE_TXB] +
+			info->opcode_count[TGSI_OPCODE_TXB2] +
+			info->opcode_count[TGSI_OPCODE_TXL];
+		newlen += SAMP_GROW * n;
+		numtmp = MAX2(numtmp, SAMP_TMP);
+	}

 	/* specifically don't include two_side_colors temps in the count: */
 	ctx.numtmp = numtmp;
--- a/src/gallium/drivers/freedreno/freedreno_lowering.h
+++ b/src/gallium/drivers/freedreno/freedreno_lowering.h
@@ -69,6 +69,16 @@ struct fd_lowering_config {
 	unsigned lower_DPH : 1;
 	unsigned lower_DP2 : 1;
 	unsigned lower_DP2A : 1;
+
+	/* To emulate certain texture wrap modes, this can be used
+	 * to saturate the specified tex coord to [0.0, 1.0].  The
+	 * bits are according to sampler #, ie. if, for example:
+	 *
+	 *   (conf->saturate_s & (1 << n))
+	 *
+	 * is true, then the s coord for sampler n is saturated.
+	 */
+	unsigned saturate_s, saturate_t, saturate_r;
 };

 const struct tgsi_token * fd_transform_lowering(
--- a/src/gallium/drivers/freedreno/freedreno_program.c
+++ b/src/gallium/drivers/freedreno/freedreno_program.c
@@ -67,7 +67,7 @@ static const char *solid_vp =
 static const char *blit_fp =
 	"FRAG                                        \n"
 	"PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1       \n"
-	"DCL IN[0], TEXCOORD                         \n"
+	"DCL IN[0], TEXCOORD[0], PERSPECTIVE         \n"
 	"DCL OUT[0], COLOR                           \n"
 	"DCL SAMP[0]                                 \n"
 	"  0: TEX OUT[0], IN[0], SAMP[0], 2D         \n"
@@ -77,7 +77,7 @@ static const char *blit_vp =
 	"VERT                                        \n"
 	"DCL IN[0]                                   \n"
 	"DCL IN[1]                                   \n"
-	"DCL OUT[0], TEXCOORD                        \n"
+	"DCL OUT[0], TEXCOORD[0]                     \n"
 	"DCL OUT[1], POSITION                        \n"
 	"  0: MOV OUT[0], IN[0]                      \n"
 	"  0: MOV OUT[1], IN[1]                      \n"
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.c
@@ -183,12 +183,16 @@ fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q,
 		return false;

 	/* if the app tries to read back the query result before the
-	 * back is submitted, that forces us to flush so that there
+	 * batch is submitted, that forces us to flush so that there
 	 * are actually results to wait for:
 	 */
 	if (!LIST_IS_EMPTY(&hq->list)) {
+		/* if app didn't actually trigger any cmdstream, then
+		 * we have nothing to do:
+		 */
+		if (!ctx->needs_flush)
+			return true;
 		DBG("reading query result forces flush!");
-		ctx->needs_flush = true;
 		fd_context_render(&ctx->base);
 	}

@@ -201,9 +205,6 @@ fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q,
 	assert(LIST_IS_EMPTY(&hq->current_periods));
 	assert(!hq->period);

-	if (LIST_IS_EMPTY(&hq->periods))
-		return true;
-
 	/* if !wait, then check the last sample (the one most likely to
 	 * not be ready yet) and bail if it is not ready:
 	 */
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -116,7 +116,7 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 	ptrans->usage = usage;
 	ptrans->box = *box;
 	ptrans->stride = slice->pitch * rsc->cpp;
-	ptrans->layer_stride = ptrans->stride;
+	ptrans->layer_stride = slice->size0;

 	if (usage & PIPE_TRANSFER_READ)
 		op |= DRM_FREEDRENO_PREP_READ;
@@ -199,9 +199,8 @@ setup_slices(struct fd_resource *rsc)

 	for (level = 0; level <= prsc->last_level; level++) {
 		struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
-		uint32_t aligned_width = align(width, 32);

-		slice->pitch = aligned_width;
+		slice->pitch = align(width, 32);
 		slice->offset = size;
 		slice->size0 = slice->pitch * height * rsc->cpp;

@@ -215,6 +214,35 @@ setup_slices(struct fd_resource *rsc)
 	return size;
 }

+/* 2d array and 3d textures seem to want their layers aligned to
+ * page boundaries
+ */
+static uint32_t
+setup_slices_array(struct fd_resource *rsc)
+{
+	struct pipe_resource *prsc = &rsc->base.b;
+	uint32_t level, size = 0;
+	uint32_t width = prsc->width0;
+	uint32_t height = prsc->height0;
+	uint32_t depth = prsc->depth0;
+
+	for (level = 0; level <= prsc->last_level; level++) {
+		struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
+
+		slice->pitch = align(width, 32);
+		slice->offset = size;
+		slice->size0 = align(slice->pitch * height * rsc->cpp, 4096);
+
+		size += slice->size0 * depth * prsc->array_size;
+
+		width = u_minify(width, 1);
+		height = u_minify(height, 1);
+		depth = u_minify(depth, 1);
+	}
+
+	return size;
+}
+
 /**
 * Create a new texture object, using the given template info.
 */
@@ -246,7 +274,16 @@ fd_resource_create(struct pipe_screen *pscreen,

 	assert(rsc->cpp);

-	size = setup_slices(rsc);
+	switch (tmpl->target) {
+	case PIPE_TEXTURE_3D:
+	case PIPE_TEXTURE_1D_ARRAY:
+	case PIPE_TEXTURE_2D_ARRAY:
+		size = setup_slices_array(rsc);
+		break;
+	default:
+		size = setup_slices(rsc);
+		break;
+	}

 	realloc_bo(rsc, size);
 	if (!rsc->bo)
@@ -410,8 +447,8 @@ fd_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
 static void
 fd_blitter_pipe_begin(struct fd_context *ctx)
 {
-	util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertexbuf.vb);
-	util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx);
+	util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb);
+	util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx);
 	util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vp);
 	util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer);
 	util_blitter_save_viewport(ctx->blitter, &ctx->viewport);
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -60,7 +60,7 @@ static const struct debug_named_value debug_options[] = {
 		{"msgs",      FD_DBG_MSGS,   "Print debug messages"},
 		{"disasm",    FD_DBG_DISASM, "Dump TGSI and adreno shader disassembly"},
 		{"dclear",    FD_DBG_DCLEAR, "Mark all state dirty after clear"},
-		{"dgmem",     FD_DBG_DGMEM,  "Mark all state dirty after GMEM tile pass"},
+		{"flush",     FD_DBG_FLUSH,  "Force flush after every draw"},
 		{"dscis",     FD_DBG_DSCIS,  "Disable scissor optimization"},
 		{"direct",    FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"},
 		{"dbypass",   FD_DBG_DBYPASS,"Disable GMEM bypass"},
@@ -70,6 +70,7 @@ static const struct debug_named_value debug_options[] = {
 		{"optmsgs",   FD_DBG_OPTMSGS,"Enable optimizater debug messages"},
 		{"optdump",   FD_DBG_OPTDUMP,"Dump shader DAG to .dot files"},
 		{"glsl130",   FD_DBG_GLSL130,"Temporary flag to enable GLSL 130 on a3xx+"},
+		{"nocp",      FD_DBG_NOCP,   "Disable copy-propagation"},
 		DEBUG_NAMED_VALUE_END
 };

@@ -156,23 +157,18 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 	case PIPE_CAP_POINT_SPRITE:
 	case PIPE_CAP_TEXTURE_SHADOW_MAP:
-	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
 	case PIPE_CAP_TEXTURE_SWIZZLE:
 	case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
 	case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
-	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
 	case PIPE_CAP_SEAMLESS_CUBE_MAP:
 	case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
-	case PIPE_CAP_TGSI_INSTANCEID:
 	case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-	case PIPE_CAP_COMPUTE:
-	case PIPE_CAP_START_INSTANCE:
-	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
 	case PIPE_CAP_USER_CONSTANT_BUFFERS:
 	case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
 		return 1;
@@ -181,12 +177,23 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
 	case PIPE_CAP_CONDITIONAL_RENDER:
-	case PIPE_CAP_PRIMITIVE_RESTART:
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 	case PIPE_CAP_TEXTURE_BARRIER:
-	case PIPE_CAP_SM3:
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_CUBE_MAP_ARRAY:
+	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+	case PIPE_CAP_TGSI_INSTANCEID:
+	case PIPE_CAP_START_INSTANCE:
+	case PIPE_CAP_COMPUTE:
 		return 0;

+	case PIPE_CAP_SM3:
+	case PIPE_CAP_PRIMITIVE_RESTART:
+		return (screen->gpu_id >= 300) ? 1 : 0;
+
 	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
 		return 256;

@@ -199,7 +206,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
 	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
-	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
 	case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
 	case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
@@ -220,6 +227,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 		return 0;

+	case PIPE_CAP_MAX_VIEWPORTS:
+		return 1;
+
 	/* Stream output. */
 	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
 	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
@@ -235,11 +245,13 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)

 	/* Texturing. */
 	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
 	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
 		return MAX_MIP_LEVELS;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 11;
+
 	case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-		return 0;  /* TODO: a3xx+ should support (required in gles3) */
+		return (screen->gpu_id >= 300) ? 256 : 0;

 	/* Render targets. */
 	case PIPE_CAP_MAX_RENDER_TARGETS:
@@ -277,11 +289,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 10;
 	case PIPE_CAP_UMA:
 		return 1;
-
-	default:
-		DBG("unknown param %d", param);
-		return 0;
 	}
+	debug_printf("unknown param %d\n", param);
+	return 0;
 }

 static float
@@ -296,16 +306,15 @@ fd_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
 	case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
 		return 16.0f;
 	case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-		return 16.0f;
+		return 15.0f;
 	case PIPE_CAPF_GUARD_BAND_LEFT:
 	case PIPE_CAPF_GUARD_BAND_TOP:
 	case PIPE_CAPF_GUARD_BAND_RIGHT:
 	case PIPE_CAPF_GUARD_BAND_BOTTOM:
 		return 0.0f;
-	default:
-		DBG("unknown paramf %d", param);
-		return 0;
 	}
+	debug_printf("unknown paramf %d\n", param);
+	return 0;
 }

 static int
@@ -342,7 +351,11 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 	case PIPE_SHADER_CAP_MAX_TEMPS:
 		return 64; /* Max native temporaries. */
 	case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
-		return ((screen->gpu_id >= 300) ? 1024 : 64) * sizeof(float[4]);
+		/* NOTE: seems to be limit for a3xx is actually 512 but
+		 * split between VS and FS.  Use lower limit of 256 to
+		 * avoid getting into impossible situations:
+		 */
+		return ((screen->gpu_id >= 300) ? 256 : 64) * sizeof(float[4]);
 	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
 		return 1;
 	case PIPE_SHADER_CAP_MAX_PREDS:
@@ -355,6 +368,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
 		return 1;
 	case PIPE_SHADER_CAP_SUBROUTINES:
+	case PIPE_SHADER_CAP_DOUBLES:
 		return 0;
 	case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
 		return 1;
@@ -368,10 +382,8 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return 16;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
-	default:
-		DBG("unknown shader param %d", param);
-		return 0;
 	}
+	debug_printf("unknown shader param %d\n", param);
 	return 0;
 }

@@ -388,6 +400,9 @@ fd_screen_bo_get_handle(struct pipe_screen *pscreen,
 	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
 		whandle->handle = fd_bo_handle(bo);
 		return TRUE;
+	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+		whandle->handle = fd_bo_dmabuf(bo);
+		return TRUE;
 	} else {
 		return FALSE;
 	}
@@ -401,12 +416,17 @@ fd_screen_bo_from_handle(struct pipe_screen *pscreen,
 	struct fd_screen *screen = fd_screen(pscreen);
 	struct fd_bo *bo;

-	if (whandle->type != DRM_API_HANDLE_TYPE_SHARED) {
+	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+		bo = fd_bo_from_name(screen->dev, whandle->handle);
+	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+		bo = fd_bo_from_handle(screen->dev, whandle->handle, 0);
+	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+		bo = fd_bo_from_dmabuf(screen->dev, whandle->handle);
+	} else {
 		DBG("Attempt to import unsupported handle type %d", whandle->type);
 		return NULL;
 	}

-	bo = fd_bo_from_name(screen->dev, whandle->handle);
 	if (!bo) {
 		DBG("ref name 0x%08x failed", whandle->handle);
 		return NULL;
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -177,7 +177,7 @@ fd_set_vertex_buffers(struct pipe_context *pctx,
 		const struct pipe_vertex_buffer *vb)
 {
 	struct fd_context *ctx = fd_context(pctx);
-	struct fd_vertexbuf_stateobj *so = &ctx->vertexbuf;
+	struct fd_vertexbuf_stateobj *so = &ctx->vtx.vertexbuf;
 	int i;

 	/* on a2xx, pitch is encoded in the vtx fetch instruction, so
@@ -237,8 +237,18 @@ static void
 fd_rasterizer_state_bind(struct pipe_context *pctx, void *hwcso)
 {
 	struct fd_context *ctx = fd_context(pctx);
+	struct pipe_scissor_state *old_scissor = fd_context_get_scissor(ctx);
+
 	ctx->rasterizer = hwcso;
 	ctx->dirty |= FD_DIRTY_RASTERIZER;
+
+	/* if scissor enable bit changed we need to mark scissor
+	 * state as dirty as well:
+	 * NOTE: we can do a shallow compare, since we only care
+	 * if it changed to/from &ctx->disable_scissor
+	 */
+	if (old_scissor != fd_context_get_scissor(ctx))
+		ctx->dirty |= FD_DIRTY_SCISSOR;
 }

 static void
@@ -286,7 +296,7 @@ static void
 fd_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
 {
 	struct fd_context *ctx = fd_context(pctx);
-	ctx->vtx = hwcso;
+	ctx->vtx.vtx = hwcso;
 	ctx->dirty |= FD_DIRTY_VTXSTATE;
 }

--- a/src/gallium/drivers/freedreno/freedreno_texture.c
+++ b/src/gallium/drivers/freedreno/freedreno_texture.c
@@ -49,7 +49,7 @@ fd_sampler_view_destroy(struct pipe_context *pctx,
 	FREE(view);
 }

-static void bind_sampler_states(struct fd_texture_stateobj *prog,
+static void bind_sampler_states(struct fd_texture_stateobj *tex,
 		unsigned nr, void **hwcso)
 {
 	unsigned i;
@@ -58,19 +58,19 @@ static void bind_sampler_states(struct fd_texture_stateobj *prog,
 	for (i = 0; i < nr; i++) {
 		if (hwcso[i])
 			new_nr = i + 1;
-		prog->samplers[i] = hwcso[i];
-		prog->dirty_samplers |= (1 << i);
+		tex->samplers[i] = hwcso[i];
+		tex->dirty_samplers |= (1 << i);
 	}

-	for (; i < prog->num_samplers; i++) {
-		prog->samplers[i] = NULL;
-		prog->dirty_samplers |= (1 << i);
+	for (; i < tex->num_samplers; i++) {
+		tex->samplers[i] = NULL;
+		tex->dirty_samplers |= (1 << i);
 	}

-	prog->num_samplers = new_nr;
+	tex->num_samplers = new_nr;
 }

-static void set_sampler_views(struct fd_texture_stateobj *prog,
+static void set_sampler_views(struct fd_texture_stateobj *tex,
 		unsigned nr, struct pipe_sampler_view **views)
 {
 	unsigned i;
@@ -79,19 +79,19 @@ static void set_sampler_views(struct fd_texture_stateobj *prog,
 	for (i = 0; i < nr; i++) {
 		if (views[i])
 			new_nr = i + 1;
-		pipe_sampler_view_reference(&prog->textures[i], views[i]);
-		prog->dirty_samplers |= (1 << i);
+		pipe_sampler_view_reference(&tex->textures[i], views[i]);
+		tex->dirty_samplers |= (1 << i);
 	}

-	for (; i < prog->num_textures; i++) {
-		pipe_sampler_view_reference(&prog->textures[i], NULL);
-		prog->dirty_samplers |= (1 << i);
+	for (; i < tex->num_textures; i++) {
+		pipe_sampler_view_reference(&tex->textures[i], NULL);
+		tex->dirty_samplers |= (1 << i);
 	}

-	prog->num_textures = new_nr;
+	tex->num_textures = new_nr;
 }

-static void
+void
 fd_sampler_states_bind(struct pipe_context *pctx,
 		unsigned shader, unsigned start,
 		unsigned nr, void **hwcso)
@@ -101,13 +101,6 @@ fd_sampler_states_bind(struct pipe_context *pctx,
 	assert(start == 0);

 	if (shader == PIPE_SHADER_FRAGMENT) {
-		/* on a2xx, since there is a flat address space for textures/samplers,
-		 * a change in # of fragment textures/samplers will trigger patching and
-		 * re-emitting the vertex shader:
-		 */
-		if (nr != ctx->fragtex.num_samplers)
-			ctx->dirty |= FD_DIRTY_TEXSTATE;
-
 		bind_sampler_states(&ctx->fragtex, nr, hwcso);
 		ctx->dirty |= FD_DIRTY_FRAGTEX;
 	}
@@ -169,6 +162,5 @@ fd_texture_init(struct pipe_context *pctx)

 	pctx->sampler_view_destroy = fd_sampler_view_destroy;

-	pctx->bind_sampler_states = fd_sampler_states_bind;
 	pctx->set_sampler_views = fd_set_sampler_views;
 }
--- a/src/gallium/drivers/freedreno/freedreno_texture.h
+++ b/src/gallium/drivers/freedreno/freedreno_texture.h
@@ -31,6 +31,10 @@

 #include "pipe/p_context.h"

+void fd_sampler_states_bind(struct pipe_context *pctx,
+		unsigned shader, unsigned start,
+		unsigned nr, void **hwcso);
+
 void fd_texture_init(struct pipe_context *pctx);

 #endif /* FREEDRENO_TEXTURE_H_ */
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -38,6 +38,7 @@
 #include "util/u_math.h"
 #include "util/u_half.h"
 #include "util/u_dynarray.h"
+#include "util/u_pack_color.h"

 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
@@ -55,7 +56,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_MSGS     0x0001
 #define FD_DBG_DISASM   0x0002
 #define FD_DBG_DCLEAR   0x0004
-#define FD_DBG_DGMEM    0x0008
+#define FD_DBG_FLUSH    0x0008
 #define FD_DBG_DSCIS    0x0010
 #define FD_DBG_DIRECT   0x0020
 #define FD_DBG_DBYPASS  0x0040
@@ -65,6 +66,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_OPTMSGS  0x0400
 #define FD_DBG_OPTDUMP  0x0800
 #define FD_DBG_GLSL130  0x1000
+#define FD_DBG_NOCP     0x2000

 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
@@ -208,6 +210,10 @@ static inline void
 OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
 		struct fd_ringmarker *end)
 {
+	uint32_t dwords = fd_ringmarker_dwords(start, end);
+
+	assert(dwords > 0);
+
 	/* for debug after a lock up, write a unique counter value
 	 * to scratch6 for each IB, to make it easier to match up
 	 * register dumps to cmdstream.  The combination of IB and
@@ -218,7 +224,7 @@ OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,

 	OUT_PKT3(ring, CP_INDIRECT_BUFFER_PFD, 2);
 	fd_ringbuffer_emit_reloc_ring(ring, start, end);
-	OUT_RING(ring, fd_ringmarker_dwords(start, end));
+	OUT_RING(ring, dwords);

 	emit_marker(ring, 6);
 }
@@ -238,4 +244,24 @@ emit_marker(struct fd_ringbuffer *ring, int scratch_idx)
 	OUT_RING(ring, ++marker_cnt);
 }

+/* helper to get numeric value from environment variable..  mostly
+ * just leaving this here because it is helpful to brute-force figure
+ * out unknown formats, etc, which blob driver does not support:
+ */
+static inline uint32_t env2u(const char *envvar)
+{
+	char *str = getenv(envvar);
+	if (str)
+		return strtol(str, NULL, 0);
+	return 0;
+}
+
+static inline uint32_t
+pack_rgba(enum pipe_format format, const float *rgba)
+{
+	union util_color uc;
+	util_pack_color(rgba, format, &uc);
+	return uc.ui[0];
+}
+
 #endif /* FREEDRENO_UTIL_H_ */
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -81,6 +81,8 @@ void ir3_destroy(struct ir3 *shader)
 		shader->chunk = chunk->next;
 		free(chunk);
 	}
+	free(shader->instrs);
+	free(shader->baryfs);
 	free(shader);
 }

@@ -104,7 +106,7 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
 		val.iim_val = reg->iim_val;
 	} else {
 		int8_t components = util_last_bit(reg->wrmask);
-		int8_t max = (reg->num + repeat + components - 1) >> 2;
+		int16_t max = (reg->num + repeat + components - 1) >> 2;

 		val.comp = reg->num & 0x3;
 		val.num  = reg->num >> 2;
@@ -596,6 +598,15 @@ static void insert_instr(struct ir3 *shader,
 				shader->instrs_sz * sizeof(shader->instrs[0]));
 	}
 	shader->instrs[shader->instrs_count++] = instr;
+
+	if (is_input(instr)) {
+		if (shader->baryfs_count == shader->baryfs_sz) {
+			shader->baryfs_sz = MAX2(2 * shader->baryfs_sz, 16);
+			shader->baryfs = realloc(shader->baryfs,
+					shader->baryfs_sz * sizeof(shader->baryfs[0]));
+		}
+		shader->baryfs[shader->baryfs_count++] = instr;
+	}
 }

 struct ir3_block * ir3_block_create(struct ir3 *shader,
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -47,7 +47,7 @@ struct ir3_info {
 	 */
 	int8_t   max_reg;   /* highest GPR # used by shader */
 	int8_t   max_half_reg;
-	int8_t   max_const;
+	int16_t  max_const;
 };

 struct ir3_register {
@@ -97,6 +97,8 @@ struct ir3_register {
 	int wrmask;
 };

+#define IR3_INSTR_SRCS 10
+
 struct ir3_instruction {
 	struct ir3_block *block;
 	int category;
@@ -156,7 +158,7 @@ struct ir3_instruction {
 	} flags;
 	int repeat;
 	unsigned regs_count;
-	struct ir3_register *regs[5];
+	struct ir3_register *regs[1 + IR3_INSTR_SRCS];
 	union {
 		struct {
 			char inv;
@@ -208,7 +210,11 @@ struct ir3_instruction {
 		 * result of moving a const to a reg would have a low cost,  so to
 		 * it could make sense to duplicate the instruction at various
 		 * points where the result is needed to reduce register footprint.
+		 *
+		 * DEPTH_UNUSED used to mark unused instructions after depth
+		 * calculation pass.
 		 */
+#define DEPTH_UNUSED  ~0
 		unsigned depth;
 	};
 	struct ir3_instruction *next;
@@ -222,6 +228,8 @@ struct ir3_heap_chunk;
 struct ir3 {
 	unsigned instrs_count, instrs_sz;
 	struct ir3_instruction **instrs;
+	unsigned baryfs_count, baryfs_sz;
+	struct ir3_instruction **baryfs;
 	unsigned heap_idx;
 	struct ir3_heap_chunk *chunk;
 };
@@ -270,6 +278,10 @@ static inline void ir3_clear_mark(struct ir3 *shader)
 	/* TODO would be nice to drop the instruction array.. for
 	 * new compiler, _clear_mark() is all we use it for, and
 	 * we could probably manage a linked list instead..
+	 *
+	 * Also, we'll probably want to mark instructions within
+	 * a block, so tracking the list of instrs globally is
+	 * unlikely to be what we want.
 	 */
 	unsigned i;
 	for (i = 0; i < shader->instrs_count; i++) {
@@ -406,12 +418,12 @@ void ir3_block_depth(struct ir3_block *block);
 void ir3_block_cp(struct ir3_block *block);

 /* scheduling: */
-void ir3_block_sched(struct ir3_block *block);
+int ir3_block_sched(struct ir3_block *block);

 /* register assignment: */
 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 		bool half_precision, bool frag_coord, bool frag_face,
-		bool *has_samp);
+		bool *has_samp, int *max_bary);

 #ifndef ARRAY_SIZE
 #  define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
@@ -444,7 +456,7 @@ static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
 {
 	unsigned idx = regmask_idx(reg);
 	unsigned i;
-	for (i = 0; i < 4; i++, idx++)
+	for (i = 0; i < IR3_INSTR_SRCS; i++, idx++)
 		if (reg->wrmask & (1 << i))
 			(*regmask)[idx / 8] |= 1 << (idx % 8);
 }
@@ -457,7 +469,7 @@ static inline void regmask_set_if_not(regmask_t *a,
 {
 	unsigned idx = regmask_idx(reg);
 	unsigned i;
-	for (i = 0; i < 4; i++, idx++)
+	for (i = 0; i < IR3_INSTR_SRCS; i++, idx++)
 		if (reg->wrmask & (1 << i))
 			if (!((*b)[idx / 8] & (1 << (idx % 8))))
 				(*a)[idx / 8] |= 1 << (idx % 8);
@@ -468,7 +480,7 @@ static inline unsigned regmask_get(regmask_t *regmask,
 {
 	unsigned idx = regmask_idx(reg);
 	unsigned i;
-	for (i = 0; i < 4; i++, idx++)
+	for (i = 0; i < IR3_INSTR_SRCS; i++, idx++)
 		if (reg->wrmask & (1 << i))
 			if ((*regmask)[idx / 8] & (1 << (idx % 8)))
 				return true;
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -34,7 +34,7 @@

 int ir3_compile_shader(struct ir3_shader_variant *so,
 		const struct tgsi_token *tokens,
-		struct ir3_shader_key key);
+		struct ir3_shader_key key, bool cp);
 int ir3_compile_shader_old(struct ir3_shader_variant *so,
 		const struct tgsi_token *tokens,
 		struct ir3_shader_key key);
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c
@@ -125,7 +125,7 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
 {
 	unsigned ret, base = 0;
 	struct tgsi_shader_info *info = &ctx->info;
-	const struct fd_lowering_config lconfig = {
+	struct fd_lowering_config lconfig = {
 			.color_two_side = so->key.color_two_side,
 			.lower_DST  = true,
 			.lower_XPD  = true,
@@ -143,6 +143,20 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
 			.lower_DP2A = true,
 	};

+	switch (so->type) {
+	case SHADER_FRAGMENT:
+	case SHADER_COMPUTE:
+		lconfig.saturate_s = so->key.fsaturate_s;
+		lconfig.saturate_t = so->key.fsaturate_t;
+		lconfig.saturate_r = so->key.fsaturate_r;
+		break;
+	case SHADER_VERTEX:
+		lconfig.saturate_s = so->key.vsaturate_s;
+		lconfig.saturate_t = so->key.vsaturate_t;
+		lconfig.saturate_r = so->key.vsaturate_r;
+		break;
+	}
+
 	ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
 	ctx->free_tokens = !!ctx->tokens;
 	if (!ctx->tokens) {
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -70,7 +70,7 @@ static void walk_children(struct ir3_instruction *instr, bool keep)
 static struct ir3_instruction *
 instr_cp_fanin(struct ir3_instruction *instr)
 {
-	unsigned i;
+	unsigned i, j;

 	/* we need to handle fanin specially, to detect cases
 	 * when we need to keep a mov
@@ -92,7 +92,15 @@ instr_cp_fanin(struct ir3_instruction *instr)
 			if (is_meta(cand) && (cand->opc == OPC_META_FO))
 				cand = instr_cp(src->instr, true);

-			src->instr = cand;
+			/* we can't have 2 registers referring to the same instruction, so
+			 * go through and check if any already refer to the candidate
+			 * instruction. if so, don't do the propagation.
+			 */
+			for (j = 1; j < instr->regs_count; j++)
+				if (instr->regs[j]->instr == cand)
+					break;
+			if (j == instr->regs_count)
+				src->instr = cand;
 		}
 	}

--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -150,10 +150,22 @@ void ir3_block_depth(struct ir3_block *block)
 		if (block->outputs[i])
 			ir3_instr_depth(block->outputs[i]);

-	/* at this point, any unvisited input is unused: */
+	/* mark un-used instructions: */
+	for (i = 0; i < block->shader->instrs_count; i++) {
+		struct ir3_instruction *instr = block->shader->instrs[i];
+
+		/* just consider instructions within this block: */
+		if (instr->block != block)
+			continue;
+
+		if (!ir3_instr_check_mark(instr))
+			instr->depth = DEPTH_UNUSED;
+	}
+
+	/* cleanup unused inputs: */
 	for (i = 0; i < block->ninputs; i++) {
 		struct ir3_instruction *in = block->inputs[i];
-		if (in && !ir3_instr_check_mark(in))
+		if (in && (in->depth == DEPTH_UNUSED))
 			block->inputs[i] = NULL;
 	}
 }
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -58,6 +58,7 @@ struct ir3_ra_ctx {
 	bool frag_face;
 	bool has_samp;
 	int cnt;
+	int max_bary;
 	bool error;
 };

@@ -253,7 +254,9 @@ static int alloc_block(struct ir3_ra_ctx *ctx,
 				(instr->regs_count == 1)) {
 			unsigned i, base = instr->regs[0]->num & ~0x3;
 			for (i = 0; i < 4; i++) {
-				struct ir3_instruction *in = ctx->block->inputs[base + i];
+				struct ir3_instruction *in = NULL;
+				if ((base + i) < ctx->block->ninputs)
+					in = ctx->block->inputs[base + i];
 				if (in)
 					compute_clobbers(ctx, in->next, in, &liveregs);
 			}
@@ -471,7 +474,9 @@ static void ra_assign_dst_shader_input(struct ir3_visitor *v,

 	/* trigger assignment of all our companion input components: */
 	for (i = 0; i < 4; i++) {
-		struct ir3_instruction *in = instr->block->inputs[i+base];
+		struct ir3_instruction *in = NULL;
+		if ((base + i) < instr->block->ninputs)
+			in = instr->block->inputs[base + i];
 		if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
 			ra_assign(a->ctx, in, a->num + off + i);
 	}
@@ -610,6 +615,12 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (is_meta(n))
 			continue;

+		if (is_input(n)) {
+			struct ir3_register *inloc = n->regs[1];
+			assert(inloc->flags & IR3_REG_IMMED);
+			ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
+		}
+
 		for (i = 1; i < n->regs_count; i++) {
 			reg = n->regs[i];

@@ -771,7 +782,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)

 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 		bool half_precision, bool frag_coord, bool frag_face,
-		bool *has_samp)
+		bool *has_samp, int *max_bary)
 {
 	struct ir3_ra_ctx ctx = {
 			.block = block,
@@ -779,12 +790,14 @@ int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 			.half_precision = half_precision,
 			.frag_coord = frag_coord,
 			.frag_face = frag_face,
+			.max_bary = -1,
 	};
 	int ret;

 	ir3_clear_mark(block->shader);
 	ret = block_ra(&ctx, block);
 	*has_samp = ctx.has_samp;
+	*max_bary = ctx.max_bary;

 	return ret;
 }
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -64,6 +64,7 @@ struct ir3_sched_ctx {
 	struct ir3_instruction *addr;      /* current a0.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
 	unsigned cnt;
+	bool error;
 };

 static struct ir3_instruction *
@@ -161,7 +162,8 @@ static void schedule(struct ir3_sched_ctx *ctx,
 * Delay-slot calculation.  Follows fanin/fanout.
 */

-static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
+/* calculate delay for specified src: */
+static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned srcn)
 {
@@ -172,7 +174,7 @@ static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
 		for (i = 1; i < assigner->regs_count; i++) {
 			struct ir3_register *reg = assigner->regs[i];
 			if (reg->flags & IR3_REG_SSA) {
-				unsigned d = delay_calc2(ctx, reg->instr,
+				unsigned d = delay_calc_srcn(ctx, reg->instr,
 						consumer, srcn);
 				delay = MAX2(delay, d);
 			}
@@ -185,6 +187,7 @@ static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
 	return delay;
 }

+/* calculate delay for instruction (maximum of delay for all srcs): */
 static unsigned delay_calc(struct ir3_sched_ctx *ctx,
 		struct ir3_instruction *instr)
 {
@@ -193,7 +196,7 @@ static unsigned delay_calc(struct ir3_sched_ctx *ctx,
 	for (i = 1; i < instr->regs_count; i++) {
 		struct ir3_register *reg = instr->regs[i];
 		if (reg->flags & IR3_REG_SSA) {
-			unsigned d = delay_calc2(ctx, reg->instr,
+			unsigned d = delay_calc_srcn(ctx, reg->instr,
 					instr, i - 1);
 			delay = MAX2(delay, d);
 		}
@@ -239,6 +242,32 @@ static int trysched(struct ir3_sched_ctx *ctx,
 	if (delay)
 		return delay;

+	/* if the instruction is a kill, we need to ensure *every*
+	 * bary.f is scheduled.  The hw seems unhappy if the thread
+	 * gets killed before the end-input (ei) flag is hit.
+	 *
+	 * We could do this by adding each bary.f instruction as
+	 * virtual ssa src for the kill instruction.  But we have
+	 * fixed length instr->regs[].
+	 *
+	 * TODO this wouldn't be quite right if we had multiple
+	 * basic blocks, if any block was conditional.  We'd need
+	 * to schedule the bary.f's outside of any block which
+	 * was conditional that contained a kill.. I think..
+	 */
+	if (is_kill(instr)) {
+		struct ir3 *ir = instr->block->shader;
+		unsigned i;
+
+		for (i = 0; i < ir->baryfs_count; i++) {
+			if (ir->baryfs[i]->depth == DEPTH_UNUSED)
+				continue;
+			delay = trysched(ctx, ir->baryfs[i]);
+			if (delay)
+				return delay;
+		}
+	}
+
 	/* if this is a write to address/predicate register, and that
 	 * register is currently in use, we need to defer until it is
 	 * free:
@@ -308,7 +337,8 @@ static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
 	struct ir3_instruction *instr = block->head;
 	bool addr_in_use = false;
 	bool pred_in_use = false;
-	unsigned cnt = ~0;
+	bool all_delayed = true;
+	unsigned cnt = ~0, attempted = 0;

 	while (instr) {
 		struct ir3_instruction *next = instr->next;
@@ -317,6 +347,10 @@ static int block_sched_undelayed(struct ir3_sched_ctx *ctx,

 		if (addr || pred) {
 			int ret = trysched(ctx, instr);
+
+			if (ret != DELAYED)
+				all_delayed = false;
+
 			if (ret == SCHEDULED)
 				cnt = 0;
 			else if (ret > 0)
@@ -325,6 +359,8 @@ static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
 				addr_in_use = true;
 			if (pred)
 				pred_in_use = true;
+
+			attempted++;
 		}

 		instr = next;
@@ -336,6 +372,12 @@ static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
 	if (!pred_in_use)
 		ctx->pred = NULL;

+	/* detect if we've gotten ourselves into an impossible situation
+	 * and bail if needed
+	 */
+	if (all_delayed && (attempted > 0))
+		ctx->error = true;
+
 	return cnt;
 }

@@ -356,7 +398,7 @@ static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 		}
 	}

-	while ((instr = block->head)) {
+	while ((instr = block->head) && !ctx->error) {
 		/* NOTE: always grab next *before* trysched(), in case the
 		 * instruction is actually scheduled (and therefore moved
 		 * from depth list into scheduled list)
@@ -393,9 +435,12 @@ static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	block->head = reverse(ctx->scheduled);
 }

-void ir3_block_sched(struct ir3_block *block)
+int ir3_block_sched(struct ir3_block *block)
 {
 	struct ir3_sched_ctx ctx = {0};
 	ir3_clear_mark(block->shader);
 	block_sched(&ctx, block);
+	if (ctx.error)
+		return -1;
+	return 0;
 }
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -68,7 +68,11 @@ assemble_variant(struct ir3_shader_variant *v)
 	free(bin);

 	v->instrlen = v->info.sizedwords / 8;
-	v->constlen = v->info.max_const + 1;
+	/* NOTE: if relative addressing is used, we set constlen in
+	 * the compiler (to worst-case value) since we don't know in
+	 * the assembler what the max addr reg value can be:
+	 */
+	v->constlen = MAX2(v->constlen, v->info.max_const + 1);
 }

 /* for vertex shader, the inputs are loaded into registers before the shader
@@ -81,16 +85,27 @@ fixup_vp_regfootprint(struct ir3_shader_variant *v)
 	unsigned i;
 	for (i = 0; i < v->inputs_count; i++) {
 		if (v->inputs[i].compmask) {
-			uint32_t regid = (v->inputs[i].regid + 3) >> 2;
+			int32_t regid = (v->inputs[i].regid + 3) >> 2;
 			v->info.max_reg = MAX2(v->info.max_reg, regid);
 		}
 	}
 	for (i = 0; i < v->outputs_count; i++) {
-		uint32_t regid = (v->outputs[i].regid + 3) >> 2;
+		int32_t regid = (v->outputs[i].regid + 3) >> 2;
 		v->info.max_reg = MAX2(v->info.max_reg, regid);
 	}
 }

+/* reset before attempting to compile again.. */
+static void reset_variant(struct ir3_shader_variant *v, const char *msg)
+{
+	debug_error(msg);
+	v->inputs_count = 0;
+	v->outputs_count = 0;
+	v->total_in = 0;
+	v->has_samp = false;
+	v->immediates_count = 0;
+}
+
 static struct ir3_shader_variant *
 create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 {
@@ -112,15 +127,12 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	}

 	if (!(fd_mesa_debug & FD_DBG_NOOPT)) {
-		ret = ir3_compile_shader(v, tokens, key);
+		ret = ir3_compile_shader(v, tokens, key, true);
 		if (ret) {
-			debug_error("new compiler failed, trying fallback!");
-
-			v->inputs_count = 0;
-			v->outputs_count = 0;
-			v->total_in = 0;
-			v->has_samp = false;
-			v->immediates_count = 0;
+			reset_variant(v, "new compiler failed, trying without copy propagation!");
+			ret = ir3_compile_shader(v, tokens, key, false);
+			if (ret)
+				reset_variant(v, "new compiler failed, trying fallback!");
 		}
 	} else {
 		ret = -1;  /* force fallback to old compiler */
@@ -165,16 +177,30 @@ ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	 * so normalize the key to avoid constructing multiple identical
 	 * variants:
 	 */
-	if (shader->type == SHADER_FRAGMENT) {
+	switch (shader->type) {
+	case SHADER_FRAGMENT:
+	case SHADER_COMPUTE:
 		key.binning_pass = false;
-	}
-	if (shader->type == SHADER_VERTEX) {
+		if (key.has_per_samp) {
+			key.vsaturate_s = 0;
+			key.vsaturate_t = 0;
+			key.vsaturate_r = 0;
+		}
+		break;
+	case SHADER_VERTEX:
 		key.color_two_side = false;
 		key.half_precision = false;
+		key.alpha = false;
+		if (key.has_per_samp) {
+			key.fsaturate_s = 0;
+			key.fsaturate_t = 0;
+			key.fsaturate_r = 0;
+		}
+		break;
 	}

 	for (v = shader->variants; v; v = v->next)
-		if (!memcmp(&key, &v->key, sizeof(key)))
+		if (ir3_shader_key_equal(&key, &v->key))
 			return v;

 	/* compile new variant if it doesn't exist already: */
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -54,14 +54,54 @@ static inline uint16_t sem2idx(ir3_semantic sem)
 * in hw (two sided color), binning-pass vertex shader, etc.
 */
 struct ir3_shader_key {
-	/* vertex shader variant parameters: */
-	unsigned binning_pass : 1;
+	union {
+		struct {
+			/* do we need to check {v,f}saturate_{s,t,r}? */
+			unsigned has_per_samp : 1;
+
+			/*
+			 * Vertex shader variant parameters:
+			 */
+			unsigned binning_pass : 1;
+
+			/*
+			 * Fragment shader variant parameters:
+			 */
+			unsigned color_two_side : 1;
+			unsigned half_precision : 1;
+			/* For rendering to alpha, we need a bit of special handling
+			 * since the hw always takes gl_FragColor starting from x
+			 * component, rather than figuring out to take the w component.
+			 * We could be more clever and generate variants for other
+			 * render target formats (ie. luminance formats are xxx1), but
+			 * let's start with this and see how it goes:
+			 */
+			unsigned alpha : 1;
+		};
+		uint32_t global;
+	};
+
+	/* bitmask of sampler which needs coords clamped for vertex
+	 * shader:
+	 */
+	uint16_t vsaturate_s, vsaturate_t, vsaturate_r;
+
+	/* bitmask of sampler which needs coords clamped for frag
+	 * shader:
+	 */
+	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;

-	/* fragment shader variant parameters: */
-	unsigned color_two_side : 1;
-	unsigned half_precision : 1;
 };

+static inline bool
+ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
+{
+	/* slow-path if we need to check {v,f}saturate_{s,t,r} */
+	if (a->has_per_samp || b->has_per_samp)
+		return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
+	return a->global == b->global;
+}
+
 struct ir3_shader_variant {
 	struct fd_bo *bo;

@@ -110,9 +150,20 @@ struct ir3_shader_variant {
 		uint8_t regid;
 		uint8_t compmask;
 		uint8_t ncomp;
-		/* in theory inloc of fs should match outloc of vs: */
+		/* In theory inloc of fs should match outloc of vs.  Or
+		 * rather the outloc of the vs is 8 plus the offset passed
+		 * to bary.f.  Presumably that +8 is to account for
+		 * gl_Position/gl_PointSize?
+		 *
+		 * NOTE inloc is currently aligned to 4 (we don't try
+		 * to pack varyings).  Changing this would likely break
+		 * assumptions in few places (like setting up of flat
+		 * shading in fd3_program) so be sure to check all the
+		 * spots where inloc is used.
+		 */
 		uint8_t inloc;
 		uint8_t bary;
+		uint8_t interpolate;
 	} inputs[16 + 2];  /* +POSITION +FACE */

 	unsigned total_in;       /* sum of inputs (scalar) */
@@ -120,6 +171,9 @@ struct ir3_shader_variant {
 	/* do we have one or more texture sample instructions: */
 	bool has_samp;

+	/* do we have kill instructions: */
+	bool has_kill;
+
 	/* const reg # of first immediate, ie. 1 == c1
 	 * (not regid, because TGSI thinks in terms of vec4 registers,
 	 * not scalar registers)
@@ -147,9 +201,9 @@ struct ir3_shader {
 	struct ir3_shader_variant *variants;

 	/* so far, only used for blit_prog shader.. values for
-	 * VPC_VARYING_INTERP[i].MODE and VPC_VARYING_PS_REPL[i].MODE
+	 * VPC_VARYING_PS_REPL[i].MODE
 	 */
-	uint32_t vinterp[4], vpsrepl[4];
+	uint32_t vpsrepl[4];
 };


--- a/src/gallium/drivers/nouveau/Android.mk
+++ b/src/gallium/drivers/nouveau/Android.mk
@@ -32,7 +32,7 @@ LOCAL_SRC_FILES := \
 	$(C_SOURCES) \
 	$(NV30_C_SOURCES) \
 	$(NV50_CODEGEN_SOURCES) \
-	$(NV50_C_SOURES) \
+	$(NV50_C_SOURCES) \
 	$(NVC0_CODEGEN_SOURCES) \
 	$(NVC0_C_SOURCES)

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -140,6 +140,7 @@ private:
   code[(0x##b) / 32] |= 1 << ((0x##b) % 32)

 #define FTZ_(b) if (i->ftz) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+#define DNZ_(b) if (i->dnz) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)

 #define SAT_(b) if (i->saturate) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)

@@ -464,6 +465,7 @@ CodeEmitterGK110::emitFMAD(const Instruction *i)
   SAT_(35);
   RND_(36, F);
   FTZ_(38);
+   DNZ_(39);

   bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();

@@ -487,6 +489,7 @@ CodeEmitterGK110::emitFMUL(const Instruction *i)
      emitForm_L(i, 0x200, 0x2, Modifier(0));

      FTZ_(38);
+      DNZ_(39);
      SAT_(3a);
      if (neg)
         code[1] ^= 1 << 22;
@@ -499,6 +502,7 @@ CodeEmitterGK110::emitFMUL(const Instruction *i)

      RND_(2a, F);
      FTZ_(2f);
+      DNZ_(30);
      SAT_(35);

      if (code[0] & 0x1) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -432,7 +432,7 @@ CodeEmitterGM107::emitNEG2(int pos, const ValueRef &a, const ValueRef &b)
 void
 CodeEmitterGM107::emitFMZ(int pos, int len)
 {
-   emitField(pos, len, /*XXX: insn->dnz << 1 | */ insn->ftz);
+   emitField(pos, len, insn->dnz << 1 | insn->ftz);
 }

 void
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -546,9 +546,9 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 			       S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
 			       S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
 			       S_028814_FACE(!state->front_ccw) |
-			       S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
-			       S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
-			       S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri) |
+			       S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
+			       S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
+			       S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
 			       S_028814_POLY_MODE(state->fill_front != PIPE_POLYGON_MODE_FILL ||
 						  state->fill_back != PIPE_POLYGON_MODE_FILL) |
 			       S_028814_POLYMODE_FRONT_PTYPE(r600_translate_fill(state->fill_front)) |
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -535,9 +535,9 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 				 S_028814_CULL_FRONT(state->cull_face & PIPE_FACE_FRONT ? 1 : 0) |
 				 S_028814_CULL_BACK(state->cull_face & PIPE_FACE_BACK ? 1 : 0) |
 				 S_028814_FACE(!state->front_ccw) |
-				 S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
-				 S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
-				 S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri) |
+				 S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
+				 S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
+				 S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
 				 S_028814_POLY_MODE(state->fill_front != PIPE_POLYGON_MODE_FILL ||
 									 state->fill_back != PIPE_POLYGON_MODE_FILL) |
 				 S_028814_POLYMODE_FRONT_PTYPE(r600_translate_fill(state->fill_front)) |
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -158,8 +158,10 @@ static void r600_bind_blend_state(struct pipe_context *ctx, void *state)
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_blend_state *blend = (struct r600_blend_state *)state;

-	if (blend == NULL)
+	if (blend == NULL) {
+		r600_set_cso_state_with_cb(&rctx->blend_state, NULL, NULL);
 		return;
+	}

 	r600_bind_blend_state_internal(rctx, blend, rctx->force_blend_disable);
 }
@@ -447,8 +449,13 @@ static void r600_delete_sampler_state(struct pipe_context *ctx, void *state)

 static void r600_delete_blend_state(struct pipe_context *ctx, void *state)
 {
+	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_blend_state *blend = (struct r600_blend_state*)state;

+	if (rctx->blend_state.cso == state) {
+		ctx->bind_blend_state(ctx, NULL);
+	}
+
 	r600_release_command_buffer(&blend->buffer);
 	r600_release_command_buffer(&blend->buffer_no_blend);
 	FREE(blend);
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -110,11 +110,13 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 	enum radeon_bo_flag flags = 0;

 	switch (res->b.b.usage) {
+	case PIPE_USAGE_STREAM:
+		flags = RADEON_FLAG_GTT_WC;
+		/* fall through */
 	case PIPE_USAGE_STAGING:
 		/* Transfers are likely to occur more often with these resources. */
 		res->domains = RADEON_DOMAIN_GTT;
 		break;
-	case PIPE_USAGE_STREAM:
 	case PIPE_USAGE_DYNAMIC:
 		/* Older kernels didn't always flush the HDP cache before
 		 * CS execution
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -33,10 +33,10 @@

 #define RADEON_LLVM_MAX_INPUTS 32 * 4
 #define RADEON_LLVM_MAX_OUTPUTS 32 * 4
-#define RADEON_LLVM_MAX_BRANCH_DEPTH 16
-#define RADEON_LLVM_MAX_LOOP_DEPTH 16
 #define RADEON_LLVM_MAX_ARRAYS 16

+#define RADEON_LLVM_INITIAL_CF_DEPTH 4
+
 #define RADEON_LLVM_MAX_SYSTEM_VALUES 4

 struct radeon_llvm_branch {
@@ -122,11 +122,13 @@ struct radeon_llvm_context {

 	/*=== Private Members ===*/

-	struct radeon_llvm_branch branch[RADEON_LLVM_MAX_BRANCH_DEPTH];
-	struct radeon_llvm_loop loop[RADEON_LLVM_MAX_LOOP_DEPTH];
+	struct radeon_llvm_branch *branch;
+	struct radeon_llvm_loop *loop;

 	unsigned branch_depth;
+	unsigned branch_depth_max;
 	unsigned loop_depth;
+	unsigned loop_depth_max;

 	struct tgsi_declaration_range arrays[RADEON_LLVM_MAX_ARRAYS];
 	unsigned num_arrays;
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -445,7 +445,19 @@ static void bgnloop_emit(
 						endloop_block, "LOOP");
 	LLVMBuildBr(gallivm->builder, loop_block);
 	LLVMPositionBuilderAtEnd(gallivm->builder, loop_block);
-	ctx->loop_depth++;
+
+	if (++ctx->loop_depth > ctx->loop_depth_max) {
+		unsigned new_max = ctx->loop_depth_max << 1;
+
+		if (!new_max)
+			new_max = RADEON_LLVM_INITIAL_CF_DEPTH;
+
+		ctx->loop = REALLOC(ctx->loop, ctx->loop_depth_max *
+				    sizeof(ctx->loop[0]),
+				    new_max * sizeof(ctx->loop[0]));
+		ctx->loop_depth_max = new_max;
+	}
+
 	ctx->loop[ctx->loop_depth - 1].loop_block = loop_block;
 	ctx->loop[ctx->loop_depth - 1].endloop_block = endloop_block;
 }
@@ -576,7 +588,18 @@ static void if_cond_emit(
 	LLVMBuildCondBr(gallivm->builder, cond, if_block, else_block);
 	LLVMPositionBuilderAtEnd(gallivm->builder, if_block);

-	ctx->branch_depth++;
+	if (++ctx->branch_depth > ctx->branch_depth_max) {
+		unsigned new_max = ctx->branch_depth_max << 1;
+
+		if (!new_max)
+			new_max = RADEON_LLVM_INITIAL_CF_DEPTH;
+
+		ctx->branch = REALLOC(ctx->branch, ctx->branch_depth_max *
+				      sizeof(ctx->branch[0]),
+				      new_max * sizeof(ctx->branch[0]));
+		ctx->branch_depth_max = new_max;
+	}
+
 	ctx->branch[ctx->branch_depth - 1].endif_block = endif_block;
 	ctx->branch[ctx->branch_depth - 1].if_block = if_block;
 	ctx->branch[ctx->branch_depth - 1].else_block = else_block;
@@ -1439,4 +1462,10 @@ void radeon_llvm_dispose(struct radeon_llvm_context * ctx)
 	LLVMContextDispose(ctx->soa.bld_base.base.gallivm->context);
 	FREE(ctx->temps);
 	ctx->temps = NULL;
+	FREE(ctx->loop);
+	ctx->loop = NULL;
+	ctx->loop_depth_max = 0;
+	FREE(ctx->branch);
+	ctx->branch = NULL;
+	ctx->branch_depth_max = 0;
 }
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -411,6 +411,11 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 				si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
 						    NULL, NULL);
 			}
+		} else {
+			samplers->depth_texture_mask &= ~(1 << slot);
+			samplers->compressed_colortex_mask &= ~(1 << slot);
+			si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
+					    NULL, NULL);
 		}
 	}

--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -599,9 +599,9 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 		S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
 		S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
 		S_028814_FACE(!state->front_ccw) |
-		S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
-		S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
-		S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri) |
+		S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
+		S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
+		S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
 		S_028814_POLY_MODE(polygon_dual_mode) |
 		S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
 		S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back));
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -781,7 +781,7 @@ static void si_state_draw(struct si_context *sctx,

 	if (info->indexed) {
 		uint32_t max_size = (ib->buffer->width0 - ib->offset) /
-				 sctx->index_buffer.index_size;
+				    ib->index_size;
 		uint64_t va = r600_resource(ib->buffer)->gpu_address + ib->offset;

 		si_pm4_add_bo(pm4, (struct r600_resource *)ib->buffer, RADEON_USAGE_READ,
--- a/src/gallium/state_trackers/gbm/gbm_drm.c
+++ b/src/gallium/state_trackers/gbm/gbm_drm.c
@@ -85,7 +85,7 @@ gbm_gallium_drm_is_format_supported(struct gbm_device *gbm,
   if (pf == PIPE_FORMAT_NONE)
      return 0;

-   if (!gdrm->screen->is_format_supported(gdrm->screen, PIPE_TEXTURE_2D, pf, 0,
+   if (!gdrm->screen->is_format_supported(gdrm->screen, pf, PIPE_TEXTURE_2D, 0,
                                          gbm_usage_to_gallium(usage)))
      return 0;

--- a/src/gallium/state_trackers/wgl/stw_ext_context.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_context.c
@@ -50,8 +50,8 @@
 HGLRC WINAPI
 wglCreateContextAttribsARB(HDC hDC, HGLRC hShareContext, const int *attribList)
 {
-   typedef HGLRC (*wglCreateContext_t)(HDC hdc);
-   typedef BOOL (*wglDeleteContext_t)(HGLRC hglrc);
+   typedef HGLRC (WINAPI *wglCreateContext_t)(HDC hdc);
+   typedef BOOL (WINAPI *wglDeleteContext_t)(HGLRC hglrc);
   HGLRC context;
   static HMODULE opengl_lib = 0;
   static wglCreateContext_t wglCreateContext_func = 0;
--- a/src/gallium/targets/pipe-loader/pipe_msm.c
+++ b/src/gallium/targets/pipe-loader/pipe_msm.c
@@ -17,5 +17,28 @@ create_screen(int fd)
   return screen;
 }

+static const struct drm_conf_ret throttle_ret = {
+   .type = DRM_CONF_INT,
+   .val.val_int = 2,
+};
+
+static const struct drm_conf_ret share_fd_ret = {
+   .type = DRM_CONF_BOOL,
+   .val.val_int = true,
+};
+
+static const struct drm_conf_ret *drm_configuration(enum drm_conf conf)
+{
+   switch (conf) {
+   case DRM_CONF_THROTTLE:
+      return &throttle_ret;
+   case DRM_CONF_SHARE_FD:
+      return &share_fd_ret;
+   default:
+      break;
+   }
+   return NULL;
+}
+
 PUBLIC
-DRM_DRIVER_DESCRIPTOR("msm", "freedreno", create_screen, NULL)
+DRM_DRIVER_DESCRIPTOR("msm", "freedreno", create_screen, drm_configuration)
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -811,17 +811,12 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
    desc.flags = flags;

    /* Assign a buffer manager. */
+    assert(flags < RADEON_NUM_CACHE_MANAGERS);
    if (use_reusable_pool) {
-        if (domain == RADEON_DOMAIN_VRAM) {
-            if (flags & RADEON_FLAG_GTT_WC)
-                provider = ws->cman_vram_gtt_wc;
-            else
-                provider = ws->cman_vram;
-        } else if (flags & RADEON_FLAG_GTT_WC) {
-            provider = ws->cman_gtt_wc;
-        } else {
-            provider = ws->cman_gtt;
-        }
+        if (domain == RADEON_DOMAIN_VRAM)
+            provider = ws->cman_vram[flags];
+        else
+            provider = ws->cman_gtt[flags];
    } else {
        provider = ws->kman;
    }
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -97,13 +97,11 @@ static boolean radeon_set_fd_access(struct radeon_drm_cs *applier,
    if (enable) {
        if (value) {
            *owner = applier;
-            printf("radeon: Acquired access to %s.\n", request_name);
            pipe_mutex_unlock(*mutex);
            return TRUE;
        }
    } else {
        *owner = NULL;
-        printf("radeon: Released access to %s.\n", request_name);
    }

    pipe_mutex_unlock(*mutex);
@@ -441,6 +439,7 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
 static void radeon_winsys_destroy(struct radeon_winsys *rws)
 {
    struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)rws;
+    int i;

    if (ws->thread) {
        ws->kill_thread = 1;
@@ -453,10 +452,10 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws)
    pipe_mutex_destroy(ws->cmask_owner_mutex);
    pipe_mutex_destroy(ws->cs_stack_lock);

-    ws->cman_vram->destroy(ws->cman_vram);
-    ws->cman_vram_gtt_wc->destroy(ws->cman_vram_gtt_wc);
-    ws->cman_gtt->destroy(ws->cman_gtt);
-    ws->cman_gtt_wc->destroy(ws->cman_gtt_wc);
+    for (i = 0; i < RADEON_NUM_CACHE_MANAGERS; i++) {
+        ws->cman_gtt[i]->destroy(ws->cman_gtt[i]);
+        ws->cman_vram[i]->destroy(ws->cman_vram[i]);
+    }
    ws->kman->destroy(ws->kman);
    if (ws->gen >= DRV_R600) {
        radeon_surface_manager_free(ws->surf_man);
@@ -643,6 +642,7 @@ PUBLIC struct radeon_winsys *
 radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
 {
    struct radeon_drm_winsys *ws;
+    int i;

    pipe_mutex_lock(fd_tab_mutex);
    if (!fd_tab) {
@@ -671,18 +671,16 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
    ws->kman = radeon_bomgr_create(ws);
    if (!ws->kman)
        goto fail;
-    ws->cman_vram = pb_cache_manager_create(ws->kman, 1000000, 2.0f, 0);
-    if (!ws->cman_vram)
-        goto fail;
-    ws->cman_vram_gtt_wc = pb_cache_manager_create(ws->kman, 1000000, 2.0f, 0);
-    if (!ws->cman_vram_gtt_wc)
-        goto fail;
-    ws->cman_gtt = pb_cache_manager_create(ws->kman, 1000000, 2.0f, 0);
-    if (!ws->cman_gtt)
-        goto fail;
-    ws->cman_gtt_wc = pb_cache_manager_create(ws->kman, 1000000, 2.0f, 0);
-    if (!ws->cman_gtt_wc)
-        goto fail;
+
+    for (i = 0; i < RADEON_NUM_CACHE_MANAGERS; i++) {
+        ws->cman_vram[i] = pb_cache_manager_create(ws->kman, 1000000, 2.0f, 0);
+        if (!ws->cman_vram[i])
+            goto fail;
+
+        ws->cman_gtt[i] = pb_cache_manager_create(ws->kman, 1000000, 2.0f, 0);
+        if (!ws->cman_gtt[i])
+            goto fail;
+    }

    if (ws->gen >= DRV_R600) {
        ws->surf_man = radeon_surface_manager_new(fd);
@@ -737,14 +735,12 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)

 fail:
    pipe_mutex_unlock(fd_tab_mutex);
-    if (ws->cman_gtt)
-        ws->cman_gtt->destroy(ws->cman_gtt);
-    if (ws->cman_gtt_wc)
-        ws->cman_gtt_wc->destroy(ws->cman_gtt_wc);
-    if (ws->cman_vram)
-        ws->cman_vram->destroy(ws->cman_vram);
-    if (ws->cman_vram_gtt_wc)
-        ws->cman_vram_gtt_wc->destroy(ws->cman_vram_gtt_wc);
+    for (i = 0; i < RADEON_NUM_CACHE_MANAGERS; i++) {
+        if (ws->cman_gtt[i])
+            ws->cman_gtt[i]->destroy(ws->cman_gtt[i]);
+        if (ws->cman_vram[i])
+            ws->cman_vram[i]->destroy(ws->cman_vram[i]);
+    }
    if (ws->kman)
        ws->kman->destroy(ws->kman);
    if (ws->surf_man)
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -41,6 +41,8 @@ enum radeon_generation {
    DRV_SI
 };

+#define RADEON_NUM_CACHE_MANAGERS 8
+
 struct radeon_drm_winsys {
    struct radeon_winsys base;
    struct pipe_reference reference;
@@ -58,10 +60,8 @@ struct radeon_drm_winsys {
    uint32_t accel_working2;

    struct pb_manager *kman;
-    struct pb_manager *cman_vram;
-    struct pb_manager *cman_vram_gtt_wc;
-    struct pb_manager *cman_gtt;
-    struct pb_manager *cman_gtt_wc;
+    struct pb_manager *cman_vram[RADEON_NUM_CACHE_MANAGERS];
+    struct pb_manager *cman_gtt[RADEON_NUM_CACHE_MANAGERS];
    struct radeon_surface_manager *surf_man;

    uint32_t num_cpus;      /* Number of CPUs. */
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -49,12 +49,12 @@ ast_array_specifier::print(void) const
 * loc and state to report the error.
 */
 static void
-update_max_array_access(ir_rvalue *ir, unsigned idx, YYLTYPE *loc,
+update_max_array_access(ir_rvalue *ir, int idx, YYLTYPE *loc,
                        struct _mesa_glsl_parse_state *state)
 {
   if (ir_dereference_variable *deref_var = ir->as_dereference_variable()) {
      ir_variable *var = deref_var->var;
-      if (idx > var->data.max_array_access) {
+      if (idx > (int)var->data.max_array_access) {
         var->data.max_array_access = idx;

         /* Check whether this access will, as a side effect, implicitly cause
@@ -88,7 +88,7 @@ update_max_array_access(ir_rvalue *ir, unsigned idx, YYLTYPE *loc,
            unsigned field_index =
               deref_record->record->type->field_index(deref_record->field);
            assert(field_index < interface_type->length);
-            if (idx > deref_var->var->max_ifc_array_access[field_index]) {
+            if (idx > (int)deref_var->var->max_ifc_array_access[field_index]) {
               deref_var->var->max_ifc_array_access[field_index] = idx;

               /* Check whether this access will, as a side effect, implicitly
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3760,7 +3760,7 @@ ast_declarator_list::hir(exec_list *instructions,
             earlier->data.how_declared == ir_var_declared_in_block) {
            _mesa_glsl_error(&loc, state,
                             "`%s' has already been redeclared using "
-                             "gl_PerVertex", var->name);
+                             "gl_PerVertex", earlier->name);
         }
         earlier->data.how_declared = ir_var_declared_normally;
      }
@@ -5674,17 +5674,21 @@ ast_interface_block::hir(exec_list *instructions,

         var->data.stream = this->layout.stream;

+         /* Examine var name here since var may get deleted in the next call */
+         bool var_is_gl_id = is_gl_identifier(var->name);
+
         if (redeclaring_per_vertex) {
            ir_variable *earlier =
               get_variable_being_redeclared(var, loc, state,
                                             true /* allow_all_redeclarations */);
-            if (!is_gl_identifier(var->name) || earlier == NULL) {
+            if (!var_is_gl_id || earlier == NULL) {
               _mesa_glsl_error(&loc, state,
                                "redeclaration of gl_PerVertex can only "
                                "include built-in variables");
            } else if (earlier->data.how_declared == ir_var_declared_normally) {
               _mesa_glsl_error(&loc, state,
-                                "`%s' has already been redeclared", var->name);
+                                "`%s' has already been redeclared",
+                                earlier->name);
            } else {
               earlier->data.how_declared = ir_var_declared_in_block;
               earlier->reinit_interface_type(block_type);
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -678,12 +678,17 @@ glsl_type::component_slots() const
 unsigned
 glsl_type::uniform_locations() const
 {
-   if (this->is_matrix())
-      return 1;
-
   unsigned size = 0;

   switch (this->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
+      return 1;
+
   case GLSL_TYPE_STRUCT:
   case GLSL_TYPE_INTERFACE:
      for (unsigned i = 0; i < this->length; i++)
@@ -692,13 +697,8 @@ glsl_type::uniform_locations() const
   case GLSL_TYPE_ARRAY:
      return this->length * this->fields.array->uniform_locations();
   default:
-      break;
+      return 0;
   }
-
-   /* The location count for many types match with component_slots() result,
-    * all expections should be handled above.
-    */
-   return component_slots();
 }

 bool
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -279,6 +279,9 @@ struct glsl_type {
   /**
    * Calculate the number of unique values from glGetUniformLocation for the
    * elements of the type.
+    *
+    * This is used to allocate slots in the UniformRemapTable, the amount of
+    * locations may not match with actual used storage space by the driver.
    */
   unsigned uniform_locations() const;

--- a/src/glsl/opt_array_splitting.cpp
+++ b/src/glsl/opt_array_splitting.cpp
@@ -295,7 +295,7 @@ ir_array_splitting_visitor::split_deref(ir_dereference **deref)
   ir_constant *constant = deref_array->array_index->as_constant();
   assert(constant);

-   if (constant->value.i[0] < (int)entry->size) {
+   if (constant->value.i[0] >= 0 && constant->value.i[0] < (int)entry->size) {
      *deref = new(entry->mem_ctx)
 	 ir_dereference_variable(entry->components[constant->value.i[0]]);
   } else {
--- a/src/glx/xfont.c
+++ b/src/glx/xfont.c
@@ -221,6 +221,7 @@ DRI_glXUseXFont(struct glx_context *CC, Font font, int first, int count, int lis
   XGCValues values;
   unsigned long valuemask;
   XFontStruct *fs;
+   __GLXDRIdrawable *glxdraw;

   GLint swapbytes, lsbfirst, rowlength;
   GLint skiprows, skippixels, alignment;
@@ -233,6 +234,10 @@ DRI_glXUseXFont(struct glx_context *CC, Font font, int first, int count, int lis
   dpy = CC->currentDpy;
   win = CC->currentDrawable;

+   glxdraw = GetGLXDRIDrawable(CC->currentDpy, CC->currentDrawable);
+   if (glxdraw)
+      win = glxdraw->xDrawable;
+
   fs = XQueryFont(dpy, font);
   if (!fs) {
      __glXSetError(CC, GL_INVALID_VALUE);
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2312,9 +2312,13 @@ enum brw_wm_barycentric_interp_mode {
 #define HSW_MOCS_WB_LLC_WB_ELLC         (2 << 1)
 #define HSW_MOCS_UC_LLC_WB_ELLC         (3 << 1)

-/* Broadwell: write-back or write-through; always use all the caches. */
-#define BDW_MOCS_WB 0x78
-#define BDW_MOCS_WT 0x58
+/* Broadwell: these defines always use all available caches (L3, LLC, eLLC),
+ * and let you force write-back (WB) or write-through (WT) caching, or leave
+ * it up to the page table entry (PTE) specified by the kernel.
+ */
+#define BDW_MOCS_WB  0x78
+#define BDW_MOCS_WT  0x58
+#define BDW_MOCS_PTE 0x18

 #include "intel_chipset.h"

--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -377,7 +377,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
             horizontal_alignment(mt) |
             surface_tiling_mode(tiling);

-   surf[1] = SET_FIELD(BDW_MOCS_WT, GEN8_SURFACE_MOCS) | mt->qpitch >> 2;
+   surf[1] = SET_FIELD(BDW_MOCS_PTE, GEN8_SURFACE_MOCS) | mt->qpitch >> 2;

   surf[2] = SET_FIELD(width - 1, GEN7_SURFACE_WIDTH) |
             SET_FIELD(height - 1, GEN7_SURFACE_HEIGHT);
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -87,6 +87,7 @@ can_do_pipelined_register_writes(struct brw_context *brw)

   /* Check whether the value got written. */
   drm_intel_bo_map(brw->batch.workaround_bo, false);
+   data = brw->batch.workaround_bo->virtual;
   bool success = data[offset] == expected_value;
   drm_intel_bo_unmap(brw->batch.workaround_bo);

@@ -145,6 +146,7 @@ can_write_oacontrol(struct brw_context *brw)

   /* Check whether the value got written. */
   drm_intel_bo_map(brw->batch.workaround_bo, false);
+   data = brw->batch.workaround_bo->virtual;
   bool success = data[offset] == expected_value;
   drm_intel_bo_unmap(brw->batch.workaround_bo);

--- a/src/mesa/drivers/dri/nouveau/nouveau_context.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_context.c
@@ -189,6 +189,9 @@ nouveau_context_init(struct gl_context *ctx, gl_api api,
 	ctx->Extensions.NV_texture_env_combine4 = true;
 	ctx->Const.MaxColorAttachments = 1;

+	/* This effectively disables 3D textures */
+	ctx->Const.Max3DTextureLevels = 1;
+
 	return GL_TRUE;
 }

--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -896,7 +896,21 @@ _mesa_generic_nop(void)


 /**
- * Allocate and initialize a new dispatch table.
+ * Special no-op glFlush, see below.
+ */
+#if defined(_WIN32)
+static void GLAPIENTRY
+nop_glFlush(void)
+{
+   /* don't record an error like we do in _mesa_generic_nop() */
+}
+#endif
+
+
+/**
+ * Allocate and initialize a new dispatch table.  All the dispatch
+ * function pointers will point at the _mesa_generic_nop() function
+ * which raises GL_INVALID_OPERATION.
 */
 struct _glapi_table *
 _mesa_alloc_dispatch_table()
@@ -916,6 +930,26 @@ _mesa_alloc_dispatch_table()
      for (i = 0; i < numEntries; i++) {
         entry[i] = (_glapi_proc) _mesa_generic_nop;
      }
+
+#if defined(_WIN32)
+      /* This is a special case for Windows in the event that
+       * wglGetProcAddress is called between glBegin/End().
+       *
+       * The MS opengl32.dll library apparently calls glFlush from
+       * wglGetProcAddress().  If we're inside glBegin/End(), glFlush
+       * will dispatch to _mesa_generic_nop() and we'll generate a
+       * GL_INVALID_OPERATION error.
+       *
+       * The specific case which hits this is piglit's primitive-restart
+       * test which calls glPrimitiveRestartNV() inside glBegin/End.  The
+       * first time we call glPrimitiveRestartNV() Piglit's API dispatch
+       * code will try to resolve the function by calling wglGetProcAddress.
+       * This raises GL_INVALID_OPERATION and an assert(glGetError()==0)
+       * will fail causing the test to fail.  By suppressing the error, the
+       * assertion passes and the test continues.
+       */
+      SET_Flush(table, nop_glFlush);
+#endif
   }
   return table;
 }
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -144,10 +144,10 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
 /* This function/macro is sensitive to precision.  Test very carefully
 * if you change it!
 */
-#define UNCLAMPED_FLOAT_TO_UBYTE(UB, F)					\
+#define UNCLAMPED_FLOAT_TO_UBYTE(UB, FLT)				\
        do {								\
           fi_type __tmp;						\
-           __tmp.f = (F);						\
+           __tmp.f = (FLT);						\
           if (__tmp.i < 0)						\
              UB = (GLubyte) 0;						\
           else if (__tmp.i >= IEEE_ONE)				\
@@ -157,10 +157,10 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
              UB = (GLubyte) __tmp.i;					\
           }								\
        } while (0)
-#define CLAMPED_FLOAT_TO_UBYTE(UB, F)					\
+#define CLAMPED_FLOAT_TO_UBYTE(UB, FLT)					\
        do {								\
           fi_type __tmp;						\
-           __tmp.f = (F) * (255.0F/256.0F) + 32768.0F;			\
+           __tmp.f = (FLT) * (255.0F/256.0F) + 32768.0F;		\
           UB = (GLubyte) __tmp.i;					\
        } while (0)
 #else
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -78,8 +78,8 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
              struct gl_texture_image *texImage)
 {
   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
+   GLint height = texImage->Height;
+   GLint depth = texImage->Depth;
   GLint img, row;
   GLfloat *depthRow = malloc(width * sizeof(GLfloat));

@@ -88,6 +88,11 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
      return;
   }

+   if (texImage->TexObject->Target == GL_TEXTURE_1D_ARRAY) {
+      depth = height;
+      height = 1;
+   }
+
   for (img = 0; img < depth; img++) {
      GLubyte *srcMap;
      GLint srcRowStride;
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -226,6 +226,13 @@ validate_uniform_parameters(struct gl_context *ctx,
      return NULL;
   }

+   /* Check that the given location is in bounds of uniform remap table. */
+   if (location >= (GLint) shProg->NumUniformRemapTable) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(location=%d)",
+                  caller, location);
+      return NULL;
+   }
+
   /* Page 82 (page 96 of the PDF) of the OpenGL 2.1 spec says:
    *
    *     "If any of the following conditions occur, an INVALID_OPERATION
@@ -239,19 +246,12 @@ validate_uniform_parameters(struct gl_context *ctx,
    *         - if count is greater than one, and the uniform declared in the
    *           shader is not an array variable,
    */
-   if (location < -1) {
+   if (location < -1 || !shProg->UniformRemapTable[location]) {
      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(location=%d)",
                  caller, location);
      return NULL;
   }

-   /* Check that the given location is in bounds of uniform remap table. */
-   if (location >= (GLint) shProg->NumUniformRemapTable) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(location=%d)",
-		  caller, location);
-      return NULL;
-   }
-
   /* If the driver storage pointer in remap table is -1, we ignore silently.
    *
    * GL_ARB_explicit_uniform_location spec says:
--- a/src/mesa/state_tracker/st_cb_eglimage.c
+++ b/src/mesa/state_tracker/st_cb_eglimage.c
@@ -96,6 +96,7 @@ st_bind_surface(struct gl_context *ctx, GLenum target,
                struct gl_texture_image *texImage,
                struct pipe_surface *ps)
 {
+   struct st_context *st = st_context(ctx);
   struct st_texture_object *stObj;
   struct st_texture_image *stImage;
   GLenum internalFormat;
@@ -124,7 +125,7 @@ st_bind_surface(struct gl_context *ctx, GLenum target,

   /* FIXME create a non-default sampler view from the pipe_surface? */
   pipe_resource_reference(&stObj->pt, ps->texture);
-   st_texture_release_all_sampler_views(stObj);
+   st_texture_release_all_sampler_views(st, stObj);
   pipe_resource_reference(&stImage->pt, stObj->pt);

   stObj->width0 = ps->width;
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -152,10 +152,11 @@ static void
 st_DeleteTextureObject(struct gl_context *ctx,
                       struct gl_texture_object *texObj)
 {
+   struct st_context *st = st_context(ctx);
   struct st_texture_object *stObj = st_texture_object(texObj);

   pipe_resource_reference(&stObj->pt, NULL);
-   st_texture_release_all_sampler_views(stObj);
+   st_texture_release_all_sampler_views(st, stObj);
   st_texture_free_sampler_views(stObj);
   _mesa_delete_texture_object(ctx, texObj);
 }
@@ -512,7 +513,7 @@ st_AllocTextureImageBuffer(struct gl_context *ctx,
   /* The parent texture object does not have space for this image */

   pipe_resource_reference(&stObj->pt, NULL);
-   st_texture_release_all_sampler_views(stObj);
+   st_texture_release_all_sampler_views(st, stObj);

   if (!guess_and_alloc_texture(st, stObj, stImage)) {
      /* Probably out of memory.
@@ -1564,13 +1565,13 @@ st_finalize_texture(struct gl_context *ctx,

      if (!st_obj) {
         pipe_resource_reference(&stObj->pt, NULL);
-         st_texture_release_all_sampler_views(stObj);
+         st_texture_release_all_sampler_views(st, stObj);
         return GL_TRUE;
      }

      if (st_obj->buffer != stObj->pt) {
         pipe_resource_reference(&stObj->pt, st_obj->buffer);
-         st_texture_release_all_sampler_views(stObj);
+         st_texture_release_all_sampler_views(st, stObj);
         stObj->width0 = stObj->pt->width0 / _mesa_get_format_bytes(tObj->_BufferObjectFormat);
         stObj->height0 = 1;
         stObj->depth0 = 1;
@@ -1591,7 +1592,7 @@ st_finalize_texture(struct gl_context *ctx,
       firstImage->pt != stObj->pt &&
       (!stObj->pt || firstImage->pt->last_level >= stObj->pt->last_level)) {
      pipe_resource_reference(&stObj->pt, firstImage->pt);
-      st_texture_release_all_sampler_views(stObj);
+      st_texture_release_all_sampler_views(st, stObj);
   }

   /* If this texture comes from a window system, there is nothing else to do. */
@@ -1639,7 +1640,7 @@ st_finalize_texture(struct gl_context *ctx,
          * gallium texture now.  We'll make a new one below.
          */
         pipe_resource_reference(&stObj->pt, NULL);
-         st_texture_release_all_sampler_views(stObj);
+         st_texture_release_all_sampler_views(st, stObj);
         st->dirty.st |= ST_NEW_FRAMEBUFFER;
      }
   }
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -237,8 +237,7 @@ void st_init_limits(struct pipe_screen *screen,

      if (options->EmitNoLoops)
         options->MaxUnrollIterations = MIN2(screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS), 65536);
-      else
-         options->MaxUnrollIterations = 255; /* SM3 limit */
+
      options->LowerClipDistance = true;
   }

--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -124,7 +124,7 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,

      /* release the old tex (will likely be freed too) */
      pipe_resource_reference(&oldTex, NULL);
-      st_texture_release_all_sampler_views(stObj);
+      st_texture_release_all_sampler_views(st, stObj);
   }
   else {
      /* Make sure that the base texture image data is present in the
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -4818,15 +4818,19 @@ emit_wpos(struct st_context *st,
 * saturating the value to [0,1] does the job.
 */
 static void
-emit_face_var(struct st_translate *t)
+emit_face_var(struct gl_context *ctx, struct st_translate *t)
 {
   struct ureg_program *ureg = t->ureg;
   struct ureg_dst face_temp = ureg_DECL_temporary(ureg);
   struct ureg_src face_input = t->inputs[t->inputMapping[VARYING_SLOT_FACE]];

-   /* MOV_SAT face_temp, input[face] */
-   face_temp = ureg_saturate(face_temp);
-   ureg_MOV(ureg, face_temp, face_input);
+   if (ctx->Const.NativeIntegers) {
+      ureg_FSGE(ureg, face_temp, face_input, ureg_imm1f(ureg, 0));
+   }
+   else {
+      /* MOV_SAT face_temp, input[face] */
+      ureg_MOV(ureg, ureg_saturate(face_temp), face_input);
+   }

   /* Use face_temp as face input from here on: */
   t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
@@ -4946,7 +4950,7 @@ st_translate_program(
      }

      if (proginfo->InputsRead & VARYING_BIT_FACE)
-         emit_face_var(t);
+         emit_face_var(ctx, t);

      /*
       * Declare output attributes.
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -507,12 +507,14 @@ st_texture_release_sampler_view(struct st_context *st,
 }

 void
-st_texture_release_all_sampler_views(struct st_texture_object *stObj)
+st_texture_release_all_sampler_views(struct st_context *st,
+                                     struct st_texture_object *stObj)
 {
   GLuint i;

+   /* XXX This should use sampler_views[i]->pipe, not st->pipe */
   for (i = 0; i < stObj->num_sampler_views; ++i)
-      pipe_sampler_view_reference(&stObj->sampler_views[i], NULL);
+      pipe_sampler_view_release(st->pipe, &stObj->sampler_views[i]);
 }


--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -255,7 +255,8 @@ st_texture_release_sampler_view(struct st_context *st,
                                struct st_texture_object *stObj);

 extern void
-st_texture_release_all_sampler_views(struct st_texture_object *stObj);
+st_texture_release_all_sampler_views(struct st_context *st,
+                                     struct st_texture_object *stObj);

 void
 st_texture_free_sampler_views(struct st_texture_object *stObj);
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -139,7 +139,7 @@ st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
                              texFormat);

   pipe_resource_reference(&stObj->pt, res);
-   st_texture_release_all_sampler_views(stObj);
+   st_texture_release_all_sampler_views(st, stObj);
   pipe_resource_reference(&stImage->pt, res);

   u_sampler_view_default_template(&templ, res, res->format);
@@ -172,7 +172,7 @@ st_vdpau_unmap_surface(struct gl_context *ctx, GLenum target, GLenum access,
   struct st_texture_image *stImage = st_texture_image(texImage);

   pipe_resource_reference(&stObj->pt, NULL);
-   st_texture_release_all_sampler_views(stObj);
+   st_texture_release_all_sampler_views(st, stObj);
   pipe_resource_reference(&stImage->pt, NULL);

   _mesa_dirty_texobj(ctx, texObj);
@@ -1 +1 @@
 .3.1
 .3.3