docs: add release notes for 11.0.8

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
Update version to 11.0.8
2015-12-21 09:22:06 +00:00 · 2015-12-21 09:16:42 +00:00 · 2015-12-21 09:13:47 +00:00 · 2015-12-21 09:13:03 +00:00 · 2015-12-21 09:09:58 +00:00 · 2015-12-21 09:07:58 +00:00
52 changed files with 746 additions and 216 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-11.0.7
+11.0.8
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -3,3 +3,6 @@

 # Somewhat of a mixed feature/bugfix patch, causing some 200 piglit regressions
 2b676570960277d47477822ffeccc672613f9142 gallium/swrast: fix front buffer blitting. (v2)
+
+# causes regression in xwayland, kde/plasma, mpv, steam ... fdo#92759
+839793680f99b8387bee9489733d5071c10f3ace i965: Use MESA_FORMAT_B8G8R8X8_SRGB for RGB visuals
--- a/configure.ac
+++ b/configure.ac
@@ -97,7 +97,7 @@ AC_PROG_CXX
 AM_PROG_CC_C_O
 AM_PROG_AS
 AX_CHECK_GNU_MAKE
-AC_CHECK_PROGS([PYTHON2], [python2 python])
+AC_CHECK_PROGS([PYTHON2], [python2.7 python2 python])
 AC_PROG_SED
 AC_PROG_MKDIR_P

@@ -375,10 +375,11 @@ save_CFLAGS="$CFLAGS"
 CFLAGS="$SSE41_CFLAGS $CFLAGS"
 AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
 #include <smmintrin.h>
+int param;
 int main () {
-    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
    c = _mm_max_epu32(a, b);
-    return 0;
+    return _mm_cvtsi128_si32(c);
 }]])], SSE41_SUPPORTED=1)
 CFLAGS="$save_CFLAGS"
 if test "x$SSE41_SUPPORTED" = x1; then
@@ -1529,7 +1530,15 @@ AC_ARG_WITH([clang-libdir],
   [CLANG_LIBDIR=''])

 PKG_CHECK_EXISTS([libclc], [have_libclc=yes], [have_libclc=no])
-AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;ELF_LIB=-lelf])
+PKG_CHECK_MODULES([LIBELF], [libelf], [have_libelf=yes], [have_libelf=no])
+
+if test "x$have_libelf" = xno; then
+   LIBELF_LIBS=''
+   LIBELF_CFLAGS=''
+   AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;LIBELF_LIBS=-lelf], [have_libelf=no])
+   AC_SUBST([LIBELF_LIBS])
+   AC_SUBST([LIBELF_CFLAGS])
+fi

 if test "x$enable_opencl" = xyes; then
    if test -z "$with_gallium_drivers"; then
@@ -2127,8 +2136,6 @@ if test "x$USE_VC4_SIMULATOR" = xyes -a "x$HAVE_GALLIUM_ILO" = xyes; then
    AC_MSG_ERROR([VC4 simulator on x86 replaces i965 driver build, so ilo must be disabled.])
 fi

-AC_SUBST([ELF_LIB])
-
 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_X11_DRIVER, test "x$enable_xlib_glx" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
--- a/docs/relnotes/11.0.7.html
+++ b/docs/relnotes/11.0.7.html
@@ -31,7 +31,8 @@ because compatibility contexts are not supported.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+07c27004ff68b288097d17b2faa7bdf15ec73c96b7e6c9835266e544adf0a62f  mesa-11.0.7.tar.gz
+e7e90a332ede6c8fd08eff90786a3fd1605a4e62ebf3a9b514047838194538cb  mesa-11.0.7.tar.xz
 </pre>


--- a/docs/relnotes/11.0.8.html
+++ b/docs/relnotes/11.0.8.html
@@ -0,0 +1,199 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.8 Release Notes / December 9, 2015</h1>
+
+<p>
+Mesa 11.0.8 is a bug fix release which fixes bugs found since the 11.0.7 release.
+</p>
+<p>
+Mesa 11.0.8 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91806">Bug 91806</a> - configure does not test whether assembler supports sse4.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92849">Bug 92849</a> - [IVB HSW BDW] piglit image load/store load-from-cleared-image.shader_test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92909">Bug 92909</a> - Offset/alignment issue with layout std140 and vec3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93004">Bug 93004</a> - Guild Wars 2 crash on nouveau DX11 cards</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93215">Bug 93215</a> - [Regression bisected] Ogles1conform Automatic mipmap generation test is fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93266">Bug 93266</a> - gl_arb_shading_language_420pack does not allow binding of image variables</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Boyuan Zhang (1):</p>
+<ul>
+  <li>radeon/uvd: uv pitch separation for stoney</li>
+</ul>
+
+<p>Dave Airlie (9):</p>
+<ul>
+  <li>r600: do SQ flush ES ring rolling workaround</li>
+  <li>r600: SMX returns CONTEXT_DONE early workaround</li>
+  <li>r600/shader: split address get out to a function.</li>
+  <li>r600/shader: add utility functions to do single slot arithmatic</li>
+  <li>r600g: fix geom shader input indirect indexing.</li>
+  <li>r600: handle geometry dynamic input array index</li>
+  <li>radeonsi: handle doubles in lds load path.</li>
+  <li>mesa/varray: set double arrays to non-normalised.</li>
+  <li>mesa/shader: return correct attribute location for double matrix arrays</li>
+</ul>
+
+<p>Emil Velikov (8):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.7</li>
+  <li>cherry-ignore: don't pick a specific i965 formats patch</li>
+  <li>Revert "i965/nir: Remove unused indirect handling"</li>
+  <li>Revert "i965/state: Get rid of dword_pitch arguments to buffer functions"</li>
+  <li>Revert "i965/vec4: Use a stride of 1 and byte offsets for UBOs"</li>
+  <li>Revert "i965/fs: Use a stride of 1 and byte offsets for UBOs"</li>
+  <li>Revert "i965/vec4: Use byte offsets for UBO pulls on Sandy Bridge"</li>
+  <li>Update version to 11.0.8</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965: Resolve color and flush for all active shader images in intel_update_state().</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>meta/generate_mipmap: Work-around GLES 1.x problem with GL_DRAW_FRAMEBUFFER</li>
+</ul>
+
+<p>Ilia Mirkin (17):</p>
+<ul>
+  <li>freedreno/a4xx: support lod_bias</li>
+  <li>freedreno/a4xx: fix 5_5_5_1 texture sampler format</li>
+  <li>freedreno/a4xx: point regid to "red" even for alpha-only rb formats</li>
+  <li>nvc0/ir: fold postfactor into immediate</li>
+  <li>nv50/ir: deal with loops with no breaks</li>
+  <li>nv50/ir: the mad source might not have a defining instruction</li>
+  <li>nv50/ir: fix instruction permutation logic</li>
+  <li>nv50/ir: don't forget to mark flagsDef on cvt in txb lowering</li>
+  <li>nv50/ir: fix DCE to not generate 96-bit loads</li>
+  <li>nv50/ir: avoid looking at uninitialized srcMods entries</li>
+  <li>gk110/ir: fix imul hi emission with limm arg</li>
+  <li>gk104/ir: sampler doesn't matter for txf</li>
+  <li>gk110/ir: fix imad sat/hi flag emission for immediate args</li>
+  <li>nv50/ir: fix cutoff for using r63 vs r127 when replacing zero</li>
+  <li>nv50/ir: can't have predication and immediates</li>
+  <li>glsl: assign varying locations to tess shaders when doing SSO</li>
+  <li>ttn: add TEX2 support</li>
+</ul>
+
+<p>Jason Ekstrand (5):</p>
+<ul>
+  <li>i965/vec4: Use byte offsets for UBO pulls on Sandy Bridge</li>
+  <li>i965/fs: Use a stride of 1 and byte offsets for UBOs</li>
+  <li>i965/vec4: Use a stride of 1 and byte offsets for UBOs</li>
+  <li>i965/state: Get rid of dword_pitch arguments to buffer functions</li>
+  <li>i965/nir: Remove unused indirect handling</li>
+</ul>
+
+<p>Jonathan Gray (2):</p>
+<ul>
+  <li>configure.ac: use pkg-config for libelf</li>
+  <li>configure: check for python2.7 for PYTHON2</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>i965: Fix fragment shader struct inputs.</li>
+  <li>i965: Fix scalar vertex shader struct outputs.</li>
+</ul>
+
+<p>Marek Olšák (8):</p>
+<ul>
+  <li>radeonsi: fix occlusion queries on Fiji</li>
+  <li>radeonsi: fix a hang due to uninitialized border color registers</li>
+  <li>radeonsi: fix Fiji for LLVM &lt;= 3.7</li>
+  <li>radeonsi: don't call of u_prims_for_vertices for patches and rectangles</li>
+  <li>radeonsi: apply the streamout workaround to Fiji as well</li>
+  <li>gallium/radeon: fix Hyper-Z hangs by programming PA_SC_MODE_CNTL_1 correctly</li>
+  <li>tgsi/scan: add flag colors_written</li>
+  <li>r600g: write all MRTs only if there is exactly one output (fixes a hang)</li>
+</ul>
+
+<p>Matt Turner (1):</p>
+<ul>
+  <li>glsl: Allow binding of image variables with 420pack.</li>
+</ul>
+
+<p>Neil Roberts (2):</p>
+<ul>
+  <li>i965: Add MESA_FORMAT_B8G8R8X8_SRGB to brw_format_for_mesa_format</li>
+  <li>i965: Add B8G8R8X8_SRGB to the alpha format override</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>configura.ac: fix test for SSE4.1 assembler support</li>
+</ul>
+
+<p>Patrick Rudolph (2):</p>
+<ul>
+  <li>nv50,nvc0: fix use-after-free when vertex buffers are unbound</li>
+  <li>gallium/util: return correct number of bound vertex buffers</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>nvc0: free memory allocated by the prog which reads MP perf counters</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>i965: use _Shader to get fragment program when updating surface state</li>
+</ul>
+
+<p>Tom Stellard (2):</p>
+<ul>
+  <li>radeonsi: Rename si_shader::ls_rsrc{1,2} to si_shader::rsrc{1,2}</li>
+  <li>radeonsi/compute: Use the compiler's COMPUTE_PGM_RSRC* register values</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1087,6 +1087,11 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
      op = nir_texop_tex;
      num_srcs = 1;
      break;
+   case TGSI_OPCODE_TEX2:
+      op = nir_texop_tex;
+      num_srcs = 1;
+      samp = 2;
+      break;
   case TGSI_OPCODE_TXP:
      op = nir_texop_tex;
      num_srcs = 2;
@@ -1242,10 +1247,12 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
   }

   if (instr->is_shadow) {
-      if (instr->coord_components < 3)
-         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], Z));
-      else
+      if (instr->coord_components == 4)
+         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[1], X));
+      else if (instr->coord_components == 3)
         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], W));
+      else
+         instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], Z));

      instr->src[src_number].src_type = nir_tex_src_comparitor;
      src_number++;
@@ -1651,6 +1658,7 @@ ttn_emit_instruction(struct ttn_compile *c)
   case TGSI_OPCODE_TXL:
   case TGSI_OPCODE_TXB:
   case TGSI_OPCODE_TXD:
+   case TGSI_OPCODE_TEX2:
   case TGSI_OPCODE_TXL2:
   case TGSI_OPCODE_TXB2:
   case TGSI_OPCODE_TXQ_LZ:
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -258,6 +258,9 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                  info->output_semantic_index[reg] = (ubyte) semIndex;
                  info->num_outputs++;

+                  if (semName == TGSI_SEMANTIC_COLOR)
+                     info->colors_written |= 1 << semIndex;
+
                  if (procType == TGSI_PROCESSOR_VERTEX ||
                      procType == TGSI_PROCESSOR_GEOMETRY ||
                      procType == TGSI_PROCESSOR_TESS_CTRL ||
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -76,6 +76,7 @@ struct tgsi_shader_info

   uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */

+   ubyte colors_written;
   boolean reads_position; /**< does fragment shader read position? */
   boolean reads_z; /**< does fragment shader read depth? */
   boolean writes_z;  /**< does fragment shader write Z value? */
--- a/src/gallium/auxiliary/util/u_helpers.c
+++ b/src/gallium/auxiliary/util/u_helpers.c
@@ -81,7 +81,13 @@ void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
                                   const struct pipe_vertex_buffer *src,
                                   unsigned start_slot, unsigned count)
 {
-   uint32_t enabled_buffers = (1ull << *dst_count) - 1;
+   unsigned i;
+   uint32_t enabled_buffers = 0;
+
+   for (i = 0; i < *dst_count; i++) {
+      if (dst[i].buffer || dst[i].user_buffer)
+         enabled_buffers |= (1ull << i);
+   }

   util_set_vertex_buffers_mask(dst, &enabled_buffers, src, start_slot,
                                count);
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -153,7 +153,7 @@ enum a4xx_vtx_fmt {

 enum a4xx_tex_fmt {
 	TFMT4_5_6_5_UNORM = 11,
-	TFMT4_5_5_5_1_UNORM = 10,
+	TFMT4_5_5_5_1_UNORM = 9,
 	TFMT4_4_4_4_4_UNORM = 8,
 	TFMT4_X8Z24_UNORM = 71,
 	TFMT4_10_10_10_2_UNORM = 33,
@@ -2718,6 +2718,12 @@ static inline uint32_t A4XX_TEX_SAMP_0_ANISO(enum a4xx_tex_aniso val)
 {
 	return ((val) << A4XX_TEX_SAMP_0_ANISO__SHIFT) & A4XX_TEX_SAMP_0_ANISO__MASK;
 }
+#define A4XX_TEX_SAMP_0_LOD_BIAS__MASK				0xfff80000
+#define A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT				19
+static inline uint32_t A4XX_TEX_SAMP_0_LOD_BIAS(float val)
+{
+	return ((((int32_t)(val * 256.0))) << A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT) & A4XX_TEX_SAMP_0_LOD_BIAS__MASK;
+}

 #define REG_A4XX_TEX_SAMP_1					0x00000001
 #define A4XX_TEX_SAMP_1_COMPARE_FUNC__MASK			0x0000000e
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -250,14 +250,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 		}
 	}

-	/* adjust regids for alpha output formats. there is no alpha render
-	 * format, so it's just treated like red
-	 */
-	for (i = 0; i < nr; i++)
-		if (util_format_is_alpha(pipe_surface_format(bufs[i])))
-			color_regid[i] += 3;
-
-
 	/* TODO get these dynamically: */
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
 	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -111,6 +111,7 @@ fd4_sampler_state_create(struct pipe_context *pctx,
 		COND(!cso->normalized_coords, A4XX_TEX_SAMP_1_UNNORM_COORDS);

 	if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+		so->texsamp0 |= A4XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias);
 		so->texsamp1 |=
 			A4XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) |
 			A4XX_TEX_SAMP_1_MAX_LOD(cso->max_lod);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)

   if (b->prev)
      b->prev->next = b;
-   if (a->prev)
+   if (a->next)
      a->next->prev = a;
 }

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -575,8 +575,8 @@ CodeEmitterGK110::emitIMUL(const Instruction *i)
   if (isLIMM(i->src(1), TYPE_S32)) {
      emitForm_L(i, 0x280, 2, Modifier(0));

-      assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH);
-
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 24;
      if (i->sType == TYPE_S32)
         code[1] |= 3 << 25;
   } else {
@@ -695,14 +695,9 @@ CodeEmitterGK110::emitIMAD(const Instruction *i)
   if (i->sType == TYPE_S32)
      code[1] |= (1 << 19) | (1 << 24);

-   if (code[0] & 0x1) {
-      assert(!i->subOp);
-      SAT_(39);
-   } else {
-      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
-         code[1] |= 1 << 25;
-      SAT_(35);
-   }
+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+      code[1] |= 1 << 25;
+   SAT_(35);
 }

 void
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2870,6 +2870,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
      }
      setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true);
+
+      // If the loop never breaks (e.g. only has RET's inside), then there
+      // will be no way to get to the break bb. However BGNLOOP will have
+      // already made a PREBREAK to it, so it must be in the CFG.
+      if (getBB()->cfg.incidentCount() == 0)
+         loopBB->cfg.attach(&getBB()->cfg, Graph::Edge::TREE);
   }
      break;
   case TGSI_OPCODE_BRK:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -202,7 +202,8 @@ NV50LegalizePostRA::visit(Function *fn)
   Program *prog = fn->getProgram();

   r63 = new_LValue(fn, FILE_GPR);
-   if (prog->maxGPR < 63)
+   // GPR units on nv50 are in half-regs
+   if (prog->maxGPR < 126)
      r63->reg.data.id = 63;
   else
      r63->reg.data.id = 127;
@@ -831,7 +832,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i)
   }
   Value *flags = bld.getScratch(1, FILE_FLAGS);
   bld.setPosition(cond, true);
-   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;

   Instruction *tex[4];
   for (l = 0; l < 4; ++l) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -686,7 +686,7 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
         i->tex.s = 0x1f;
         i->setIndirectR(hnd);
         i->setIndirectS(NULL);
-      } else if (i->tex.r == i->tex.s) {
+      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
         i->tex.r += prog->driver->io.texBindBase / 4;
         i->tex.s  = 0; // only a single cX[] value possible here
      } else {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -842,6 +842,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
         i->src(0).mod = i->src(t).mod;
         i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
         i->src(1).mod = 0;
+      } else
+      if (i->postFactor && i->sType == TYPE_F32) {
+         /* Can't emit a postfactor with an immediate, have to fold it in */
+         i->setSrc(s, new_ImmediateValue(
+                      prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
+         i->postFactor = 0;
      }
      break;
   case OP_MAD:
@@ -2606,8 +2612,11 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb)
             i->getSrc(0)->reg.data.id >= 64)
            break;

+         if (i->getPredicate())
+            break;
+
         def = i->getSrc(1)->getInsn();
-         if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
+         if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
            vtmp = i->getSrc(1);
            i->setSrc(1, def->getSrc(0));

@@ -2909,6 +2918,16 @@ DeadCodeElim::visit(BasicBlock *bb)
   return true;
 }

+// Each load can go into up to 4 destinations, any of which might potentially
+// be dead (i.e. a hole). These can always be split into 2 loads, independent
+// of where the holes are. We find the first contiguous region, put it into
+// the first load, and then put the second contiguous region into the second
+// load. There can be at most 2 contiguous regions.
+//
+// Note that there are some restrictions, for example it's not possible to do
+// a 64-bit load that's not 64-bit aligned, so such a load has to be split
+// up. Also hardware doesn't support 96-bit loads, so those also have to be
+// split into a 64-bit and 32-bit load.
 void
 DeadCodeElim::checkSplitLoad(Instruction *ld1)
 {
@@ -2929,6 +2948,8 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
   addr1 = ld1->getSrc(0)->reg.data.offset;
   n1 = n2 = 0;
   size1 = size2 = 0;
+
+   // Compute address/width for first load
   for (d = 0; ld1->defExists(d); ++d) {
      if (mask & (1 << d)) {
         if (size1 && (addr1 & 0x7))
@@ -2942,16 +2963,34 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
         break;
      }
   }
+
+   // Scale back the size of the first load until it can be loaded. This
+   // typically happens for TYPE_B96 loads.
+   while (n1 &&
+          !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
+                                                typeOfSize(size1))) {
+      size1 -= def1[--n1]->reg.size;
+      d--;
+   }
+
+   // Compute address/width for second load
   for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
      if (mask & (1 << d)) {
+         assert(!size2 || !(addr2 & 0x7));
         def2[n2] = ld1->getDef(d);
         size2 += def2[n2++]->reg.size;
-      } else {
+      } else if (!n2) {
         assert(!n2);
         addr2 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
      }
   }

+   // Make sure that we've processed all the values
+   for (; ld1->defExists(d); ++d)
+      assert(!(mask & (1 << d)));
+
   updateLdStOffset(ld1, addr1, func);
   ld1->setType(typeOfSize(size1));
   for (d = 0; d < 4; ++d)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -454,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
         return false;
      }
   }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
      return false;
   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -426,7 +426,7 @@ TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
         return false;
      }
   }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
      return false;
   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -960,6 +960,9 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
   struct nv50_context *nv50 = nv50_context(pipe);
   unsigned i;

+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+   nv50->dirty |= NV50_NEW_ARRAYS;
+
   util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
                                 start_slot, count);

@@ -983,10 +986,6 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
         nv50->vbo_constant &= ~(1 << dst_index);
      }
   }
-
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
-
-   nv50->dirty |= NV50_NEW_ARRAYS;
 }

 static void
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -417,6 +417,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
   if (screen->pm.prog) {
      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
      nvc0_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
   }

   nouveau_bo_ref(NULL, &screen->text);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -998,6 +998,9 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    unsigned i;

+    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
+    nvc0->dirty |= NVC0_NEW_ARRAYS;
+
    util_set_vertex_buffers_count(nvc0->vtxbuf, &nvc0->num_vtxbufs, vb,
                                  start_slot, count);

@@ -1021,9 +1024,6 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
          nvc0->constant_vbos &= ~(1 << dst_index);
       }
    }
-
-    nvc0->dirty |= NVC0_NEW_ARRAYS;
-    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
 }

 static void
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1527,12 +1527,17 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples,
 				     S_028C00_EXPAND_LINE_WIDTH(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, S_028C04_MSAA_NUM_SAMPLES(util_logbase2(nr_samples)) |
 				     S_028C04_MAX_SAMPLE_DIST(max_dist)); /* R_028C04_PA_SC_AA_CONFIG */
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	} else {
 		r600_write_context_reg_seq(cs, R_028C00_PA_SC_LINE_CNTL, 2);
 		radeon_emit(cs, S_028C00_LAST_PIXEL(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, 0); /* R_028C04_PA_SC_AA_CONFIG */
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }

--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -57,7 +57,7 @@

 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS	16
-#define R600_MAX_DRAW_CS_DWORDS		47
+#define R600_MAX_DRAW_CS_DWORDS		52
 #define R600_TRACE_CS_DWORDS		7

 #define R600_MAX_USER_CONST_BUFFERS 13
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -598,6 +598,106 @@ static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back
 	return 0;
 }

+/* execute a single slot ALU calculation */
+static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
+		for (i = 0; i < 4; i++) {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = op;
+			alu.src[0].sel = src0_sel;
+			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[0].value = src0_chan_val;
+			else
+				alu.src[0].chan = src0_chan_val;
+			alu.src[1].sel = src1_sel;
+			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[1].value = src1_chan_val;
+			else
+				alu.src[1].chan = src1_chan_val;
+			alu.dst.sel = dst_sel;
+			alu.dst.chan = i;
+			alu.dst.write = i == dst_chan;
+			alu.last = (i == 3);
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+		return 0;
+	}
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+/* execute a single slot ALU calculation */
+static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val,
+			  int src2_sel, unsigned src2_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r;
+
+	/* validate this for other ops */
+	assert(op == ALU_OP3_MULADD_UINT24);
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.src[2].sel = src2_sel;
+	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[2].value = src2_chan_val;
+	else
+		alu.src[2].chan = src2_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.is_op3 = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
+{
+	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
+}
+
 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
 {
 	int i;
@@ -1129,6 +1229,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	unsigned vtx_id = src->Dimension.Index;
 	int offset_reg = vtx_id / 3;
 	int offset_chan = vtx_id % 3;
+	int t2 = 0;

 	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
 	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
@@ -1136,13 +1237,24 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	if (offset_reg == 0 && offset_chan == 2)
 		offset_chan = 3;

+	if (src->Dimension.Indirect || src->Register.Indirect)
+		t2 = r600_get_temp(ctx);
+
 	if (src->Dimension.Indirect) {
 		int treg[3];
-		int t2;
 		struct r600_bytecode_alu alu;
 		int r, i;
-
-		/* you have got to be shitting me -
+		unsigned addr_reg;
+		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
+		if (src->DimIndirect.Index > 0) {
+			r = single_alu_op2(ctx, ALU_OP1_MOV,
+					   ctx->bc->ar_reg, 0,
+					   addr_reg, 0,
+					   0, 0);
+			if (r)
+				return r;
+		}
+		/*
 		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
 		   at least this is what fglrx seems to do. */
 		for (i = 0; i < 3; i++) {
@@ -1150,7 +1262,6 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		}
 		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);

-		t2 = r600_get_temp(ctx);
 		for (i = 0; i < 3; i++) {
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ALU_OP1_MOV;
@@ -1175,8 +1286,33 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		if (r)
 			return r;
 		offset_reg = t2;
+		offset_chan = 0;
 	}

+	if (src->Register.Indirect) {
+		int addr_reg;
+		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
+
+		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
+
+		/* pull the value from index_reg */
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   t2, 1,
+				   addr_reg, 0,
+				   V_SQ_ALU_SRC_LITERAL, first);
+		if (r)
+			return r;
+		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+				   t2, 0,
+				   t2, 1,
+				   V_SQ_ALU_SRC_LITERAL, 4,
+				   offset_reg, offset_chan);
+		if (r)
+			return r;
+		offset_reg = t2;
+		offset_chan = 0;
+		index = src->Register.Index - first;
+	}

 	memset(&vtx, 0, sizeof(vtx));
 	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
@@ -1222,6 +1358,7 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)

 			fetch_gs_input(ctx, src, treg);
 			ctx->src[i].sel = treg;
+			ctx->src[i].rel = 0;
 		}
 	}
 	return 0;
@@ -1972,7 +2109,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,

 	ctx.nliterals = 0;
 	ctx.literals = NULL;
-	shader->fs_write_all = FALSE;
+
+	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+			       ctx.info.colors_written == 1;

 	if (shader->vs_as_gs_a)
 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
@@ -2003,10 +2142,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		case TGSI_TOKEN_TYPE_PROPERTY:
 			property = &ctx.parse.FullToken.FullProperty;
 			switch (property->Property.PropertyName) {
-			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-				if (property->u[0].Data == 1)
-					shader->fs_write_all = TRUE;
-				break;
 			case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION:
 				if (property->u[0].Data == 1)
 					shader->vs_position_window_space = TRUE;
@@ -6675,7 +6810,7 @@ static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
 	struct r600_bytecode_alu alu;
 	int r;
 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
-	unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg;
+	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);

 	assert(inst->Dst[0].Register.Index < 3);
 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1691,6 +1691,24 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 					(info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0);
 	}

+	/* SMX returns CONTEXT_DONE too early workaround */
+	if (rctx->b.family == CHIP_R600 ||
+	    rctx->b.family == CHIP_RV610 ||
+	    rctx->b.family == CHIP_RV630 ||
+	    rctx->b.family == CHIP_RV635) {
+		/* if we have gs shader or streamout
+		   we need to do a wait idle after every draw */
+		if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) {
+			r600_write_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1));
+		}
+	}
+
+	/* ES ring rolling over at EOP - workaround */
+	if (rctx->b.chip_class == R600) {
+		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT);
+	}
+
 	if (rctx->screen->b.trace_bo) {
 		r600_trace_emit(rctx);
 	}
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -130,6 +130,7 @@
 #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
 #define EVENT_TYPE_FLUSH_AND_INV_DB_META       0x2c /* supported on r700+ */
 #define EVENT_TYPE_VGT_FLUSH                   0x24
+#define EVENT_TYPE_SQ_NON_EVENT                0x26
 #define EVENT_TYPE_FLUSH_AND_INV_CB_META	46 /* supported on r700+ */
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
--- a/src/gallium/drivers/radeon/Makefile.am
+++ b/src/gallium/drivers/radeon/Makefile.am
@@ -16,7 +16,8 @@ libradeon_la_SOURCES = \
 if NEED_RADEON_LLVM

 AM_CFLAGS += \
-	$(LLVM_CFLAGS)
+	$(LLVM_CFLAGS) \
+	$(LIBELF_CFLAGS)

 libradeon_la_SOURCES += \
 	$(LLVM_C_FILES)
@@ -24,7 +25,7 @@ libradeon_la_SOURCES += \
 libradeon_la_LIBADD = \
 	$(CLOCK_LIB) \
 	$(LLVM_LIBS) \
-	$(ELF_LIB)
+	$(LIBELF_LIBS)

 libradeon_la_LDFLAGS = \
 	$(LLVM_LDFLAGS)
--- a/src/gallium/drivers/radeon/cayman_msaa.c
+++ b/src/gallium/drivers/radeon/cayman_msaa.c
@@ -229,13 +229,17 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
 			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
-					     EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+					       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		} else if (overrast_samples > 1) {
 			r600_write_context_reg(cs, CM_R_028804_DB_EQAA,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) |
 					       S_028804_OVERRASTERIZATION_AMOUNT(log_samples));
-			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		}
 	} else {
 		r600_write_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2);
@@ -245,6 +249,8 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 		r600_write_context_reg(cs, CM_R_028804_DB_EQAA,
 				       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 				       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -226,8 +226,8 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->family = rscreen->family;
 	rctx->chip_class = rscreen->chip_class;

-	if (rscreen->family == CHIP_HAWAII)
-		rctx->max_db = 16;
+	if (rscreen->chip_class >= CIK)
+		rctx->max_db = MAX2(8, rscreen->info.r600_num_backends);
 	else if (rscreen->chip_class >= EVERGREEN)
 		rctx->max_db = 8;
 	else
@@ -543,10 +543,11 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_TONGA: return "tonga";
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
-	case CHIP_FIJI: return "fiji";
 #if HAVE_LLVM <= 0x0307
+	case CHIP_FIJI: return "tonga";
 	case CHIP_STONEY: return "carrizo";
 #else
+	case CHIP_FIJI: return "fiji";
 	case CHIP_STONEY: return "stoney";
 #endif
 	default: return "";
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -168,6 +168,8 @@

 #define EG_R_028A4C_PA_SC_MODE_CNTL_1                0x028A4C
 #define   EG_S_028A4C_PS_ITER_SAMPLE(x)                 (((x) & 0x1) << 16)
+#define   EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)        (((x) & 0x1) << 25)
+#define   EG_S_028A4C_FORCE_EOV_REZ_ENABLE(x)           (((x) & 0x1) << 26)

 #define CM_R_028804_DB_EQAA                          0x00028804
 #define   S_028804_MAX_ANCHOR_SAMPLES(x)		(((x) & 0x7) << 0)
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -951,6 +951,8 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	dec->msg->body.decode.db_pitch = dec->base.width;

 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
+	if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY)
+		dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2;

 	switch (u_reduce_video_profile(picture->profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -385,7 +385,10 @@ struct ruvd_msg {
 			uint32_t	dt_chroma_top_offset;
 			uint32_t	dt_chroma_bottom_offset;
 			uint32_t	dt_surf_tile_config;
-			uint32_t	dt_reserved[3];
+			uint32_t	dt_uv_surf_tile_config;
+			// re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney
+			uint32_t	dt_wa_chroma_top_offset;
+			uint32_t	dt_wa_chroma_bottom_offset;

 			uint32_t	reserved[16];

--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -33,14 +33,6 @@
 #include "sid.h"

 #define MAX_GLOBAL_BUFFERS 20
-#if HAVE_LLVM < 0x0305
-#define NUM_USER_SGPRS 2
-#else
-/* XXX: Even though we don't pass the scratch buffer via user sgprs any more
- * LLVM still expects that we specify 4 USER_SGPRS so it can remain compatible
- * with older mesa. */
-#define NUM_USER_SGPRS 4
-#endif

 struct si_compute {
 	struct si_context *ctx;
@@ -241,7 +233,6 @@ static void si_launch_grid(
 	uint64_t kernel_args_va;
 	uint64_t scratch_buffer_va = 0;
 	uint64_t shader_va;
-	unsigned arg_user_sgpr_count = NUM_USER_SGPRS;
 	unsigned i;
 	struct si_shader *shader = &program->shader;
 	unsigned lds_blocks;
@@ -365,20 +356,7 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, (shader_va >> 8) & 0xffffffff);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);

-	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1,
-		/* We always use at least 3 VGPRS, these come from
-		 * TIDIG_COMP_CNT.
-		 * XXX: The compiler should account for this.
-		 */
-		S_00B848_VGPRS((MAX2(3, shader->num_vgprs) - 1) / 4)
-		/* We always use at least 4 + arg_user_sgpr_count.  The 4 extra
-		 * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN
-		 * XXX: The compiler should account for this.
-		 */
-		|  S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count,
-		                        shader->num_sgprs)) - 1) / 8)
-		|  S_00B028_FLOAT_MODE(shader->float_mode))
-		;
+	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);

 	lds_blocks = shader->lds_size;
 	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
@@ -394,17 +372,10 @@ static void si_launch_grid(

 	assert(lds_blocks <= 0xFF);

-	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2,
-		S_00B84C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)
-		| S_00B84C_USER_SGPR(arg_user_sgpr_count)
-		| S_00B84C_TGID_X_EN(1)
-		| S_00B84C_TGID_Y_EN(1)
-		| S_00B84C_TGID_Z_EN(1)
-		| S_00B84C_TG_SIZE_EN(1)
-		| S_00B84C_TIDIG_COMP_CNT(2)
-		| S_00B84C_LDS_SIZE(lds_blocks)
-		| S_00B84C_EXCP_EN(0))
-		;
+	shader->rsrc2 &= C_00B84C_LDS_SIZE;
+	shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+
+	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
 	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);

 	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -637,6 +637,14 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 			    lp_build_const_int32(gallivm, swizzle));

 	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	if (type == TGSI_TYPE_DOUBLE) {
+		LLVMValueRef value2;
+		dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+				       lp_build_const_int32(gallivm, swizzle + 1));
+		value2 = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+		return radeon_llvm_emit_fetch_double(bld_base, value, value2);
+	}
+
 	return LLVMBuildBitCast(gallivm->builder, value,
 				tgsi2llvmtype(bld_base, type), "");
 }
@@ -3752,12 +3760,14 @@ void si_shader_binary_read_config(const struct si_screen *sscreen,
 			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
 			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 			shader->float_mode =  G_00B028_FLOAT_MODE(value);
+			shader->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
 			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
 			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
+			shader->rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			shader->spi_ps_input_ena = value;
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -268,8 +268,8 @@ struct si_shader {
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */

-	unsigned		ls_rsrc1;
-	unsigned		ls_rsrc2;
+	unsigned		rsrc1;
+	unsigned		rsrc2;
 };

 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2979,6 +2979,28 @@ static void si_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
 	si_need_cs_space((struct si_context*)ctx, num_dw, include_draw_vbo);
 }

+static void si_init_border_color_buffer(struct si_context *sctx)
+{
+	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (!pm4)
+		return;
+
+	assert(sctx->scratch_buffer == NULL);
+	r600_resource_reference(&sctx->scratch_buffer, NULL);
+	sctx->scratch_buffer = si_resource_create_custom(&sctx->screen->b.b,
+							 PIPE_USAGE_DEFAULT,
+							 4096 * 16);
+
+	uint64_t va = sctx->scratch_buffer->gpu_address;
+
+	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, va >> 8);
+	if (sctx->b.chip_class >= CIK)
+		si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, va >> 40);
+	si_pm4_add_bo(pm4, sctx->scratch_buffer, RADEON_USAGE_READ,
+		      RADEON_PRIO_SHADER_DATA);
+	si_pm4_set_state(sctx, ta_bordercolor_base, pm4);
+}
+
 static void si_init_config(struct si_context *sctx);

 void si_init_state_functions(struct si_context *sctx)
@@ -3045,6 +3067,7 @@ void si_init_state_functions(struct si_context *sctx)
 	}

 	si_init_config(sctx);
+	si_init_border_color_buffer(sctx);
 }

 static void
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;

 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->ls_rsrc2;
+	ls_rsrc2 = ls->current->rsrc2;

 	if (sctx->b.chip_class >= CIK) {
 		assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
 		si_write_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
 	si_write_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->ls_rsrc1);
+	radeon_emit(cs, ls->current->rsrc1);
 	radeon_emit(cs, ls_rsrc2);

 	/* Compute userdata SGPRs. */
@@ -216,6 +216,18 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
 }

+static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
+{
+	switch (info->mode) {
+	case PIPE_PRIM_PATCHES:
+		return info->count / info->vertices_per_patch;
+	case R600_PRIM_RECTANGLE_LIST:
+		return info->count / 3;
+	default:
+		return u_prims_for_vertices(info->mode, info->count);
+	}
+}
+
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 					  const struct pipe_draw_info *info,
 					  unsigned num_patches)
@@ -305,7 +317,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
 	    (info->indirect ||
 	     (info->instance_count > 1 &&
-	      u_prims_for_vertices(info->mode, info->count) <= 1)))
+	      si_num_prims_for_vertices(info) <= 1)))
 		sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;

 	/* Instancing bug on 2 SE chips. */
@@ -849,7 +861,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)

 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
-	if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) &&
+	if ((sctx->b.family == CHIP_HAWAII ||
+	     sctx->b.family == CHIP_TONGA ||
+	     sctx->b.family == CHIP_FIJI) &&
 	    (sctx->b.streamout.streamout_enabled ||
 	     sctx->b.streamout.prims_gen_query_enabled)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -119,10 +119,10 @@ static void si_shader_ls(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);

-	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+	shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt);
-	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+	shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
 			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
 }

--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/src/gallium/Automake.inc

 lib_LTLIBRARIES = lib@OPENCL_LIBNAME@.la

+AM_CPPFLAGS = \
+        $(LIBELF_CFLAGS)
+
 lib@OPENCL_LIBNAME@_la_LDFLAGS = \
 	$(LLVM_LDFLAGS) \
 	-no-undefined \
@@ -20,7 +23,7 @@ lib@OPENCL_LIBNAME@_la_LIBADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(ELF_LIB) \
+	$(LIBELF_LIBS) \
 	$(DLOPEN_LIBS) \
 	-lclangCodeGen \
 	-lclangFrontendTool \
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2281,7 +2281,9 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,

         return false;
      }
-   } else if (state->is_version(420, 310) && base_type->is_image()) {
+   } else if ((state->is_version(420, 310) ||
+               state->ARB_shading_language_420pack_enable) &&
+              base_type->is_image()) {
      assert(ctx->Const.MaxImageUnits <= MAX_IMAGE_UNITS);
      if (max_index >= ctx->Const.MaxImageUnits) {
         _mesa_glsl_error(loc, state, "Image binding %d exceeds the "
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3754,13 +3754,13 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
   if (first < MESA_SHADER_FRAGMENT) {
      gl_shader *const sh = prog->_LinkedShaders[last];

-      if (first == MESA_SHADER_GEOMETRY) {
+      if (first != MESA_SHADER_VERTEX) {
         /* There was no vertex shader, but we still have to assign varying
-          * locations for use by geometry shader inputs in SSO.
+          * locations for use by tessellation/geometry shader inputs in SSO.
          *
          * If the shader is not separable (i.e., prog->SeparateShader is
-          * false), linking will have already failed when first is
-          * MESA_SHADER_GEOMETRY.
+          * false), linking will have already failed when first is not
+          * MESA_SHADER_VERTEX.
          */
         if (!assign_varying_locations(ctx, mem_ctx, prog,
                                       NULL, prog->_LinkedShaders[first],
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -62,6 +62,15 @@ fallback_required(struct gl_context *ctx, GLenum target,
   GLuint srcLevel;
   GLenum status;

+   /* GL_DRAW_FRAMEBUFFER does not exist in OpenGL ES 1.x, and since
+    * _mesa_meta_begin hasn't been called yet, we have to work-around API
+    * difficulties.  The whole reason that GL_DRAW_FRAMEBUFFER is used instead
+    * of GL_FRAMEBUFFER is that the read framebuffer may be different.  This
+    * is moot in OpenGL ES 1.x.
+    */
+   const GLenum fbo_target = ctx->API == API_OPENGLES
+      ? GL_FRAMEBUFFER : GL_DRAW_FRAMEBUFFER;
+
   /* check for fallbacks */
   if (target == GL_TEXTURE_3D) {
      _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH,
@@ -102,13 +111,13 @@ fallback_required(struct gl_context *ctx, GLenum target,
    */
   if (!mipmap->FBO)
      _mesa_GenFramebuffers(1, &mipmap->FBO);
-   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, mipmap->FBO);
+   _mesa_BindFramebuffer(fbo_target, mipmap->FBO);

-   _mesa_meta_bind_fbo_image(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, baseImage, 0);
+   _mesa_meta_bind_fbo_image(fbo_target, GL_COLOR_ATTACHMENT0, baseImage, 0);

-   status = _mesa_CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER);
+   status = _mesa_CheckFramebufferStatus(fbo_target);

-   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fboSave);
+   _mesa_BindFramebuffer(fbo_target, fboSave);

   if (status != GL_FRAMEBUFFER_COMPLETE_EXT) {
      _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH,
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -189,6 +189,24 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
      brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
   }

+   /* Resolve color for each active shader image. */
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      const struct gl_shader *shader = ctx->_Shader->CurrentProgram[i] ?
+         ctx->_Shader->CurrentProgram[i]->_LinkedShaders[i] : NULL;
+
+      if (unlikely(shader && shader->NumImages)) {
+         for (unsigned j = 0; j < shader->NumImages; j++) {
+            struct gl_image_unit *u = &ctx->ImageUnits[shader->ImageUnits[j]];
+            tex_obj = intel_texture_object(u->TexObj);
+
+            if (tex_obj && tex_obj->mt) {
+               intel_miptree_resolve_color(brw, tex_obj->mt);
+               brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
+            }
+         }
+      }
+   }
+
   _mesa_lock_context_textures(ctx);
 }

--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1039,33 +1039,19 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 }

 void
-fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
+fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
                                       const glsl_type *type,
                                       glsl_interp_qualifier interpolation_mode,
-                                       int location, bool mod_centroid,
+                                       int *location, bool mod_centroid,
                                       bool mod_sample)
 {
-   attr.type = brw_type_for_base_type(type->get_scalar_type());
-
   assert(stage == MESA_SHADER_FRAGMENT);
   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;

-   unsigned int array_elements;
-
-   if (type->is_array()) {
-      array_elements = type->length;
-      if (array_elements == 0) {
-         fail("dereferenced array '%s' has length 0\n", name);
-      }
-      type = type->fields.array;
-   } else {
-      array_elements = 1;
-   }
-
   if (interpolation_mode == INTERP_QUALIFIER_NONE) {
      bool is_gl_Color =
-         location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
+         *location == VARYING_SLOT_COL0 || *location == VARYING_SLOT_COL1;
      if (key->flat_shade && is_gl_Color) {
         interpolation_mode = INTERP_QUALIFIER_FLAT;
      } else {
@@ -1073,71 +1059,86 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
      }
   }

-   for (unsigned int i = 0; i < array_elements; i++) {
-      for (unsigned int j = 0; j < type->matrix_columns; j++) {
-	 if (prog_data->urb_setup[location] == -1) {
-	    /* If there's no incoming setup data for this slot, don't
-	     * emit interpolation for it.
-	     */
-	    attr = offset(attr, bld, type->vector_elements);
-	    location++;
-	    continue;
-	 }
+   if (type->is_array() || type->is_matrix()) {
+      const glsl_type *elem_type = glsl_get_array_element(type);
+      const unsigned length = glsl_get_length(type);

-	 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
-	    /* Constant interpolation (flat shading) case. The SF has
-	     * handed us defined values in only the constant offset
-	     * field of the setup reg.
-	     */
-	    for (unsigned int k = 0; k < type->vector_elements; k++) {
-	       struct brw_reg interp = interp_reg(location, k);
-	       interp = suboffset(interp, 3);
-               interp.type = attr.type;
-               bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
-	       attr = offset(attr, bld, 1);
-	    }
-	 } else {
-	    /* Smooth/noperspective interpolation case. */
-	    for (unsigned int k = 0; k < type->vector_elements; k++) {
-               struct brw_reg interp = interp_reg(location, k);
-               if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
-                  /* Get the pixel/sample mask into f0 so that we know
-                   * which pixels are lit.  Then, for each channel that is
-                   * unlit, replace the centroid data with non-centroid
-                   * data.
-                   */
-                  bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
-
-                  fs_inst *inst;
-                  inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
-                                      false, false);
-                  inst->predicate = BRW_PREDICATE_NORMAL;
-                  inst->predicate_inverse = true;
-                  if (devinfo->has_pln)
-                     inst->no_dd_clear = true;
-
-                  inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
-                                      mod_centroid && !key->persample_shading,
-                                      mod_sample || key->persample_shading);
-                  inst->predicate = BRW_PREDICATE_NORMAL;
-                  inst->predicate_inverse = false;
-                  if (devinfo->has_pln)
-                     inst->no_dd_check = true;
-
-               } else {
-                  emit_linterp(attr, fs_reg(interp), interpolation_mode,
-                               mod_centroid && !key->persample_shading,
-                               mod_sample || key->persample_shading);
-               }
-               if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
-                  bld.MUL(attr, attr, this->pixel_w);
-               }
-	       attr = offset(attr, bld, 1);
-	    }
-
-	 }
-	 location++;
+      for (unsigned i = 0; i < length; i++) {
+         emit_general_interpolation(attr, name, elem_type, interpolation_mode,
+                                    location, mod_centroid, mod_sample);
      }
+   } else if (type->is_record()) {
+      for (unsigned i = 0; i < type->length; i++) {
+         const glsl_type *field_type = type->fields.structure[i].type;
+         emit_general_interpolation(attr, name, field_type, interpolation_mode,
+                                    location, mod_centroid, mod_sample);
+      }
+   } else {
+      assert(type->is_scalar() || type->is_vector());
+
+      if (prog_data->urb_setup[*location] == -1) {
+         /* If there's no incoming setup data for this slot, don't
+          * emit interpolation for it.
+          */
+         *attr = offset(*attr, bld, type->vector_elements);
+         (*location)++;
+         return;
+      }
+
+      attr->type = brw_type_for_base_type(type->get_scalar_type());
+
+      if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
+         /* Constant interpolation (flat shading) case. The SF has
+          * handed us defined values in only the constant offset
+          * field of the setup reg.
+          */
+         for (unsigned int i = 0; i < type->vector_elements; i++) {
+            struct brw_reg interp = interp_reg(*location, i);
+            interp = suboffset(interp, 3);
+            interp.type = attr->type;
+            bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
+            *attr = offset(*attr, bld, 1);
+         }
+      } else {
+         /* Smooth/noperspective interpolation case. */
+         for (unsigned int i = 0; i < type->vector_elements; i++) {
+            struct brw_reg interp = interp_reg(*location, i);
+            if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
+               /* Get the pixel/sample mask into f0 so that we know
+                * which pixels are lit.  Then, for each channel that is
+                * unlit, replace the centroid data with non-centroid
+                * data.
+                */
+               bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+
+               fs_inst *inst;
+               inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+                                   false, false);
+               inst->predicate = BRW_PREDICATE_NORMAL;
+               inst->predicate_inverse = true;
+               if (devinfo->has_pln)
+                  inst->no_dd_clear = true;
+
+               inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+                                   mod_centroid && !key->persample_shading,
+                                   mod_sample || key->persample_shading);
+               inst->predicate = BRW_PREDICATE_NORMAL;
+               inst->predicate_inverse = false;
+               if (devinfo->has_pln)
+                  inst->no_dd_check = true;
+
+            } else {
+               emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+                            mod_centroid && !key->persample_shading,
+                            mod_sample || key->persample_shading);
+            }
+            if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
+               bld.MUL(*attr, *attr, this->pixel_w);
+            }
+            *attr = offset(*attr, bld, 1);
+         }
+      }
+      (*location)++;
   }
 }

--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -197,10 +197,10 @@ public:
   fs_reg *emit_frontfacing_interpolation();
   fs_reg *emit_samplepos_setup();
   fs_reg *emit_sampleid_setup();
-   void emit_general_interpolation(fs_reg attr, const char *name,
+   void emit_general_interpolation(fs_reg *attr, const char *name,
                                   const glsl_type *type,
                                   glsl_interp_qualifier interpolation_mode,
-                                   int location, bool mod_centroid,
+                                   int *location, bool mod_centroid,
                                   bool mod_sample);
   fs_reg *emit_vs_system_value(int location);
   void emit_interpolation_setup_gen4();
@@ -240,6 +240,8 @@ public:

   void emit_nir_code();
   void nir_setup_inputs(nir_shader *shader);
+   void nir_setup_single_output_varying(fs_reg &reg, const glsl_type *type,
+                                        unsigned &location);
   void nir_setup_outputs(nir_shader *shader);
   void nir_setup_uniforms(nir_shader *shader);
   void nir_setup_uniform(nir_variable *var);
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -105,9 +105,10 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
                                      input, reg), 0xF);
         } else {
-            emit_general_interpolation(input, var->name, var->type,
+            int location = var->data.location;
+            emit_general_interpolation(&input, var->name, var->type,
                                       (glsl_interp_qualifier) var->data.interpolation,
-                                       var->data.location, var->data.centroid,
+                                       &location, var->data.centroid,
                                       var->data.sample);
         }
         break;
@@ -115,6 +116,32 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
   }
 }

+void
+fs_visitor::nir_setup_single_output_varying(fs_reg &reg,
+                                            const glsl_type *type,
+                                            unsigned &location)
+{
+   if (type->is_array() || type->is_matrix()) {
+      const struct glsl_type *elem_type = glsl_get_array_element(type);
+      const unsigned length = glsl_get_length(type);
+
+      for (unsigned i = 0; i < length; i++) {
+         nir_setup_single_output_varying(reg, elem_type, location);
+      }
+   } else if (type->is_record()) {
+      for (unsigned i = 0; i < type->length; i++) {
+         const struct glsl_type *field_type = type->fields.structure[i].type;
+         nir_setup_single_output_varying(reg, field_type, location);
+      }
+   } else {
+      assert(type->is_scalar() || type->is_vector());
+      this->outputs[location] = reg;
+      this->output_components[location] = type->vector_elements;
+      reg = offset(reg, bld, 4);
+      location++;
+   }
+}
+
 void
 fs_visitor::nir_setup_outputs(nir_shader *shader)
 {
@@ -130,13 +157,11 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
                               : var->type->vector_elements;

      switch (stage) {
-      case MESA_SHADER_VERTEX:
-         for (int i = 0; i < ALIGN(type_size(var->type), 4) / 4; i++) {
-            int output = var->data.location + i;
-            this->outputs[output] = offset(reg, bld, 4 * i);
-            this->output_components[output] = vector_elements;
-         }
+      case MESA_SHADER_VERTEX: {
+         unsigned location = var->data.location;
+         nir_setup_single_output_varying(reg, var->type, location);
         break;
+      }
      case MESA_SHADER_FRAGMENT:
         if (var->data.index > 0) {
            assert(var->data.location == FRAG_RESULT_DATA0);
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -379,6 +379,7 @@ brw_format_for_mesa_format(mesa_format mesa_format)
      [MESA_FORMAT_A8R8G8B8_SRGB] = 0,
      [MESA_FORMAT_R8G8B8A8_SRGB] = BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB,
      [MESA_FORMAT_X8R8G8B8_SRGB] = 0,
+      [MESA_FORMAT_B8G8R8X8_SRGB] = BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB,
      [MESA_FORMAT_L_SRGB8] = BRW_SURFACEFORMAT_L8_UNORM_SRGB,
      [MESA_FORMAT_L8A8_SRGB] = BRW_SURFACEFORMAT_L8A8_UNORM_SRGB,
      [MESA_FORMAT_A8L8_SRGB] = 0,
@@ -614,6 +615,10 @@ brw_init_surface_formats(struct brw_context *brw)
 	  */
 	 render = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 	 break;
+      case BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB:
+         if (gen < tinfo->render_target)
+            render = BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+         break;
      case BRW_SURFACEFORMAT_R8G8B8X8_UNORM:
         render = BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
         break;
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -981,7 +981,7 @@ brw_upload_wm_abo_surfaces(struct brw_context *brw)
 {
   struct gl_context *ctx = &brw->ctx;
   /* _NEW_PROGRAM */
-   struct gl_shader_program *prog = ctx->Shader._CurrentFragmentProgram;
+   struct gl_shader_program *prog = ctx->_Shader->_CurrentFragmentProgram;

   if (prog) {
      /* BRW_NEW_FS_PROG_DATA */
@@ -1257,7 +1257,7 @@ brw_upload_wm_image_surfaces(struct brw_context *brw)
 {
   struct gl_context *ctx = &brw->ctx;
   /* BRW_NEW_FRAGMENT_PROGRAM */
-   struct gl_shader_program *prog = ctx->Shader._CurrentFragmentProgram;
+   struct gl_shader_program *prog = ctx->_Shader->_CurrentFragmentProgram;

   if (prog) {
      /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -776,13 +776,18 @@ program_resource_location(struct gl_shader_program *shProg,
    * and user-defined attributes.
    */
   switch (res->Type) {
-   case GL_PROGRAM_INPUT:
+   case GL_PROGRAM_INPUT: {
+      const ir_variable *var = RESOURCE_VAR(res);
+
      /* If the input is an array, fail if the index is out of bounds. */
      if (array_index > 0
-          && array_index >= RESOURCE_VAR(res)->type->length) {
+          && array_index >= var->type->length) {
         return -1;
      }
-      return RESOURCE_VAR(res)->data.location + array_index - VERT_ATTRIB_GENERIC0;
+      return (var->data.location +
+	      (array_index * var->type->without_array()->matrix_columns) -
+	      VERT_ATTRIB_GENERIC0);
+   }
   case GL_PROGRAM_OUTPUT:
      /* If the output is an array, fail if the index is out of bounds. */
      if (array_index > 0
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -766,7 +766,7 @@ _mesa_VertexAttribLPointer(GLuint index, GLint size, GLenum type,

   update_array(ctx, "glVertexAttribLPointer", VERT_ATTRIB_GENERIC(index),
                legalTypes, 1, 4,
-                size, type, stride, GL_TRUE, GL_FALSE, GL_TRUE, ptr);
+                size, type, stride, GL_FALSE, GL_FALSE, GL_TRUE, ptr);
 }
@@ -1 +1 @@
 .0.7
 .0.8