Update version to 11.2.0-rc4

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
i965: Fix brw_render_cache_set_check_flush's PIPE_CONTROLs.
2016-03-30 00:10:07 +01:00 · 2016-03-29 12:15:01 +01:00 · 2016-03-29 12:15:01 +01:00 · 2016-03-29 12:15:01 +01:00 · 2016-03-29 12:15:01 +01:00 · 2016-03-29 12:15:01 +01:00
50 changed files with 460 additions and 165 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-11.2.0-rc3
+11.2.0-rc4
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -2625,6 +2625,13 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
 	 continue;
      }

+      if (num_attr >= ARRAY_SIZE(to_assign)) {
+         linker_error(prog, "too many %s (max %u)",
+                      target_index == MESA_SHADER_VERTEX ?
+                      "vertex shader inputs" : "fragment shader outputs",
+                      (unsigned)ARRAY_SIZE(to_assign));
+         return false;
+      }
      to_assign[num_attr].slots = slots;
      to_assign[num_attr].var = var;
      num_attr++;
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -1006,6 +1006,9 @@ dri2_create_image_khr_pixmap(_EGLDisplay *disp, _EGLContext *ctx,
   geometry_cookie = xcb_get_geometry (dri2_dpy->conn, drawable);
   buffers_reply = xcb_dri2_get_buffers_reply (dri2_dpy->conn,
 					       buffers_cookie, NULL);
+   if (buffers_reply == NULL)
+     return NULL;
+
   buffers = xcb_dri2_get_buffers_buffers (buffers_reply);
   if (buffers == NULL) {
      return NULL;
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -44,7 +44,6 @@
 #include "egllog.h"


-#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))


 /**
--- a/src/egl/main/egldefines.h
+++ b/src/egl/main/egldefines.h
@@ -40,9 +40,16 @@ extern "C" {

 #define _EGL_MAX_EXTENSIONS_LEN 1000

+/* Hardcoded, conservative default for EGL_LARGEST_PBUFFER,
+ * this is used to implement EGL_LARGEST_PBUFFER.
+ */
+#define _EGL_MAX_PBUFFER_WIDTH 4096
+#define _EGL_MAX_PBUFFER_HEIGHT 4096
+
 #define _EGL_VENDOR_STRING "Mesa Project"

 #define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))

 #ifdef __cplusplus
 }
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -307,6 +307,12 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
   if (err != EGL_SUCCESS)
      return _eglError(err, func);

+   /* if EGL_LARGEST_PBUFFER in use, clamp width and height */
+   if (surf->LargestPbuffer) {
+      surf->Width = MIN2(surf->Width, _EGL_MAX_PBUFFER_WIDTH);
+      surf->Height = MIN2(surf->Height, _EGL_MAX_PBUFFER_HEIGHT);
+   }
+
   return EGL_TRUE;
 }

--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -108,11 +108,11 @@ emit_segment(struct draw_stage *stage, struct prim_header *header,
 }


-static inline unsigned
+static inline bool
 stipple_test(int counter, ushort pattern, int factor)
 {
   int b = (counter / factor) & 0xf;
-   return (1 << b) & pattern;
+   return !!((1 << b) & pattern);
 }


@@ -126,7 +126,7 @@ stipple_line(struct draw_stage *stage, struct prim_header *header)
   const float *pos0 = v0->data[pos];
   const float *pos1 = v1->data[pos];
   float start = 0;
-   int state = 0;
+   bool state = 0;

   float x0 = pos0[0];
   float x1 = pos1[0];
@@ -143,29 +143,29 @@ stipple_line(struct draw_stage *stage, struct prim_header *header)
      stipple->counter = 0;


-   /* XXX ToDo: intead of iterating pixel-by-pixel, use a look-up table.
+   /* XXX ToDo: instead of iterating pixel-by-pixel, use a look-up table.
    */
   for (i = 0; i < length; i++) {
-      int result = stipple_test( (int) stipple->counter+i,
-                                 (ushort) stipple->pattern, stipple->factor );
+      bool result = stipple_test((int)stipple->counter + i,
+                                 (ushort)stipple->pattern, stipple->factor);
      if (result != state) {
         /* changing from "off" to "on" or vice versa */
-	 if (state) {
-	    if (start != i) {
+         if (state) {
+            if (start != i) {
               /* finishing an "on" segment */
-	       emit_segment( stage, header, start / length, i / length );
+               emit_segment(stage, header, start / length, i / length);
            }
-	 }
-	 else {
+         }
+         else {
            /* starting an "on" segment */
-	    start = (float) i;
-	 }
-	 state = result;	   
+            start = (float)i;
+         }
+         state = result;
      }
   }

   if (state && start < length)
-      emit_segment( stage, header, start / length, 1.0 );
+      emit_segment(stage, header, start / length, 1.0);

   stipple->counter += length;
 }
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1388,7 +1388,9 @@ static boolean parse_declaration( struct translate_ctx *ctx )
         if (str_match_nocase_whole(&cur, "ATOMIC")) {
            decl.Declaration.Atomic = 1;
            ctx->cur = cur;
-         } else if (str_match_nocase_whole(&cur, "SHARED")) {
+         }
+      } else if (file == TGSI_FILE_MEMORY) {
+         if (str_match_nocase_whole(&cur, "SHARED")) {
            decl.Declaration.Shared = 1;
            ctx->cur = cur;
         }
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -116,6 +116,12 @@ struct lp_rast_plane {

   /* one-pixel sized trivial reject offsets for each plane */
   uint32_t eo;
+   /*
+    * We rely on this struct being 64bit aligned (ideally it would be 128bit
+    * but that's quite the waste) and therefore on 32bit we need padding
+    * since otherwise (even with the 64bit number in there) it wouldn't be.
+    */
+   uint32_t pad;
 };

 /**
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -94,6 +94,8 @@ lp_setup_alloc_triangle(struct lp_scene *scene,
   unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
   struct lp_rast_triangle *tri;

+   STATIC_ASSERT(sizeof(struct lp_rast_plane) % 8 == 0);
+
   *tri_size = (sizeof(struct lp_rast_triangle) +
                3 * input_array_sz +
                plane_sz);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -1634,7 +1634,9 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
   code[1] |= (i->tex.mask & 0xc) << 12;

   if (i->tex.liveOnly)
-      code[1] |= 4;
+      code[1] |= 1 << 2;
+   if (i->tex.derivAll)
+      code[1] |= 1 << 3;

   defId(i->def(0), 2);

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1989,7 +1989,6 @@ Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask)
 void
 Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
 {
-   Value *val;
   Value *arg[4], *src[8];
   Value *lod = NULL, *shd = NULL;
   unsigned int s, c, d;
@@ -2032,17 +2031,6 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
         shd = src[n - 1];
   }

-   if (tgt.isCube()) {
-      for (c = 0; c < 3; ++c)
-         src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]);
-      val = getScratch();
-      mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
-      mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
-      mkOp1(OP_RCP, TYPE_F32, val, val);
-      for (c = 0; c < 3; ++c)
-         src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val);
-   }
-
   for (c = 0, d = 0; c < 4; ++c) {
      if (dst[c]) {
         texi->setDef(d++, dst[c]);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -67,6 +67,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
   tmp = bld.getScratch();

   for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
      // mov coordinates from lane l to all lanes
      bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
      for (c = 0; c < dim; ++c) {
@@ -92,10 +93,25 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
         add->lanes = 1; /* abused for .ndv */
      }

+      // normalize cube coordinates if necessary
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
+
      // texture
      bld.insert(tex = cloneForward(func, i));
      for (c = 0; c < dim; ++c)
-         tex->setSrc(c + array, crd[c]);
+         tex->setSrc(c + array, src[c]);
      bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);

      // save results
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -724,6 +724,23 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
   const int dref = arg;
   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;

+   /* Only normalize in the non-explicit derivatives case.
+    */
+   if (i->tex.target.isCube() && i->op != OP_TXD) {
+      Value *src[3], *val;
+      int c;
+      for (c = 0; c < 3; ++c)
+         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+      val = bld.getScratch();
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c) {
+         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+                                 i->getSrc(c), val));
+      }
+   }
+
   // handle MS, which means looking up the MS params for this texture, and
   // adjusting the input coordinates to point at the right sample.
   if (i->tex.target.isMS()) {
@@ -934,12 +951,14 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)

   handleTEX(i);
   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+   i->tex.derivAll = true;

   for (c = 0; c < dim; ++c)
      crd[c] = bld.getScratch();

   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
   for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
      // mov coordinates from lane l to all lanes
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
@@ -949,10 +968,24 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
      // add dPdy from lane l to lanes dy
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // normalize cube coordinates if necessary
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
      // texture
      bld.insert(tex = cloneForward(func, i));
      for (c = 0; c < dim; ++c)
-         tex->setSrc(c, crd[c]);
+         tex->setSrc(c, src[c]);
      // save results
      for (c = 0; i->defExists(c); ++c) {
         Instruction *mov;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -615,6 +615,24 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
   const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
   const int chipset = prog->getTarget()->getChipset();

+   /* Only normalize in the non-explicit derivatives case. For explicit
+    * derivatives, this is handled in handleManualTXD.
+    */
+   if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
+      Value *src[3], *val;
+      int c;
+      for (c = 0; c < 3; ++c)
+         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+      val = bld.getScratch();
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c) {
+         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+                                 i->getSrc(c), val));
+      }
+   }
+
   // Arguments to the TEX instruction are a little insane. Even though the
   // encoding is identical between SM20 and SM30, the arguments mean
   // different things between Fermi and Kepler+. A lot of arguments are
@@ -728,9 +746,13 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
      }

      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
-      for (int s = dim; s >= 1; --s)
-         i->setSrc(s, i->getSrc(s - 1));
-      i->setSrc(0, arrayIndex);
+      if (arrayIndex) {
+         for (int s = dim; s >= 1; --s)
+            i->setSrc(s, i->getSrc(s - 1));
+         i->setSrc(0, arrayIndex);
+      } else {
+         i->moveSources(0, 1);
+      }

      if (arrayIndex) {
         int sat = (i->op == OP_TXF) ? 1 : 0;
@@ -852,7 +874,17 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
   Value *zero = bld.loadImm(bld.getSSA(), 0);
   int l, c;
   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
-   const int array = i->tex.target.isArray();
+
+   // This function is invoked after handleTEX lowering, so we have to expect
+   // the arguments in the order that the hw wants them. For Fermi, array and
+   // indirect are both in the leading arg, while for Kepler, array and
+   // indirect are separate (and both precede the coordinates). Maxwell is
+   // handled in a separate function.
+   unsigned array;
+   if (targ->getChipset() < NVISA_GK104_CHIPSET)
+      array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
+   else
+      array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);

   i->op = OP_TEX; // no need to clone dPdx/dPdy later

@@ -861,6 +893,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)

   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
   for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
      // mov coordinates from lane l to all lanes
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
@@ -870,10 +903,24 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
      // add dPdy from lane l to lanes dy
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // normalize cube coordinates
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
      // texture
      bld.insert(tex = cloneForward(func, i));
      for (c = 0; c < dim; ++c)
-         tex->setSrc(c + array, crd[c]);
+         tex->setSrc(c + array, src[c]);
      // save results
      for (c = 0; i->defExists(c); ++c) {
         Instruction *mov;
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -633,8 +633,6 @@ nv50_stream_output_validate(struct nv50_context *nv50)
   BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
   PUSH_DATA (push, ctrl);

-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
-
   for (i = 0; i < nv50->num_so_targets; ++i) {
      struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -1180,8 +1180,10 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
   }
   nv50->num_so_targets = num_targets;

-   if (nv50->so_targets_dirty)
+   if (nv50->so_targets_dirty) {
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
      nv50->dirty |= NV50_NEW_STRMOUT;
+   }
 }

 static void
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -294,7 +294,6 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)

   if (!(nvc0->dirty & NVC0_NEW_TFB_TARGETS))
      return;
-   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TFB);

   for (b = 0; b < nvc0->num_tfbbufs; ++b) {
      struct nvc0_so_target *targ = nvc0_so_target(nvc0->tfbbuf[b]);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -413,7 +413,7 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
 {
   unsigned s, i;

-   for (s = 0; s < 5; ++s)
+   for (s = 0; s < 6; ++s)
      for (i = 0; i < nvc0_context(pipe)->num_samplers[s]; ++i)
         if (nvc0_context(pipe)->samplers[s][i] == hwcso)
            nvc0_context(pipe)->samplers[s][i] = NULL;
@@ -1184,8 +1184,10 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
   }
   nvc0->num_tfbbufs = num_targets;

-   if (nvc0->tfbbuf_dirty)
+   if (nvc0->tfbbuf_dirty) {
+      nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TFB);
      nvc0->dirty |= NVC0_NEW_TFB_TARGETS;
+   }
 }

 static void
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1203,8 +1203,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
   x0 = (float)info->src.box.x - x_range * (float)info->dst.box.x;
   y0 = (float)info->src.box.y - y_range * (float)info->dst.box.y;

-   x1 = x0 + 16384.0f * x_range;
-   y1 = y0 + 16384.0f * y_range;
+   x1 = x0 + 32768.0f * x_range;
+   y1 = y0 + 32768.0f * y_range;

   x0 *= (float)(1 << nv50_miptree(src)->ms_x);
   x1 *= (float)(1 << nv50_miptree(src)->ms_x);
@@ -1302,6 +1302,17 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
   }
   nvc0->state.num_vtxelts = 2;

+   if (nvc0->state.prim_restart) {
+      IMMED_NVC0(push, NVC0_3D(PRIM_RESTART_ENABLE), 0);
+      nvc0->state.prim_restart = 0;
+   }
+
+   if (nvc0->state.index_bias) {
+      IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
+      nvc0->state.index_bias = 0;
+   }
+
   for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
      if (info->dst.box.z + i) {
         BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
@@ -1314,14 +1325,14 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
      *(vbuf++) = fui(y0);
      *(vbuf++) = fui(z);

-      *(vbuf++) = fui(16384 << nv50_miptree(dst)->ms_x);
+      *(vbuf++) = fui(32768 << nv50_miptree(dst)->ms_x);
      *(vbuf++) = fui(0.0f);
      *(vbuf++) = fui(x1);
      *(vbuf++) = fui(y0);
      *(vbuf++) = fui(z);

      *(vbuf++) = fui(0.0f);
-      *(vbuf++) = fui(16384 << nv50_miptree(dst)->ms_y);
+      *(vbuf++) = fui(32768 << nv50_miptree(dst)->ms_y);
      *(vbuf++) = fui(x0);
      *(vbuf++) = fui(y1);
      *(vbuf++) = fui(z);
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -645,21 +645,21 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 		if (rviews[i]) {
 			struct r600_texture *rtex =
 				(struct r600_texture*)rviews[i]->base.texture;
+			bool is_buffer = rviews[i]->base.texture->target == PIPE_BUFFER;

-			if (rviews[i]->base.texture->target != PIPE_BUFFER) {
-				if (rtex->is_depth && !rtex->is_flushing_texture) {
-					dst->views.compressed_depthtex_mask |= 1 << i;
-				} else {
-					dst->views.compressed_depthtex_mask &= ~(1 << i);
-				}
-
-				/* Track compressed colorbuffers. */
-				if (rtex->cmask.size) {
-					dst->views.compressed_colortex_mask |= 1 << i;
-				} else {
-					dst->views.compressed_colortex_mask &= ~(1 << i);
-				}
+			if (!is_buffer && rtex->is_depth && !rtex->is_flushing_texture) {
+				dst->views.compressed_depthtex_mask |= 1 << i;
+			} else {
+				dst->views.compressed_depthtex_mask &= ~(1 << i);
 			}
+
+			/* Track compressed colorbuffers. */
+			if (!is_buffer && rtex->cmask.size) {
+				dst->views.compressed_colortex_mask |= 1 << i;
+			} else {
+				dst->views.compressed_colortex_mask &= ~(1 << i);
+			}
+
 			/* Changing from array to non-arrays textures and vice versa requires
 			 * updating TEX_ARRAY_OVERRIDE in sampler states on R6xx-R7xx. */
 			if (rctx->b.chip_class <= R700 &&
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -598,9 +598,13 @@ bool expr_handler::fold_assoc(alu_node *n) {

 	unsigned op = n->bc.op;
 	bool allow_neg = false, cur_neg = false;
+	bool distribute_neg = false;

 	switch(op) {
 	case ALU_OP2_ADD:
+		distribute_neg = true;
+		allow_neg = true;
+		break;
 	case ALU_OP2_MUL:
 	case ALU_OP2_MUL_IEEE:
 		allow_neg = true;
@@ -632,7 +636,7 @@ bool expr_handler::fold_assoc(alu_node *n) {
 		if (v1->is_const()) {
 			literal arg = v1->get_const_value();
 			apply_alu_src_mod(a->bc, 1, arg);
-			if (cur_neg)
+			if (cur_neg && distribute_neg)
 				arg.f = -arg.f;

 			if (a == n)
@@ -660,7 +664,7 @@ bool expr_handler::fold_assoc(alu_node *n) {
 		if (v0->is_const()) {
 			literal arg = v0->get_const_value();
 			apply_alu_src_mod(a->bc, 0, arg);
-			if (cur_neg)
+			if (cur_neg && distribute_neg)
 				arg.f = -arg.f;

 			if (last_arg == 0) {
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -314,7 +314,8 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 		}
 	}
 	else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
-		 !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
+		 !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+			    PIPE_TRANSFER_PERSISTENT)) &&
 		 !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
 		 r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
 		assert(usage & PIPE_TRANSFER_WRITE);
@@ -341,7 +342,8 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	}
 	/* Using a staging buffer in GTT for larger reads is much faster. */
 	else if ((usage & PIPE_TRANSFER_READ) &&
-		 !(usage & PIPE_TRANSFER_WRITE) &&
+		 !(usage & (PIPE_TRANSFER_WRITE |
+			    PIPE_TRANSFER_PERSISTENT)) &&
 		 rbuffer->domains == RADEON_DOMAIN_VRAM &&
 		 r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) {
 		struct r600_resource *staging;
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -533,8 +533,14 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
 		return 0;

-	/* Overalign HTILE on Stoney to fix piglit/depthstencil-render-miplevels 585. */
-	if (rscreen->family == CHIP_STONEY)
+	/* Overalign HTILE on P2 configs to work around GPU hangs in
+	 * piglit/depthstencil-render-miplevels 585.
+	 *
+	 * This has been confirmed to help Kabini & Stoney, where the hangs
+	 * are always reproducible. I think I have seen the test hang
+	 * on Carrizo too, though it was very rare there.
+	 */
+	if (rscreen->chip_class >= CIK && num_pipes < 4)
 		num_pipes = 4;

 	switch (num_pipes) {
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -237,6 +237,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_SUPPORTED:
 		switch (codec) {
 		case PIPE_VIDEO_FORMAT_MPEG12:
+			return profile != PIPE_VIDEO_PROFILE_MPEG1;
 		case PIPE_VIDEO_FORMAT_MPEG4:
 		case PIPE_VIDEO_FORMAT_MPEG4_AVC:
 			if (rscreen->family < CHIP_PALM)
@@ -257,7 +258,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_MAX_WIDTH:
 		return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	case PIPE_VIDEO_CAP_MAX_HEIGHT:
-		return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
+		return (rscreen->family < CHIP_TONGA) ? 1152 : 4096;
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -303,6 +303,7 @@ static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
 		 */
 		if (samplers->views.views[i] &&
 		    samplers->views.views[i]->texture &&
+		    samplers->views.views[i]->texture->target != PIPE_BUFFER &&
 		    ((struct r600_texture*)samplers->views.views[i]->texture)->fmask.size)
 			continue;

--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -2209,6 +2209,7 @@ img_filter_2d_ewa(const struct sp_sampler_view *sp_sview,
                  const float t[TGSI_QUAD_SIZE],
                  const float p[TGSI_QUAD_SIZE],
                  const uint faces[TGSI_QUAD_SIZE],
+                  const int8_t *offset,
                  unsigned level,
                  const float dudx, const float dvdx,
                  const float dudy, const float dvdy,
@@ -2268,6 +2269,8 @@ img_filter_2d_ewa(const struct sp_sampler_view *sp_sview,
   /* F *= formScale; */ /* no need to scale F as we don't use it below here */

   args.level = level;
+   args.offset = offset;
+
   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
      /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse
       * and incrementally update the value of Ax^2+Bxy*Cy^2; when this
@@ -2431,6 +2434,8 @@ mip_filter_linear_aniso(const struct sp_sampler_view *sp_sview,
   const float dvdy = (t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]) * t_to_v;
   struct img_filter_args args;

+   args.offset = filt_args->offset;
+
   if (filt_args->control == TGSI_SAMPLER_LOD_BIAS ||
       filt_args->control == TGSI_SAMPLER_LOD_NONE ||
       /* XXX FIXME */
@@ -2503,8 +2508,8 @@ mip_filter_linear_aniso(const struct sp_sampler_view *sp_sview,
       * seem to be worth the extra running time.
       */
      img_filter_2d_ewa(sp_sview, sp_samp, min_filter, mag_filter,
-                        s, t, p, filt_args->faces, level0,
-                        dudx, dvdx, dudy, dvdy, rgba);
+                        s, t, p, filt_args->faces, filt_args->offset,
+                        level0, dudx, dvdx, dudy, dvdy, rgba);
   }

   if (DEBUG_TEX) {
--- a/src/gallium/state_trackers/clover/core/kernel.cpp
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -55,7 +55,7 @@ kernel::launch(command_queue &q,
   const auto reduced_grid_size =
      map(divides(), grid_size, block_size);
   void *st = exec.bind(&q, grid_offset);
-   struct pipe_grid_info info;
+   struct pipe_grid_info info = {};

   // The handles are created during exec_context::bind(), so we need make
   // sure to call exec_context::bind() before retrieving them.
--- a/src/gallium/state_trackers/omx/vid_dec.c
+++ b/src/gallium/state_trackers/omx/vid_dec.c
@@ -140,7 +140,7 @@ static OMX_ERRORTYPE vid_dec_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam

   r = omx_base_filter_Constructor(comp, name);
   if (r)
-	return r;
+      return r;

   priv->profile = PIPE_VIDEO_PROFILE_UNKNOWN;

@@ -268,7 +268,7 @@ static OMX_ERRORTYPE vid_dec_SetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i
      r = checkHeader(param, sizeof(OMX_PARAM_COMPONENTROLETYPE));
      if (r)
         return r;
- 
+
      if (!strcmp((char *)role->cRole, OMX_VID_DEC_MPEG2_ROLE)) {
         priv->profile = PIPE_VIDEO_PROFILE_MPEG2_MAIN;
      } else if (!strcmp((char *)role->cRole, OMX_VID_DEC_AVC_ROLE)) {
@@ -321,7 +321,7 @@ static OMX_ERRORTYPE vid_dec_GetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i
         strcpy((char *)role->cRole, OMX_VID_DEC_MPEG2_ROLE);
      else if (priv->profile == PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH)
         strcpy((char *)role->cRole, OMX_VID_DEC_AVC_ROLE);
- 
+
      break;
   }

@@ -419,6 +419,7 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA
   priv->in_buffers[i] = buf;
   priv->sizes[i] = buf->nFilledLen;
   priv->inputs[i] = buf->pBuffer;
+   priv->timestamps[i] = buf->nTimeStamp;

   while (priv->num_in_buffers > (!!(buf->nFlags & OMX_BUFFERFLAG_EOS) ? 0 : 1)) {
      bool eos = !!(priv->in_buffers[0]->nFlags & OMX_BUFFERFLAG_EOS);
@@ -469,12 +470,13 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA
         priv->in_buffers[0] = priv->in_buffers[1];
         priv->sizes[0] = priv->sizes[1] - delta;
         priv->inputs[0] = priv->inputs[1] + delta;
+         priv->timestamps[0] = priv->timestamps[1];
      }

      if (r)
         return r;
   }
- 
+
   return OMX_ErrorNone;
 }

@@ -513,7 +515,7 @@ static void vid_dec_FillOutput(vid_dec_PrivateType *priv, struct pipe_video_buff

   box.width = def->nFrameWidth / 2;
   box.height = def->nFrameHeight / 2;
- 
+
   src = priv->pipe->transfer_map(priv->pipe, views[1]->texture, 0,
                                  PIPE_TRANSFER_READ, &box, &transfer);
   util_copy_rect(dst, views[1]->texture->format, def->nStride, 0, 0,
@@ -526,9 +528,13 @@ static void vid_dec_FrameDecoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
 {
   vid_dec_PrivateType *priv = comp->pComponentPrivate;
   bool eos = !!(input->nFlags & OMX_BUFFERFLAG_EOS);
+   OMX_TICKS timestamp;

-   if (!input->pInputPortPrivate)
-      input->pInputPortPrivate = priv->Flush(priv);
+   if (!input->pInputPortPrivate) {
+      input->pInputPortPrivate = priv->Flush(priv, &timestamp);
+      if (timestamp != OMX_VID_DEC_TIMESTAMP_INVALID)
+         input->nTimeStamp = timestamp;
+   }

   if (input->pInputPortPrivate) {
      if (output->pInputPortPrivate) {
@@ -539,6 +545,7 @@ static void vid_dec_FrameDecoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
         vid_dec_FillOutput(priv, input->pInputPortPrivate, output);
      }
      output->nFilledLen = output->nAllocLen;
+      output->nTimeStamp = input->nTimeStamp;
   }

   if (eos && input->pInputPortPrivate)
--- a/src/gallium/state_trackers/omx/vid_dec.h
+++ b/src/gallium/state_trackers/omx/vid_dec.h
@@ -59,6 +59,8 @@
 #define OMX_VID_DEC_AVC_NAME "OMX.mesa.video_decoder.avc"
 #define OMX_VID_DEC_AVC_ROLE "video_decoder.avc"

+#define OMX_VID_DEC_TIMESTAMP_INVALID ((OMX_TICKS) -1)
+
 struct vl_vlc;

 DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
@@ -69,7 +71,7 @@ DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
   struct pipe_video_codec *codec; \
   void (*Decode)(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left); \
   void (*EndFrame)(vid_dec_PrivateType *priv); \
-   struct pipe_video_buffer *(*Flush)(vid_dec_PrivateType *priv); \
+   struct pipe_video_buffer *(*Flush)(vid_dec_PrivateType *priv, OMX_TICKS *timestamp); \
   struct pipe_video_buffer *target, *shadow; \
   union { \
      struct { \
@@ -100,6 +102,9 @@ DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
   OMX_BUFFERHEADERTYPE *in_buffers[2]; \
   const void *inputs[2]; \
   unsigned sizes[2]; \
+   OMX_TICKS timestamps[2]; \
+   OMX_TICKS timestamp; \
+   bool first_buf_in_frame; \
   bool frame_finished; \
   bool frame_started; \
   unsigned bytes_left; \
--- a/src/gallium/state_trackers/omx/vid_dec_h264.c
+++ b/src/gallium/state_trackers/omx/vid_dec_h264.c
@@ -45,6 +45,7 @@
 struct dpb_list {
   struct list_head list;
   struct pipe_video_buffer *buffer;
+   OMX_TICKS timestamp;
   unsigned poc;
 };

@@ -82,7 +83,7 @@ static const uint8_t Default_8x8_Inter[64] = {

 static void vid_dec_h264_Decode(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left);
 static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv);
-static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv);
+static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp);

 void vid_dec_h264_Init(vid_dec_PrivateType *priv)
 {
@@ -91,9 +92,10 @@ void vid_dec_h264_Init(vid_dec_PrivateType *priv)
   priv->Decode = vid_dec_h264_Decode;
   priv->EndFrame = vid_dec_h264_EndFrame;
   priv->Flush = vid_dec_h264_Flush;
-   
+
   LIST_INITHEAD(&priv->codec_data.h264.dpb_list);
   priv->picture.h264.field_order_cnt[0] = priv->picture.h264.field_order_cnt[1] = INT_MAX;
+   priv->first_buf_in_frame = true;
 }

 static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
@@ -104,6 +106,9 @@ static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
      return;

   vid_dec_NeedTarget(priv);
+   if (priv->first_buf_in_frame)
+      priv->timestamp = priv->timestamps[0];
+   priv->first_buf_in_frame = false;

   priv->picture.h264.num_ref_frames = priv->picture.h264.pps->sps->max_num_ref_frames;

@@ -127,7 +132,8 @@ static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
   priv->frame_started = true;
 }

-static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv)
+static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv,
+                                                    OMX_TICKS *timestamp)
 {
   struct dpb_list *entry, *result = NULL;
   struct pipe_video_buffer *buf;
@@ -146,6 +152,8 @@ static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv)
      return NULL;

   buf = result->buffer;
+   if (timestamp)
+      *timestamp = result->timestamp;

   --priv->codec_data.h264.dpb_num;
   LIST_DEL(&result->list);
@@ -159,6 +167,7 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
   struct dpb_list *entry;
   struct pipe_video_buffer *tmp;
   bool top_field_first;
+   OMX_TICKS timestamp;

   if (!priv->frame_started)
      return;
@@ -181,7 +190,9 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
   if (!entry)
      return;

+   priv->first_buf_in_frame = true;
   entry->buffer = priv->target;
+   entry->timestamp = priv->timestamp;
   entry->poc = MIN2(priv->picture.h264.field_order_cnt[0], priv->picture.h264.field_order_cnt[1]);
   LIST_ADDTAIL(&entry->list, &priv->codec_data.h264.dpb_list);
   ++priv->codec_data.h264.dpb_num;
@@ -192,7 +203,8 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
      return;

   tmp = priv->in_buffers[0]->pInputPortPrivate;
-   priv->in_buffers[0]->pInputPortPrivate = vid_dec_h264_Flush(priv);
+   priv->in_buffers[0]->pInputPortPrivate = vid_dec_h264_Flush(priv, &timestamp);
+   priv->in_buffers[0]->nTimeStamp = timestamp;
   priv->target = tmp;
   priv->frame_finished = priv->in_buffers[0]->pInputPortPrivate != NULL;
 }
@@ -829,7 +841,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
         priv->picture.h264.field_order_cnt[0] = expectedPicOrderCnt + priv->codec_data.h264.delta_pic_order_cnt[0];
         priv->picture.h264.field_order_cnt[1] = priv->picture.h264.field_order_cnt[0] +
            sps->offset_for_top_to_bottom_field + priv->codec_data.h264.delta_pic_order_cnt[1];
-         
+
      } else if (!priv->picture.h264.bottom_field_flag)
         priv->picture.h264.field_order_cnt[0] = expectedPicOrderCnt + priv->codec_data.h264.delta_pic_order_cnt[0];
      else
@@ -859,7 +871,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
      if (!priv->picture.h264.field_pic_flag) {
         priv->picture.h264.field_order_cnt[0] = tempPicOrderCnt;
         priv->picture.h264.field_order_cnt[1] = tempPicOrderCnt;
-         
+
      } else if (!priv->picture.h264.bottom_field_flag)
         priv->picture.h264.field_order_cnt[0] = tempPicOrderCnt;
      else
@@ -876,7 +888,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,

   priv->picture.h264.num_ref_idx_l0_active_minus1 = pps->num_ref_idx_l0_default_active_minus1;
   priv->picture.h264.num_ref_idx_l1_active_minus1 = pps->num_ref_idx_l1_default_active_minus1;
- 
+
   if (slice_type == PIPE_H264_SLICE_TYPE_P ||
       slice_type == PIPE_H264_SLICE_TYPE_SP ||
       slice_type == PIPE_H264_SLICE_TYPE_B) {
--- a/src/gallium/state_trackers/omx/vid_dec_mpeg12.c
+++ b/src/gallium/state_trackers/omx/vid_dec_mpeg12.c
@@ -61,7 +61,7 @@ static uint8_t default_non_intra_matrix[64] = {

 static void vid_dec_mpeg12_Decode(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left);
 static void vid_dec_mpeg12_EndFrame(vid_dec_PrivateType *priv);
-static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv);
+static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp);

 void vid_dec_mpeg12_Init(vid_dec_PrivateType *priv)
 {
@@ -131,10 +131,12 @@ static void vid_dec_mpeg12_EndFrame(vid_dec_PrivateType *priv)
   priv->in_buffers[0]->pInputPortPrivate = done;
 }

-static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv)
+static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp)
 {
   struct pipe_video_buffer *result = priv->picture.mpeg12.ref[1];
   priv->picture.mpeg12.ref[1] = NULL;
+   if (timestamp)
+      *timestamp = OMX_VID_DEC_TIMESTAMP_INVALID;
   return result;
 }

--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -179,7 +179,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
   if (!screen->get_video_param(screen, PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
                                PIPE_VIDEO_ENTRYPOINT_ENCODE, PIPE_VIDEO_CAP_SUPPORTED))
      return OMX_ErrorBadParameter;
- 
+
   priv->stacked_frames_num = screen->get_video_param(screen,
                                PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
                                PIPE_VIDEO_ENTRYPOINT_ENCODE,
@@ -242,7 +242,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam

   port->Port_AllocateBuffer = vid_enc_AllocateOutBuffer;
   port->Port_FreeBuffer = vid_enc_FreeOutBuffer;
- 
+
   priv->bitrate.eControlRate = OMX_Video_ControlRateDisable;
   priv->bitrate.nTargetBitrate = 0;

@@ -253,7 +253,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
   priv->profile_level.eProfile = OMX_VIDEO_AVCProfileBaseline;
   priv->profile_level.eLevel = OMX_VIDEO_AVCLevel42;

-   priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; 
+   priv->force_pic_type.IntraRefreshVOP = OMX_FALSE;
   priv->frame_num = 0;
   priv->pic_order_cnt = 0;
   priv->restricted_b_frames = debug_get_bool_option("OMX_USE_RESTRICTED_B_FRAMES", FALSE);
@@ -380,7 +380,7 @@ static OMX_ERRORTYPE vid_enc_SetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i

         port = (omx_base_video_PortType *)priv->ports[OMX_BASE_FILTER_OUTPUTPORT_INDEX];
         port->sPortParam.nBufferSize = framesize * 512 / (16*16);
-      
+
         priv->frame_rate = def->format.video.xFramerate;

         priv->callbacks->EventHandler(comp, priv->callbackData, OMX_EventPortSettingsChanged,
@@ -532,10 +532,10 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
   vid_enc_PrivateType *priv = comp->pComponentPrivate;
   OMX_ERRORTYPE r;
   int i;
- 
+
   if (!config)
      return OMX_ErrorBadParameter;
-                         
+
   switch(idx) {
   case OMX_IndexConfigVideoIntraVOPRefresh: {
      OMX_CONFIG_INTRAREFRESHVOPTYPE *type = config;
@@ -543,9 +543,9 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
      r = checkHeader(config, sizeof(OMX_CONFIG_INTRAREFRESHVOPTYPE));
      if (r)
         return r;
-      
+
      priv->force_pic_type = *type;
-      
+
      break;
   }
   case OMX_IndexConfigCommonScale: {
@@ -568,11 +568,11 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
      priv->scale = *scale;
      if (priv->scale.xWidth != 0xffffffff && priv->scale.xHeight != 0xffffffff) {
         struct pipe_video_buffer templat = {};
- 
+
         templat.buffer_format = PIPE_FORMAT_NV12;
         templat.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420;
-         templat.width = priv->scale.xWidth; 
-         templat.height = priv->scale.xHeight; 
+         templat.width = priv->scale.xWidth;
+         templat.height = priv->scale.xHeight;
         templat.interlaced = false;
         for (i = 0; i < OMX_VID_ENC_NUM_SCALING_BUFFERS; ++i) {
            priv->scale_buffer[i] = priv->s_pipe->create_video_buffer(priv->s_pipe, &templat);
@@ -615,7 +615,7 @@ static OMX_ERRORTYPE vid_enc_GetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
   default:
      return omx_base_component_GetConfig(handle, idx, config);
   }
-   
+
   return OMX_ErrorNone;
 }

@@ -1010,10 +1010,10 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
   switch (priv->bitrate.eControlRate) {
   case OMX_Video_ControlRateVariable:
      rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_VARIABLE;
-      break; 
+      break;
   case OMX_Video_ControlRateConstant:
      rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_CONSTANT;
-      break; 
+      break;
   case OMX_Video_ControlRateVariableSkipFrames:
      rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_VARIABLE_SKIP;
      break;
@@ -1023,8 +1023,8 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
   default:
      rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_DISABLE;
      break;
-   } 
-      
+   }
+
   rate_ctrl->frame_rate_den = OMX_VID_ENC_CONTROL_FRAME_RATE_DEN_DEFAULT;
   rate_ctrl->frame_rate_num = ((priv->frame_rate) >> 16) * rate_ctrl->frame_rate_den;

@@ -1035,7 +1035,7 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
         rate_ctrl->target_bitrate = priv->bitrate.nTargetBitrate;
      else
         rate_ctrl->target_bitrate = OMX_VID_ENC_BITRATE_MAX;
-      rate_ctrl->peak_bitrate = rate_ctrl->target_bitrate;    
+      rate_ctrl->peak_bitrate = rate_ctrl->target_bitrate;
      if (rate_ctrl->target_bitrate < OMX_VID_ENC_BITRATE_MEDIAN)
         rate_ctrl->vbv_buffer_size = MIN2((rate_ctrl->target_bitrate * 2.75), OMX_VID_ENC_BITRATE_MEDIAN);
      else
@@ -1051,7 +1051,7 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
      rate_ctrl->peak_bits_picture_integer = rate_ctrl->target_bits_picture;
      rate_ctrl->peak_bits_picture_fraction = 0;
   }
-   
+
   picture->quant_i_frames = priv->quant.nQpI;
   picture->quant_p_frames = priv->quant.nQpP;
   picture->quant_b_frames = priv->quant.nQpB;
@@ -1069,7 +1069,7 @@ static void enc_HandleTask(omx_base_PortType *port, struct encode_task *task,
   unsigned size = priv->ports[OMX_BASE_FILTER_OUTPUTPORT_INDEX]->sPortParam.nBufferSize;
   struct pipe_video_buffer *vbuf = task->buf;
   struct pipe_h264_enc_picture_desc picture = {};
- 
+
   /* -------------- scale input image --------- */
   enc_ScaleInput(port, &vbuf, &size);
   priv->s_pipe->flush(priv->s_pipe, NULL, 0);
@@ -1160,7 +1160,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
       priv->force_pic_type.IntraRefreshVOP) {
      enc_ClearBframes(port, inp);
      picture_type = PIPE_H264_ENC_PICTURE_TYPE_IDR;
-      priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; 
+      priv->force_pic_type.IntraRefreshVOP = OMX_FALSE;
      priv->frame_num = 0;
   } else if (priv->codec->profile == PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE ||
              !(priv->pic_order_cnt % OMX_VID_ENC_P_PERIOD_DEFAULT) ||
@@ -1169,7 +1169,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
   } else {
      picture_type = PIPE_H264_ENC_PICTURE_TYPE_B;
   }
-   
+
   task->pic_order_cnt = priv->pic_order_cnt++;

   if (picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
@@ -1245,7 +1245,7 @@ static void vid_enc_BufferEncoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
   output->pBuffer = priv->t_pipe->transfer_map(priv->t_pipe, outp->bitstream, 0,
                                                PIPE_TRANSFER_READ_WRITE,
                                                &box, &outp->transfer);
- 
+
   /* ------------- get size of result ----------------- */

   priv->codec->get_feedback(priv->codec, task->feedback, &size);
--- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
@@ -52,6 +52,7 @@
 #include <unistd.h>

 #define VMW_MAX_DEFAULT_TEXTURE_SIZE   (128 * 1024 * 1024)
+#define VMW_FENCE_TIMEOUT_SECONDS 60

 struct vmw_region
 {
@@ -721,7 +722,7 @@ vmw_ioctl_fence_finish(struct vmw_winsys_screen *vws,
   memset(&arg, 0, sizeof(arg));

   arg.handle = handle;
-   arg.timeout_us = 10*1000000;
+   arg.timeout_us = VMW_FENCE_TIMEOUT_SECONDS*1000000;
   arg.lazy = 0;
   arg.flags = vflags;

--- a/src/gallium/winsys/svga/drm/vmw_surface.c
+++ b/src/gallium/winsys/svga/drm/vmw_surface.c
@@ -170,6 +170,8 @@ vmw_svga_winsys_surface_unmap(struct svga_winsys_context *swc,
      *rebind = vsrf->rebind;
      vsrf->rebind = FALSE;
      vmw_svga_winsys_buffer_unmap(&vsrf->screen->base, vsrf->buf);
+   } else {
+      *rebind = FALSE;
   }
   pipe_mutex_unlock(vsrf->mutex);
 }
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2441,8 +2441,10 @@ fs_visitor::opt_sampler_eot()
    * we have enough space, but it will make sure the dead code eliminator kills
    * the instruction that this will replace.
    */
-   if (tex_inst->header_size != 0)
+   if (tex_inst->header_size != 0) {
+      invalidate_live_intervals();
      return true;
+   }

   fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
                                  load_payload->sources + 1);
@@ -2473,6 +2475,7 @@ fs_visitor::opt_sampler_eot()
   tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
   tex_inst->src[0] = send_header;

+   invalidate_live_intervals();
   return true;
 }

@@ -5187,12 +5190,18 @@ fs_visitor::optimize()
 void
 fs_visitor::fixup_3src_null_dest()
 {
+   bool progress = false;
+
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
      if (inst->is_3src() && inst->dst.is_null()) {
         inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                            inst->dst.type);
+         progress = true;
      }
   }
+
+   if (progress)
+      invalidate_live_intervals();
 }

 void
@@ -5228,7 +5237,7 @@ fs_visitor::allocate_registers()
       * SIMD8.  There's probably actually some intermediate point where
       * SIMD16 with a couple of spills is still better.
       */
-      if (dispatch_width == 16) {
+      if (dispatch_width == 16 && min_dispatch_width <= 8) {
         fail("Failure to register allocate.  Reduce number of "
              "live scalar values to avoid this.");
      } else {
@@ -5470,6 +5479,13 @@ fs_visitor::run_cs()
   if (shader_time_index >= 0)
      emit_shader_time_begin();

+   if (devinfo->is_haswell && prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+   }
+
   emit_nir_code();

   if (failed)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -407,6 +407,7 @@ public:
   bool spilled_any_registers;

   const unsigned dispatch_width; /**< 8 or 16 */
+   unsigned min_dispatch_width;

   int shader_time_index;

--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1021,6 +1021,18 @@ fs_visitor::init()
      unreachable("unhandled shader stage");
   }

+   if (stage == MESA_SHADER_COMPUTE) {
+      const brw_cs_prog_data *cs_prog_data =
+         (const brw_cs_prog_data *) prog_data;
+      unsigned size = cs_prog_data->local_size[0] *
+                      cs_prog_data->local_size[1] *
+                      cs_prog_data->local_size[2];
+      size = DIV_ROUND_UP(size, devinfo->max_cs_threads);
+      min_dispatch_width = size > 16 ? 32 : (size > 8 ? 16 : 8);
+   } else {
+      min_dispatch_width = 8;
+   }
+
   this->prog_data = this->stage_prog_data;

   this->failed = false;
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -338,8 +338,6 @@ brw_emit_mi_flush(struct brw_context *brw)
      }
      brw_emit_pipe_control_flush(brw, flags);
   }
-
-   brw_render_cache_set_clear(brw);
 }

 int
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -736,6 +736,22 @@ brw_notification_reg(void)
                  WRITEMASK_X);
 }

+static inline struct brw_reg
+brw_sr0_reg(void)
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                  BRW_ARF_STATE,
+                  0,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_UD,
+                  BRW_VERTICAL_STRIDE_8,
+                  BRW_WIDTH_8,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYZW,
+                  WRITEMASK_XYZW);
+}
+
 static inline struct brw_reg
 brw_acc_reg(unsigned width)
 {
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1033,6 +1033,7 @@ vec4_visitor::opt_register_coalesce()

         if (is_nop_mov) {
            inst->remove(block);
+            progress = true;
            continue;
         }
      }
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -685,9 +685,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
   case nir_intrinsic_load_instance_id:
   case nir_intrinsic_load_base_instance:
   case nir_intrinsic_load_draw_id:
-   case nir_intrinsic_load_invocation_id:
-   case nir_intrinsic_load_tess_level_inner:
-   case nir_intrinsic_load_tess_level_outer: {
+   case nir_intrinsic_load_invocation_id: {
      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
      src_reg val = src_reg(nir_system_values[sv]);
      assert(val.file != BAD_FILE);
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -402,6 +402,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
         }
      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
         dst.type = BRW_REGISTER_TYPE_F;
+         unsigned swiz = BRW_SWIZZLE_WZYX;

         /* This is a read of gl_TessLevelOuter[], which lives in the
          * high 4 DWords of the Patch URB header, in reverse order.
@@ -414,6 +415,8 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
            dst.writemask = WRITEMASK_XYZ;
            break;
         case GL_ISOLINES:
+            /* Isolines are not reversed; swizzle .zw -> .xy */
+            swiz = BRW_SWIZZLE_ZWZW;
            dst.writemask = WRITEMASK_XY;
            return;
         default:
@@ -422,7 +425,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)

         dst_reg tmp(this, glsl_type::vec4_type);
         emit_output_urb_read(tmp, 1, src_reg());
-         emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+         emit(MOV(dst, swizzle(src_reg(tmp), swiz)));
      } else {
         emit_output_urb_read(dst, imm_offset, indirect_offset);
      }
@@ -475,8 +478,15 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          * Patch URB Header at DWords 4-7.  However, it's reversed, so
          * instead of .xyzw we have .wzyx.
          */
-         swiz = BRW_SWIZZLE_WZYX;
-         mask = writemask_for_backwards_vector(mask);
+         if (key->tes_primitive_mode == GL_ISOLINES) {
+            /* Isolines .xy should be stored in .zw, in order. */
+            swiz = BRW_SWIZZLE4(0, 0, 0, 1);
+            mask <<= 2;
+         } else {
+            /* Other domains are reversed; store .wzyx instead of .xyzw. */
+            swiz = BRW_SWIZZLE_WZYX;
+            mask = writemask_for_backwards_vector(mask);
+         }
      }

      emit_urb_write(swizzle(value, swiz), mask,
--- a/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
@@ -28,6 +28,7 @@
 */

 #include "brw_vec4_tes.h"
+#include "brw_cfg.h"

 namespace brw {

@@ -53,39 +54,10 @@ vec4_tes_visitor::make_reg_for_system_value(int location, const glsl_type *type)
 void
 vec4_tes_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
 {
-   const struct brw_tes_prog_data *tes_prog_data =
-      (const struct brw_tes_prog_data *) prog_data;
-
   switch (instr->intrinsic) {
-   case nir_intrinsic_load_tess_level_outer: {
-      dst_reg dst(this, glsl_type::vec4_type);
-      nir_system_values[SYSTEM_VALUE_TESS_LEVEL_OUTER] = dst;
-
-      dst_reg temp(this, glsl_type::vec4_type);
-      vec4_instruction *read =
-         emit(VEC4_OPCODE_URB_READ, temp, input_read_header);
-      read->offset = 1;
-      read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
-      emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WZYX)));
+   case nir_intrinsic_load_tess_level_outer:
+   case nir_intrinsic_load_tess_level_inner:
      break;
-   }
-   case nir_intrinsic_load_tess_level_inner: {
-      dst_reg dst(this, glsl_type::vec2_type);
-      nir_system_values[SYSTEM_VALUE_TESS_LEVEL_INNER] = dst;
-
-      /* Set up the message header to reference the proper parts of the URB */
-      dst_reg temp(this, glsl_type::vec4_type);
-      vec4_instruction *read =
-         emit(VEC4_OPCODE_URB_READ, temp, input_read_header);
-      read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
-      if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {
-         emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WZYX)));
-      } else {
-         read->offset = 1;
-         emit(MOV(dst, src_reg(temp)));
-      }
-      break;
-   }
   default:
      vec4_visitor::nir_setup_system_value_intrinsic(instr);
   }
@@ -105,6 +77,25 @@ vec4_tes_visitor::setup_payload()

   reg = setup_uniforms(reg);

+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file != ATTR)
+            continue;
+
+         struct brw_reg grf =
+            brw_vec4_grf(reg + inst->src[i].nr / 2, 4 * (inst->src[i].nr % 2));
+         grf = stride(grf, 0, 4, 1);
+         grf.swizzle = inst->src[i].swizzle;
+         grf.type = inst->src[i].type;
+         grf.abs = inst->src[i].abs;
+         grf.negate = inst->src[i].negate;
+
+         inst->src[i] = grf;
+      }
+   }
+
+   reg += 8 * prog_data->urb_read_length;
+
   this->first_non_payload_grf = reg;
 }

@@ -148,12 +139,36 @@ vec4_tes_visitor::emit_urb_write_opcode(bool complete)
 void
 vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 {
+   const struct brw_tes_prog_data *tes_prog_data =
+      (const struct brw_tes_prog_data *) prog_data;
+
   switch (instr->intrinsic) {
   case nir_intrinsic_load_tess_coord:
      /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
               src_reg(brw_vec8_grf(1, 0))));
      break;
+   case nir_intrinsic_load_tess_level_outer:
+      if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+                          BRW_SWIZZLE_ZWZW)));
+      } else {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+                          BRW_SWIZZLE_WZYX)));
+      }
+      break;
+   case nir_intrinsic_load_tess_level_inner:
+      if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 0, glsl_type::vec4_type),
+                          BRW_SWIZZLE_WZYX)));
+      } else {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  src_reg(ATTR, 1, glsl_type::float_type)));
+      }
+      break;
   case nir_intrinsic_load_primitive_id:
      emit(TES_OPCODE_GET_PRIMITIVE_ID,
           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
@@ -169,6 +184,19 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
         header = src_reg(this, glsl_type::uvec4_type);
         emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
              input_read_header, indirect_offset);
+      } else {
+         /* Arbitrarily only push up to 24 vec4 slots worth of data,
+          * which is 12 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 24;
+         if (imm_offset < max_push_slots) {
+            emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D),
+                     src_reg(ATTR, imm_offset, glsl_type::ivec4_type)));
+            prog_data->urb_read_length =
+               MAX2(prog_data->urb_read_length,
+                    DIV_ROUND_UP(imm_offset + 1, 2));
+            break;
+         }
      }

      dst_reg temp(this, glsl_type::ivec4_type);
--- a/src/mesa/drivers/dri/i965/intel_copy_image.c
+++ b/src/mesa/drivers/dri/i965/intel_copy_image.c
@@ -140,9 +140,9 @@ copy_image_with_memcpy(struct brw_context *brw,
   _mesa_get_format_block_size(src_mt->format, &src_bw, &src_bh);

   assert(src_width % src_bw == 0);
-   assert(src_height % src_bw == 0);
+   assert(src_height % src_bh == 0);
   assert(src_x % src_bw == 0);
-   assert(src_y % src_bw == 0);
+   assert(src_y % src_bh == 0);

   /* If we are on the same miptree, same level, and same slice, then
    * intel_miptree_map won't let us map it twice.  We have to do things a
@@ -153,7 +153,7 @@ copy_image_with_memcpy(struct brw_context *brw,

   if (same_slice) {
      assert(dst_x % src_bw == 0);
-      assert(dst_y % src_bw == 0);
+      assert(dst_y % src_bh == 0);

      map_x1 = MIN2(src_x, dst_x);
      map_y1 = MIN2(src_y, dst_y);
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1065,7 +1065,28 @@ brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
   if (!_mesa_set_search(brw->render_cache, bo))
      return;

-   brw_emit_mi_flush(brw);
+   if (brw->gen >= 6) {
+      if (brw->gen == 6) {
+         /* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
+          * Flush Enable = 1, a PIPE_CONTROL with any non-zero
+          * post-sync-op is required.
+          */
+         brw_emit_post_sync_nonzero_flush(brw);
+      }
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                  PIPE_CONTROL_CS_STALL);
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+   } else {
+      brw_emit_mi_flush(brw);
+   }
+
+   brw_render_cache_set_clear(brw);
 }

 /**
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -50,7 +50,7 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
      width <<= 1;
      if (height != 1)
         height <<= 1;
-      if (depth != 1)
+      if (intelObj->base.Target == GL_TEXTURE_3D)
         depth <<= 1;
   }

--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -387,6 +387,7 @@ st_update_renderbuffer_surface(struct st_context *st,
 {
   struct pipe_context *pipe = st->pipe;
   struct pipe_resource *resource = strb->texture;
+   struct st_texture_object *stTexObj = NULL;
   unsigned rtt_width = strb->Base.Width;
   unsigned rtt_height = strb->Base.Height;
   unsigned rtt_depth = strb->Base.Depth;
@@ -398,9 +399,18 @@ st_update_renderbuffer_surface(struct st_context *st,
    */
   boolean enable_srgb = (st->ctx->Color.sRGBEnabled &&
         _mesa_get_format_color_encoding(strb->Base.Format) == GL_SRGB);
-   enum pipe_format format = (enable_srgb) ?
-      util_format_srgb(resource->format) :
-      util_format_linear(resource->format);
+   enum pipe_format format = resource->format;
+
+   if (strb->is_rtt) {
+      stTexObj = st_texture_object(strb->Base.TexImage->TexObject);
+      if (stTexObj->surface_based)
+         format = stTexObj->surface_format;
+   }
+
+   format = (enable_srgb) ?
+      util_format_srgb(format) :
+      util_format_linear(format);
+
   unsigned first_layer, last_layer, level;

   if (resource->target == PIPE_TEXTURE_1D_ARRAY) {
@@ -431,8 +441,8 @@ st_update_renderbuffer_surface(struct st_context *st,

   /* Adjust for texture views */
   if (strb->is_rtt && resource->array_size > 1 &&
-       strb->Base.TexImage->TexObject->Immutable) {
-      struct gl_texture_object *tex = strb->Base.TexImage->TexObject;
+       stTexObj->base.Immutable) {
+      struct gl_texture_object *tex = &stTexObj->base;
      first_layer += tex->MinLayer;
      if (!strb->rtt_layered)
         last_layer += tex->MinLayer;
@@ -492,8 +502,6 @@ st_render_texture(struct gl_context *ctx,

   st_update_renderbuffer_surface(st, strb);

-   strb->Base.Format = st_pipe_format_to_mesa_format(pt->format);
-
   /* Invalidate buffer state so that the pipe's framebuffer state
    * gets updated.
    * That's where the new renderbuffer (which we just created) gets
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -2886,10 +2886,13 @@ st_finalize_texture(struct gl_context *ctx,
         /* Need to import images in main memory or held in other textures.
          */
         if (stImage && stObj->pt != stImage->pt) {
+            GLuint depth = stObj->depth0;
+            if (stObj->base.Target == GL_TEXTURE_3D)
+               depth = u_minify(depth, level);
            if (level == 0 ||
                (stImage->base.Width == u_minify(stObj->width0, level) &&
                 stImage->base.Height == u_minify(stObj->height0, level) &&
-                 stImage->base.Depth == u_minify(stObj->depth0, level))) {
+                 stImage->base.Depth == depth)) {
               /* src image fits expected dest mipmap level size */
               copy_image_data_to_texture(st, stObj, level, stImage);
            }
--- a/src/mesa/swrast/s_context.c
+++ b/src/mesa/swrast/s_context.c
@@ -900,11 +900,16 @@ void
 _swrast_render_finish( struct gl_context *ctx )
 {
   SWcontext *swrast = SWRAST_CONTEXT(ctx);
+   struct gl_query_object *query = ctx->Query.CurrentOcclusionObject;

   _swrast_flush(ctx);

   if (swrast->Driver.SpanRenderFinish)
      swrast->Driver.SpanRenderFinish( ctx );
+
+   if (query && (query->Target == GL_ANY_SAMPLES_PASSED ||
+                 query->Target == GL_ANY_SAMPLES_PASSED_CONSERVATIVE))
+      query->Result = !!query->Result;
 }