Update version to 12.0.0-rc2

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
mesa: automake: distclean git_sha1.h when building OOT
2016-06-07 12:35:59 +01:00 · 2016-06-07 12:35:53 +01:00 · 2016-06-07 12:35:50 +01:00 · 2016-06-07 12:35:46 +01:00 · 2016-06-07 12:35:40 +01:00 · 2016-06-03 01:44:56 +01:00
73 changed files with 1096 additions and 486 deletions
--- a/Android.mk
+++ b/Android.mk
@@ -95,6 +95,8 @@ SUBDIRS := \
 	src/mesa \
 	src/util \
 	src/egl \
+	src/intel/genxml \
+	src/intel/isl \
 	src/mesa/drivers/dri

 INC_DIRS := $(call all-named-subdir-makefiles,$(SUBDIRS))
--- a/2
+++ b/2
@@ -1 +1 @@
-11.3.0-devel
+12.0.0-rc2
--- a/configure.ac
+++ b/configure.ac
@@ -99,7 +99,6 @@ AM_PROG_CC_C_O
 AM_PROG_AS
 AX_CHECK_GNU_MAKE
 AC_CHECK_PROGS([PYTHON2], [python2.7 python2 python])
-AC_CHECK_PROGS([PYTHON3], [python3.5 python3.4 python3])
 AC_PROG_SED
 AC_PROG_MKDIR_P

@@ -142,12 +141,6 @@ else
    fi
 fi

-if test -z "$PYTHON3"; then
-    if test ! -f "$srcdir/src/intel/genxml/gen9_pack.h"; then
-        AC_MSG_ERROR([Python3 not found - unable to generate sources])
-    fi
-fi
-
 AC_PROG_INSTALL

 dnl We need a POSIX shell for parts of the build. Assume we have one
@@ -2874,7 +2867,6 @@ if test "x$MESA_LLVM" = x1; then
    echo ""
 fi
 echo "        PYTHON2:         $PYTHON2"
-echo "        PYTHON3:         $PYTHON3"

 echo ""
 echo "        Run '${MAKE-make}' to build Mesa"
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -166,6 +166,7 @@ See the <a href="xlibdriver.html">Xlib software driver page</a> for details.
   <li>vec4 - force vec4 mode in vertex shader</li>
   <li>spill_fs - force spilling of all registers in the scalar backend (useful to debug spilling code)</li>
   <li>spill_vec4 - force spilling of all registers in the vec4 backend (useful to debug spilling code)</li>
+   <li>norbc - disable single sampled render buffer compression</li>
 </ul>
 </ul>

--- a/docs/relnotes/12.0.0.html
+++ b/docs/relnotes/12.0.0.html
@@ -14,15 +14,15 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">

-<h1>Mesa 11.3.0 Release Notes / TBD</h1>
+<h1>Mesa 12.0.0 Release Notes / TBD</h1>

 <p>
-Mesa 11.3.0 is a new development release.
+Mesa 12.0.0 is a new development release.
 People who are concerned with stability and reliability should stick
-with a previous release or wait for Mesa 11.3.1.
+with a previous release or wait for Mesa 12.0.1.
 </p>
 <p>
-Mesa 11.3.0 implements the OpenGL 4.3 API, but the version reported by
+Mesa 12.0.0 implements the OpenGL 4.3 API, but the version reported by
 glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
 glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
 Some drivers don't support all the features required in OpenGL 4.3.  OpenGL
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,17 +19,39 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.

-git_sha1.h:
+.PHONY: git_sha1.h.tmp
+git_sha1.h.tmp:
+	@# Don't assume that $(top_srcdir)/.git is a directory. It may be
+	@# a gitlink file if $(top_srcdir) is a submodule checkout or a linked
+	@# worktree.
+	@# If we are building from a release tarball copy the bundled header.
 	@if test -e $(top_srcdir)/.git; then \
 		if which git > /dev/null; then \
 		    git --git-dir=$(top_srcdir)/.git log -n 1 --oneline | \
 			sed 's/^\([^ ]*\) .*/#define MESA_GIT_SHA1 "git-\1"/' \
-			> git_sha1.h ; \
+			> git_sha1.h.tmp ; \
 		fi \
+	else \
+		cp $(srcdir)/git_sha1.h git_sha1.h.tmp ;\
+		chmod u+w git_sha1.h.tmp; \
+	fi
+
+git_sha1.h: git_sha1.h.tmp
+	@echo "updating git_sha1.h"
+	@if ! cmp -s git_sha1.h.tmp git_sha1.h; then \
+		mv git_sha1.h.tmp git_sha1.h ;\
+	else \
+		rm git_sha1.h.tmp ;\
 	fi

 BUILT_SOURCES = git_sha1.h

+# We want to keep the srcdir file since we need it on rebuild from tarball.
+# At the same time `make distclean' gets angry at us if we don't cleanup the
+# builddir one.
+distclean-local:
+	test $(top_srcdir) != $(top_builddir) && rm $(builddir)/git_sha1.h
+
 SUBDIRS = . gtest util mapi/glapi/gen mapi

 # include only conditionally ?
--- a/src/compiler/Android.glsl.mk
+++ b/src/compiler/Android.glsl.mk
@@ -38,13 +38,14 @@ LOCAL_SRC_FILES := \
 	$(LIBGLSL_FILES) \

 LOCAL_C_INCLUDES := \
-	$(MESA_TOP)/src/compiler/nir \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary

-LOCAL_STATIC_LIBRARIES := libmesa_compiler
+LOCAL_STATIC_LIBRARIES := \
+	libmesa_compiler \
+	libmesa_nir

 LOCAL_MODULE := libmesa_glsl

--- a/src/compiler/Makefile.nir.am
+++ b/src/compiler/Makefile.nir.am
@@ -86,4 +86,5 @@ EXTRA_DIST += \
 	nir/nir_opcodes_c.py				\
 	nir/nir_opcodes_h.py				\
 	nir/nir_opt_algebraic.py			\
-	nir/tests
+	nir/tests \
+	SConscript.nir
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -3442,11 +3442,11 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
   if (qual->flags.q.explicit_xfb_offset) {
      unsigned qual_xfb_offset;
      unsigned component_size = var->type->contains_double() ? 8 : 4;
-      const glsl_type *t = get_varying_type(var, state->stage);
+
      if (process_qualifier_constant(state, loc, "xfb_offset",
                                     qual->offset, &qual_xfb_offset) &&
          validate_xfb_offset_qualifier(loc, state, (int) qual_xfb_offset,
-                                        t, component_size)) {
+                                        var->type, component_size)) {
         var->data.offset = qual_xfb_offset;
         var->data.explicit_xfb_offset = true;
      }
@@ -7336,6 +7336,12 @@ ast_interface_block::hir(exec_list *instructions,
                                        packing,
                                        this->block_name);

+   unsigned component_size = block_type->contains_double() ? 8 : 4;
+   int xfb_offset =
+      layout.flags.q.explicit_xfb_offset ? (int) qual_xfb_offset : -1;
+   validate_xfb_offset_qualifier(&loc, state, xfb_offset, block_type,
+                                 component_size);
+
   if (!state->symbols->add_interface(block_type->name, block_type, var_mode)) {
      YYLTYPE loc = this->get_location();
      _mesa_glsl_error(&loc, state, "interface block `%s' with type `%s' "
@@ -7474,13 +7480,6 @@ ast_interface_block::hir(exec_list *instructions,
                                      var_mode);
      }

-      unsigned component_size = block_type->contains_double() ? 8 : 4;
-      int xfb_offset =
-         layout.flags.q.explicit_xfb_offset ? (int) qual_xfb_offset : -1;
-      const glsl_type *t = get_varying_type(var, state->stage);
-      validate_xfb_offset_qualifier(&loc, state, xfb_offset, t,
-                                    component_size);
-
      var->data.matrix_layout = matrix_layout == GLSL_MATRIX_LAYOUT_INHERITED
         ? GLSL_MATRIX_LAYOUT_COLUMN_MAJOR : matrix_layout;

@@ -7531,12 +7530,6 @@ ast_interface_block::hir(exec_list *instructions,
       */
      assert(this->array_specifier == NULL);

-      unsigned component_size = block_type->contains_double() ? 8 : 4;
-      int xfb_offset =
-         layout.flags.q.explicit_xfb_offset ? (int) qual_xfb_offset : -1;
-      validate_xfb_offset_qualifier(&loc, state, xfb_offset, block_type,
-                                    component_size);
-
      for (unsigned i = 0; i < num_variables; i++) {
         ir_variable *var =
            new(state) ir_variable(fields[i].type,
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -622,7 +622,7 @@ builtin_variable_generator::generate_constants()
   /* Max uniforms/varyings: GLSL ES counts these in units of vectors; desktop
    * GL counts them in units of "components" or "floats".
    */
-   if (state->es_shader) {
+   if (state->is_version(410, 100)) {
      add_const("gl_MaxVertexUniformVectors",
                state->Const.MaxVertexUniformComponents / 4);
      add_const("gl_MaxFragmentUniformVectors",
@@ -1201,8 +1201,15 @@ builtin_variable_generator::generate_cs_special_vars()
                    "gl_LocalInvocationID");
   add_system_value(SYSTEM_VALUE_WORK_GROUP_ID, uvec3_t, "gl_WorkGroupID");
   add_system_value(SYSTEM_VALUE_NUM_WORK_GROUPS, uvec3_t, "gl_NumWorkGroups");
-   add_variable("gl_GlobalInvocationID", uvec3_t, ir_var_auto, 0);
-   add_variable("gl_LocalInvocationIndex", uint_t, ir_var_auto, 0);
+   if (state->ctx->Const.LowerCsDerivedVariables) {
+      add_variable("gl_GlobalInvocationID", uvec3_t, ir_var_auto, 0);
+      add_variable("gl_LocalInvocationIndex", uint_t, ir_var_auto, 0);
+   } else {
+      add_system_value(SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
+                       uvec3_t, "gl_GlobalInvocationID");
+      add_system_value(SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
+                       uint_t, "gl_LocalInvocationIndex");
+   }
 }


@@ -1431,16 +1438,16 @@ initialize_cs_derived_variables(gl_shader *shader,
 * These are initialized in the main function.
 */
 void
-_mesa_glsl_initialize_derived_variables(gl_shader *shader)
+_mesa_glsl_initialize_derived_variables(struct gl_context *ctx,
+                                        gl_shader *shader)
 {
   /* We only need to set CS variables currently. */
-   if (shader->Stage != MESA_SHADER_COMPUTE)
-      return;
+   if (shader->Stage == MESA_SHADER_COMPUTE &&
+       ctx->Const.LowerCsDerivedVariables) {
+      ir_function_signature *const main_sig =
+         _mesa_get_main_function_signature(shader);

-   ir_function_signature *const main_sig =
-      _mesa_get_main_function_signature(shader);
-   if (main_sig == NULL)
-      return;
-
-   initialize_cs_derived_variables(shader, main_sig);
+      if (main_sig != NULL)
+         initialize_cs_derived_variables(shader, main_sig);
+   }
 }
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -1687,7 +1687,7 @@ set_shader_inout_layout(struct gl_shader *shader,
         shader->TessEval.PointMode = state->in_qualifier->point_mode;
      break;
   case MESA_SHADER_GEOMETRY:
-      shader->Geom.VerticesOut = 0;
+      shader->Geom.VerticesOut = -1;
      if (state->out_qualifier->flags.q.max_vertices) {
         unsigned qual_max_vertices;
         if (state->out_qualifier->max_vertices->
@@ -1907,7 +1907,7 @@ _mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader,
      }
   }

-   _mesa_glsl_initialize_derived_variables(shader);
+   _mesa_glsl_initialize_derived_variables(ctx, shader);

   delete state->symbols;
   ralloc_free(state);
--- a/src/compiler/glsl/ir.cpp
+++ b/src/compiler/glsl/ir.cpp
@@ -2021,26 +2021,3 @@ mode_string(const ir_variable *var)
   assert(!"Should not get here.");
   return "invalid variable";
 }
-
-/**
- * Get the varying type stripped of the outermost array if we're processing
- * a stage whose varyings are arrays indexed by a vertex number (such as
- * geometry shader inputs).
- */
-const glsl_type *
-get_varying_type(const ir_variable *var, gl_shader_stage stage)
-{
-   const glsl_type *type = var->type;
-
-   if (!var->data.patch &&
-       ((var->data.mode == ir_var_shader_out &&
-         stage == MESA_SHADER_TESS_CTRL) ||
-        (var->data.mode == ir_var_shader_in &&
-         (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_TESS_EVAL ||
-          stage == MESA_SHADER_GEOMETRY)))) {
-      assert(type->is_array());
-      type = type->fields.array;
-   }
-
-   return type;
-}
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -2562,7 +2562,8 @@ _mesa_glsl_initialize_variables(exec_list *instructions,
 				struct _mesa_glsl_parse_state *state);

 extern void
-_mesa_glsl_initialize_derived_variables(gl_shader *shader);
+_mesa_glsl_initialize_derived_variables(struct gl_context *ctx,
+                                        gl_shader *shader);

 extern void
 _mesa_glsl_initialize_functions(_mesa_glsl_parse_state *state);
@@ -2621,9 +2622,6 @@ is_gl_identifier(const char *s)
   return s && s[0] == 'g' && s[1] == 'l' && s[2] == '_';
 }

-const glsl_type *
-get_varying_type(const ir_variable *var, gl_shader_stage stage);
-
 extern "C" {
 #endif /* __cplusplus */

--- a/src/compiler/glsl/link_uniform_initializers.cpp
+++ b/src/compiler/glsl/link_uniform_initializers.cpp
@@ -145,6 +145,8 @@ set_opaque_binding(void *mem_ctx, gl_shader_program *prog,
                    storage->opaque[sh].active) {
               for (unsigned i = 0; i < elements; i++) {
                  const unsigned index = storage->opaque[sh].index + i;
+                  if (index >= ARRAY_SIZE(shader->ImageUnits))
+                     break;
                  shader->ImageUnits[index] = storage->storage[i].i;
               }
            }
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -40,6 +40,29 @@
 #include "program.h"


+/**
+ * Get the varying type stripped of the outermost array if we're processing
+ * a stage whose varyings are arrays indexed by a vertex number (such as
+ * geometry shader inputs).
+ */
+static const glsl_type *
+get_varying_type(const ir_variable *var, gl_shader_stage stage)
+{
+   const glsl_type *type = var->type;
+
+   if (!var->data.patch &&
+       ((var->data.mode == ir_var_shader_out &&
+         stage == MESA_SHADER_TESS_CTRL) ||
+        (var->data.mode == ir_var_shader_in &&
+         (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_TESS_EVAL ||
+          stage == MESA_SHADER_GEOMETRY)))) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   return type;
+}
+
 static void
 create_xfb_varying_names(void *mem_ctx, const glsl_type *t, char **name,
                         size_t name_length, unsigned *count,
@@ -1094,21 +1117,23 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
            num_buffers++;
            buffer_stream_id = -1;
            continue;
-         } else if (buffer_stream_id == -1)  {
-            /* First varying writing to this buffer: remember its stream */
-            buffer_stream_id = (int) tfeedback_decls[i].get_stream_id();
-         } else if (buffer_stream_id !=
-                    (int) tfeedback_decls[i].get_stream_id()) {
-            /* Varying writes to the same buffer from a different stream */
-            linker_error(prog,
-                         "Transform feedback can't capture varyings belonging "
-                         "to different vertex streams in a single buffer. "
-                         "Varying %s writes to buffer from stream %u, other "
-                         "varyings in the same buffer write from stream %u.",
-                         tfeedback_decls[i].name(),
-                         tfeedback_decls[i].get_stream_id(),
-                         buffer_stream_id);
-            return false;
+         } else if (tfeedback_decls[i].is_varying()) {
+            if (buffer_stream_id == -1)  {
+               /* First varying writing to this buffer: remember its stream */
+               buffer_stream_id = (int) tfeedback_decls[i].get_stream_id();
+            } else if (buffer_stream_id !=
+                       (int) tfeedback_decls[i].get_stream_id()) {
+               /* Varying writes to the same buffer from a different stream */
+               linker_error(prog,
+                            "Transform feedback can't capture varyings belonging "
+                            "to different vertex streams in a single buffer. "
+                            "Varying %s writes to buffer from stream %u, other "
+                            "varyings in the same buffer write from stream %u.",
+                            tfeedback_decls[i].name(),
+                            tfeedback_decls[i].get_stream_id(),
+                            buffer_stream_id);
+               return false;
+            }
         }

         if (has_xfb_qualifiers) {
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -1980,7 +1980,7 @@ link_gs_inout_layout_qualifiers(struct gl_shader_program *prog,
 				struct gl_shader **shader_list,
 				unsigned num_shaders)
 {
-   linked_shader->Geom.VerticesOut = 0;
+   linked_shader->Geom.VerticesOut = -1;
   linked_shader->Geom.Invocations = 0;
   linked_shader->Geom.InputType = PRIM_UNKNOWN;
   linked_shader->Geom.OutputType = PRIM_UNKNOWN;
@@ -2024,8 +2024,8 @@ link_gs_inout_layout_qualifiers(struct gl_shader_program *prog,
 	 linked_shader->Geom.OutputType = shader->Geom.OutputType;
      }

-      if (shader->Geom.VerticesOut != 0) {
-	 if (linked_shader->Geom.VerticesOut != 0 &&
+      if (shader->Geom.VerticesOut != -1) {
+	 if (linked_shader->Geom.VerticesOut != -1 &&
 	     linked_shader->Geom.VerticesOut != shader->Geom.VerticesOut) {
 	    linker_error(prog, "geometry shader defined with conflicting "
 			 "output vertex count (%d and %d)\n",
@@ -2067,7 +2067,7 @@ link_gs_inout_layout_qualifiers(struct gl_shader_program *prog,
   }
   prog->Geom.OutputType = linked_shader->Geom.OutputType;

-   if (linked_shader->Geom.VerticesOut == 0) {
+   if (linked_shader->Geom.VerticesOut == -1) {
      linker_error(prog,
 		   "geometry shader didn't declare max_vertices\n");
      return;
--- a/src/compiler/glsl/lower_distance.cpp
+++ b/src/compiler/glsl/lower_distance.cpp
@@ -168,6 +168,7 @@ lower_distance_visitor::visit(ir_variable *ir)
      *new_var = ir->clone(ralloc_parent(ir), NULL);
      (*new_var)->name = ralloc_strdup(*new_var, GLSL_CLIP_VAR_NAME);
      (*new_var)->data.max_array_access = new_size - 1;
+      (*new_var)->data.location = VARYING_SLOT_CLIP_DIST0;

      if (!ir->type->fields.array->is_array()) {
         /* gl_ClipDistance (used for vertex, tessellation evaluation and
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -1752,6 +1752,8 @@ nir_intrinsic_from_system_value(gl_system_value val)
      return nir_intrinsic_load_sample_mask_in;
   case SYSTEM_VALUE_LOCAL_INVOCATION_ID:
      return nir_intrinsic_load_local_invocation_id;
+   case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX:
+      return nir_intrinsic_load_local_invocation_index;
   case SYSTEM_VALUE_WORK_GROUP_ID:
      return nir_intrinsic_load_work_group_id;
   case SYSTEM_VALUE_NUM_WORK_GROUPS:
@@ -1801,6 +1803,8 @@ nir_system_value_from_intrinsic(nir_intrinsic_op intrin)
      return SYSTEM_VALUE_SAMPLE_MASK_IN;
   case nir_intrinsic_load_local_invocation_id:
      return SYSTEM_VALUE_LOCAL_INVOCATION_ID;
+   case nir_intrinsic_load_local_invocation_index:
+      return SYSTEM_VALUE_LOCAL_INVOCATION_INDEX;
   case nir_intrinsic_load_num_work_groups:
      return SYSTEM_VALUE_NUM_WORK_GROUPS;
   case nir_intrinsic_load_work_group_id:
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1682,6 +1682,8 @@ typedef struct nir_shader_compiler_options {

   /* Indicates that the driver only has zero-based vertex id */
   bool vertex_id_zero_based;
+
+   bool lower_cs_local_index_from_id;
 } nir_shader_compiler_options;

 typedef struct nir_shader_info {
--- a/src/compiler/nir/nir_gather_info.c
+++ b/src/compiler/nir/nir_gather_info.c
@@ -44,6 +44,7 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader)
   case nir_intrinsic_load_primitive_id:
   case nir_intrinsic_load_invocation_id:
   case nir_intrinsic_load_local_invocation_id:
+   case nir_intrinsic_load_local_invocation_index:
   case nir_intrinsic_load_work_group_id:
   case nir_intrinsic_load_num_work_groups:
      shader->info.system_values_read |=
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -299,10 +299,12 @@ SYSTEM_VALUE(tess_level_outer, 4, 0, xx, xx, xx)
 SYSTEM_VALUE(tess_level_inner, 2, 0, xx, xx, xx)
 SYSTEM_VALUE(patch_vertices_in, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(local_invocation_id, 3, 0, xx, xx, xx)
+SYSTEM_VALUE(local_invocation_index, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(work_group_id, 3, 0, xx, xx, xx)
 SYSTEM_VALUE(user_clip_plane, 4, 1, UCP_ID, xx, xx)
 SYSTEM_VALUE(num_work_groups, 3, 0, xx, xx, xx)
 SYSTEM_VALUE(helper_invocation, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(channel_num, 1, 0, xx, xx, xx)

 /*
 * Load operations pull data from some piece of GPU memory.  All load
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -48,7 +48,7 @@ convert_block(nir_block *block, nir_builder *b)

      b->cursor = nir_after_instr(&load_var->instr);

-      nir_ssa_def *sysval;
+      nir_ssa_def *sysval = NULL;
      switch (var->data.location) {
      case SYSTEM_VALUE_GLOBAL_INVOCATION_ID: {
         /* From the GLSL man page for gl_GlobalInvocationID:
@@ -74,6 +74,12 @@ convert_block(nir_block *block, nir_builder *b)
      }

      case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX: {
+         /* If lower_cs_local_index_from_id is true, then we derive the local
+          * index from the local id.
+          */
+         if (!b->shader->options->lower_cs_local_index_from_id)
+            break;
+
         /* From the GLSL man page for gl_LocalInvocationIndex:
          *
          *    "The value of gl_LocalInvocationIndex is equal to
@@ -111,12 +117,14 @@ convert_block(nir_block *block, nir_builder *b)
            nir_load_system_value(b, nir_intrinsic_load_base_instance, 0));
         break;

-      default: {
+      default:
+         break;
+      }
+
+      if (sysval == NULL) {
         nir_intrinsic_op sysval_op =
            nir_intrinsic_from_system_value(var->data.location);
         sysval = nir_load_system_value(b, sysval_op, 0);
-         break;
-      } /* default */
      }

      nir_ssa_def_rewrite_uses(&load_var->dest.ssa, nir_src_for_ssa(sysval));
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -331,7 +331,9 @@ validate_alu_dest(nir_alu_instr *instr, validate_state *state)
    * destinations of type float
    */
   nir_alu_instr *alu = nir_instr_as_alu(state->instr);
-   validate_assert(state, nir_op_infos[alu->op].output_type == nir_type_float ||
+   validate_assert(state,
+          (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) ==
+           nir_type_float) ||
          !dest->saturate);

   unsigned bit_size = dest->dest.is_ssa ? dest->dest.ssa.bit_size
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -814,10 +814,6 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)

   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER;

-   dri2_dpy->extensions[0] = &droid_image_loader_extension.base;
-   dri2_dpy->extensions[1] = &use_invalidate.base;
-   dri2_dpy->extensions[2] = &image_lookup_extension.base;
-
   /* render nodes cannot use Gem names, and thus do not support
    * the __DRI_DRI2_LOADER extension */
   if (!dri2_dpy->is_render_node) {
@@ -827,10 +823,13 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
      dri2_dpy->dri2_loader_extension.flushFrontBuffer = droid_flush_front_buffer;
      dri2_dpy->dri2_loader_extension.getBuffersWithFormat =
        droid_get_buffers_with_format;
-      dri2_dpy->extensions[3] = &dri2_dpy->dri2_loader_extension.base;
-      dri2_dpy->extensions[4] = NULL;
-   } else
-      dri2_dpy->extensions[3] = NULL;
+      dri2_dpy->extensions[0] = &dri2_dpy->dri2_loader_extension.base;
+   } else {
+      dri2_dpy->extensions[0] = &droid_image_loader_extension.base;
+   }
+   dri2_dpy->extensions[1] = &use_invalidate.base;
+   dri2_dpy->extensions[2] = &image_lookup_extension.base;
+   dri2_dpy->extensions[3] = NULL;


   if (!dri2_create_screen(dpy)) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1985,6 +1985,10 @@ CodeEmitterNVC0::emitMOV(const Instruction *i)
         opc |= i->lanes << 5;

      emitForm_B(i, opc);
+
+      // Explicitly emit the predicate source as emitForm_B skips it.
+      if (i->src(0).getFile() == FILE_PREDICATE)
+         srcId(i->src(0), 20);
   } else {
      uint32_t imm;

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
@@ -365,6 +365,12 @@ int BitSet::findFreeRange(unsigned int count) const
         }
      }
   }
+
+   // If we couldn't find a position, we can have a left-over -1 in pos. Make
+   // sure to abort in such a case.
+   if (pos < 0)
+      return -1;
+
   pos += i * 32;

   return ((pos + count) <= size) ? pos : -1;
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -298,7 +298,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
-			return 41;
+			return (rscreen->family < CHIP_TONGA) ? 41 : 52;
 		case PIPE_VIDEO_PROFILE_HEVC_MAIN:
 		case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
 			return 186;
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3692,7 +3692,10 @@ static void si_init_config(struct si_context *sctx)
 		raster_config_1 = 0x0000002a;
 		break;
 	case CHIP_ICELAND:
-		raster_config = 0x00000002;
+		if (num_rb == 1)
+			raster_config = 0x00000000;
+		else
+			raster_config = 0x00000002;
 		raster_config_1 = 0x00000000;
 		break;
 	case CHIP_CARRIZO:
--- a/src/intel/genxml/Android.mk
+++ b/src/intel/genxml/Android.mk
@@ -0,0 +1,82 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# Import variable GENERATED_FILES.
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_genxml
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+intermediates := $(call local-generated-sources-dir)
+
+# dummy.c source file is generated to meet the build system's rules.
+LOCAL_GENERATED_SOURCES += $(intermediates)/dummy.c
+
+$(intermediates)/dummy.c:
+	@mkdir -p $(dir $@)
+	@echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) touch $@
+
+# This is the list of auto-generated files headers
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/genxml/, $(GENXML_GENERATED_FILES))
+
+define header-gen
+	@mkdir -p $(dir $@)
+	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(PRIVATE_SCRIPT) $(PRIVATE_XML) > $@
+endef
+
+$(intermediates)/genxml/gen6_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/gen_pack_header.py
+$(intermediates)/genxml/gen6_pack.h: PRIVATE_XML := $(LOCAL_PATH)/gen6.xml
+$(intermediates)/genxml/gen6_pack.h: $(LOCAL_PATH)/gen6.xml $(LOCAL_PATH)/gen_pack_header.py
+	$(call header-gen)
+
+$(intermediates)/genxml/gen7_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/gen_pack_header.py
+$(intermediates)/genxml/gen7_pack.h: PRIVATE_XML := $(LOCAL_PATH)/gen7.xml
+$(intermediates)/genxml/gen7_pack.h: $(LOCAL_PATH)/gen7.xml $(LOCAL_PATH)/gen_pack_header.py
+	$(call header-gen)
+
+$(intermediates)/genxml/gen75_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/gen_pack_header.py
+$(intermediates)/genxml/gen75_pack.h: PRIVATE_XML := $(LOCAL_PATH)/gen75.xml
+$(intermediates)/genxml/gen75_pack.h: $(LOCAL_PATH)/gen75.xml $(LOCAL_PATH)/gen_pack_header.py
+	$(call header-gen)
+
+$(intermediates)/genxml/gen8_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/gen_pack_header.py
+$(intermediates)/genxml/gen8_pack.h: PRIVATE_XML := $(LOCAL_PATH)/gen8.xml
+$(intermediates)/genxml/gen8_pack.h: $(LOCAL_PATH)/gen8.xml $(LOCAL_PATH)/gen_pack_header.py
+	$(call header-gen)
+
+$(intermediates)/genxml/gen9_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/gen_pack_header.py
+$(intermediates)/genxml/gen9_pack.h: PRIVATE_XML := $(LOCAL_PATH)/gen9.xml
+$(intermediates)/genxml/gen9_pack.h: $(LOCAL_PATH)/gen9.xml $(LOCAL_PATH)/gen_pack_header.py
+	$(call header-gen)
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+	$(MESA_TOP)/src/intel \
+	$(intermediates)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
--- a/src/intel/genxml/Makefile.am
+++ b/src/intel/genxml/Makefile.am
@@ -23,14 +23,14 @@ include Makefile.sources

 BUILT_SOURCES = $(GENXML_GENERATED_FILES)

-PYTHON3_GEN = $(AM_V_GEN)$(PYTHON3) $(PYTHON_FLAGS)
+PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)

 SUFFIXES = _pack.h .xml

 $(BUILT_SOURCES): gen_pack_header.py

 .xml_pack.h:
-	$(PYTHON3_GEN) $(srcdir)/gen_pack_header.py $< > $@
+	$(PYTHON_GEN) $(srcdir)/gen_pack_header.py $< > $@

 CLEANFILES = $(BUILT_SOURCES)

--- a/src/intel/genxml/gen_pack_header.py
+++ b/src/intel/genxml/gen_pack_header.py
@@ -1,5 +1,9 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python2
+#encoding=utf-8

+from __future__ import (
+    absolute_import, division, print_function, unicode_literals
+)
 import xml.parsers.expat
 import re
 import sys
@@ -197,7 +201,7 @@ def to_alphanum(name):

 def safe_name(name):
    name = to_alphanum(name)
-    if not str.isalpha(name[0]):
+    if not name[0].isalpha():
        name = '_' + name

    return name
@@ -209,9 +213,9 @@ def num_from_str(num_str):
        assert(not num_str.startswith('0') and 'octals numbers not allowed')
        return int(num_str)

-class Field:
-    ufixed_pattern = re.compile("u(\d+)\.(\d+)")
-    sfixed_pattern = re.compile("s(\d+)\.(\d+)")
+class Field(object):
+    ufixed_pattern = re.compile(r"u(\d+)\.(\d+)")
+    sfixed_pattern = re.compile(r"s(\d+)\.(\d+)")

    def __init__(self, parser, attrs):
        self.parser = parser
@@ -278,7 +282,7 @@ class Field:
        for value in self.values:
            print("#define %-40s %d" % (prefix + value.name, value.value))

-class Group:
+class Group(object):
    def __init__(self, parser, parent, start, count, size):
        self.parser = parser
        self.parent = parent
@@ -466,12 +470,12 @@ class Group:
            print("   dw[%d] = %s;" % (index, v))
            print("   dw[%d] = %s >> 32;" % (index + 1, v))

-class Value:
+class Value(object):
    def __init__(self, attrs):
        self.name = safe_name(attrs["name"])
        self.value = int(attrs["value"])

-class Parser:
+class Parser(object):
    def __init__(self):
        self.parser = xml.parsers.expat.ParserCreate()
        self.parser.StartElementHandler = self.start_element
--- a/src/intel/isl/Android.mk
+++ b/src/intel/isl/Android.mk
@@ -0,0 +1,155 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+LOCAL_PATH := $(call my-dir)
+
+# Import variables LIBISL_FILES.
+include $(LOCAL_PATH)/Makefile.sources
+
+LIBISL_GENX_COMMON_INCLUDES := \
+	$(MESA_TOP)/src/ \
+	$(MESA_TOP)/src/mesa/drivers/dri/i965
+
+# ---------------------------------------
+# Build libisl_gen7
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_isl_gen7
+
+LOCAL_SRC_FILES := $(ISL_GEN7_FILES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=70
+
+LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libisl_gen75
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_isl_gen75
+
+LOCAL_SRC_FILES := $(ISL_GEN75_FILES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=75
+
+LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libisl_gen8
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_isl_gen8
+
+LOCAL_SRC_FILES := $(ISL_GEN8_FILES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=80
+
+LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libisl_gen9
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_isl_gen9
+
+LOCAL_SRC_FILES := $(ISL_GEN9_FILES)
+
+LOCAL_CFLAGS := -DGEN_VERSIONx10=90
+
+LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libisl
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_isl
+
+LOCAL_SRC_FILES := $(ISL_FILES)
+
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/mapi \
+	$(MESA_TOP)/src/mesa \
+	$(MESA_TOP)/src/mesa/drivers/dri/i965 \
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(MESA_TOP)/src/intel
+
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+	libmesa_isl_gen7 \
+	libmesa_isl_gen75 \
+	libmesa_isl_gen8 \
+	libmesa_isl_gen9
+
+# Autogenerated sources
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+intermediates := $(call local-generated-sources-dir)
+
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, $(ISL_GENERATED_FILES))
+
+define bash-gen
+	@mkdir -p $(dir $@)
+	@echo "Gen Bash: $(PRIVATE_MODULE) <= $(notdir $(@))"
+	$(hide) $(PRIVATE_SCRIPT) < $(PRIVATE_CSV) > $@
+endef
+
+isl_format_layout_deps := \
+	$(LOCAL_PATH)/isl_format_layout_gen.bash \
+	$(LOCAL_PATH)/isl_format_layout.csv
+
+$(intermediates)/isl_format_layout.c: PRIVATE_SCRIPT := bash -c $(LOCAL_PATH)/isl_format_layout_gen.bash
+$(intermediates)/isl_format_layout.c: PRIVATE_CSV := $(LOCAL_PATH)/isl_format_layout.csv
+$(intermediates)/isl_format_layout.c: $(isl_format_layout_deps)
+	$(call bash-gen)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
--- a/src/intel/isl/Makefile.am
+++ b/src/intel/isl/Makefile.am
@@ -50,7 +50,7 @@ libisl_la_CFLAGS = $(CFLAGS) -Wno-override-init

 libisl_la_LIBADD = $(ISL_GEN_LIBS)

-libisl_la_SOURCES = $(ISL_FILES)
+libisl_la_SOURCES = $(ISL_FILES) $(ISL_GENERATED_FILES)

 libisl_gen7_la_SOURCES = $(ISL_GEN7_FILES)
 libisl_gen7_la_CFLAGS = $(libisl_la_CFLAGS) -DGEN_VERSIONx10=70
--- a/src/intel/isl/Makefile.sources
+++ b/src/intel/isl/Makefile.sources
@@ -2,7 +2,6 @@ ISL_FILES = \
 	isl.c \
 	isl.h \
 	isl_format.c \
-	isl_format_layout.c \
 	isl_gen4.c \
 	isl_gen4.h \
 	isl_gen6.c \
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -1065,24 +1065,14 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;

-   const unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
-   const unsigned push_constant_data_size =
-      (local_id_dwords + prog_data->nr_params) * 4;
-   const unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
-   const unsigned param_aligned_count =
-      reg_aligned_constant_size / sizeof(uint32_t);
-
   /* If we don't actually have any push constants, bail. */
-   if (reg_aligned_constant_size == 0)
+   if (cs_prog_data->push.total.size == 0)
      return (struct anv_state) { .offset = 0 };

-   const unsigned threads = pipeline->cs_thread_width_max;
-   const unsigned total_push_constants_size =
-      reg_aligned_constant_size * threads;
   const unsigned push_constant_alignment =
      cmd_buffer->device->info.gen < 8 ? 32 : 64;
   const unsigned aligned_total_push_constants_size =
-      ALIGN(total_push_constants_size, push_constant_alignment);
+      ALIGN(cs_prog_data->push.total.size, push_constant_alignment);
   struct anv_state state =
      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
                                         aligned_total_push_constants_size,
@@ -1091,21 +1081,33 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
   /* Walk through the param array and fill the buffer with data */
   uint32_t *u32_map = state.map;

-   brw_cs_fill_local_id_payload(cs_prog_data, u32_map, threads,
-                                reg_aligned_constant_size);
-
-   /* Setup uniform data for the first thread */
-   for (unsigned i = 0; i < prog_data->nr_params; i++) {
-      uint32_t offset = (uintptr_t)prog_data->param[i];
-      u32_map[local_id_dwords + i] = *(uint32_t *)((uint8_t *)data + offset);
+   if (cs_prog_data->push.cross_thread.size > 0) {
+      assert(cs_prog_data->thread_local_id_index < 0 ||
+             cs_prog_data->thread_local_id_index >=
+                cs_prog_data->push.cross_thread.dwords);
+      for (unsigned i = 0;
+           i < cs_prog_data->push.cross_thread.dwords;
+           i++) {
+         uint32_t offset = (uintptr_t)prog_data->param[i];
+         u32_map[i] = *(uint32_t *)((uint8_t *)data + offset);
+      }
   }

-   /* Copy uniform data from the first thread to every other thread */
-   const size_t uniform_data_size = prog_data->nr_params * sizeof(uint32_t);
-   for (unsigned t = 1; t < threads; t++) {
-      memcpy(&u32_map[t * param_aligned_count + local_id_dwords],
-             &u32_map[local_id_dwords],
-             uniform_data_size);
+   if (cs_prog_data->push.per_thread.size > 0) {
+      for (unsigned t = 0; t < cs_prog_data->threads; t++) {
+         unsigned dst =
+            8 * (cs_prog_data->push.per_thread.regs * t +
+                 cs_prog_data->push.cross_thread.regs);
+         unsigned src = cs_prog_data->push.cross_thread.dwords;
+         for ( ; src < prog_data->nr_params; src++, dst++) {
+            if (src != cs_prog_data->thread_local_id_index) {
+               uint32_t offset = (uintptr_t)prog_data->param[src];
+               u32_map[dst] = *(uint32_t *)((uint8_t *)data + offset);
+            } else {
+               u32_map[dst] = t * cs_prog_data->simd_size;
+            }
+         }
+      }
   }

   if (!cmd_buffer->device->info.has_llc)
--- a/src/intel/vulkan/anv_entrypoints_gen.py
+++ b/src/intel/vulkan/anv_entrypoints_gen.py
@@ -51,6 +51,20 @@ def hash(name):

    return h

+def print_guard_start(name):
+    if "Wayland" in name:
+        print "#ifdef VK_USE_PLATFORM_WAYLAND_KHR"
+    if "Xcb" in name:
+        print "#ifdef VK_USE_PLATFORM_XCB_KHR"
+    return
+
+def print_guard_end(name):
+    if "Wayland" in name:
+        print "#endif // VK_USE_PLATFORM_WAYLAND_KHR"
+    if "Xcb" in name:
+        print "#endif // VK_USE_PLATFORM_XCB_KHR"
+    return
+
 opt_header = False
 opt_code = False

@@ -86,7 +100,9 @@ if opt_header:
    print "      struct {"

    for type, name, args, num, h in entrypoints:
+        print_guard_start(name)
        print "         %s (*%s)%s;" % (type, name, args)
+        print_guard_end(name)
    print "      };\n"
    print "   };\n"
    print "};\n"
@@ -94,12 +110,14 @@ if opt_header:
    print "void anv_set_dispatch_devinfo(const struct brw_device_info *info);\n"

    for type, name, args, num, h in entrypoints:
+        print_guard_start(name)
        print "%s anv_%s%s;" % (type, name, args)
        print "%s gen7_%s%s;" % (type, name, args)
        print "%s gen75_%s%s;" % (type, name, args)
        print "%s gen8_%s%s;" % (type, name, args)
        print "%s gen9_%s%s;" % (type, name, args)
        print "%s anv_validate_%s%s;" % (type, name, args)
+        print_guard_end(name)
    exit()


@@ -146,9 +164,11 @@ static const char strings[] ="""
 offsets = []
 i = 0;
 for type, name, args, num, h in entrypoints:
+    print_guard_start(name)
    print "   \"vk%s\\0\"" % name
    offsets.append(i)
    i += 2 + len(name) + 1
+    print_guard_end(name)
 print """   ;

 /* Weak aliases for all potential validate functions. These will resolve to
@@ -162,15 +182,21 @@ print """   ;

 print "\nstatic const struct anv_entrypoint entrypoints[] = {"
 for type, name, args, num, h in entrypoints:
+    print_guard_start(name)
    print "   { %5d, 0x%08x }," % (offsets[num], h)
+    print_guard_end(name)
 print "};\n"

 for layer in [ "anv", "validate", "gen7", "gen75", "gen8", "gen9" ]:
    for type, name, args, num, h in entrypoints:
+        print_guard_start(name)
        print "%s %s_%s%s __attribute__ ((weak));" % (type, layer, name, args)
+        print_guard_end(name)
    print "\nconst struct anv_dispatch_table %s_layer = {" % layer
    for type, name, args, num, h in entrypoints:
+        print_guard_start(name)
        print "   .%s = %s_%s," % (name, layer, name)
+        print_guard_end(name)
    print "};\n"

 print """
@@ -242,8 +268,10 @@ anv_resolve_entrypoint(uint32_t index)
 # lets the resolver look it up in the table.

 for type, name, args, num, h in entrypoints:
+    print_guard_start(name)
    print "static void *resolve_%s(void) { return anv_resolve_entrypoint(%d); }" % (name, num)
    print "%s vk%s%s\n   __attribute__ ((ifunc (\"resolve_%s\"), visibility (\"default\")));\n" % (type, name, args, name)
+    print_guard_end(name)


 # Now generate the hash table used for entry point look up.  This is a
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -338,6 +338,10 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
      pipeline->needs_data_cache = true;
   }

+   if (stage == MESA_SHADER_COMPUTE)
+      ((struct brw_cs_prog_data *)prog_data)->thread_local_id_index =
+         prog_data->nr_params++; /* The CS Thread ID uniform */
+
   if (nir->info.num_ssbos > 0)
      pipeline->needs_data_cache = true;

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1474,7 +1474,6 @@ struct anv_pipeline {
   bool                                         primitive_restart;
   uint32_t                                     topology;

-   uint32_t                                     cs_thread_width_max;
   uint32_t                                     cs_right_mask;

   struct {
--- a/src/intel/vulkan/gen7_cmd_buffer.c
+++ b/src/intel/vulkan/gen7_cmd_buffer.c
@@ -234,12 +234,6 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;

-   unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
-   unsigned push_constant_data_size =
-      (prog_data->nr_params + local_id_dwords) * 4;
-   unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
-   unsigned push_constant_regs = reg_aligned_constant_size / 32;
-
   if (push_state.alloc_size) {
      anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
         curbe.CURBETotalDataLength    = push_state.alloc_size;
@@ -264,14 +258,17 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
                          .BindingTablePointer = surfaces.offset,
                          .SamplerStatePointer = samplers.offset,
                          .ConstantURBEntryReadLength =
-                             push_constant_regs,
-#if !GEN_IS_HASWELL
+                             cs_prog_data->push.per_thread.regs,
+#if GEN_IS_HASWELL
+                          .CrossThreadConstantDataReadLength =
+                             cs_prog_data->push.cross_thread.regs,
+#else
                          .ConstantURBEntryReadOffset = 0,
 #endif
                          .BarrierEnable = cs_prog_data->uses_barrier,
                          .SharedLocalMemorySize = slm_size,
                          .NumberofThreadsinGPGPUThreadGroup =
-                             pipeline->cs_thread_width_max);
+                             cs_prog_data->threads);

   const uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
   anv_batch_emit(&cmd_buffer->batch,
--- a/src/intel/vulkan/gen8_cmd_buffer.c
+++ b/src/intel/vulkan/gen8_cmd_buffer.c
@@ -319,12 +319,6 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;

-   unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
-   unsigned push_constant_data_size =
-      (prog_data->nr_params + local_id_dwords) * 4;
-   unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
-   unsigned push_constant_regs = reg_aligned_constant_size / 32;
-
   if (push_state.alloc_size) {
      anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
         curbe.CURBETotalDataLength    = push_state.alloc_size;
@@ -351,12 +345,15 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
                          .BindingTableEntryCount = 0,
                          .SamplerStatePointer = samplers.offset,
                          .SamplerCount = 0,
-                          .ConstantIndirectURBEntryReadLength = push_constant_regs,
+                          .ConstantIndirectURBEntryReadLength =
+                             cs_prog_data->push.per_thread.regs,
                          .ConstantURBEntryReadOffset = 0,
                          .BarrierEnable = cs_prog_data->uses_barrier,
                          .SharedLocalMemorySize = slm_size,
                          .NumberofThreadsinGPGPUThreadGroup =
-                             pipeline->cs_thread_width_max);
+                             cs_prog_data->threads,
+                          .CrossThreadConstantDataReadLength =
+                             cs_prog_data->push.cross_thread.regs);

   uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
   anv_batch_emit(&cmd_buffer->batch,
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -773,7 +773,7 @@ void genX(CmdDispatch)(
      ggw.SIMDSize                     = prog_data->simd_size / 16;
      ggw.ThreadDepthCounterMaximum    = 0;
      ggw.ThreadHeightCounterMaximum   = 0;
-      ggw.ThreadWidthCounterMaximum    = pipeline->cs_thread_width_max - 1;
+      ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
      ggw.ThreadGroupIDXDimension      = x;
      ggw.ThreadGroupIDYDimension      = y;
      ggw.ThreadGroupIDZDimension      = z;
@@ -874,7 +874,7 @@ void genX(CmdDispatchIndirect)(
      ggw.SIMDSize                     = prog_data->simd_size / 16;
      ggw.ThreadDepthCounterMaximum    = 0;
      ggw.ThreadHeightCounterMaximum   = 0;
-      ggw.ThreadWidthCounterMaximum    = pipeline->cs_thread_width_max - 1;
+      ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
      ggw.RightExecutionMask           = pipeline->cs_right_mask;
      ggw.BottomExecutionMask          = 0xffffffff;
   }
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -87,18 +87,9 @@ genX(compute_pipeline_create)(
   anv_setup_pipeline_l3_config(pipeline);

   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
-   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
-
-   unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
-   unsigned push_constant_data_size =
-      (prog_data->nr_params + local_id_dwords) * 4;
-   unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
-   unsigned push_constant_regs = reg_aligned_constant_size / 32;

   uint32_t group_size = cs_prog_data->local_size[0] *
      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
-   pipeline->cs_thread_width_max =
-      DIV_ROUND_UP(group_size, cs_prog_data->simd_size);
   uint32_t remainder = group_size & (cs_prog_data->simd_size - 1);

   if (remainder > 0)
@@ -107,7 +98,8 @@ genX(compute_pipeline_create)(
      pipeline->cs_right_mask = ~0u >> (32 - cs_prog_data->simd_size);

   const uint32_t vfe_curbe_allocation =
-      push_constant_regs * pipeline->cs_thread_width_max;
+      ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
+            cs_prog_data->push.cross_thread.regs, 2);

   anv_batch_emit(&pipeline->batch, GENX(MEDIA_VFE_STATE), vfe) {
      vfe.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_COMPUTE];
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -52,7 +52,8 @@ LOCAL_SRC_FILES := \
 	$(i965_FILES)

 LOCAL_WHOLE_STATIC_LIBRARIES := \
-	$(MESA_DRI_WHOLE_STATIC_LIBRARIES)
+	$(MESA_DRI_WHOLE_STATIC_LIBRARIES) \
+	libmesa_isl

 LOCAL_SHARED_LIBRARIES := \
 	$(MESA_DRI_SHARED_LIBRARIES) \
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -46,6 +46,7 @@ i965_compiler_FILES = \
 	brw_nir.c \
 	brw_nir_analyze_boolean_resolves.c \
 	brw_nir_attribute_workarounds.c \
+	brw_nir_intrinsics.c \
 	brw_nir_opt_peephole_ffma.c \
 	brw_packed_float.c \
 	brw_predicated_break.cpp \
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -424,15 +424,28 @@ struct brw_wm_prog_data {
   int urb_setup[VARYING_SLOT_MAX];
 };

+struct brw_push_const_block {
+   unsigned dwords;     /* Dword count, not reg aligned */
+   unsigned regs;
+   unsigned size;       /* Bytes, register aligned */
+};
+
 struct brw_cs_prog_data {
   struct brw_stage_prog_data base;

   GLuint dispatch_grf_start_reg_16;
   unsigned local_size[3];
   unsigned simd_size;
+   unsigned threads;
   bool uses_barrier;
   bool uses_num_work_groups;
-   unsigned local_invocation_id_regs;
+   int thread_local_id_index;
+
+   struct {
+      struct brw_push_const_block cross_thread;
+      struct brw_push_const_block per_thread;
+      struct brw_push_const_block total;
+   } push;

   struct {
      /** @{
@@ -817,13 +830,6 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
               unsigned *final_assembly_size,
               char **error_str);

-/**
- * Fill out local id payload for compute shader according to cs_prog_data.
- */
-void
-brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
-                             void *buffer, uint32_t threads, uint32_t stride);
-
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -70,6 +70,7 @@
 #include "tnl/t_pipeline.h"
 #include "util/ralloc.h"
 #include "util/debug.h"
+#include "isl/isl.h"

 /***************************************
 * Mesa's Driver Functions
@@ -166,6 +167,38 @@ intel_update_framebuffer(struct gl_context *ctx,
                                 fb->DefaultGeometry.NumSamples);
 }

+/* On Gen9 color buffers may be compressed by the hardware (lossless
+ * compression). There are, however, format restrictions and care needs to be
+ * taken that the sampler engine is capable for re-interpreting a buffer with
+ * format different the buffer was originally written with.
+ *
+ * For example, SRGB formats are not compressible and the sampler engine isn't
+ * capable of treating RGBA_UNORM as SRGB_ALPHA. In such a case the underlying
+ * color buffer needs to be resolved so that the sampling surface can be
+ * sampled as non-compressed (i.e., without the auxiliary MCS buffer being
+ * set).
+ */
+static bool
+intel_texture_view_requires_resolve(struct brw_context *brw,
+                                    struct intel_texture_object *intel_tex)
+{
+   if (brw->gen < 9 ||
+       !intel_miptree_is_lossless_compressed(brw, intel_tex->mt))
+     return false;
+
+   const uint32_t brw_format = brw_format_for_mesa_format(intel_tex->_Format);
+
+   if (isl_format_supports_lossless_compression(brw->intelScreen->devinfo,
+                                                brw_format))
+      return false;
+
+   perf_debug("Incompatible sampling format (%s) for rbc (%s)\n",
+              _mesa_get_format_name(intel_tex->_Format),
+              _mesa_get_format_name(intel_tex->mt->format));
+
+   return true;
+}
+
 static void
 intel_update_state(struct gl_context * ctx, GLuint new_state)
 {
@@ -198,8 +231,9 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
      /* Sampling engine understands lossless compression and resolving
       * those surfaces should be skipped for performance reasons.
       */
-      intel_miptree_resolve_color(brw, tex_obj->mt,
-                                  INTEL_MIPTREE_IGNORE_CCS_E);
+      const int flags = intel_texture_view_requires_resolve(brw, tex_obj) ?
+                           0 : INTEL_MIPTREE_IGNORE_CCS_E;
+      intel_miptree_resolve_color(brw, tex_obj->mt, flags);
      brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
   }

--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@@ -93,6 +93,9 @@ brw_codegen_cs_prog(struct brw_context *brw,
    */
   int param_count = cp->program.Base.nir->num_uniforms / 4;

+   /* The backend also sometimes add a param for the thread local id. */
+   prog_data.thread_local_id_index = param_count++;
+
   /* The backend also sometimes adds params for texture size. */
   param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
   prog_data.base.param =
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2943,6 +2943,9 @@ enum brw_wm_barycentric_interp_mode {
 # define MEDIA_GPGPU_THREAD_COUNT_MASK          INTEL_MASK(7, 0)
 # define GEN8_MEDIA_GPGPU_THREAD_COUNT_SHIFT    0
 # define GEN8_MEDIA_GPGPU_THREAD_COUNT_MASK     INTEL_MASK(9, 0)
+/* GEN7 DW6, GEN8+ DW7 */
+# define CROSS_THREAD_READ_LENGTH_SHIFT         0
+# define CROSS_THREAD_READ_LENGTH_MASK          INTEL_MASK(7, 0)
 #define MEDIA_STATE_FLUSH                       0x7004
 #define GPGPU_WALKER                            0x7105
 /* GEN7 DW0 */
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -2000,8 +2000,10 @@ void gen6_math(struct brw_codegen *p,

   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
   if (devinfo->gen == 6) {
-      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
-      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(has_scalar_region(src0) ||
+             src0.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(has_scalar_region(src1) ||
+             src1.hstride == BRW_HORIZONTAL_STRIDE_1);
   }

   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2097,6 +2097,10 @@ fs_visitor::assign_constant_locations()
   bool contiguous[uniforms];
   memset(contiguous, 0, sizeof(contiguous));

+   int thread_local_id_index =
+      (stage == MESA_SHADER_COMPUTE) ?
+      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1;
+
   /* First, we walk through the instructions and do two things:
    *
    *  1) Figure out which uniforms are live.
@@ -2141,6 +2145,9 @@ fs_visitor::assign_constant_locations()
      }
   }

+   if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])
+      thread_local_id_index = -1;
+
   /* Only allow 16 registers (128 uniform components) as push constants.
    *
    * Just demote the end of the list.  We could probably do better
@@ -2149,7 +2156,9 @@ fs_visitor::assign_constant_locations()
    * If changing this value, note the limitation about total_regs in
    * brw_curbe.c.
    */
-   const unsigned int max_push_components = 16 * 8;
+   unsigned int max_push_components = 16 * 8;
+   if (thread_local_id_index >= 0)
+      max_push_components--; /* Save a slot for the thread ID */

   /* We push small arrays, but no bigger than 16 floats.  This is big enough
    * for a vec4 but hopefully not large enough to push out other stuff.  We
@@ -2187,6 +2196,10 @@ fs_visitor::assign_constant_locations()
      if (!is_live[u] || is_live_64bit[u])
         continue;

+      /* Skip thread_local_id_index to put it in the last push register. */
+      if (thread_local_id_index == (int)u)
+         continue;
+
      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
                                 push_constant_loc, pull_constant_loc,
                                 &num_push_constants, &num_pull_constants,
@@ -2194,6 +2207,10 @@ fs_visitor::assign_constant_locations()
                                 stage_prog_data);
   }

+   /* Add the CS local thread ID uniform at the end of the push constants */
+   if (thread_local_id_index >= 0)
+      push_constant_loc[thread_local_id_index] = num_push_constants++;
+
   /* As the uniforms are going to be reordered, take the data from a temporary
    * copy of the original param[].
    */
@@ -2212,6 +2229,7 @@ fs_visitor::assign_constant_locations()
    * push_constant_loc[i] <= i and we can do it in one smooth loop without
    * having to make a copy.
    */
+   int new_thread_local_id_index = -1;
   for (unsigned int i = 0; i < uniforms; i++) {
      const gl_constant_value *value = param[i];

@@ -2219,9 +2237,15 @@ fs_visitor::assign_constant_locations()
         stage_prog_data->pull_param[pull_constant_loc[i]] = value;
      } else if (push_constant_loc[i] != -1) {
         stage_prog_data->param[push_constant_loc[i]] = value;
+         if (thread_local_id_index == (int)i)
+            new_thread_local_id_index = push_constant_loc[i];
      }
   }
   ralloc_free(param);
+
+   if (stage == MESA_SHADER_COMPUTE)
+      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index =
+         new_thread_local_id_index;
 }

 /**
@@ -2767,6 +2791,20 @@ fs_visitor::opt_redundant_discard_jumps()
   return progress;
 }

+/**
+ * Compute a bitmask with GRF granularity with a bit set for each GRF starting
+ * from \p r which overlaps the region starting at \p r and spanning \p n GRF
+ * units.
+ */
+static inline unsigned
+mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned n)
+{
+   const int rel_offset = (reg_offset(s) - reg_offset(r)) / REG_SIZE;
+   assert(reg_space(r) == reg_space(s) &&
+          rel_offset >= 0 && rel_offset < int(8 * sizeof(unsigned)));
+   return ((1 << n) - 1) << rel_offset;
+}
+
 bool
 fs_visitor::compute_to_mrf()
 {
@@ -2792,31 +2830,22 @@ fs_visitor::compute_to_mrf()
          inst->src[0].subreg_offset)
 	 continue;

-      /* Work out which hardware MRF registers are written by this
-       * instruction.
-       */
-      int mrf_low = inst->dst.nr & ~BRW_MRF_COMPR4;
-      int mrf_high;
-      if (inst->dst.nr & BRW_MRF_COMPR4) {
-	 mrf_high = mrf_low + 4;
-      } else if (inst->exec_size == 16) {
-	 mrf_high = mrf_low + 1;
-      } else {
-	 mrf_high = mrf_low;
-      }
-
      /* Can't compute-to-MRF this GRF if someone else was going to
       * read it later.
       */
      if (this->virtual_grf_end[inst->src[0].nr] > ip)
 	 continue;

-      /* Found a move of a GRF to a MRF.  Let's see if we can go
-       * rewrite the thing that made this GRF to write into the MRF.
+      /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
+       * things that computed the value of all GRFs of the source region.  The
+       * regs_left bitset keeps track of the registers we haven't yet found a
+       * generating instruction for.
       */
+      unsigned regs_left = (1 << inst->regs_read(0)) - 1;
+
      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
-	 if (scan_inst->dst.file == VGRF &&
-            scan_inst->dst.nr == inst->src[0].nr) {
+         if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE,
+                             inst->src[0], inst->regs_read(0) * REG_SIZE)) {
 	    /* Found the last thing to write our reg we want to turn
 	     * into a compute-to-MRF.
 	     */
@@ -2824,15 +2853,18 @@ fs_visitor::compute_to_mrf()
 	    /* If this one instruction didn't populate all the
 	     * channels, bail.  We might be able to rewrite everything
 	     * that writes that reg, but it would require smarter
-	     * tracking to delay the rewriting until complete success.
+	     * tracking.
 	     */
 	    if (scan_inst->is_partial_write())
 	       break;

-            /* Things returning more than one register would need us to
-             * understand coalescing out more than one MOV at a time.
+            /* Handling things not fully contained in the source of the copy
+             * would need us to understand coalescing out more than one MOV at
+             * a time.
             */
-            if (scan_inst->regs_written > scan_inst->exec_size / 8)
+            if (scan_inst->dst.reg_offset < inst->src[0].reg_offset ||
+                scan_inst->dst.reg_offset + scan_inst->regs_written >
+                inst->src[0].reg_offset + inst->regs_read(0))
               break;

 	    /* SEND instructions can't have MRF as a destination. */
@@ -2848,16 +2880,11 @@ fs_visitor::compute_to_mrf()
 	       }
 	    }

-	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
-	       /* Found the creator of our MRF's source value. */
-	       scan_inst->dst.file = MRF;
-               scan_inst->dst.nr = inst->dst.nr;
-               scan_inst->dst.reg_offset = 0;
-	       scan_inst->saturate |= inst->saturate;
-	       inst->remove(block);
-	       progress = true;
-	    }
-	    break;
+            /* Clear the bits for any registers this instruction overwrites. */
+            regs_left &= ~mask_relative_to(
+               inst->src[0], scan_inst->dst, scan_inst->regs_written);
+            if (!regs_left)
+               break;
 	 }

 	 /* We don't handle control flow here.  Most computation of
@@ -2872,54 +2899,83 @@ fs_visitor::compute_to_mrf()
 	  */
 	 bool interfered = false;
 	 for (int i = 0; i < scan_inst->sources; i++) {
-	    if (scan_inst->src[i].file == VGRF &&
-                scan_inst->src[i].nr == inst->src[0].nr &&
-		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
+            if (regions_overlap(scan_inst->src[i], scan_inst->regs_read(i) * REG_SIZE,
+                                inst->src[0], inst->regs_read(0) * REG_SIZE)) {
 	       interfered = true;
 	    }
 	 }
 	 if (interfered)
 	    break;

-	 if (scan_inst->dst.file == MRF) {
+         if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE,
+                             inst->dst, inst->regs_written * REG_SIZE)) {
 	    /* If somebody else writes our MRF here, we can't
 	     * compute-to-MRF before that.
 	     */
-            int scan_mrf_low = scan_inst->dst.nr & ~BRW_MRF_COMPR4;
-	    int scan_mrf_high;
+            break;
+         }

-            if (scan_inst->dst.nr & BRW_MRF_COMPR4) {
-	       scan_mrf_high = scan_mrf_low + 4;
-	    } else if (scan_inst->exec_size == 16) {
-	       scan_mrf_high = scan_mrf_low + 1;
-	    } else {
-	       scan_mrf_high = scan_mrf_low;
-	    }
-
-	    if (mrf_low == scan_mrf_low ||
-		mrf_low == scan_mrf_high ||
-		mrf_high == scan_mrf_low ||
-		mrf_high == scan_mrf_high) {
-	       break;
-	    }
-	 }
-
-	 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
+         if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
+             regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
+                             inst->dst, inst->regs_written * REG_SIZE)) {
 	    /* Found a SEND instruction, which means that there are
 	     * live values in MRFs from base_mrf to base_mrf +
 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
 	     * above it.
 	     */
-	    if (mrf_low >= scan_inst->base_mrf &&
-		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
-	       break;
-	    }
-	    if (mrf_high >= scan_inst->base_mrf &&
-		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
-	       break;
-	    }
-	 }
+            break;
+         }
      }
+
+      if (regs_left)
+         continue;
+
+      /* Found all generating instructions of our MRF's source value, so it
+       * should be safe to rewrite them to point to the MRF directly.
+       */
+      regs_left = (1 << inst->regs_read(0)) - 1;
+
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE,
+                             inst->src[0], inst->regs_read(0) * REG_SIZE)) {
+            /* Clear the bits for any registers this instruction overwrites. */
+            regs_left &= ~mask_relative_to(
+               inst->src[0], scan_inst->dst, scan_inst->regs_written);
+
+            const unsigned rel_offset = (reg_offset(scan_inst->dst) -
+                                         reg_offset(inst->src[0])) / REG_SIZE;
+
+            if (inst->dst.nr & BRW_MRF_COMPR4) {
+               /* Apply the same address transformation done by the hardware
+                * for COMPR4 MRF writes.
+                */
+               assert(rel_offset < 2);
+               scan_inst->dst.nr = inst->dst.nr + rel_offset * 4;
+
+               /* Clear the COMPR4 bit if the generating instruction is not
+                * compressed.
+                */
+               if (scan_inst->regs_written < 2)
+                  scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
+
+            } else {
+               /* Calculate the MRF number the result of this instruction is
+                * ultimately written to.
+                */
+               scan_inst->dst.nr = inst->dst.nr + rel_offset;
+            }
+
+            scan_inst->dst.file = MRF;
+            scan_inst->dst.reg_offset = 0;
+            scan_inst->saturate |= inst->saturate;
+            if (!regs_left)
+               break;
+         }
+      }
+
+      assert(!regs_left);
+      inst->remove(block);
+      progress = true;
   }

   if (progress)
@@ -3080,18 +3136,18 @@ fs_visitor::remove_duplicate_mrf_writes()
      }

      /* Clear out any MRF move records whose sources got overwritten. */
-      if (inst->dst.file == VGRF) {
-	 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
-	    if (last_mrf_move[i] &&
-                last_mrf_move[i]->src[0].nr == inst->dst.nr) {
-	       last_mrf_move[i] = NULL;
-	    }
-	 }
+      for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
+         if (last_mrf_move[i] &&
+             regions_overlap(inst->dst, inst->regs_written * REG_SIZE,
+                             last_mrf_move[i]->src[0],
+                             last_mrf_move[i]->regs_read(0) * REG_SIZE)) {
+            last_mrf_move[i] = NULL;
+         }
      }

      if (inst->opcode == BRW_OPCODE_MOV &&
 	  inst->dst.file == MRF &&
-	  inst->src[0].file == VGRF &&
+	  inst->src[0].file != ARF &&
 	  !inst->is_partial_write()) {
         last_mrf_move[inst->dst.nr] = inst;
      }
@@ -4416,6 +4472,14 @@ lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
   const brw_device_info *devinfo = bld.shader->devinfo;

   if (devinfo->gen >= 7) {
+      /* We are switching the instruction from an ALU-like instruction to a
+       * send-from-grf instruction.  Since sends can't handle strides or
+       * source modifiers, we have to make a copy of the offset source.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.MOV(tmp, inst->src[1]);
+      inst->src[1] = tmp;
+
      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;

   } else {
@@ -5517,31 +5581,6 @@ fs_visitor::setup_vs_payload()
   payload.num_regs = 2;
 }

-/**
- * We are building the local ID push constant data using the simplest possible
- * method. We simply push the local IDs directly as they should appear in the
- * registers for the uvec3 gl_LocalInvocationID variable.
- *
- * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
- * registers worth of push constant space.
- *
- * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
- * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
- * to coordinated.
- *
- * FINISHME: There are a few easy optimizations to consider.
- *
- * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
- *    no need for using push constant space for that dimension.
- *
- * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
- *    easily use 16-bit words rather than 32-bit dwords in the push constant
- *    data.
- *
- * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
- *    conveying the data, and thereby reduce push constant usage.
- *
- */
 void
 fs_visitor::setup_gs_payload()
 {
@@ -5585,15 +5624,7 @@ void
 fs_visitor::setup_cs_payload()
 {
   assert(devinfo->gen >= 7);
-   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
-
   payload.num_regs = 1;
-
-   if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
-      prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
-      payload.local_invocation_id_reg = payload.num_regs;
-      payload.num_regs += prog_data->local_invocation_id_regs;
-   }
 }

 void
@@ -6467,25 +6498,6 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
   return g.get_assembly(final_assembly_size);
 }

-fs_reg *
-fs_visitor::emit_cs_local_invocation_id_setup()
-{
-   assert(stage == MESA_SHADER_COMPUTE);
-
-   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
-
-   struct brw_reg src =
-      brw_vec8_grf(payload.local_invocation_id_reg, 0);
-   src = retype(src, BRW_REGISTER_TYPE_UD);
-   bld.MOV(*reg, src);
-   src.nr += dispatch_width / 8;
-   bld.MOV(offset(*reg, bld, 1), src);
-   src.nr += dispatch_width / 8;
-   bld.MOV(offset(*reg, bld, 2), src);
-
-   return reg;
-}
-
 fs_reg *
 fs_visitor::emit_cs_work_group_id_setup()
 {
@@ -6504,6 +6516,70 @@ fs_visitor::emit_cs_work_group_id_setup()
   return reg;
 }

+static void
+fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
+{
+   block->dwords = dwords;
+   block->regs = DIV_ROUND_UP(dwords, 8);
+   block->size = block->regs * 32;
+}
+
+static void
+cs_fill_push_const_info(const struct brw_device_info *devinfo,
+                        struct brw_cs_prog_data *cs_prog_data)
+{
+   const struct brw_stage_prog_data *prog_data =
+      (struct brw_stage_prog_data*) cs_prog_data;
+   bool fill_thread_id =
+      cs_prog_data->thread_local_id_index >= 0 &&
+      cs_prog_data->thread_local_id_index < (int)prog_data->nr_params;
+   bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell;
+
+   /* The thread ID should be stored in the last param dword */
+   assert(prog_data->nr_params > 0 || !fill_thread_id);
+   assert(!fill_thread_id ||
+          cs_prog_data->thread_local_id_index ==
+             (int)prog_data->nr_params - 1);
+
+   unsigned cross_thread_dwords, per_thread_dwords;
+   if (!cross_thread_supported) {
+      cross_thread_dwords = 0u;
+      per_thread_dwords = prog_data->nr_params;
+   } else if (fill_thread_id) {
+      /* Fill all but the last register with cross-thread payload */
+      cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8);
+      per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
+      assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
+   } else {
+      /* Fill all data using cross-thread payload */
+      cross_thread_dwords = prog_data->nr_params;
+      per_thread_dwords = 0u;
+   }
+
+   fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
+   fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
+
+   unsigned total_dwords =
+      (cs_prog_data->push.per_thread.size * cs_prog_data->threads +
+       cs_prog_data->push.cross_thread.size) / 4;
+   fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
+
+   assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
+          cs_prog_data->push.per_thread.size == 0);
+   assert(cs_prog_data->push.cross_thread.dwords +
+          cs_prog_data->push.per_thread.dwords ==
+             prog_data->nr_params);
+}
+
+static void
+cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
+{
+   cs_prog_data->simd_size = size;
+   unsigned group_size = cs_prog_data->local_size[0] *
+      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
+   cs_prog_data->threads = (group_size + size - 1) / size;
+}
+
 const unsigned *
 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
               void *mem_ctx,
@@ -6519,6 +6595,16 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                                      true);
   brw_nir_lower_cs_shared(shader);
   prog_data->base.total_shared += shader->num_shared;
+
+   /* Now that we cloned the nir_shader, we can update num_uniforms based on
+    * the thread_local_id_index.
+    */
+   assert(prog_data->thread_local_id_index >= 0);
+   shader->num_uniforms =
+      MAX2(shader->num_uniforms,
+           (unsigned)4 * (prog_data->thread_local_id_index + 1));
+
+   brw_nir_lower_intrinsics(shader, &prog_data->base);
   shader = brw_postprocess_nir(shader, compiler->devinfo, true);

   prog_data->local_size[0] = shader->info.cs.local_size[0];
@@ -6544,7 +6630,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
         fail_msg = v8.fail_msg;
      } else {
         cfg = v8.cfg;
-         prog_data->simd_size = 8;
+         cs_set_simd_size(prog_data, 8);
+         cs_fill_push_const_info(compiler->devinfo, prog_data);
         prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
      }
   }
@@ -6569,7 +6656,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
         }
      } else {
         cfg = v16.cfg;
-         prog_data->simd_size = 16;
+         cs_set_simd_size(prog_data, 16);
+         cs_fill_push_const_info(compiler->devinfo, prog_data);
         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
      }
   }
@@ -6596,7 +6684,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
         }
      } else {
         cfg = v32.cfg;
-         prog_data->simd_size = 32;
+         cs_set_simd_size(prog_data, 32);
+         cs_fill_push_const_info(compiler->devinfo, prog_data);
      }
   }

@@ -6623,39 +6712,3 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,

   return g.get_assembly(final_assembly_size);
 }
-
-void
-brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
-                             void *buffer, uint32_t threads, uint32_t stride)
-{
-   if (prog_data->local_invocation_id_regs == 0)
-      return;
-
-   /* 'stride' should be an integer number of registers, that is, a multiple
-    * of 32 bytes.
-    */
-   assert(stride % 32 == 0);
-
-   unsigned x = 0, y = 0, z = 0;
-   for (unsigned t = 0; t < threads; t++) {
-      uint32_t *param = (uint32_t *) buffer + stride * t / 4;
-
-      for (unsigned i = 0; i < prog_data->simd_size; i++) {
-         param[0 * prog_data->simd_size + i] = x;
-         param[1 * prog_data->simd_size + i] = y;
-         param[2 * prog_data->simd_size + i] = z;
-
-         x++;
-         if (x == prog_data->local_size[0]) {
-            x = 0;
-            y++;
-            if (y == prog_data->local_size[1]) {
-               y = 0;
-               z++;
-               if (z == prog_data->local_size[2])
-                  z = 0;
-            }
-         }
-      }
-   }
-}
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -267,7 +267,6 @@ public:
                           unsigned base_offset, const nir_src &offset_src,
                           unsigned num_components);
   void emit_cs_terminate();
-   fs_reg *emit_cs_local_invocation_id_setup();
   fs_reg *emit_cs_work_group_id_setup();

   void emit_barrier();
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -621,20 +621,14 @@ namespace brw {
      src_reg
      fix_math_operand(const src_reg &src) const
      {
-         /* Can't do hstride == 0 args on gen6 math, so expand it out. We
-          * might be able to do better by doing execsize = 1 math and then
-          * expanding that result out, but we would need to be careful with
-          * masking.
-          *
-          * Gen6 hardware ignores source modifiers (negate and abs) on math
+         /* Gen6 hardware ignores source modifiers (negate and abs) on math
          * instructions, so we also move to a temp to set those up.
          *
          * Gen7 relaxes most of the above restrictions, but still can't use IMM
          * operands to math
          */
         if ((shader->devinfo->gen == 6 &&
-              (src.file == IMM || src.file == UNIFORM ||
-               src.abs || src.negate)) ||
+              (src.file == IMM || src.abs || src.negate)) ||
             (shader->devinfo->gen == 7 && src.file == IMM)) {
            const dst_reg tmp = vgrf(src.type);
            MOV(tmp, src);
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -147,8 +147,6 @@ struct table {
 static struct imm *
 find_imm(struct table *table, float val)
 {
-   assert(signbit(val) == 0);
-
   for (int i = 0; i < table->len; i++) {
      if (table->imm[i].val == val) {
         return &table->imm[i];
@@ -220,7 +218,8 @@ fs_visitor::opt_combine_constants()
             inst->src[i].type != BRW_REGISTER_TYPE_F)
            continue;

-         float val = fabsf(inst->src[i].f);
+         float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
+                     fabs(inst->src[i].f);
         struct imm *imm = find_imm(&table, val);

         if (imm) {
@@ -301,7 +300,7 @@ fs_visitor::opt_combine_constants()
         reg->stride = 0;
         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
         assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
-                fabsf(reg->f) == table.imm[i].val);
+                fabsf(reg->f) == fabs(table.imm[i].val));
      }
   }

--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -578,14 +578,9 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
            break;
         /* fallthrough */
      case SHADER_OPCODE_POW:
-         /* Allow constant propagation into src1 (except on Gen 6), and let
-          * constant combining promote the constant on Gen < 8.
-          *
-          * While Gen 6 MATH can take a scalar source, its source and
-          * destination offsets must be equal and we cannot ensure that.
+         /* Allow constant propagation into src1, and let constant combining
+          * promote the constant on Gen < 8.
          */
-         if (devinfo->gen == 6)
-            break;
         /* fallthrough */
      case BRW_OPCODE_BFI1:
      case BRW_OPCODE_ASR:
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -272,13 +272,6 @@ emit_system_values_block(nir_block *block, fs_visitor *v)
            *reg = *v->emit_samplemaskin_setup();
         break;

-      case nir_intrinsic_load_local_invocation_id:
-         assert(v->stage == MESA_SHADER_COMPUTE);
-         reg = &v->nir_system_values[SYSTEM_VALUE_LOCAL_INVOCATION_ID];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_cs_local_invocation_id_setup();
-         break;
-
      case nir_intrinsic_load_work_group_id:
         assert(v->stage == MESA_SHADER_COMPUTE);
         reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
@@ -1668,6 +1661,9 @@ fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
   struct brw_gs_prog_data *gs_prog_data =
      (struct brw_gs_prog_data *) prog_data;

+   if (gs_compile->control_data_header_size_bits == 0)
+      return;
+
   /* We can only do EndPrimitive() functionality when the control data
    * consists of cut bits.  Fortunately, the only time it isn't is when the
    * output type is points, in which case EndPrimitive() is a no-op.
@@ -2746,7 +2742,7 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
         break;
      case BRW_TESS_DOMAIN_ISOLINE:
         for (unsigned i = 0; i < 2; i++)
-            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
+            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 6 + i));
         break;
      }
      break;
@@ -3873,6 +3869,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
      break;
   }

+   case nir_intrinsic_load_channel_num: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      dest = retype(dest, BRW_REGISTER_TYPE_UD);
+      const fs_builder allbld8 = bld.group(8, 0).exec_all();
+      allbld8.MOV(tmp, brw_imm_v(0x76543210));
+      if (dispatch_width > 8)
+         allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
+      if (dispatch_width > 16) {
+         const fs_builder allbld16 = bld.group(16, 0).exec_all();
+         allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
+      }
+      bld.MOV(dest, tmp);
+      break;
+   }
+
   default:
      unreachable("unknown intrinsic");
   }
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -204,9 +204,23 @@ reg_offset(const fs_reg &r)
 static inline bool
 regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
 {
-   return reg_space(r) == reg_space(s) &&
-          !(reg_offset(r) + dr <= reg_offset(s) ||
-            reg_offset(s) + ds <= reg_offset(r));
+   if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
+      fs_reg t = r;
+      t.nr &= ~BRW_MRF_COMPR4;
+      /* COMPR4 regions are translated by the hardware during decompression
+       * into two separate half-regions 4 MRFs apart from each other.
+       */
+      return regions_overlap(t, dr / 2, s, ds) ||
+             regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds);
+
+   } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
+      return regions_overlap(s, ds, r, dr);
+
+   } else {
+      return reg_space(r) == reg_space(s) &&
+             !(reg_offset(r) + dr <= reg_offset(s) ||
+               reg_offset(s) + ds <= reg_offset(r));
+   }
 }

 /**
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -91,6 +91,8 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir);
 nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler,
                               nir_shader *nir);

+bool brw_nir_lower_intrinsics(nir_shader *nir,
+                              struct brw_stage_prog_data *prog_data);
 void brw_nir_lower_vs_inputs(nir_shader *nir,
                             const struct brw_device_info *devinfo,
                             bool is_scalar,
--- a/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+struct lower_intrinsics_state {
+   nir_shader *nir;
+   union {
+      struct brw_stage_prog_data *prog_data;
+      struct brw_cs_prog_data *cs_prog_data;
+   };
+   nir_function_impl *impl;
+   bool progress;
+   nir_builder builder;
+   bool cs_thread_id_used;
+};
+
+static nir_ssa_def *
+read_thread_local_id(struct lower_intrinsics_state *state)
+{
+   assert(state->cs_prog_data->thread_local_id_index >= 0);
+   state->cs_thread_id_used = true;
+   const int id_index = state->cs_prog_data->thread_local_id_index;
+
+   nir_builder *b = &state->builder;
+   nir_shader *nir = state->nir;
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+   load->num_components = 1;
+   load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+   nir_intrinsic_set_base(load, id_index * sizeof(uint32_t));
+   nir_intrinsic_set_range(load, sizeof(uint32_t));
+   nir_builder_instr_insert(b, &load->instr);
+   return &load->dest.ssa;
+}
+
+static bool
+lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
+                                  nir_block *block)
+{
+   bool progress = false;
+   nir_builder *b = &state->builder;
+   nir_shader *nir = state->nir;
+
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+
+      b->cursor = nir_after_instr(&intrinsic->instr);
+
+      nir_ssa_def *sysval;
+      switch (intrinsic->intrinsic) {
+      case nir_intrinsic_load_local_invocation_index: {
+         assert(nir->stage == MESA_SHADER_COMPUTE);
+         /* We construct the local invocation index from:
+          *
+          *    gl_LocalInvocationIndex =
+          *       cs_thread_local_id + channel_num;
+          */
+         nir_ssa_def *thread_local_id = read_thread_local_id(state);
+         nir_ssa_def *channel =
+            nir_load_system_value(b, nir_intrinsic_load_channel_num, 0);
+         sysval = nir_iadd(b, channel, thread_local_id);
+         break;
+      }
+
+      case nir_intrinsic_load_local_invocation_id: {
+         assert(nir->stage == MESA_SHADER_COMPUTE);
+         /* We lower gl_LocalInvocationID from gl_LocalInvocationIndex based
+          * on this formula:
+          *
+          *    gl_LocalInvocationID.x =
+          *       gl_LocalInvocationIndex % gl_WorkGroupSize.x;
+          *    gl_LocalInvocationID.y =
+          *       (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
+          *       gl_WorkGroupSize.y;
+          *    gl_LocalInvocationID.z =
+          *       (gl_LocalInvocationIndex /
+          *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
+          *       gl_WorkGroupSize.z;
+          */
+         unsigned *size = nir->info.cs.local_size;
+
+         nir_ssa_def *local_index =
+            nir_load_system_value(b, nir_intrinsic_load_local_invocation_index, 0);
+
+         nir_const_value uvec3;
+         uvec3.u32[0] = 1;
+         uvec3.u32[1] = size[0];
+         uvec3.u32[2] = size[0] * size[1];
+         nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3);
+         uvec3.u32[0] = size[0];
+         uvec3.u32[1] = size[1];
+         uvec3.u32[2] = size[2];
+         nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3);
+
+         sysval = nir_imod(b, nir_idiv(b, local_index, div_val), mod_val);
+         break;
+      }
+
+      default:
+         continue;
+      }
+
+      nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, nir_src_for_ssa(sysval));
+      nir_instr_remove(&intrinsic->instr);
+
+      state->progress = true;
+   }
+
+   return progress;
+}
+
+static void
+lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
+{
+   nir_builder_init(&state->builder, state->impl);
+
+   nir_foreach_block(block, state->impl) {
+      lower_cs_intrinsics_convert_block(state, block);
+   }
+
+   nir_metadata_preserve(state->impl,
+                         nir_metadata_block_index | nir_metadata_dominance);
+}
+
+bool
+brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data)
+{
+   /* Currently we only lower intrinsics for compute shaders */
+   if (nir->stage != MESA_SHADER_COMPUTE)
+      return false;
+
+   bool progress = false;
+   struct lower_intrinsics_state state;
+   memset(&state, 0, sizeof(state));
+   state.nir = nir;
+   state.prog_data = prog_data;
+
+   do {
+      state.progress = false;
+      nir_foreach_function(function, nir) {
+         if (function->impl) {
+            state.impl = function->impl;
+            lower_cs_intrinsics_convert_impl(&state);
+         }
+      }
+      progress |= state.progress;
+   } while (state.progress);
+
+   if (nir->stage == MESA_SHADER_COMPUTE && !state.cs_thread_id_used)
+      state.cs_prog_data->thread_local_id_index = -1;
+
+   return progress;
+}
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -225,19 +225,24 @@ brw_codegen_tcs_prog(struct brw_context *brw,
       */
      const float **param = (const float **) prog_data.base.base.param;
      static float zero = 0.0f;
-      for (int i = 0; i < 4; i++) {
-         param[7 - i] = &ctx->TessCtrlProgram.patch_default_outer_level[i];
-      }
+      for (int i = 0; i < 8; i++)
+         param[i] = &zero;

      if (key->tes_primitive_mode == GL_QUADS) {
+         for (int i = 0; i < 4; i++)
+            param[7 - i] = &ctx->TessCtrlProgram.patch_default_outer_level[i];
+
         param[3] = &ctx->TessCtrlProgram.patch_default_inner_level[0];
         param[2] = &ctx->TessCtrlProgram.patch_default_inner_level[1];
-         param[1] = &zero;
-         param[0] = &zero;
      } else if (key->tes_primitive_mode == GL_TRIANGLES) {
+         for (int i = 0; i < 3; i++)
+            param[7 - i] = &ctx->TessCtrlProgram.patch_default_outer_level[i];
+
         param[4] = &ctx->TessCtrlProgram.patch_default_inner_level[0];
-         for (int i = 0; i < 4; i++)
-            param[i] = &zero;
+      } else {
+         assert(key->tes_primitive_mode == GL_ISOLINES);
+         param[7] = &ctx->TessCtrlProgram.patch_default_outer_level[1];
+         param[6] = &ctx->TessCtrlProgram.patch_default_outer_level[0];
      }
   }

--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -539,6 +539,9 @@ vec4_gs_visitor::gs_end_primitive()
      return;
   }

+   if (c->control_data_header_size_bits == 0)
+      return;
+
   /* Cut bits use one bit per vertex. */
   assert(c->control_data_bits_per_vertex == 1);

--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -33,17 +33,6 @@
 #include "program/prog_statevars.h"
 #include "compiler/glsl/ir_uniform.h"

-static unsigned
-get_cs_thread_count(const struct brw_cs_prog_data *cs_prog_data)
-{
-   const unsigned simd_size = cs_prog_data->simd_size;
-   unsigned group_size = cs_prog_data->local_size[0] *
-      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
-
-   return (group_size + simd_size - 1) / simd_size;
-}
-
-
 static void
 brw_upload_cs_state(struct brw_context *brw)
 {
@@ -53,7 +42,6 @@ brw_upload_cs_state(struct brw_context *brw)
   uint32_t offset;
   uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
                                                8 * 4, 64, &offset);
-   struct gl_program *prog = (struct gl_program *) brw->compute_program;
   struct brw_stage_state *stage_state = &brw->cs.base;
   struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
   struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
@@ -70,17 +58,6 @@ brw_upload_cs_state(struct brw_context *brw)
                                            prog_data->binding_table.size_bytes,
                                            32, &stage_state->bind_bo_offset);

-   unsigned local_id_dwords = 0;
-
-   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
-      local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
-
-   unsigned push_constant_data_size =
-      (prog_data->nr_params + local_id_dwords) * sizeof(gl_constant_value);
-   unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
-   unsigned push_constant_regs = reg_aligned_constant_size / 32;
-   unsigned threads = get_cs_thread_count(cs_prog_data);
-
   uint32_t dwords = brw->gen < 8 ? 8 : 9;
   BEGIN_BATCH(dwords);
   OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));
@@ -129,7 +106,9 @@ brw_upload_cs_state(struct brw_context *brw)
    *
    * Note: The constant data is built in brw_upload_cs_push_constants below.
    */
-   const uint32_t vfe_curbe_allocation = push_constant_regs * threads;
+   const uint32_t vfe_curbe_allocation =
+      ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
+            cs_prog_data->push.cross_thread.regs, 2);
   OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) |
             SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC));
   OUT_BATCH(0);
@@ -137,11 +116,11 @@ brw_upload_cs_state(struct brw_context *brw)
   OUT_BATCH(0);
   ADVANCE_BATCH();

-   if (reg_aligned_constant_size > 0) {
+   if (cs_prog_data->push.total.size > 0) {
      BEGIN_BATCH(4);
      OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));
      OUT_BATCH(0);
-      OUT_BATCH(ALIGN(reg_aligned_constant_size * threads, 64));
+      OUT_BATCH(ALIGN(cs_prog_data->push.total.size, 64));
      OUT_BATCH(stage_state->push_const_offset);
      ADVANCE_BATCH();
   }
@@ -160,12 +139,13 @@ brw_upload_cs_state(struct brw_context *brw)
   desc[dw++] = stage_state->sampler_offset |
      ((stage_state->sampler_count + 3) / 4);
   desc[dw++] = stage_state->bind_bo_offset;
-   desc[dw++] = SET_FIELD(push_constant_regs, MEDIA_CURBE_READ_LENGTH);
+   desc[dw++] = SET_FIELD(cs_prog_data->push.per_thread.regs,
+                          MEDIA_CURBE_READ_LENGTH);
   const uint32_t media_threads =
      brw->gen >= 8 ?
-      SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
-      SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT);
-   assert(threads <= brw->max_cs_threads);
+      SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
+      SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT);
+   assert(cs_prog_data->threads <= brw->max_cs_threads);

   assert(prog_data->total_shared <= 64 * 1024);
   uint32_t slm_size = 0;
@@ -182,6 +162,9 @@ brw_upload_cs_state(struct brw_context *brw)
      SET_FIELD(slm_size, MEDIA_SHARED_LOCAL_MEMORY_SIZE) |
      media_threads;

+   desc[dw++] =
+      SET_FIELD(cs_prog_data->push.cross_thread.regs, CROSS_THREAD_READ_LENGTH);
+
   BEGIN_BATCH(4);
   OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
   OUT_BATCH(0);
@@ -224,10 +207,6 @@ brw_upload_cs_push_constants(struct brw_context *brw,
   struct gl_context *ctx = &brw->ctx;
   const struct brw_stage_prog_data *prog_data =
      (struct brw_stage_prog_data*) cs_prog_data;
-   unsigned local_id_dwords = 0;
-
-   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
-      local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;

   /* Updates the ParamaterValues[i] pointers for all parameters of the
    * basic type of PROGRAM_STATE_VAR.
@@ -235,42 +214,52 @@ brw_upload_cs_push_constants(struct brw_context *brw,
   /* XXX: Should this happen somewhere before to get our state flag set? */
   _mesa_load_state_parameters(ctx, prog->Parameters);

-   if (prog_data->nr_params == 0 && local_id_dwords == 0) {
+   if (cs_prog_data->push.total.size == 0) {
      stage_state->push_const_size = 0;
-   } else {
-      gl_constant_value *param;
-      unsigned i, t;
+      return;
+   }

-      const unsigned push_constant_data_size =
-         (local_id_dwords + prog_data->nr_params) * sizeof(gl_constant_value);
-      const unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
-      const unsigned param_aligned_count =
-         reg_aligned_constant_size / sizeof(*param);

-      unsigned threads = get_cs_thread_count(cs_prog_data);
+   gl_constant_value *param = (gl_constant_value*)
+      brw_state_batch(brw, type, ALIGN(cs_prog_data->push.total.size, 64),
+                      64, &stage_state->push_const_offset);
+   assert(param);

-      param = (gl_constant_value*)
-         brw_state_batch(brw, type,
-                         ALIGN(reg_aligned_constant_size * threads, 64),
-                         64, &stage_state->push_const_offset);
-      assert(param);
+   STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));

-      STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
+   if (cs_prog_data->push.cross_thread.size > 0) {
+      gl_constant_value *param_copy = param;
+      assert(cs_prog_data->thread_local_id_index < 0 ||
+             cs_prog_data->thread_local_id_index >=
+                cs_prog_data->push.cross_thread.dwords);
+      for (unsigned i = 0;
+           i < cs_prog_data->push.cross_thread.dwords;
+           i++) {
+         param_copy[i] = *prog_data->param[i];
+      }
+   }

-      brw_cs_fill_local_id_payload(cs_prog_data, param, threads,
-                                   reg_aligned_constant_size);
-
-      /* _NEW_PROGRAM_CONSTANTS */
-      for (t = 0; t < threads; t++) {
-         gl_constant_value *next_param =
-            &param[t * param_aligned_count + local_id_dwords];
-         for (i = 0; i < prog_data->nr_params; i++) {
-            next_param[i] = *prog_data->param[i];
+   gl_constant_value thread_id;
+   if (cs_prog_data->push.per_thread.size > 0) {
+      for (unsigned t = 0; t < cs_prog_data->threads; t++) {
+         unsigned dst =
+            8 * (cs_prog_data->push.per_thread.regs * t +
+                 cs_prog_data->push.cross_thread.regs);
+         unsigned src = cs_prog_data->push.cross_thread.dwords;
+         for ( ; src < prog_data->nr_params; src++, dst++) {
+            if (src != cs_prog_data->thread_local_id_index)
+               param[dst] = *prog_data->param[src];
+            else {
+               thread_id.u = t * cs_prog_data->simd_size;
+               param[dst] = thread_id;
+            }
         }
      }
-
-      stage_state->push_const_size = ALIGN(prog_data->nr_params, 8) / 8;
   }
+
+   stage_state->push_const_size =
+      cs_prog_data->push.cross_thread.regs +
+      cs_prog_data->push.per_thread.regs;
 }


--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -123,7 +123,7 @@ gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
      const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
      unsigned component_mask = (1 << components) - 1;
      unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
-
+      unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT;
      assert(stream_id < MAX_VERTEX_STREAMS);

      /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
@@ -145,7 +145,7 @@ gen7_upload_3dstate_so_decl_list(struct brw_context *brw,

      buffer_mask[stream_id] |= 1 << buffer;

-      decl |= buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT;
+      decl |= decl_buffer_slot;
      if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
         decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] <<
            SO_DECL_REGISTER_INDEX_SHIFT;
@@ -172,12 +172,14 @@ gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
      next_offset[buffer] += skip_components;

      while (skip_components >= 4) {
-         so_decl[stream_id][decls[stream_id]++] = SO_DECL_HOLE_FLAG | 0xf;
+         so_decl[stream_id][decls[stream_id]++] =
+            SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot;
         skip_components -= 4;
      }
      if (skip_components > 0)
         so_decl[stream_id][decls[stream_id]++] =
-            SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1);
+            SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) |
+            decl_buffer_slot;

      assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);

--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -40,6 +40,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_wm.h"
+#include "isl/isl.h"

 /**
 * Convert an swizzle enumeration (i.e. SWIZZLE_X) to one of the Gen7.5+
@@ -254,8 +255,18 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
    * the color buffer should always have been resolved before it is used as
    * a texture so there is no need for it. On Gen9 it will be uploaded when
    * the surface is losslessly compressed (CCS_E).
+    * However, sampling engine is not capable of re-interpreting the
+    * underlying color buffer in non-compressible formats when the surface
+    * is configured as compressed. Therefore state upload has made sure the
+    * buffer is in resolved state allowing the surface to be configured as
+    * non-compressed.
    */
-   if (mt->num_samples <= 1 && aux_mode != GEN9_SURFACE_AUX_MODE_CCS_E) {
+   if (mt->num_samples <= 1 &&
+       (aux_mode != GEN9_SURFACE_AUX_MODE_CCS_E ||
+        !isl_format_supports_lossless_compression(
+            brw->intelScreen->devinfo, format))) {
+      assert(!mt->mcs_mt ||
+             mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_RESOLVED);
      aux_mt = NULL;
      aux_mode = GEN8_SURFACE_AUX_MODE_NONE;
   }
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -80,6 +80,7 @@ static const struct debug_control debug_control[] = {
   { "tes",         DEBUG_TES },
   { "l3",          DEBUG_L3 },
   { "do32",        DEBUG_DO32 },
+   { "norbc",       DEBUG_NO_RBC },
   { NULL,    0 }
 };

--- a/src/mesa/drivers/dri/i965/intel_debug.h
+++ b/src/mesa/drivers/dri/i965/intel_debug.h
@@ -73,6 +73,7 @@ extern uint64_t INTEL_DEBUG;
 #define DEBUG_TES                 (1ull << 37)
 #define DEBUG_L3                  (1ull << 38)
 #define DEBUG_DO32                (1ull << 39)
+#define DEBUG_NO_RBC              (1ull << 40)

 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -1620,7 +1620,9 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    * single-sampled buffers. Disabling compression allows us to skip
    * resolves.
    */
+   const bool lossless_compression_disabled = INTEL_DEBUG & DEBUG_NO_RBC;
   const bool is_lossless_compressed =
+      unlikely(!lossless_compression_disabled) &&
      brw->gen >= 9 && !mt->is_scanout &&
      intel_miptree_supports_lossless_compressed(brw, mt);

--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -1765,7 +1765,7 @@ _mesa_buffer_sub_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
                      const char *func)
 {
   if (!buffer_object_subdata_range_good(ctx, bufObj, offset, size,
-                                         false, func)) {
+                                         true, func)) {
      /* error already recorded */
      return;
   }
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -389,7 +389,8 @@ driver_RenderTexture_is_safe(const struct gl_renderbuffer_attachment *att)
   const struct gl_texture_image *const texImage =
      att->Texture->Image[att->CubeMapFace][att->TextureLevel];

-   if (texImage->Width == 0 || texImage->Height == 0 || texImage->Depth == 0)
+   if (!texImage ||
+       texImage->Width == 0 || texImage->Height == 0 || texImage->Depth == 0)
      return false;

   if ((texImage->TexObject->Target == GL_TEXTURE_1D_ARRAY
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3763,6 +3763,9 @@ struct gl_constants
   GLuint MaxTessControlTotalOutputComponents;
   bool LowerTessLevel; /**< Lower gl_TessLevel* from float[n] to vecn? */
   bool PrimitiveRestartForPatches;
+   bool LowerCsDerivedVariables;    /**< Lower gl_GlobalInvocationID and
+                                     *   gl_LocalInvocationIndex based on
+                                     *   other builtin variables. */
 };


--- a/src/mesa/main/samplerobj.c
+++ b/src/mesa/main/samplerobj.c
@@ -811,10 +811,8 @@ _mesa_SamplerParameteri(GLuint sampler, GLenum pname, GLint param)
       *     "An INVALID_OPERATION error is generated if sampler is not the name
       *     of a sampler object previously returned from a call to GenSamplers."
       *
-       * In desktop GL, an GL_INVALID_VALUE is returned instead.
       */
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glSamplerParameteri(sampler %u)", sampler);
      return;
   }
@@ -904,10 +902,8 @@ _mesa_SamplerParameterf(GLuint sampler, GLenum pname, GLfloat param)
       *     "An INVALID_OPERATION error is generated if sampler is not the name
       *     of a sampler object previously returned from a call to GenSamplers."
       *
-       * In desktop GL, an GL_INVALID_VALUE is returned instead.
       */
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glSamplerParameterf(sampler %u)", sampler);
      return;
   }
@@ -995,11 +991,8 @@ _mesa_SamplerParameteriv(GLuint sampler, GLenum pname, const GLint *params)
       *
       *     "An INVALID_OPERATION error is generated if sampler is not the name
       *     of a sampler object previously returned from a call to GenSamplers."
-       *
-       * In desktop GL, an GL_INVALID_VALUE is returned instead.
       */
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glSamplerParameteriv(sampler %u)", sampler);
      return;
   }
@@ -1096,10 +1089,8 @@ _mesa_SamplerParameterfv(GLuint sampler, GLenum pname, const GLfloat *params)
       *     "An INVALID_OPERATION error is generated if sampler is not the name
       *     of a sampler object previously returned from a call to GenSamplers."
       *
-       * In desktop GL, an GL_INVALID_VALUE is returned instead.
       */
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glSamplerParameterfv(sampler %u)", sampler);
      return;
   }
@@ -1184,8 +1175,7 @@ _mesa_SamplerParameterIiv(GLuint sampler, GLenum pname, const GLint *params)

   sampObj = _mesa_lookup_samplerobj(ctx, sampler);
   if (!sampObj) {
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glSamplerParameterIiv(sampler %u)", sampler);
      return;
   }
@@ -1271,8 +1261,7 @@ _mesa_SamplerParameterIuiv(GLuint sampler, GLenum pname, const GLuint *params)

   sampObj = _mesa_lookup_samplerobj(ctx, sampler);
   if (!sampObj) {
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glSamplerParameterIuiv(sampler %u)", sampler);
      return;
   }
@@ -1362,10 +1351,8 @@ _mesa_GetSamplerParameteriv(GLuint sampler, GLenum pname, GLint *params)
       *     "An INVALID_OPERATION error is generated if sampler is not the name
       *     of a sampler object previously returned from a call to GenSamplers."
       *
-       * In desktop GL, an GL_INVALID_VALUE is returned instead.
       */
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glGetSamplerParameteriv(sampler %u)", sampler);
      return;
   }
@@ -1456,10 +1443,8 @@ _mesa_GetSamplerParameterfv(GLuint sampler, GLenum pname, GLfloat *params)
       *     "An INVALID_OPERATION error is generated if sampler is not the name
       *     of a sampler object previously returned from a call to GenSamplers."
       *
-       * In desktop GL, an GL_INVALID_VALUE is returned instead.
       */
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glGetSamplerParameterfv(sampler %u)", sampler);
      return;
   }
@@ -1533,8 +1518,7 @@ _mesa_GetSamplerParameterIiv(GLuint sampler, GLenum pname, GLint *params)

   sampObj = _mesa_lookup_samplerobj(ctx, sampler);
   if (!sampObj) {
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glGetSamplerParameterIiv(sampler %u)",
                  sampler);
      return;
@@ -1609,8 +1593,7 @@ _mesa_GetSamplerParameterIuiv(GLuint sampler, GLenum pname, GLuint *params)

   sampObj = _mesa_lookup_samplerobj(ctx, sampler);
   if (!sampObj) {
-      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
-                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+      _mesa_error(ctx, GL_INVALID_OPERATION,
                  "glGetSamplerParameterIuiv(sampler %u)",
                  sampler);
      return;
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -505,7 +505,7 @@ _mesa_program_resource_find_name(struct gl_shader_program *shProg,
      if (rname_last_square_bracket) {
         baselen_without_array_index -= strlen(rname_last_square_bracket);
         rname_has_array_index_zero =
-            (strncmp(rname_last_square_bracket, "[0]\0", 4) == 0) &&
+            (strcmp(rname_last_square_bracket, "[0]") == 0) &&
            (baselen_without_array_index == strlen(name));
      }

--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -232,7 +232,7 @@ init_shader_program(struct gl_shader_program *prog)
   prog->FragDataBindings = string_to_uint_map_ctor();
   prog->FragDataIndexBindings = string_to_uint_map_ctor();

-   prog->Geom.VerticesOut = 0;
+   prog->Geom.VerticesOut = -1;
   prog->Geom.InputType = GL_TRIANGLES;
   prog->Geom.OutputType = GL_TRIANGLE_STRIP;
   prog->Geom.UsesEndPrimitive = false;
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -314,6 +314,7 @@ void st_init_limits(struct pipe_screen *screen,
   }

   c->LowerTessLevel = true;
+   c->LowerCsDerivedVariables = true;
   c->PrimitiveRestartForPatches =
      screen->get_param(screen, PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES);