Update version to 17.0.0-rc3

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
etnaviv: force vertex buffers through the MMU
2017-02-06 13:18:13 +00:00 · 2017-02-03 11:18:53 +00:00 · 2017-02-03 11:12:16 +00:00 · 2017-02-03 11:09:00 +00:00 · 2017-02-03 11:08:59 +00:00 · 2017-02-03 11:08:59 +00:00
94 changed files with 994 additions and 356 deletions
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -43,6 +43,7 @@ LOCAL_CFLAGS += \
 	-DANDROID_VERSION=0x0$(MESA_ANDROID_MAJOR_VERSION)0$(MESA_ANDROID_MINOR_VERSION)

 LOCAL_CFLAGS += \
+	-DENABLE_SHADER_CACHE \
 	-D__STDC_LIMIT_MACROS \
 	-DHAVE___BUILTIN_EXPECT \
 	-DHAVE___BUILTIN_FFS \
@@ -77,10 +78,22 @@ endif

 ifeq ($(MESA_ENABLE_LLVM),true)
 LOCAL_CFLAGS += \
-	-DHAVE_LLVM=0x0305 -DMESA_LLVM_VERSION_PATCH=2 \
 	-D__STDC_CONSTANT_MACROS \
 	-D__STDC_FORMAT_MACROS \
 	-D__STDC_LIMIT_MACROS
+
+  ifeq ($(MESA_ANDROID_MAJOR_VERSION),5)
+    LOCAL_CFLAGS += -DHAVE_LLVM=0x0305 -DMESA_LLVM_VERSION_PATCH=2
+    ELF_INCLUDES := external/elfutils/0.153/libelf
+  endif
+  ifeq ($(MESA_ANDROID_MAJOR_VERSION),6)
+    LOCAL_CFLAGS += -DHAVE_LLVM=0x0307 -DMESA_LLVM_VERSION_PATCH=0
+    ELF_INCLUDES := external/elfutils/src/libelf
+  endif
+  ifeq ($(MESA_ANDROID_MAJOR_VERSION),7)
+    LOCAL_CFLAGS += -DHAVE_LLVM=0x0308 -DMESA_LLVM_VERSION_PATCH=0
+    ELF_INCLUDES := external/elfutils/libelf
+  endif
 endif

 ifneq ($(LOCAL_IS_HOST_MODULE),true)
--- a/2
+++ b/2
@@ -1 +1 @@
-17.0.0-devel
+17.0.0-rc3
--- a/configure.ac
+++ b/configure.ac
@@ -1436,6 +1436,22 @@ if test "x$enable_gallium_osmesa" = xyes; then
    fi
 fi

+require_dri_shared_libs_and_glapi() {
+    if test "x$enable_static" = xyes; then
+        AC_MSG_ERROR([$1 cannot be build as static library])
+    fi
+
+    if test "x$enable_dri" != xyes; then
+        # There is only a single backend which won't be build/used otherwise.
+        # XXX: Revisit this as the egl/haiku is a thing.
+        AC_MSG_ERROR([$1 requires --enable-dri])
+    fi
+
+    if test "x$enable_shared_glapi" != xyes; then
+        AC_MSG_ERROR([$1 requires --enable-shared-glapi])
+    fi
+}
+
 if test "x$enable_dri" = xyes; then
    require_dri_shared_libs_and_glapi "DRI"

@@ -1722,7 +1738,7 @@ fi
 AC_ARG_WITH([vulkan-drivers],
    [AS_HELP_STRING([--with-vulkan-drivers@<:@=DIRS...@:>@],
        [comma delimited Vulkan drivers list, e.g.
-        "intel"
+        "intel,radeon"
        @<:@default=no@:>@])],
    [with_vulkan_drivers="$withval"],
    [with_vulkan_drivers="no"])
@@ -1766,6 +1782,7 @@ if test -n "$with_vulkan_drivers"; then
 fi


+DEFINES="$DEFINES -DENABLE_SHADER_CACHE"
 AM_CONDITIONAL(NEED_MEGADRIVER, test -n "$DRI_DIRS")
 AM_CONDITIONAL(NEED_LIBMESA, test "x$enable_glx" = xxlib -o \
                                  "x$enable_osmesa" = xyes -o \
@@ -1814,22 +1831,6 @@ AC_SUBST([OSMESA_LIB_DEPS])
 AC_SUBST([OSMESA_PC_REQ])
 AC_SUBST([OSMESA_PC_LIB_PRIV])

-require_dri_shared_libs_and_glapi() {
-    if test "x$enable_static" = xyes; then
-        AC_MSG_ERROR([$1 cannot be build as static library])
-    fi
-
-    if test "x$enable_dri" != xyes; then
-        # There is only a single backend which won't be build/used otherwise.
-        # XXX: Revisit this as the egl/haiku is a thing.
-        AC_MSG_ERROR([$1 requires --enable-dri])
-    fi
-
-    if test "x$enable_shared_glapi" != xyes; then
-        AC_MSG_ERROR([$1 requires --enable-shared-glapi])
-    fi
-}
-
 dnl
 dnl gbm configuration
 dnl
@@ -2211,6 +2212,19 @@ gallium_require_llvm() {
    fi
 }

+dnl
+dnl r300 doesn't strictly require LLVM, but for performance reasons we
+dnl highly recommend LLVM usage. So require it at least on x86 and x86_64
+dnl architectures.
+dnl
+r300_require_llvm() {
+    case "$host" in *gnux32) return;; esac
+    case "$host_cpu" in
+    i*86|x86_64|amd64) gallium_require_llvm $1
+        ;;
+    esac
+}
+
 dnl
 dnl DRM is needed by X, Wayland, and offscreen rendering.
 dnl Surfaceless is an alternative for the last one.
@@ -2297,7 +2311,7 @@ if test -n "$with_gallium_drivers"; then
            HAVE_GALLIUM_R300=yes
            PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
            require_libdrm "r300"
-            gallium_require_llvm "r300"
+            r300_require_llvm "r300"
            ;;
        xr600)
            HAVE_GALLIUM_R600=yes
--- a/src/amd/Android.common.mk
+++ b/src/amd/Android.common.mk
@@ -55,7 +55,7 @@ LOCAL_C_INCLUDES := \
 	external/llvm/include \
 	external/llvm/device/include \
 	external/libcxx/include \
-	external/elfutils/$(if $(filter 5,$(MESA_ANDROID_MAJOR_VERSION)),0.153/,$(if $(filter 6,$(MESA_ANDROID_MAJOR_VERSION)),src/))libelf
+	$(ELF_INCLUDES)

 LOCAL_STATIC_LIBRARIES := libLLVMCore

--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1267,6 +1267,9 @@ static void visit_alu(struct nir_to_llvm_context *ctx, nir_alu_instr *instr)
 		src[1] = to_float(ctx, src[1]);
 		result = LLVMBuildFRem(ctx->builder, src[0], src[1], "");
 		break;
+	case nir_op_irem:
+		result = LLVMBuildSRem(ctx->builder, src[0], src[1], "");
+		break;
 	case nir_op_idiv:
 		result = LLVMBuildSDiv(ctx->builder, src[0], src[1], "");
 		break;
@@ -1745,9 +1748,12 @@ static LLVMValueRef visit_vulkan_resource_index(struct nir_to_llvm_context *ctx,
 static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
                                             nir_intrinsic_instr *instr)
 {
-	LLVMValueRef ptr;
+	LLVMValueRef ptr, addr;

-	ptr = build_gep0(ctx, ctx->push_constants, get_src(ctx, instr->src[0]));
+	addr = LLVMConstInt(ctx->i32, nir_intrinsic_base(instr), 0);
+	addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx, instr->src[0]), "");
+
+	ptr = build_gep0(ctx, ctx->push_constants, addr);
 	ptr = cast_ptr(ctx, ptr, get_def_type(ctx, &instr->dest.ssa));

 	return LLVMBuildLoad(ctx->builder, ptr, "");
@@ -2238,7 +2244,7 @@ static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
 }

 static LLVMValueRef get_image_coords(struct nir_to_llvm_context *ctx,
-				     nir_intrinsic_instr *instr, bool add_frag_pos)
+				     nir_intrinsic_instr *instr)
 {
 	const struct glsl_type *type = instr->variables[0]->var->type;
 	if(instr->variables[0]->deref.child)
@@ -2253,6 +2259,8 @@ static LLVMValueRef get_image_coords(struct nir_to_llvm_context *ctx,
 	LLVMValueRef res;
 	int count;
 	enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
+	bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS ||
+			     dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
 	bool is_ms = (dim == GLSL_SAMPLER_DIM_MS ||
 		      dim == GLSL_SAMPLER_DIM_SUBPASS_MS);

@@ -2378,12 +2386,11 @@ static LLVMValueRef visit_image_load(struct nir_to_llvm_context *ctx,
 	} else {
 		bool is_da = glsl_sampler_type_is_array(type) ||
 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
-		bool add_frag_pos = glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_SUBPASS;
 		LLVMValueRef da = is_da ? ctx->i32one : ctx->i32zero;
 		LLVMValueRef glc = LLVMConstInt(ctx->i1, 0, false);
 		LLVMValueRef slc = LLVMConstInt(ctx->i1, 0, false);

-		params[0] = get_image_coords(ctx, instr, add_frag_pos);
+		params[0] = get_image_coords(ctx, instr);
 		params[1] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
 		params[2] = LLVMConstInt(ctx->i32, 15, false); /* dmask */
 		if (HAVE_LLVM <= 0x0309) {
@@ -2442,7 +2449,7 @@ static void visit_image_store(struct nir_to_llvm_context *ctx,
 		LLVMValueRef slc = i1false;

 		params[0] = to_float(ctx, get_src(ctx, instr->src[2]));
-		params[1] = get_image_coords(ctx, instr, false); /* coords */
+		params[1] = get_image_coords(ctx, instr); /* coords */
 		params[2] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
 		params[3] = LLVMConstInt(ctx->i32, 15, false); /* dmask */
 		if (HAVE_LLVM <= 0x0309) {
@@ -2502,7 +2509,7 @@ static LLVMValueRef visit_image_atomic(struct nir_to_llvm_context *ctx,
 		bool da = glsl_sampler_type_is_array(type) ||
 		          glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;

-		coords = params[param_count++] = get_image_coords(ctx, instr, false);
+		coords = params[param_count++] = get_image_coords(ctx, instr);
 		params[param_count++] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
 		params[param_count++] = i1false; /* r128 */
 		params[param_count++] = da ? i1true : i1false;      /* da */
@@ -3154,6 +3161,15 @@ static void tex_fetch_ptrs(struct nir_to_llvm_context *ctx,
 		*fmask_ptr = get_sampler_desc(ctx, instr->texture, DESC_FMASK);
 }

+static LLVMValueRef apply_round_slice(struct nir_to_llvm_context *ctx,
+				      LLVMValueRef coord)
+{
+	coord = to_float(ctx, coord);
+	coord = ac_emit_llvm_intrinsic(&ctx->ac, "llvm.rint.f32", ctx->f32, &coord, 1, 0);
+	coord = to_integer(ctx, coord);
+	return coord;
+}
+
 static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr)
 {
 	LLVMValueRef result = NULL;
@@ -3211,6 +3227,11 @@ static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr)
 		}
 	}

+	if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
+		result = get_buffer_size(ctx, res_ptr, false);
+		goto write_result;
+	}
+
 	if (instr->op == nir_texop_texture_samples) {
 		LLVMValueRef res, samples, is_msaa;
 		res = LLVMBuildBitCast(ctx->builder, res_ptr, ctx->v8i32, "");
@@ -3310,15 +3331,16 @@ static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr)
 	/* Pack texture coordinates */
 	if (coord) {
 		address[count++] = coords[0];
-		if (instr->coord_components > 1)
+		if (instr->coord_components > 1) {
+			if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array && instr->op != nir_texop_txf) {
+				coords[1] = apply_round_slice(ctx, coords[1]);
+			}
 			address[count++] = coords[1];
+		}
 		if (instr->coord_components > 2) {
 			/* This seems like a bit of a hack - but it passes Vulkan CTS with it */
 			if (instr->sampler_dim != GLSL_SAMPLER_DIM_3D && instr->op != nir_texop_txf) {
-				coords[2] = to_float(ctx, coords[2]);
-				coords[2] = ac_emit_llvm_intrinsic(&ctx->ac, "llvm.rint.f32", ctx->f32, &coords[2],
-								1, 0);
-				coords[2] = to_integer(ctx, coords[2]);
+				coords[2] = apply_round_slice(ctx, coords[2]);
 			}
 			address[count++] = coords[2];
 		}
--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -21,9 +21,7 @@

 include Makefile.sources

-vulkan_includedir = $(includedir)/vulkan
-
-vulkan_include_HEADERS = \
+noinst_HEADERS = \
 	$(top_srcdir)/include/vulkan/vk_platform.h \
 	$(top_srcdir)/include/vulkan/vulkan.h

@@ -32,9 +30,6 @@ lib_LTLIBRARIES = libvulkan_radeon.la
 # The gallium includes are for the util/u_math.h include from main/macros.h

 AM_CPPFLAGS = \
-	$(AMDGPU_CFLAGS) \
-	$(VALGRIND_CFLAGS) \
-	$(DEFINES) \
 	-I$(top_srcdir)/include \
 	-I$(top_builddir)/src \
 	-I$(top_srcdir)/src \
@@ -48,7 +43,10 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/mesa \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/gallium/include
+	-I$(top_srcdir)/src/gallium/include \
+	$(AMDGPU_CFLAGS) \
+	$(VALGRIND_CFLAGS) \
+	$(DEFINES)

 AM_CFLAGS = \
 	$(VISIBILITY_CFLAGS) \
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -438,7 +438,8 @@ radv_emit_graphics_raster_state(struct radv_cmd_buffer *cmd_buffer,
 			       raster->spi_interp_control);

 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028A00_PA_SU_POINT_SIZE, 2);
-	radeon_emit(cmd_buffer->cs, 0);
+	unsigned tmp = (unsigned)(1.0 * 8.0);
+	radeon_emit(cmd_buffer->cs, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
 	radeon_emit(cmd_buffer->cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) |
 		    S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2))); /* R_028A04_PA_SU_POINT_MINMAX */

@@ -2605,6 +2606,7 @@ void radv_CmdPipelineBarrier(
 			break;
 		case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
 		case VK_ACCESS_TRANSFER_READ_BIT:
+		case VK_ACCESS_TRANSFER_WRITE_BIT:
 		case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
 			flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER | RADV_CMD_FLAG_INV_GLOBAL_L2;
 		default:
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -989,8 +989,7 @@ VkResult radv_QueueSubmit(
 			if (queue->device->trace_bo)
 				*queue->device->trace_id_ptr = 0;

-			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array,
-							pSubmits[i].commandBufferCount,
+			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, advance,
 							(struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores,
 							b ? pSubmits[i].waitSemaphoreCount : 0,
 							(struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores,
--- a/src/compiler/glsl/ir_optimization.h
+++ b/src/compiler/glsl/ir_optimization.h
@@ -30,7 +30,7 @@

 /* Operations for lower_instructions() */
 #define SUB_TO_ADD_NEG     0x01
-#define DIV_TO_MUL_RCP     0x02
+#define FDIV_TO_MUL_RCP    0x02
 #define EXP_TO_EXP2        0x04
 #define POW_TO_EXP2        0x08
 #define LOG_TO_LOG2        0x10
@@ -49,6 +49,8 @@
 #define FIND_LSB_TO_FLOAT_CAST    0x20000
 #define FIND_MSB_TO_FLOAT_CAST    0x40000
 #define IMUL_HIGH_TO_MUL          0x80000
+#define DDIV_TO_MUL_RCP           0x100000
+#define DIV_TO_MUL_RCP            (FDIV_TO_MUL_RCP | DDIV_TO_MUL_RCP)

 /**
 * \see class lower_packing_builtins_visitor
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -535,7 +535,7 @@ private:
            const char *str_end;
            while((str_start = strchr(name_copy, '[')) &&
                  (str_end = strchr(name_copy, ']'))) {
-               memmove(str_start, str_end + 1, 1 + strlen(str_end));
+               memmove(str_start, str_end + 1, 1 + strlen(str_end + 1));
            }

            unsigned index = 0;
--- a/src/compiler/glsl/lower_instructions.cpp
+++ b/src/compiler/glsl/lower_instructions.cpp
@@ -54,8 +54,8 @@
 * want to recognize add(op0, neg(op1)) or the other way around to
 * produce a subtract anyway.
 *
- * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
- * --------------------------------------
+ * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP:
+ * ---------------------------------------------------------
 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
 *
 * Many GPUs don't have a divide instruction (945 and 965 included),
@@ -63,9 +63,11 @@
 * reciprocal.  By breaking the operation down, constant reciprocals
 * can get constant folded.
 *
- * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
- * handles the integer case, converting to and from floating point so that
- * RCP is possible.
+ * FDIV_TO_MUL_RCP only lowers single-precision floating point division;
+ * DDIV_TO_MUL_RCP only lowers double-precision floating point division.
+ * DIV_TO_MUL_RCP is a convenience macro that sets both flags.
+ * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating
+ * point so that RCP is possible.
 *
 * EXP_TO_EXP2 and LOG_TO_LOG2:
 * ----------------------------
@@ -326,7 +328,8 @@ lower_instructions_visitor::mod_to_floor(ir_expression *ir)
   /* Don't generate new IR that would need to be lowered in an additional
    * pass.
    */
-   if (lowering(DIV_TO_MUL_RCP) && (ir->type->is_float() || ir->type->is_double()))
+   if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) ||
+       (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double()))
      div_to_mul_rcp(div_expr);

   ir_expression *const floor_expr =
@@ -1599,8 +1602,8 @@ lower_instructions_visitor::visit_leave(ir_expression *ir)
   case ir_binop_div:
      if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
 	 int_div_to_mul_rcp(ir);
-      else if ((ir->operands[1]->type->is_float() ||
-                ir->operands[1]->type->is_double()) && lowering(DIV_TO_MUL_RCP))
+      else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) ||
+               (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP)))
 	 div_to_mul_rcp(ir);
      break;

--- a/src/compiler/glsl/tests/cache_test.c
+++ b/src/compiler/glsl/tests/cache_test.c
@@ -37,6 +37,8 @@

 bool error = false;

+#ifdef ENABLE_SHADER_CACHE
+
 static void
 expect_equal(uint64_t actual, uint64_t expected, const char *test)
 {
@@ -378,10 +380,12 @@ test_put_key_and_get_key(void)

   disk_cache_destroy(cache);
 }
+#endif /* ENABLE_SHADER_CACHE */

 int
 main(void)
 {
+#ifdef ENABLE_SHADER_CACHE
   int err;

   test_disk_cache_create();
@@ -392,6 +396,7 @@ main(void)

   err = rmrf_local(CACHE_TEST_TMP);
   expect_equal(err, 0, "Removing " CACHE_TEST_TMP " again");
+#endif /* ENABLE_SHADER_CACHE */

   return error ? 1 : 0;
 }
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -210,43 +210,27 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
         return true;

      case nir_type_int:
-         for (unsigned i = 0; i < num_components; ++i) {
-            int64_t val;
-            switch (load->def.bit_size) {
-            case 32:
-               val = load->value.i32[new_swizzle[i]];
-               break;
-            case 64:
-               val = load->value.i64[new_swizzle[i]];
-               break;
-            default:
-               unreachable("unknown bit size");
-            }
-
-            if (val != const_val->data.i)
-               return false;
-         }
-         return true;
-
      case nir_type_uint:
      case nir_type_bool32:
-         for (unsigned i = 0; i < num_components; ++i) {
-            uint64_t val;
-            switch (load->def.bit_size) {
-            case 32:
-               val = load->value.u32[new_swizzle[i]];
-               break;
-            case 64:
-               val = load->value.u64[new_swizzle[i]];
-               break;
-            default:
-               unreachable("unknown bit size");
+         switch (load->def.bit_size) {
+         case 32:
+            for (unsigned i = 0; i < num_components; ++i) {
+               if (load->value.u32[new_swizzle[i]] !=
+                   (uint32_t)const_val->data.u)
+                  return false;
            }
+            return true;

-            if (val != const_val->data.u)
-               return false;
+         case 64:
+            for (unsigned i = 0; i < num_components; ++i) {
+               if (load->value.u64[new_swizzle[i]] != const_val->data.u)
+                  return false;
+            }
+            return true;
+
+         default:
+            unreachable("unknown bit size");
         }
-         return true;

      default:
         unreachable("Invalid alu source type");
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -1102,23 +1102,43 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
      SpvOp opcode = get_specialization(b, val, w[3]);
      switch (opcode) {
      case SpvOpVectorShuffle: {
-         struct vtn_value *v0 = vtn_value(b, w[4], vtn_value_type_constant);
-         struct vtn_value *v1 = vtn_value(b, w[5], vtn_value_type_constant);
-         unsigned len0 = glsl_get_vector_elements(v0->const_type);
-         unsigned len1 = glsl_get_vector_elements(v1->const_type);
+         struct vtn_value *v0 = &b->values[w[4]];
+         struct vtn_value *v1 = &b->values[w[5]];
+
+         assert(v0->value_type == vtn_value_type_constant ||
+                v0->value_type == vtn_value_type_undef);
+         assert(v1->value_type == vtn_value_type_constant ||
+                v1->value_type == vtn_value_type_undef);
+
+         unsigned len0 = v0->value_type == vtn_value_type_constant ?
+                         glsl_get_vector_elements(v0->const_type) :
+                         glsl_get_vector_elements(v0->type->type);
+         unsigned len1 = v1->value_type == vtn_value_type_constant ?
+                         glsl_get_vector_elements(v1->const_type) :
+                         glsl_get_vector_elements(v1->type->type);

         assert(len0 + len1 < 16);

         unsigned bit_size = glsl_get_bit_size(val->const_type);
-         assert(bit_size == glsl_get_bit_size(v0->const_type) &&
-                bit_size == glsl_get_bit_size(v1->const_type));
+         unsigned bit_size0 = v0->value_type == vtn_value_type_constant ?
+                              glsl_get_bit_size(v0->const_type) :
+                              glsl_get_bit_size(v0->type->type);
+         unsigned bit_size1 = v1->value_type == vtn_value_type_constant ?
+                              glsl_get_bit_size(v1->const_type) :
+                              glsl_get_bit_size(v1->type->type);
+
+         assert(bit_size == bit_size0 && bit_size == bit_size1);

         if (bit_size == 64) {
            uint64_t u64[8];
-            for (unsigned i = 0; i < len0; i++)
-               u64[i] = v0->constant->values[0].u64[i];
-            for (unsigned i = 0; i < len1; i++)
-               u64[len0 + i] = v1->constant->values[0].u64[i];
+            if (v0->value_type == vtn_value_type_constant) {
+               for (unsigned i = 0; i < len0; i++)
+                  u64[i] = v0->constant->values[0].u64[i];
+            }
+            if (v1->value_type == vtn_value_type_constant) {
+               for (unsigned i = 0; i < len1; i++)
+                  u64[len0 + i] = v1->constant->values[0].u64[i];
+            }

            for (unsigned i = 0, j = 0; i < count - 6; i++, j++) {
               uint32_t comp = w[i + 6];
@@ -1132,11 +1152,14 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
            }
         } else {
            uint32_t u32[8];
-            for (unsigned i = 0; i < len0; i++)
-               u32[i] = v0->constant->values[0].u32[i];
-
-            for (unsigned i = 0; i < len1; i++)
-               u32[len0 + i] = v1->constant->values[0].u32[i];
+            if (v0->value_type == vtn_value_type_constant) {
+               for (unsigned i = 0; i < len0; i++)
+                  u32[i] = v0->constant->values[0].u32[i];
+            }
+            if (v1->value_type == vtn_value_type_constant) {
+               for (unsigned i = 0; i < len1; i++)
+                  u32[len0 + i] = v1->constant->values[0].u32[i];
+            }

            for (unsigned i = 0, j = 0; i < count - 6; i++, j++) {
               uint32_t comp = w[i + 6];
@@ -2902,6 +2925,7 @@ vtn_handle_variable_or_type_instruction(struct vtn_builder *b, SpvOp opcode,
      vtn_handle_constant(b, opcode, w, count);
      break;

+   case SpvOpUndef:
   case SpvOpVariable:
      vtn_handle_variables(b, opcode, w, count);
      break;
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -1199,7 +1199,8 @@ var_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member,
         is_vertex_input = false;
         location += vtn_var->patch ? VARYING_SLOT_PATCH0 : VARYING_SLOT_VAR0;
      } else {
-         unreachable("Location must be on input or output variable");
+         vtn_warn("Location must be on input or output variable");
+         return;
      }

      if (vtn_var->var) {
@@ -1267,6 +1268,12 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
                     const uint32_t *w, unsigned count)
 {
   switch (opcode) {
+   case SpvOpUndef: {
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_undef);
+      val->type = vtn_value(b, w[1], vtn_value_type_type)->type;
+      break;
+   }
+
   case SpvOpVariable: {
      struct vtn_variable *var = rzalloc(b, struct vtn_variable);
      var->type = vtn_value(b, w[1], vtn_value_type_type)->type;
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -96,8 +96,8 @@ AM_CFLAGS += \
 	-I$(top_srcdir)/src/egl/drivers/dri2 \
 	-I$(top_srcdir)/src/gbm/backends/dri \
 	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
-	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
 	-I$(top_builddir)/src/egl/wayland/wayland-drm \
+	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
 	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\" \
 	-D_EGL_BUILT_IN_DRIVER_DRI2

--- a/src/gallium/Android.common.mk
+++ b/src/gallium/Android.common.mk
@@ -34,7 +34,7 @@ LOCAL_C_INCLUDES += \
 	external/llvm/include \
 	external/llvm/device/include \
 	external/libcxx/include \
-	external/elfutils/$(if $(filter true,$(MESA_LOLLIPOP_BUILD)),0.153/)libelf
+	$(ELF_INCLUDES)
 endif

 include $(MESA_COMMON_MK)
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -527,7 +527,7 @@ lp_build_gather(struct gallivm_state *gallivm,
      if (vec_zext) {
         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
         if (vector_justify) {
-#if PIPE_ARCH_BIG_ENDIAN
+#ifdef PIPE_ARCH_BIG_ENDIAN
            unsigned sv = dst_type.width - src_width;
            res = LLVMBuildShl(gallivm->builder, res,
                               lp_build_const_int_vec(gallivm, res_type, sv), "");
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -2624,7 +2624,6 @@ lp_set_default_actions_cpu(
   bld_base->op_actions[TGSI_OPCODE_DSLT].emit = dslt_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_DSNE].emit = dsne_emit_cpu;

-   bld_base->op_actions[TGSI_OPCODE_DDIV].emit = div_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = drecip_sqrt_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = dsqrt_emit_cpu;

--- a/src/gallium/auxiliary/hud/hud_cpufreq.c
+++ b/src/gallium/auxiliary/hud/hud_cpufreq.c
@@ -149,6 +149,7 @@ hud_cpufreq_graph_install(struct hud_pane *pane, int cpu_index,
      break;
   case CPUFREQ_MAXIMUM:
      snprintf(gr->name, sizeof(gr->name), "%s-Max", cfi->name);
+      break;
   default:
      return;
   }
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -209,6 +209,16 @@ micro_dadd(union tgsi_double_channel *dst,
   dst->d[3] = src[0].d[3] + src[1].d[3];
 }

+static void
+micro_ddiv(union tgsi_double_channel *dst,
+          const union tgsi_double_channel *src)
+{
+   dst->d[0] = src[0].d[0] / src[1].d[0];
+   dst->d[1] = src[0].d[1] / src[1].d[1];
+   dst->d[2] = src[0].d[2] / src[1].d[2];
+   dst->d[3] = src[0].d[3] / src[1].d[3];
+}
+
 static void
 micro_ddx(union tgsi_exec_channel *dst,
          const union tgsi_exec_channel *src)
@@ -5995,6 +6005,10 @@ exec_instruction(
      exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
      break;

+   case TGSI_OPCODE_DDIV:
+      exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
+      break;
+
   case TGSI_OPCODE_DMUL:
      exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
      break;
--- a/src/gallium/drivers/etnaviv/etnaviv_compiler.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_compiler.c
@@ -1021,7 +1021,7 @@ label_mark_use(struct etna_compile *c, struct etna_compile_label *label)
 static struct etna_compile_frame *
 find_frame(struct etna_compile *c, enum etna_compile_frame_type type)
 {
-   for (unsigned sp = c->frame_sp; sp >= 0; sp--)
+   for (int sp = c->frame_sp; sp >= 0; sp--)
      if (c->frame_stack[sp].type == type)
         return &c->frame_stack[sp];

@@ -1444,7 +1444,42 @@ static void
 trans_trig(const struct instr_translater *t, struct etna_compile *c,
           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
-   if (c->specs->has_sin_cos_sqrt) {
+   if (c->specs->has_new_sin_cos) { /* Alternative SIN/COS */
+      /* On newer chips alternative SIN/COS instructions are implemented,
+       * which:
+       * - Need their input scaled by 1/pi instead of 2/pi
+       * - Output an x and y component, which need to be multiplied to
+       *   get the result
+       */
+      /* TGSI lowering should deal with SCS */
+      assert(inst->Instruction.Opcode != TGSI_OPCODE_SCS);
+
+      struct etna_native_reg temp = etna_compile_get_inner_temp(c); /* only using .xyz */
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_MUL,
+         .sat = 0,
+         .dst = etna_native_to_dst(temp, INST_COMPS_Z),
+         .src[0] = src[0], /* any swizzling happens here */
+         .src[1] = alloc_imm_f32(c, 1.0f / M_PI),
+      });
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = inst->Instruction.Opcode == TGSI_OPCODE_COS
+                    ? INST_OPCODE_COS
+                    : INST_OPCODE_SIN,
+         .sat = 0,
+         .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y),
+         .src[2] = etna_native_to_src(temp, SWIZZLE(Z, Z, Z, Z)),
+         .tex = { .amode=1 }, /* Unknown bit needs to be set */
+      });
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_MUL,
+         .sat = inst->Instruction.Saturate,
+         .dst = convert_dst(c, &inst->Dst[0]),
+         .src[0] = etna_native_to_src(temp, SWIZZLE(X, X, X, X)),
+         .src[1] = etna_native_to_src(temp, SWIZZLE(Y, Y, Y, Y)),
+      });
+
+   } else if (c->specs->has_sin_cos_sqrt) {
      /* TGSI lowering should deal with SCS */
      assert(inst->Instruction.Opcode != TGSI_OPCODE_SCS);

--- a/src/gallium/drivers/etnaviv/etnaviv_emit.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_emit.c
@@ -491,6 +491,23 @@ etna_emit_state(struct etna_context *ctx)
      /*00C14*/ EMIT_STATE(SE_DEPTH_BIAS, rasterizer->SE_DEPTH_BIAS);
      /*00C18*/ EMIT_STATE(SE_CONFIG, rasterizer->SE_CONFIG);
   }
+   if (unlikely(dirty & (ETNA_DIRTY_SCISSOR | ETNA_DIRTY_FRAMEBUFFER |
+                         ETNA_DIRTY_RASTERIZER | ETNA_DIRTY_VIEWPORT))) {
+      struct etna_rasterizer_state *rasterizer = etna_rasterizer_state(ctx->rasterizer);
+
+      uint32_t clip_right =
+         MIN2(ctx->framebuffer.SE_CLIP_RIGHT, ctx->viewport.SE_CLIP_RIGHT);
+      uint32_t clip_bottom =
+         MIN2(ctx->framebuffer.SE_CLIP_BOTTOM, ctx->viewport.SE_CLIP_BOTTOM);
+
+      if (rasterizer->scissor) {
+         clip_right = MIN2(ctx->scissor.SE_CLIP_RIGHT, clip_right);
+         clip_bottom = MIN2(ctx->scissor.SE_CLIP_BOTTOM, clip_bottom);
+      }
+
+      /*00C20*/ EMIT_STATE_FIXP(SE_CLIP_RIGHT, clip_right);
+      /*00C24*/ EMIT_STATE_FIXP(SE_CLIP_BOTTOM, clip_bottom);
+   }
   if (unlikely(dirty & (ETNA_DIRTY_SHADER))) {
      /*00E00*/ EMIT_STATE(RA_CONTROL, ctx->shader_state.RA_CONTROL);
   }
--- a/src/gallium/drivers/etnaviv/etnaviv_internal.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_internal.h
@@ -47,6 +47,17 @@
 /* PE render targets must be aligned to 64 bytes */
 #define ETNA_PE_ALIGNMENT (64)

+/* These demarcate the margin (fixp16) between the computed sizes and the
+  value sent to the chip. These have been set to the numbers used by the
+  Vivante driver on gc2000. They used to be -1 for scissor right and bottom. I
+  am not sure whether older hardware was relying on these or they were just a
+  guess. But if so, these need to be moved to the _specs structure.
+*/
+#define ETNA_SE_SCISSOR_MARGIN_RIGHT (0x1119)
+#define ETNA_SE_SCISSOR_MARGIN_BOTTOM (0x1111)
+#define ETNA_SE_CLIP_MARGIN_RIGHT (0xffff)
+#define ETNA_SE_CLIP_MARGIN_BOTTOM (0xffff)
+
 /* GPU chip 3D specs */
 struct etna_specs {
   /* supports SUPERTILE (64x64) tiling? */
@@ -59,6 +70,8 @@ struct etna_specs {
   unsigned has_sign_floor_ceil : 1;
   /* can use VS_RANGE, PS_RANGE registers*/
   unsigned has_shader_range_registers : 1;
+   /* has the new sin/cos functions */
+   unsigned has_new_sin_cos : 1;
   /* can use any kind of wrapping mode on npot textures */
   unsigned npot_tex_any_wrap;
   /* number of bits per TS tile */
@@ -126,6 +139,8 @@ struct compiled_scissor_state {
   uint32_t SE_SCISSOR_TOP;
   uint32_t SE_SCISSOR_RIGHT;
   uint32_t SE_SCISSOR_BOTTOM;
+   uint32_t SE_CLIP_RIGHT;
+   uint32_t SE_CLIP_BOTTOM;
 };

 /* Compiled pipe_viewport_state */
@@ -140,6 +155,8 @@ struct compiled_viewport_state {
   uint32_t SE_SCISSOR_TOP;
   uint32_t SE_SCISSOR_RIGHT;
   uint32_t SE_SCISSOR_BOTTOM;
+   uint32_t SE_CLIP_RIGHT;
+   uint32_t SE_CLIP_BOTTOM;
   uint32_t PE_DEPTH_NEAR;
   uint32_t PE_DEPTH_FAR;
 };
@@ -162,6 +179,8 @@ struct compiled_framebuffer_state {
   uint32_t SE_SCISSOR_TOP;
   uint32_t SE_SCISSOR_RIGHT;
   uint32_t SE_SCISSOR_BOTTOM;
+   uint32_t SE_CLIP_RIGHT;
+   uint32_t SE_CLIP_BOTTOM;
   uint32_t RA_MULTISAMPLE_UNK00E04;
   uint32_t RA_MULTISAMPLE_UNK00E10[VIVS_RA_MULTISAMPLE_UNK00E10__LEN];
   uint32_t RA_CENTROID_TABLE[VIVS_RA_CENTROID_TABLE__LEN];
--- a/src/gallium/drivers/etnaviv/etnaviv_resource.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_resource.c
@@ -201,7 +201,10 @@ etna_resource_alloc(struct pipe_screen *pscreen, unsigned layout,

   size = setup_miptree(rsc, paddingX, paddingY, msaa_xscale, msaa_yscale);

-   struct etna_bo *bo = etna_bo_new(screen->dev, size, DRM_ETNA_GEM_CACHE_WC);
+   uint32_t flags = DRM_ETNA_GEM_CACHE_WC;
+   if (templat->bind & PIPE_BIND_VERTEX_BUFFER)
+      flags |= DRM_ETNA_GEM_FORCE_MMU;
+   struct etna_bo *bo = etna_bo_new(screen->dev, size, flags);
   if (unlikely(bo == NULL)) {
      BUG("Problem allocating video memory for resource");
      return NULL;
--- a/src/gallium/drivers/etnaviv/etnaviv_screen.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c
@@ -469,8 +469,11 @@ etna_screen_is_format_supported(struct pipe_screen *pscreen,
      return FALSE;

   if (usage & PIPE_BIND_RENDER_TARGET) {
-      /* if render target, must be RS-supported format */
-      if (translate_rs_format(format) != ETNA_NO_MATCH) {
+      /* If render target, must be RS-supported format that is not rb swapped.
+       * Exposing rb swapped (or other swizzled) formats for rendering would
+       * involve swizzing in the pixel shader.
+       */
+      if (translate_rs_format(format) != ETNA_NO_MATCH && !translate_rs_format_rb_swap(format)) {
         /* Validate MSAA; number of samples must be allowed, and render target
          * must have MSAA'able format. */
         if (sample_count > 1) {
@@ -617,6 +620,8 @@ etna_get_specs(struct etna_screen *screen)
      screen->model >= 0x1000 || screen->model == 0x880;
   screen->specs.npot_tex_any_wrap =
      VIV_FEATURE(screen, chipMinorFeatures1, NON_POWER_OF_TWO);
+   screen->specs.has_new_sin_cos =
+      VIV_FEATURE(screen, chipMinorFeatures3, HAS_FAST_TRANSCENDENTALS);

   if (instruction_count > 256) { /* unified instruction memory? */
      screen->specs.vs_offset = 0xC000;
--- a/src/gallium/drivers/etnaviv/etnaviv_state.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_state.c
@@ -323,8 +323,10 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
   /* Scissor setup */
   cs->SE_SCISSOR_LEFT = 0; /* affected by rasterizer and scissor state as well */
   cs->SE_SCISSOR_TOP = 0;
-   cs->SE_SCISSOR_RIGHT = (sv->width << 16) - 1;
-   cs->SE_SCISSOR_BOTTOM = (sv->height << 16) - 1;
+   cs->SE_SCISSOR_RIGHT = (sv->width << 16) + ETNA_SE_SCISSOR_MARGIN_RIGHT;
+   cs->SE_SCISSOR_BOTTOM = (sv->height << 16) + ETNA_SE_SCISSOR_MARGIN_BOTTOM;
+   cs->SE_CLIP_RIGHT = (sv->width << 16) + ETNA_SE_CLIP_MARGIN_RIGHT;
+   cs->SE_CLIP_BOTTOM = (sv->height << 16) + ETNA_SE_CLIP_MARGIN_BOTTOM;

   cs->TS_MEM_CONFIG = ts_mem_config;

@@ -345,13 +347,17 @@ etna_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
 {
   struct etna_context *ctx = etna_context(pctx);
   struct compiled_scissor_state *cs = &ctx->scissor;
+   assert(ss->minx <= ss->maxx);
+   assert(ss->miny <= ss->maxy);

   /* note that this state is only used when rasterizer_state->scissor is on */
   ctx->scissor_s = *ss;
   cs->SE_SCISSOR_LEFT = (ss->minx << 16);
   cs->SE_SCISSOR_TOP = (ss->miny << 16);
-   cs->SE_SCISSOR_RIGHT = (ss->maxx << 16) - 1;
-   cs->SE_SCISSOR_BOTTOM = (ss->maxy << 16) - 1;
+   cs->SE_SCISSOR_RIGHT = (ss->maxx << 16) + ETNA_SE_SCISSOR_MARGIN_RIGHT;
+   cs->SE_SCISSOR_BOTTOM = (ss->maxy << 16) + ETNA_SE_SCISSOR_MARGIN_BOTTOM;
+   cs->SE_CLIP_RIGHT = (ss->maxx << 16) + ETNA_SE_CLIP_MARGIN_RIGHT;
+   cs->SE_CLIP_BOTTOM = (ss->maxy << 16) + ETNA_SE_CLIP_MARGIN_BOTTOM;

   ctx->dirty |= ETNA_DIRTY_SCISSOR;
 }
@@ -387,22 +393,14 @@ etna_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
   /* Compute scissor rectangle (fixp) from viewport.
    * Make sure left is always < right and top always < bottom.
    */
-   cs->SE_SCISSOR_LEFT = etna_f32_to_fixp16(MAX2(vs->translate[0] - vs->scale[0], 0.0f));
-   cs->SE_SCISSOR_TOP = etna_f32_to_fixp16(MAX2(vs->translate[1] - vs->scale[1], 0.0f));
-   cs->SE_SCISSOR_RIGHT = etna_f32_to_fixp16(MAX2(vs->translate[0] + vs->scale[0], 0.0f));
-   cs->SE_SCISSOR_BOTTOM = etna_f32_to_fixp16(MAX2(vs->translate[1] + vs->scale[1], 0.0f));
-
-   if (cs->SE_SCISSOR_LEFT > cs->SE_SCISSOR_RIGHT) {
-      uint32_t tmp = cs->SE_SCISSOR_RIGHT;
-      cs->SE_SCISSOR_RIGHT = cs->SE_SCISSOR_LEFT;
-      cs->SE_SCISSOR_LEFT = tmp;
-   }
-
-   if (cs->SE_SCISSOR_TOP > cs->SE_SCISSOR_BOTTOM) {
-      uint32_t tmp = cs->SE_SCISSOR_BOTTOM;
-      cs->SE_SCISSOR_BOTTOM = cs->SE_SCISSOR_TOP;
-      cs->SE_SCISSOR_TOP = tmp;
-   }
+   cs->SE_SCISSOR_LEFT = etna_f32_to_fixp16(MAX2(vs->translate[0] - fabsf(vs->scale[0]), 0.0f));
+   cs->SE_SCISSOR_TOP = etna_f32_to_fixp16(MAX2(vs->translate[1] - fabsf(vs->scale[1]), 0.0f));
+   uint32_t right_fixp = etna_f32_to_fixp16(MAX2(vs->translate[0] + fabsf(vs->scale[0]), 0.0f));
+   uint32_t bottom_fixp = etna_f32_to_fixp16(MAX2(vs->translate[1] + fabsf(vs->scale[1]), 0.0f));
+   cs->SE_SCISSOR_RIGHT = right_fixp + ETNA_SE_SCISSOR_MARGIN_RIGHT;
+   cs->SE_SCISSOR_BOTTOM = bottom_fixp + ETNA_SE_SCISSOR_MARGIN_BOTTOM;
+   cs->SE_CLIP_RIGHT = right_fixp + ETNA_SE_CLIP_MARGIN_RIGHT;
+   cs->SE_CLIP_BOTTOM = bottom_fixp + ETNA_SE_CLIP_MARGIN_BOTTOM;

   cs->PE_DEPTH_NEAR = fui(0.0); /* not affected if depth mode is Z (as in GL) */
   cs->PE_DEPTH_FAR = fui(1.0);
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -9,6 +9,7 @@ AM_CFLAGS = \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(FREEDRENO_CFLAGS)

+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
 	$(MKDIR_GEN)
 	$(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py > $@ || ($(RM) $@; false)
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
+++ b/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2017 by the following authors:
@@ -2028,6 +2028,8 @@ static inline uint32_t A5XX_GRAS_CL_VPORT_ZSCALE_0(float val)
 }

 #define REG_A5XX_GRAS_SU_CNTL					0x0000e090
+#define A5XX_GRAS_SU_CNTL_CULL_FRONT				0x00000001
+#define A5XX_GRAS_SU_CNTL_CULL_BACK				0x00000002
 #define A5XX_GRAS_SU_CNTL_FRONT_CW				0x00000004
 #define A5XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK			0x000007f8
 #define A5XX_GRAS_SU_CNTL_LINEHALFWIDTH__SHIFT			3
@@ -2909,6 +2911,12 @@ static inline uint32_t A5XX_VPC_PACK_NUMNONPOSVAR(uint32_t val)
 {
 	return ((val) << A5XX_VPC_PACK_NUMNONPOSVAR__SHIFT) & A5XX_VPC_PACK_NUMNONPOSVAR__MASK;
 }
+#define A5XX_VPC_PACK_PSIZELOC__MASK				0x0000ff00
+#define A5XX_VPC_PACK_PSIZELOC__SHIFT				8
+static inline uint32_t A5XX_VPC_PACK_PSIZELOC(uint32_t val)
+{
+	return ((val) << A5XX_VPC_PACK_PSIZELOC__SHIFT) & A5XX_VPC_PACK_PSIZELOC__MASK;
+}

 #define REG_A5XX_VPC_FS_PRIMITIVEID_CNTL			0x0000e2a0

@@ -3049,19 +3057,15 @@ static inline uint32_t A5XX_VFD_DECODE_INSTR_IDX(uint32_t val)
 {
 	return ((val) << A5XX_VFD_DECODE_INSTR_IDX__SHIFT) & A5XX_VFD_DECODE_INSTR_IDX__MASK;
 }
+#define A5XX_VFD_DECODE_INSTR_INSTANCED				0x00020000
 #define A5XX_VFD_DECODE_INSTR_FORMAT__MASK			0x3ff00000
 #define A5XX_VFD_DECODE_INSTR_FORMAT__SHIFT			20
 static inline uint32_t A5XX_VFD_DECODE_INSTR_FORMAT(enum a5xx_vtx_fmt val)
 {
 	return ((val) << A5XX_VFD_DECODE_INSTR_FORMAT__SHIFT) & A5XX_VFD_DECODE_INSTR_FORMAT__MASK;
 }
-#define A5XX_VFD_DECODE_INSTR_SWAP__MASK			0xc0000000
-#define A5XX_VFD_DECODE_INSTR_SWAP__SHIFT			30
-static inline uint32_t A5XX_VFD_DECODE_INSTR_SWAP(enum a3xx_color_swap val)
-{
-	return ((val) << A5XX_VFD_DECODE_INSTR_SWAP__SHIFT) & A5XX_VFD_DECODE_INSTR_SWAP__MASK;
-}
-#define A5XX_VFD_DECODE_INSTR_INSTANCED				0x00020000
+#define A5XX_VFD_DECODE_INSTR_UNK30				0x40000000
+#define A5XX_VFD_DECODE_INSTR_FLOAT				0x80000000

 static inline uint32_t REG_A5XX_VFD_DECODE_STEP_RATE(uint32_t i0) { return 0x0000e48b + 0x2*i0; }

@@ -3167,6 +3171,12 @@ static inline uint32_t A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
 #define REG_A5XX_SP_FS_CONFIG_MAX_CONST				0x0000e58b

 #define REG_A5XX_SP_VS_CTRL_REG0				0x0000e590
+#define A5XX_SP_VS_CTRL_REG0_THREADSIZE__MASK			0x00000008
+#define A5XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT			3
+static inline uint32_t A5XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT) & A5XX_SP_VS_CTRL_REG0_THREADSIZE__MASK;
+}
 #define A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
 #define A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
 static inline uint32_t A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
@@ -3259,6 +3269,12 @@ static inline uint32_t A5XX_SP_VS_VPC_DST_REG_OUTLOC3(uint32_t val)
 #define REG_A5XX_SP_VS_OBJ_START_HI				0x0000e5ad

 #define REG_A5XX_SP_FS_CTRL_REG0				0x0000e5c0
+#define A5XX_SP_FS_CTRL_REG0_THREADSIZE__MASK			0x00000008
+#define A5XX_SP_FS_CTRL_REG0_THREADSIZE__SHIFT			3
+static inline uint32_t A5XX_SP_FS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_SP_FS_CTRL_REG0_THREADSIZE__SHIFT) & A5XX_SP_FS_CTRL_REG0_THREADSIZE__MASK;
+}
 #define A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
 #define A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
 static inline uint32_t A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
@@ -3328,6 +3344,7 @@ static inline uint32_t A5XX_SP_FS_MRT_REG_COLOR_FORMAT(enum a5xx_color_fmt val)
 {
 	return ((val) << A5XX_SP_FS_MRT_REG_COLOR_FORMAT__SHIFT) & A5XX_SP_FS_MRT_REG_COLOR_FORMAT__MASK;
 }
+#define A5XX_SP_FS_MRT_REG_COLOR_SRGB				0x00000400

 #define REG_A5XX_UNKNOWN_E5DB					0x0000e5db

@@ -3381,6 +3398,12 @@ static inline uint32_t A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(enum a3xx_msaa_sample
 #define REG_A5XX_TPL1_TP_FS_ROTATION_CNTL			0x0000e764

 #define REG_A5XX_HLSQ_CONTROL_0_REG				0x0000e784
+#define A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK		0x00000001
+#define A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT		0
+static inline uint32_t A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT) & A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK;
+}

 #define REG_A5XX_HLSQ_CONTROL_1_REG				0x0000e785
 #define A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD__MASK	0x0000003f
--- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
@@ -60,12 +60,6 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
 			info->restart_index : 0xffffffff);

-	/* points + psize -> spritelist: */
-	if (ctx->rasterizer->point_size_per_vertex &&
-			fd5_emit_get_vp(emit)->writes_psize &&
-			(info->mode == PIPE_PRIM_POINTS))
-		primtype = DI_PT_POINTLIST_PSIZE;
-
 	fd5_emit_render_cntl(ctx, false);
 	fd5_draw_emit(ctx->batch, ring, primtype,
 			emit->key.binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY,
@@ -214,35 +208,44 @@ fd5_clear(struct fd_context *ctx, unsigned buffers,
 			if (!(buffers & (PIPE_CLEAR_COLOR0 << i)))
 				continue;

+			enum pipe_format pfmt = pfb->cbufs[i]->format;
+
 			// XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP??
-			float f[4];
-			switch (fd5_pipe2swap(pfb->cbufs[i]->format)) {
+			union pipe_color_union swapped;
+			switch (fd5_pipe2swap(pfmt)) {
 			case WZYX:
-				f[0] = color->f[0];
-				f[1] = color->f[1];
-				f[2] = color->f[2];
-				f[3] = color->f[3];
+				swapped.ui[0] = color->ui[0];
+				swapped.ui[1] = color->ui[1];
+				swapped.ui[2] = color->ui[2];
+				swapped.ui[3] = color->ui[3];
 				break;
 			case WXYZ:
-				f[2] = color->f[0];
-				f[1] = color->f[1];
-				f[0] = color->f[2];
-				f[3] = color->f[3];
+				swapped.ui[2] = color->ui[0];
+				swapped.ui[1] = color->ui[1];
+				swapped.ui[0] = color->ui[2];
+				swapped.ui[3] = color->ui[3];
 				break;
 			case ZYXW:
-				f[3] = color->f[0];
-				f[0] = color->f[1];
-				f[1] = color->f[2];
-				f[2] = color->f[3];
+				swapped.ui[3] = color->ui[0];
+				swapped.ui[0] = color->ui[1];
+				swapped.ui[1] = color->ui[2];
+				swapped.ui[2] = color->ui[3];
 				break;
 			case XYZW:
-				f[3] = color->f[0];
-				f[2] = color->f[1];
-				f[1] = color->f[2];
-				f[0] = color->f[3];
+				swapped.ui[3] = color->ui[0];
+				swapped.ui[2] = color->ui[1];
+				swapped.ui[1] = color->ui[2];
+				swapped.ui[0] = color->ui[3];
 				break;
 			}
-			util_pack_color(f, pfb->cbufs[i]->format, &uc);
+
+			if (util_format_is_pure_uint(pfmt)) {
+				util_format_write_4ui(pfmt, swapped.ui, 0, &uc, 0, 0, 0, 1, 1);
+			} else if (util_format_is_pure_sint(pfmt)) {
+				util_format_write_4i(pfmt, swapped.i, 0, &uc, 0, 0, 0, 1, 1);
+			} else {
+				util_pack_color(swapped.f, pfmt, &uc);
+			}

 			OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1);
 			OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_MRT0 + i));
--- a/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
@@ -366,6 +366,7 @@ fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 			struct fd_resource *rsc = fd_resource(vb->buffer);
 			enum pipe_format pfmt = elem->src_format;
 			enum a5xx_vtx_fmt fmt = fd5_pipe2vtx(pfmt);
+			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t off = vb->buffer_offset + elem->src_offset;
 			uint32_t size = fd_bo_size(rsc->bo) - off;
 			debug_assert(fmt != ~0);
@@ -379,7 +380,8 @@ fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 			OUT_RING(ring, A5XX_VFD_DECODE_INSTR_IDX(j) |
 					A5XX_VFD_DECODE_INSTR_FORMAT(fmt) |
 					COND(elem->instance_divisor, A5XX_VFD_DECODE_INSTR_INSTANCED) |
-					0xc0000000);  // XXX
+					A5XX_VFD_DECODE_INSTR_UNK30 |
+					COND(!isint, A5XX_VFD_DECODE_INSTR_FLOAT));
 			OUT_RING(ring, MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */

 			OUT_PKT4(ring, REG_A5XX_VFD_DEST_CNTL(j), 1);
--- a/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
@@ -109,7 +109,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		}

 		OUT_PKT4(ring, REG_A5XX_SP_FS_MRT_REG(i), 1);
-		OUT_RING(ring, A5XX_SP_FS_MRT_REG_COLOR_FORMAT(format));
+		OUT_RING(ring, A5XX_SP_FS_MRT_REG_COLOR_FORMAT(format) |
+				COND(srgb, A5XX_SP_FS_MRT_REG_COLOR_SRGB));

 		/* when we support UBWC, these would be the system memory
 		 * addr/pitch/etc:
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
@@ -336,10 +336,14 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	uint32_t pos_regid, psize_regid, color_regid[8];
 	uint32_t face_regid, coord_regid, zwcoord_regid;
 	uint32_t vcoord_regid, vertex_regid, instance_regid;
+	enum a3xx_threadsize fssz;
+	uint8_t psize_loc = ~0;
 	int i, j;

 	setup_stages(emit, s);

+	fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
+
 	pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS);
 	psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ);
 	vertex_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_VERTEX_ID);
@@ -364,7 +368,7 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
 	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
 	zwcoord_regid = s[FS].v->frag_coord ? regid(0,2) : regid(63,0);
-	vcoord_regid = (s[FS].v->total_in > 0) ? regid(0,0) : regid(63,0);
+	vcoord_regid = (s[FS].v->total_in > 0) ? s[FS].v->pos_regid : regid(63,0);

 	/* we could probably divide this up into things that need to be
 	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
@@ -472,8 +476,10 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	if (pos_regid != regid(63,0))
 		ir3_link_add(&l, pos_regid, 0xf, l.max_loc);

-	if (psize_regid != regid(63,0))
+	if (psize_regid != regid(63,0)) {
+		psize_loc = l.max_loc;
 		ir3_link_add(&l, psize_regid, 0x1, l.max_loc);
+	}

 	if ((s[VS].v->shader->stream_output.num_outputs > 0) &&
 			!emit->key.binning_pass) {
@@ -551,7 +557,8 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	}

 	OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 5);
-	OUT_RING(ring, 0x00000881);        /* XXX HLSQ_CONTROL_0 */
+	OUT_RING(ring, A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
+			0x00000880);               /* XXX HLSQ_CONTROL_0 */
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD(63));
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
 			0xfcfcfc00);               /* XXX */
@@ -564,7 +571,8 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	OUT_PKT4(ring, REG_A5XX_SP_FS_CTRL_REG0, 1);
 	OUT_RING(ring, COND(s[FS].v->total_in > 0, A5XX_SP_FS_CTRL_REG0_VARYING) |
 			COND(s[FS].v->frag_coord, A5XX_SP_FS_CTRL_REG0_VARYING) |
-			0x4000e | /* XXX set pretty much everywhere */
+			0x40006 | /* XXX set pretty much everywhere */
+			A5XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
 			A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
 			A5XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
 			A5XX_SP_FS_CTRL_REG0_BRANCHSTACK(0x3) |  // XXX need to figure this out somehow..
@@ -692,7 +700,7 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)

 		OUT_PKT4(ring, REG_A5XX_VPC_PACK, 1);
 		OUT_RING(ring, A5XX_VPC_PACK_NUMNONPOSVAR(s[FS].v->total_in) |
-				(s[VS].v->writes_psize ? 0x0c00 : 0xff00)); // XXX
+				A5XX_VPC_PACK_PSIZELOC(psize_loc));

 		OUT_PKT4(ring, REG_A5XX_VPC_VARYING_INTERP_MODE(0), 8);
 		for (i = 0; i < 8; i++)
--- a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c
@@ -76,11 +76,11 @@ fd5_rasterizer_state_create(struct pipe_context *pctx,
 //	if (cso->fill_front != PIPE_POLYGON_MODE_FILL ||
 //		cso->fill_back != PIPE_POLYGON_MODE_FILL)
 //		so->pc_prim_vtx_cntl2 |= A5XX_PC_PRIM_VTX_CNTL2_POLYMODE_ENABLE;
-//
-//	if (cso->cull_face & PIPE_FACE_FRONT)
-//		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_FRONT;
-//	if (cso->cull_face & PIPE_FACE_BACK)
-//		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_BACK;
+
+	if (cso->cull_face & PIPE_FACE_FRONT)
+		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_FRONT;
+	if (cso->cull_face & PIPE_FACE_BACK)
+		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_BACK;
 	if (!cso->front_ccw)
 		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_FRONT_CW;
 //	if (!cso->flatshade_first)
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2924,7 +2924,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	struct pipe_stream_output_info so = pipeshader->selector->so;
 	struct tgsi_full_immediate *immediate;
 	struct r600_shader_ctx ctx;
-	struct r600_bytecode_output output[32];
+	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
 	unsigned output_done, noutput;
 	unsigned opcode;
 	int i, j, k, r = 0;
@@ -4185,41 +4185,63 @@ static int egcm_double_to_int(struct r600_shader_ctx *ctx)
 	return 0;
 }

+static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
+					unsigned op,
+					int dst_reg,
+					struct r600_shader_src *src,
+					bool abs)
+{
+	struct r600_bytecode_alu alu;
+	const int last_slot = 3;
+	int r;
+
+	/* these have to write the result to X/Y by the looks of it */
+	for (int i = 0 ; i < last_slot; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = op;
+
+		r600_bytecode_src(&alu.src[0], src, 1);
+		r600_bytecode_src(&alu.src[1], src, 0);
+
+		if (abs)
+			r600_bytecode_src_set_abs(&alu.src[1]);
+
+		alu.dst.sel = dst_reg;
+		alu.dst.chan = i;
+		alu.dst.write = (i == 0 || i == 1);
+
+		if (bc->chip_class != CAYMAN || i == last_slot - 1)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(bc, &alu);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	int i, r;
 	struct r600_bytecode_alu alu;
-	int last_slot = 3;
 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
 	int t1 = ctx->temp_reg;

-	/* these have to write the result to X/Y by the looks of it */
-	for (i = 0 ; i < last_slot; i++) {
-		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-		alu.op = ctx->inst_info->op;
+	/* should only be one src regs */
+	assert(inst->Instruction.NumSrcRegs == 1);

-		/* should only be one src regs */
-		assert (inst->Instruction.NumSrcRegs == 1);
+	/* only support one double at a time */
+	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
+	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);

-		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
-		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
-
-		/* RSQ should take the absolute value of src */
-		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
-		    ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
-			r600_bytecode_src_set_abs(&alu.src[1]);
-		}
-		alu.dst.sel = t1;
-		alu.dst.chan = i;
-		alu.dst.write = (i == 0 || i == 1);
-
-		if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
-			alu.last = 1;
-		r = r600_bytecode_add_alu(ctx->bc, &alu);
-		if (r)
-			return r;
-	}
+	r = cayman_emit_unary_double_raw(
+		ctx->bc, ctx->inst_info->op, t1,
+		&ctx->src[0],
+		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
+		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
+	if (r)
+		return r;

 	for (i = 0 ; i <= lasti; i++) {
 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
@@ -4326,25 +4348,27 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
 	int t1 = ctx->temp_reg;

-	for (k = 0; k < 2; k++) {
-		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
-			continue;
+	/* t1 would get overwritten below if we actually tried to
+	 * multiply two pairs of doubles at a time. */
+	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
+	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);

-		for (i = 0; i < 4; i++) {
-			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-			alu.op = ctx->inst_info->op;
-			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
-			}
-			alu.dst.sel = t1;
-			alu.dst.chan = i;
-			alu.dst.write = 1;
-			if (i == 3)
-				alu.last = 1;
-			r = r600_bytecode_add_alu(ctx->bc, &alu);
-			if (r)
-				return r;
+	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ctx->inst_info->op;
+		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
+			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
 		}
+		alu.dst.sel = t1;
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
 	}

 	for (i = 0; i <= lasti; i++) {
@@ -4366,6 +4390,63 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
 	return 0;
 }

+/*
+ * Emit RECIP_64 + MUL_64 to implement division.
+ */
+static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	int r;
+	struct r600_bytecode_alu alu;
+	int t1 = ctx->temp_reg;
+	int k;
+
+	/* Only support one double at a time. This is the same constraint as
+	 * in DMUL lowering. */
+	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
+	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
+
+	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
+
+	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
+	if (r)
+		return r;
+
+	for (int i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP2_MUL_64;
+
+		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
+
+		alu.src[1].sel = t1;
+		alu.src[1].chan = (i == 3) ? 0 : 1;
+
+		alu.dst.sel = t1;
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+
+	for (int i = 0; i < 2; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP1_MOV;
+		alu.src[0].sel = t1;
+		alu.src[0].chan = i;
+		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
+		alu.dst.write = 1;
+		if (i == 1)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
 /*
 * r600 - trunc to -PI..PI range
 * r700 - normalize by dividing by 2PI
@@ -9376,6 +9457,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
+	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
@@ -9598,6 +9680,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
+	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -320,14 +320,21 @@ static void si_sampler_view_add_buffer(struct si_context *sctx,
 	if (resource->target == PIPE_BUFFER)
 		return;

-	/* Now add separate DCC if it's present. */
+	/* Now add separate DCC or HTILE. */
 	rtex = (struct r600_texture*)resource;
-	if (!rtex->dcc_separate_buffer)
-		return;
+	if (rtex->dcc_separate_buffer) {
+		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
+						    rtex->dcc_separate_buffer, usage,
+						    RADEON_PRIO_DCC, check_mem);
+	}

-	radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
-					    rtex->dcc_separate_buffer, usage,
-					    RADEON_PRIO_DCC, check_mem);
+	if (rtex->htile_buffer &&
+	    rtex->tc_compatible_htile &&
+	    !is_stencil_sampler) {
+		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
+						    rtex->htile_buffer, usage,
+						    RADEON_PRIO_HTILE, check_mem);
+	}
 }

 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
@@ -653,7 +660,8 @@ si_mark_image_range_valid(const struct pipe_image_view *view)

 static void si_set_shader_image(struct si_context *ctx,
 				unsigned shader,
-				unsigned slot, const struct pipe_image_view *view)
+				unsigned slot, const struct pipe_image_view *view,
+				bool skip_decompress)
 {
 	struct si_screen *screen = ctx->screen;
 	struct si_images_info *images = &ctx->images[shader];
@@ -695,7 +703,7 @@ static void si_set_shader_image(struct si_context *ctx,
 		assert(!tex->is_depth);
 		assert(tex->fmask.size == 0);

-		if (uses_dcc &&
+		if (uses_dcc && !skip_decompress &&
 		    (view->access & PIPE_IMAGE_ACCESS_WRITE ||
 		     !vi_dcc_formats_compatible(res->b.b.format, view->format))) {
 			/* If DCC can't be disabled, at least decompress it.
@@ -769,10 +777,10 @@ si_set_shader_images(struct pipe_context *pipe,

 	if (views) {
 		for (i = 0, slot = start_slot; i < count; ++i, ++slot)
-			si_set_shader_image(ctx, shader, slot, &views[i]);
+			si_set_shader_image(ctx, shader, slot, &views[i], false);
 	} else {
 		for (i = 0, slot = start_slot; i < count; ++i, ++slot)
-			si_set_shader_image(ctx, shader, slot, NULL);
+			si_set_shader_image(ctx, shader, slot, NULL, false);
 	}

 	si_update_compressed_tex_shader_mask(ctx, shader);
@@ -1703,7 +1711,7 @@ void si_update_all_texture_descriptors(struct si_context *sctx)
 			    view->resource->target == PIPE_BUFFER)
 				continue;

-			si_set_shader_image(sctx, shader, i, view);
+			si_set_shader_image(sctx, shader, i, view, true);
 		}

 		/* Sampler views. */
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -717,8 +717,10 @@ static void si_update_poly_offset_state(struct si_context *sctx)
 {
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;

-	if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf)
+	if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
+		si_pm4_bind_state(sctx, poly_offset, NULL);
 		return;
+	}

 	/* Use the user format, not db_render_format, so that the polygon
 	 * offset behaves as expected by applications.
@@ -1363,11 +1365,17 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 		case PIPE_FORMAT_Z16_UNORM:
 			return V_008F14_IMG_DATA_FORMAT_16;
 		case PIPE_FORMAT_X24S8_UINT:
+		case PIPE_FORMAT_S8X24_UINT:
+			/*
+			 * Implemented as an 8_8_8_8 data format to fix texture
+			 * gathers in stencil sampling. This affects at least
+			 * GL45-CTS.texture_cube_map_array.sampling on VI.
+			 */
+			return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
 		case PIPE_FORMAT_Z24X8_UNORM:
 		case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 			return V_008F14_IMG_DATA_FORMAT_8_24;
 		case PIPE_FORMAT_X8Z24_UNORM:
-		case PIPE_FORMAT_S8X24_UINT:
 		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 			return V_008F14_IMG_DATA_FORMAT_24_8;
 		case PIPE_FORMAT_S8_UINT:
@@ -2794,14 +2802,22 @@ si_make_texture_descriptor(struct si_screen *screen,
 	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
 		const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
 		const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+		const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};

 		switch (pipe_format) {
 		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-		case PIPE_FORMAT_X24S8_UINT:
 		case PIPE_FORMAT_X32_S8X24_UINT:
 		case PIPE_FORMAT_X8Z24_UNORM:
 			util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
 			break;
+		case PIPE_FORMAT_X24S8_UINT:
+			/*
+			 * X24S8 is implemented as an 8_8_8_8 data format, to
+			 * fix texture gathers. This affects at least
+			 * GL45-CTS.texture_cube_map_array.sampling on VI.
+			 */
+			util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+			break;
 		default:
 			util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
 		}
@@ -3352,7 +3368,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
 		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
 		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
-		channel = &desc->channel[first_non_void];
+		channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;

 		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
 				   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
@@ -3374,12 +3390,12 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 				/* This isn't actually used in OpenGL. */
 				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_A2_SINT << (4 * i);
 			}
-		} else if (channel->type == UTIL_FORMAT_TYPE_FIXED) {
+		} else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) {
 			if (desc->swizzle[3] == PIPE_SWIZZLE_1)
 				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGBX_32_FIXED << (4 * i);
 			else
 				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGBA_32_FIXED << (4 * i);
-		} else if (channel->size == 32 && !channel->pure_integer) {
+		} else if (channel && channel->size == 32 && !channel->pure_integer) {
 			if (channel->type == UTIL_FORMAT_TYPE_SIGNED) {
 				if (channel->normalized) {
 					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -850,11 +850,12 @@ void si_emit_cache_flush(struct si_context *sctx)
 	if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2 ||
 	    (rctx->chip_class <= CIK &&
 	     (rctx->flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
-		/* Invalidate L1 & L2. (L1 is always invalidated)
+		/* Invalidate L1 & L2. (L1 is always invalidated on SI)
 		 * WB must be set on VI+ when TC_ACTION is set.
 		 */
 		si_emit_surface_sync(rctx, cp_coher_cntl |
 				     S_0085F0_TC_ACTION_ENA(1) |
+				     S_0085F0_TCL1_ACTION_ENA(1) |
 				     S_0301F0_TC_WB_ACTION_ENA(rctx->chip_class >= VI));
 		cp_coher_cntl = 0;
 		sctx->b.num_L2_invalidates++;
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -217,6 +217,15 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
        out_numThreadsPerProcGroup++;
    }

+    /* Prune empty numa nodes */
+    for (auto it = out_nodes.begin(); it != out_nodes.end(); ) {
+       if ((*it).cores.size() == 0)
+          it = out_nodes.erase(it);
+       else
+          ++it;
+    }
+
+    /* Prune empty core nodes */
    for (uint32_t node = 0; node < out_nodes.size(); node++) {
        auto& numaNode = out_nodes[node];
        auto it = numaNode.cores.begin();
--- a/src/gallium/drivers/swr/swr_query.cpp
+++ b/src/gallium/drivers/swr/swr_query.cpp
@@ -29,7 +29,7 @@
 #include "swr_query.h"
 #include "swr_screen.h"
 #include "swr_state.h"
-
+#include "common/os.h"

 static struct swr_query *
 swr_query(struct pipe_query *p)
@@ -45,7 +45,8 @@ swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
   assert(type < PIPE_QUERY_TYPES);
   assert(index < MAX_SO_STREAMS);

-   pq = CALLOC_STRUCT(swr_query);
+   pq = (struct swr_query *) AlignedMalloc(sizeof(struct swr_query), 64);
+   memset(pq, 0, sizeof(*pq));

   if (pq) {
      pq->type = type;
@@ -67,7 +68,7 @@ swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
      swr_fence_reference(pipe->screen, &pq->fence, NULL);
   }

-   FREE(pq);
+   AlignedFree(pq);
 }


--- a/src/gallium/drivers/swr/swr_query.h
+++ b/src/gallium/drivers/swr/swr_query.h
@@ -34,7 +34,7 @@ struct swr_query_result {
   uint64_t timestamp_end;
 };

-struct swr_query {
+OSALIGNLINE(struct) swr_query {
   unsigned type; /* PIPE_QUERY_* */
   unsigned index;

--- a/src/gallium/state_trackers/clover/Makefile.am
+++ b/src/gallium/state_trackers/clover/Makefile.am
@@ -2,12 +2,12 @@ include Makefile.sources

 AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
+	-I$(top_builddir)/src \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/drivers \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/gallium/winsys \
-	-I$(top_builddir)/src \
 	-I$(srcdir)

 if HAVE_CLOVER_ICD
--- a/src/gallium/state_trackers/dri/Makefile.am
+++ b/src/gallium/state_trackers/dri/Makefile.am
@@ -28,8 +28,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/mesa \
-	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	$(GALLIUM_CFLAGS) \
 	$(LIBDRM_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -81,7 +81,7 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende
   }

   if (context->decoder->entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE)
-      context->decoder->begin_frame(context->decoder, context->target, &context->desc.base);
+      context->needs_begin_frame = true;

   return VA_STATUS_SUCCESS;
 }
@@ -178,6 +178,8 @@ handlePictureParameterBuffer(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *

      if (!context->decoder)
         return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+      context->needs_begin_frame = true;
   }

   return vaStatus;
@@ -308,8 +310,11 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
   sizes[num_buffers] = buf->size;
   ++num_buffers;

-   context->decoder->begin_frame(context->decoder, context->target,
-      &context->desc.base);
+   if (context->needs_begin_frame) {
+      context->decoder->begin_frame(context->decoder, context->target,
+         &context->desc.base);
+      context->needs_begin_frame = false;
+   }
   context->decoder->decode_bitstream(context->decoder, context->target, &context->desc.base,
      num_buffers, (const void * const*)buffers, sizes);
 }
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -261,6 +261,7 @@ typedef struct {
   int target_id;
   bool first_single_submitted;
   int gop_coeff;
+   bool needs_begin_frame;
 } vlVaContext;

 typedef struct {
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -75,6 +75,13 @@ vlVdpOutputSurfaceCreate(VdpDevice device,

   memset(&res_tmpl, 0, sizeof(res_tmpl));

+   /*
+    * The output won't look correctly when this buffer is send to X,
+    * if the VDPAU RGB component order doesn't match the X11 one so
+    * we only allow the X11 format
+    */
+   vlsurface->send_to_X = rgba_format == VDP_RGBA_FORMAT_B8G8R8A8;
+
   res_tmpl.target = PIPE_TEXTURE_2D;
   res_tmpl.format = VdpFormatRGBAToPipe(rgba_format);
   res_tmpl.width0 = width;
--- a/src/gallium/state_trackers/vdpau/presentation.c
+++ b/src/gallium/state_trackers/vdpau/presentation.c
@@ -231,7 +231,7 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
   vscreen = pq->device->vscreen;

   pipe_mutex_lock(pq->device->mutex);
-   if (vscreen->set_back_texture_from_output)
+   if (vscreen->set_back_texture_from_output && surf->send_to_X)
      vscreen->set_back_texture_from_output(vscreen, surf->surface->texture, clip_width, clip_height);
   tex = vscreen->texture_from_drawable(vscreen, (void *)pq->drawable);
   if (!tex) {
@@ -239,7 +239,7 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
      return VDP_STATUS_INVALID_HANDLE;
   }

-   if (!vscreen->set_back_texture_from_output) {
+   if (!vscreen->set_back_texture_from_output || !surf->send_to_X) {
      dirty_area = vscreen->get_dirty_area(vscreen);

      memset(&surf_templ, 0, sizeof(surf_templ));
@@ -289,7 +289,7 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
      framenum++;
   }

-   if (!vscreen->set_back_texture_from_output) {
+   if (!vscreen->set_back_texture_from_output || !surf->send_to_X) {
      pipe_resource_reference(&tex, NULL);
      pipe_surface_reference(&surf_draw, NULL);
   }
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -415,6 +415,7 @@ typedef struct
   struct pipe_fence_handle *fence;
   struct vl_compositor_state cstate;
   struct u_rect dirty_area;
+   bool send_to_X;
 } vlVdpOutputSurface;

 typedef struct
--- a/src/gallium/targets/d3dadapter9/Makefile.am
+++ b/src/gallium/targets/d3dadapter9/Makefile.am
@@ -27,8 +27,8 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/loader \
 	-I$(top_srcdir)/src/mapi/ \
 	-I$(top_srcdir)/src/mesa/ \
-	-I$(top_srcdir)/src/mesa/drivers/dri/common/ \
 	-I$(top_builddir)/src/mesa/drivers/dri/common/ \
+	-I$(top_srcdir)/src/mesa/drivers/dri/common/ \
 	-I$(top_srcdir)/src/gallium/winsys \
 	-I$(top_srcdir)/src/gallium/state_trackers/nine \
 	$(GALLIUM_TARGET_CFLAGS) \
--- a/src/glx/Makefile.am
+++ b/src/glx/Makefile.am
@@ -37,10 +37,10 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/include/GL/internal \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/loader \
-	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_builddir)/src/mapi \
+	-I$(top_srcdir)/src/mapi \
 	-I$(top_builddir)/src/mapi/glapi \
+	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
 	$(SHARED_GLAPI_CFLAGS) \
 	$(EXTRA_DEFINES_XF86VIDMODE) \
--- a/src/glx/apple/Makefile.am
+++ b/src/glx/apple/Makefile.am
@@ -6,11 +6,11 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/glx \
-	-I$(top_srcdir)/src/mesa \
 	-I$(top_builddir)/src/mesa \
+	-I$(top_srcdir)/src/mesa \
 	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_builddir)/src/mapi/glapi \
+	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
 	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES) \
--- a/src/glx/windows/Makefile.am
+++ b/src/glx/windows/Makefile.am
@@ -24,8 +24,8 @@ libwindowsglx_la_CFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/glx \
 	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_builddir)/src/mapi/glapi \
+	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
 	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES) \
--- a/src/intel/blorp/blorp_blit.c
+++ b/src/intel/blorp/blorp_blit.c
@@ -26,6 +26,9 @@
 #include "blorp_priv.h"
 #include "brw_meta_util.h"

+/* header-only include needed for _mesa_unorm_to_float and friends. */
+#include "mesa/main/format_utils.h"
+
 #define FILE_DEBUG_FLAG DEBUG_BLORP

 static const bool split_blorp_blit_debug = false;
@@ -2204,6 +2207,75 @@ get_ccs_compatible_uint_format(const struct isl_format_layout *fmtl)
   }
 }

+/* Takes an isl_color_value and returns a color value that is the original
+ * color value only bit-casted to a UINT format.  This value, together with
+ * the format from get_ccs_compatible_uint_format, will yield the same bit
+ * value as the original color and format.
+ */
+static union isl_color_value
+bitcast_color_value_to_uint(union isl_color_value color,
+                            const struct isl_format_layout *fmtl)
+{
+   /* All CCS formats have the same number of bits in each channel */
+   const struct isl_channel_layout *chan = &fmtl->channels.r;
+
+   union isl_color_value bits;
+   switch (chan->type) {
+   case ISL_UINT:
+   case ISL_SINT:
+      /* Hardware will ignore the high bits so there's no need to cast */
+      bits = color;
+      break;
+
+   case ISL_UNORM:
+      for (unsigned i = 0; i < 4; i++)
+         bits.u32[i] = _mesa_float_to_unorm(color.f32[i], chan->bits);
+      break;
+
+   case ISL_SNORM:
+      for (unsigned i = 0; i < 4; i++)
+         bits.i32[i] = _mesa_float_to_snorm(color.f32[i], chan->bits);
+      break;
+
+   case ISL_SFLOAT:
+      switch (chan->bits) {
+      case 16:
+         for (unsigned i = 0; i < 4; i++)
+            bits.u32[i] = _mesa_float_to_half(color.f32[i]);
+         break;
+
+      case 32:
+         bits = color;
+         break;
+
+      default:
+         unreachable("Invalid float format size");
+      }
+      break;
+
+   default:
+      unreachable("Invalid channel type");
+   }
+
+   switch (fmtl->format) {
+   case ISL_FORMAT_B8G8R8A8_UNORM:
+   case ISL_FORMAT_B8G8R8A8_UNORM_SRGB:
+   case ISL_FORMAT_B8G8R8X8_UNORM:
+   case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: {
+      /* If it's a BGRA format, we need to swap blue and red */
+      uint32_t tmp = bits.u32[0];
+      bits.u32[0] = bits.u32[2];
+      bits.u32[2] = tmp;
+      break;
+   }
+
+   default:
+      break; /* Nothing to do */
+   }
+
+   return bits;
+}
+
 static void
 surf_convert_to_uncompressed(const struct isl_device *isl_dev,
                             struct brw_blorp_surface_info *info,
@@ -2320,6 +2392,16 @@ blorp_copy(struct blorp_batch *batch,
      params.src.view.format = get_copy_format_for_bpb(isl_dev, src_fmtl->bpb);
   }

+   if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) {
+      params.src.clear_color =
+         bitcast_color_value_to_uint(params.src.clear_color, src_fmtl);
+   }
+
+   if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) {
+      params.dst.clear_color =
+         bitcast_color_value_to_uint(params.dst.clear_color, dst_fmtl);
+   }
+
   wm_prog_key.src_bpc =
      isl_format_get_layout(params.src.view.format)->channels.r.bits;
   wm_prog_key.dst_bpc =
--- a/src/intel/blorp/blorp_clear.c
+++ b/src/intel/blorp/blorp_clear.c
@@ -349,6 +349,29 @@ blorp_clear(struct blorp_batch *batch,
   if (format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {
      clear_color.u32[0] = float3_to_rgb9e5(clear_color.f32);
      format = ISL_FORMAT_R32_UINT;
+   } else if (format == ISL_FORMAT_A4B4G4R4_UNORM) {
+      /* Broadwell and earlier cannot render to this format so we need to work
+       * around it by swapping the colors around and using B4G4R4A4 instead.
+       */
+
+      /* First, we apply the swizzle. */
+      union isl_color_value old;
+      assert((unsigned)(swizzle.r - ISL_CHANNEL_SELECT_RED) < 4);
+      assert((unsigned)(swizzle.g - ISL_CHANNEL_SELECT_RED) < 4);
+      assert((unsigned)(swizzle.b - ISL_CHANNEL_SELECT_RED) < 4);
+      assert((unsigned)(swizzle.a - ISL_CHANNEL_SELECT_RED) < 4);
+      old.u32[swizzle.r - ISL_CHANNEL_SELECT_RED] = clear_color.u32[0];
+      old.u32[swizzle.g - ISL_CHANNEL_SELECT_RED] = clear_color.u32[1];
+      old.u32[swizzle.b - ISL_CHANNEL_SELECT_RED] = clear_color.u32[2];
+      old.u32[swizzle.a - ISL_CHANNEL_SELECT_RED] = clear_color.u32[3];
+      swizzle = ISL_SWIZZLE_IDENTITY;
+
+      /* Now we re-order for the new format */
+      clear_color.u32[0] = old.u32[1];
+      clear_color.u32[1] = old.u32[2];
+      clear_color.u32[2] = old.u32[3];
+      clear_color.u32[3] = old.u32[0];
+      format = ISL_FORMAT_B4G4R4A4_UNORM;
   }

   memcpy(&params.wm_inputs.clear_color, clear_color.f32, sizeof(float) * 4);
--- a/src/intel/isl/isl_format.c
+++ b/src/intel/isl/isl_format.c
@@ -218,9 +218,10 @@ static const struct surface_format_info format_info[] = {
   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,   P8A8_UNORM_PALETTE1)
   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   A1B5G5R5_UNORM)
   /* According to the PRM, A4B4G4R4_UNORM isn't supported until Sky Lake
-    * but empirical testing indicates that it works just fine on Broadwell.
+    * but empirical testing indicates that at least sampling works just fine
+    * on Broadwell.
    */
-   SF(80, 80,  x,  x, 80,  x,  x,  x,  x,    x,   A4B4G4R4_UNORM)
+   SF(80, 80,  x,  x, 90,  x,  x,  x,  x,    x,   A4B4G4R4_UNORM)
   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_UINT)
   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_SINT)
   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x,    x,   R8_UNORM)
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -232,9 +232,12 @@ VkResult anv_AllocateCommandBuffers(
         break;
   }

-   if (result != VK_SUCCESS)
+   if (result != VK_SUCCESS) {
      anv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
                             i, pCommandBuffers);
+      for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
+         pCommandBuffers[i] = VK_NULL_HANDLE;
+   }

   return result;
 }
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -329,18 +329,18 @@ VkResult anv_CreateDescriptorPool(
      }
   }

-   const size_t size =
-      sizeof(*pool) +
+   const size_t pool_size =
      pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) +
      descriptor_count * sizeof(struct anv_descriptor) +
      buffer_count * sizeof(struct anv_buffer_view);
+   const size_t total_size = sizeof(*pool) + pool_size;

-   pool = vk_alloc2(&device->alloc, pAllocator, size, 8,
+   pool = vk_alloc2(&device->alloc, pAllocator, total_size, 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (!pool)
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

-   pool->size = size;
+   pool->size = pool_size;
   pool->next = 0;
   pool->free_list = EMPTY;

--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -75,8 +75,11 @@ choose_isl_surf_usage(VkImageUsageFlags vk_usage,
      isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
   }

-   if (vk_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
-      /* blorp implements transfers by rendering into the destination image. */
+   if (vk_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT &&
+       aspect == VK_IMAGE_ASPECT_COLOR_BIT) {
+      /* blorp implements transfers by rendering into the destination image.
+       * Only request this with color images, as we deal with depth/stencil
+       * formats differently. */
      isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
   }

--- a/src/intel/vulkan/anv_nir_lower_input_attachments.c
+++ b/src/intel/vulkan/anv_nir_lower_input_attachments.c
@@ -100,11 +100,8 @@ try_lower_input_load(nir_function_impl *impl, nir_intrinsic_instr *load)

   if (image_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) {
      tex->op = nir_texop_txf_ms;
-
-      nir_ssa_def *sample_id =
-         nir_load_system_value(&b, nir_intrinsic_load_sample_id, 0);
      tex->src[2].src_type = nir_tex_src_ms_index;
-      tex->src[2].src = nir_src_for_ssa(sample_id);
+      tex->src[2].src = load->src[1];
   }

   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -55,8 +55,6 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_device *device = cmd_buffer->device;

-/* XXX: Do we need this on more than just BDW? */
-#if (GEN_GEN >= 8)
   /* Emit a render target cache flush.
    *
    * This isn't documented anywhere in the PRM.  However, it seems to be
@@ -65,9 +63,10 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
    * clear depth, reset state base address, and then go render stuff.
    */
   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DCFlushEnable = true;
      pc.RenderTargetCacheFlushEnable = true;
+      pc.CommandStreamerStallEnable = true;
   }
-#endif

   anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
      sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
@@ -148,6 +147,8 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
    */
   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
      pc.TextureCacheInvalidationEnable = true;
+      pc.ConstantCacheInvalidationEnable = true;
+      pc.StateCacheInvalidationEnable = true;
   }
 }

@@ -1177,9 +1178,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,

      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
         assert(stage == MESA_SHADER_FRAGMENT);
-         if (desc->image_view->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
-            /* For stencil input attachments, we treat it like any old texture
-             * that a user may have bound.
+         if (desc->image_view->aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) {
+            /* For depth and stencil input attachments, we treat it like any
+             * old texture that a user may have bound.
             */
            surface_state = desc->image_view->sampler_surface_state;
            assert(surface_state.alloc_size);
@@ -1187,9 +1188,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
                                  desc->image_view->image->aux_usage,
                                  surface_state);
         } else {
-            /* For depth and color input attachments, we create the surface
-             * state at vkBeginRenderPass time so that we can include aux
-             * and clear color information.
+            /* For color input attachments, we create the surface state at
+             * vkBeginRenderPass time so that we can include aux and clear
+             * color information.
             */
            assert(binding->input_attachment_index < subpass->input_count);
            const unsigned subpass_att = binding->input_attachment_index;
--- a/src/loader/Makefile.am
+++ b/src/loader/Makefile.am
@@ -39,8 +39,8 @@ libloader_la_LIBADD =

 if HAVE_DRICOMMON
 libloader_la_CPPFLAGS += \
-	-I$(top_srcdir)/src/mesa/drivers/dri/common/ \
 	-I$(top_builddir)/src/mesa/drivers/dri/common/ \
+	-I$(top_srcdir)/src/mesa/drivers/dri/common/ \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/mapi/ \
 	-DUSE_DRICONF
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -46,8 +46,8 @@ AM_CPPFLAGS =							\
 	$(SELINUX_CFLAGS)					\
 	-I$(top_srcdir)/include					\
 	-I$(top_srcdir)/src					\
-	-I$(top_srcdir)/src/mapi				\
-	-I$(top_builddir)/src/mapi
+	-I$(top_builddir)/src/mapi				\
+	-I$(top_srcdir)/src/mapi

 include Makefile.sources

--- a/src/mesa/drivers/dri/i915/Makefile.am
+++ b/src/mesa/drivers/dri/i915/Makefile.am
@@ -30,9 +30,9 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
-	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(INTEL_CFLAGS)
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -30,21 +30,22 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
 	-I$(top_srcdir)/src/gtest/include \
-	-I$(top_srcdir)/src/compiler/nir \
-	-I$(top_srcdir)/src/intel \
 	-I$(top_builddir)/src/compiler/glsl \
 	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_srcdir)/src/compiler/nir \
 	-I$(top_builddir)/src/intel \
-	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	-I$(top_srcdir)/src/intel \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(INTEL_CFLAGS)

 AM_CXXFLAGS = $(AM_CFLAGS)

+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 brw_nir_trig_workarounds.c: brw_nir_trig_workarounds.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
 	$(MKDIR_GEN)
 	$(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/brw_nir_trig_workarounds.py > $@ || ($(RM) $@; false)
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -284,8 +284,10 @@ brw_blorp_to_isl_format(struct brw_context *brw, mesa_format format,
   case MESA_FORMAT_S_UINT8:
      return ISL_FORMAT_R8_UINT;
   case MESA_FORMAT_Z24_UNORM_X8_UINT:
+   case MESA_FORMAT_Z24_UNORM_S8_UINT:
      return ISL_FORMAT_R24_UNORM_X8_TYPELESS;
   case MESA_FORMAT_Z_FLOAT32:
+   case MESA_FORMAT_Z32_FLOAT_S8X24_UINT:
      return ISL_FORMAT_R32_FLOAT;
   case MESA_FORMAT_Z_UNORM16:
      return ISL_FORMAT_R16_UNORM;
@@ -908,6 +910,17 @@ do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,
      blorp_batch_finish(&batch);
   }

+   /*
+    * Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+    *
+    *  Any transition from any value in {Clear, Render, Resolve} to a
+    *  different value in {Clear, Render, Resolve} requires end of pipe
+    *  synchronization.
+    */
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                               PIPE_CONTROL_CS_STALL);
+
   return true;
 }

@@ -975,6 +988,17 @@ brw_blorp_resolve_color(struct brw_context *brw, struct intel_mipmap_tree *mt,
                     brw_blorp_to_isl_format(brw, format, true),
                     resolve_op);
   blorp_batch_finish(&batch);
+
+   /*
+    * Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+    *
+    *  Any transition from any value in {Clear, Render, Resolve} to a
+    *  different value in {Clear, Render, Resolve} requires end of pipe
+    *  synchronization.
+    */
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                               PIPE_CONTROL_CS_STALL);
 }

 static void
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -36,6 +36,7 @@

 #include "brw_context.h"
 #include "brw_blorp.h"
+#include "brw_defines.h"

 #define FILE_DEBUG_FLAG DEBUG_BLIT

@@ -174,14 +175,46 @@ brw_fast_clear_depth(struct gl_context *ctx)
      mt->depth_clear_value = depth_clear_value;
   }

-   /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
-    *
-    *     "If other rendering operations have preceded this clear, a
-    *      PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled
-    *      must be issued before the rectangle primitive used for the depth
-    *      buffer clear operation.
-    */
-   brw_emit_mi_flush(brw);
+   if (brw->gen == 6) {
+      /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+       *
+       *   "If other rendering operations have preceded this clear, a
+       *    PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled
+       *    must be issued before the rectangle primitive used for the depth
+       *    buffer clear operation.
+       */
+       brw_emit_pipe_control_flush(brw,
+                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                   PIPE_CONTROL_CS_STALL);
+   } else if (brw->gen >= 7) {
+      /*
+       * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
+       *
+       *   If other rendering operations have preceded this clear, a
+       *   PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+       *   enabled must be issued before the rectangle primitive used for the
+       *   depth buffer clear operation.
+       *
+       * Same applies for Gen8 and Gen9.
+       *
+       * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1 PIPE_CONTROL,
+       * Depth Cache Flush Enable:
+       *
+       *   This bit must not be set when Depth Stall Enable bit is set in
+       *   this packet.
+       *
+       * This is confirmed to hold for real, HSW gets immediate gpu hangs.
+       *
+       * Therefore issue two pipe control flushes, one for cache flush and
+       * another for depth stall.
+       */
+       brw_emit_pipe_control_flush(brw,
+                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                   PIPE_CONTROL_CS_STALL);
+
+       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+   }

   if (fb->MaxNumLayers > 0) {
      for (unsigned layer = 0; layer < depth_irb->layer_count; layer++) {
@@ -201,7 +234,12 @@ brw_fast_clear_depth(struct gl_context *ctx)
       *      by a PIPE_CONTROL command with DEPTH_STALL bit set and Then
       *      followed by Depth FLUSH'
      */
-      brw_emit_mi_flush(brw);
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_STALL);
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  PIPE_CONTROL_CS_STALL);
   }

   /* Now, the HiZ buffer contains data that needs to be resolved to the depth
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -910,6 +910,9 @@ brw_process_driconf_options(struct brw_context *brw)
   ctx->Const.ForceGLSLExtensionsWarn =
      driQueryOptionb(options, "force_glsl_extensions_warn");

+   ctx->Const.ForceGLSLVersion =
+      driQueryOptioni(options, "force_glsl_version");
+
   ctx->Const.DisableGLSLLineContinuations =
      driQueryOptionb(options, "disable_glsl_line_continuations");

--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -508,7 +508,7 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
   insn = brw_next_insn(p, BRW_OPCODE_SEND);

   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
-   brw_set_src0(p, insn, payload);
+   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
   brw_set_src1(p, insn, brw_imm_d(0));

   /* Terminate a compute shader by sending a message to the thread spawner.
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -177,6 +177,49 @@ static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 static void brwDeleteProgram( struct gl_context *ctx,
 			      struct gl_program *prog )
 {
+   struct brw_context *brw = brw_context(ctx);
+
+   /* Beware!  prog's refcount has reached zero, and it's about to be freed.
+    *
+    * In brw_upload_pipeline_state(), we compare brw->foo_program to
+    * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
+    * pointer has changed.
+    *
+    * We cannot leave brw->foo_program as a dangling pointer to the dead
+    * program.  malloc() may allocate the same memory for a new gl_program,
+    * causing us to see matching pointers...but totally different programs.
+    *
+    * We cannot set brw->foo_program to NULL, either.  If we've deleted the
+    * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
+    * would cause us to see matching pointers (NULL == NULL), and fail to
+    * detect that a program has changed since our last draw.
+    *
+    * So, set it to a bogus gl_program pointer that will never match,
+    * causing us to properly reevaluate the state on our next draw.
+    *
+    * Getting this wrong causes heisenbugs which are very hard to catch,
+    * as you need a very specific allocation pattern to hit the problem.
+    */
+   static const struct gl_program deleted_program;
+
+   if (brw->vertex_program == prog)
+      brw->vertex_program = &deleted_program;
+
+   if (brw->tess_ctrl_program == prog)
+      brw->tess_ctrl_program = &deleted_program;
+
+   if (brw->tess_eval_program == prog)
+      brw->tess_eval_program = &deleted_program;
+
+   if (brw->geometry_program == prog)
+      brw->geometry_program = &deleted_program;
+
+   if (brw->fragment_program == prog)
+      brw->fragment_program = &deleted_program;
+
+   if (brw->compute_program == prog)
+      brw->compute_program = &deleted_program;
+
   _mesa_delete_program( ctx, prog );
 }

--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -477,6 +477,18 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
      break;
   case BLORP_HIZ_OP_DEPTH_CLEAR:
      dw1 |= GEN8_WM_HZ_DEPTH_CLEAR;
+
+      /* The "Clear Rectangle X Max" (and Y Max) fields are exclusive,
+       * rather than inclusive, and limited to 16383.  This means that
+       * for a 16384x16384 render target, we would miss the last row
+       * or column of pixels along the edge.
+       *
+       * To work around this, we have to set the "Full Surface Depth
+       * and Stencil Clear" bit.  We can do this in all cases because
+       * we always clear the full rectangle anyway.  We'll need to
+       * change this if we ever add scissored clear support.
+       */
+      dw1 |= GEN8_WM_HZ_FULL_SURFACE_DEPTH_CLEAR;
      break;
   case BLORP_HIZ_OP_NONE:
      unreachable("Should not get here.");
@@ -511,6 +523,22 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
   OUT_BATCH(0);
   ADVANCE_BATCH();

+   /*
+    * From the Broadwell PRM, volume 7, "Depth Buffer Clear":
+    *
+    *  Depth buffer clear pass using any of the methods (WM_STATE, 3DSTATE_WM
+    *  or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL command with
+    *  DEPTH_STALL bit and Depth FLUSH bits "set" before starting to render.
+    *  DepthStall and DepthFlush are not needed between consecutive depth
+    *  clear passes nor is it required if th e depth clear pass was done with
+    *  "full_surf_clear" bit set in the 3DSTATE_WM_HZ_OP.
+    *
+    *  TODO: Such as the spec says, this could be conditional.
+    */
+   brw_emit_pipe_control_flush(brw, 
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               PIPE_CONTROL_DEPTH_STALL);
+
   /* Mark this buffer as needing a TC flush, as we've rendered to it. */
   brw_render_cache_set_add_bo(brw, mt->bo);

--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -25,6 +25,7 @@

 #include "intel_batchbuffer.h"
 #include "intel_mipmap_tree.h"
+#include "intel_fbo.h"

 #include "brw_context.h"
 #include "brw_state.h"
@@ -179,7 +180,9 @@ genX(blorp_exec)(struct blorp_batch *batch,
    * data with different formats, which blorp does for stencil and depth
    * data.
    */
-   brw_emit_mi_flush(brw);
+   if (params->src.enabled)
+      brw_render_cache_set_check_flush(brw, params->src.addr.buffer);
+   brw_render_cache_set_check_flush(brw, params->dst.addr.buffer);

   brw_select_pipeline(brw, BRW_RENDER_PIPELINE);

@@ -256,8 +259,10 @@ retry:
   brw->no_depth_or_stencil = false;
   brw->ib.type = -1;

-   /* Flush the sampler cache so any texturing from the destination is
-    * coherent.
-    */
-   brw_emit_mi_flush(brw);
+   if (params->dst.enabled)
+      brw_render_cache_set_add_bo(brw, params->dst.addr.buffer);
+   if (params->depth.enabled)
+      brw_render_cache_set_add_bo(brw, params->depth.addr.buffer);
+   if (params->stencil.enabled)
+      brw_render_cache_set_add_bo(brw, params->stencil.addr.buffer);
 }
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -235,13 +235,9 @@ emit_miptree_blit(struct brw_context *brw,
    *    represented per scan line’s worth of graphics data depends on the
    *    color depth.
    *
-    * Furthermore, intelEmitCopyBlit (which is called below) uses a signed
-    * 16-bit integer to represent buffer pitch, so it can only handle buffer
-    * pitches < 32k. However, the pitch is measured in bytes for linear buffers
-    * and dwords for tiled buffers.
-    *
-    * As a result of these two limitations, we can only use the blitter to do
-    * this copy when the miptree's pitch is less than 32k linear or 128k tiled.
+    * The blitter's pitch is a signed 16-bit integer, but measured in bytes
+    * for linear surfaces and DWords for tiled surfaces.  So the maximum
+    * pitch is 32k linear and 128k tiled.
    */
   if (blt_pitch(src_mt) >= 32768 || blt_pitch(dst_mt) >= 32768) {
      perf_debug("Falling back due to >= 32k/128k pitch\n");
@@ -480,11 +476,11 @@ static bool
 can_fast_copy_blit(struct brw_context *brw,
 		   drm_intel_bo *src_buffer,
                   int16_t src_x, int16_t src_y,
-                   uintptr_t src_offset, uint32_t src_pitch,
+                   uintptr_t src_offset, int32_t src_pitch,
                   uint32_t src_tiling, uint32_t src_tr_mode,
 		   drm_intel_bo *dst_buffer,
                   int16_t dst_x, int16_t dst_y,
-                   uintptr_t dst_offset, uint32_t dst_pitch,
+                   uintptr_t dst_offset, int32_t dst_pitch,
                   uint32_t dst_tiling, uint32_t dst_tr_mode,
                   int16_t w, int16_t h, uint32_t cpp,
                   GLenum logic_op)
@@ -520,10 +516,8 @@ can_fast_copy_blit(struct brw_context *brw,
   if (!_mesa_is_pow_two(cpp) || cpp > 16)
      return false;

-   /* For Fast Copy Blits the pitch cannot be a negative number. So, bit 15
-    * of the destination pitch must be zero.
-    */
-   if ((src_pitch >> 15 & 1) != 0 || (dst_pitch >> 15 & 1) != 0)
+   /* For Fast Copy Blits the pitch cannot be a negative number. */
+   if (src_pitch < 0 || dst_pitch < 0)
      return false;

   /* For Linear surfaces, the pitch has to be an OWord (16byte) multiple. */
@@ -577,12 +571,12 @@ xy_blit_cmd(uint32_t src_tiling, uint32_t src_tr_mode,
 bool
 intelEmitCopyBlit(struct brw_context *brw,
 		  GLuint cpp,
-		  GLshort src_pitch,
+		  int32_t src_pitch,
 		  drm_intel_bo *src_buffer,
 		  GLuint src_offset,
 		  uint32_t src_tiling,
 		  uint32_t src_tr_mode,
-		  GLshort dst_pitch,
+		  int32_t dst_pitch,
 		  drm_intel_bo *dst_buffer,
 		  GLuint dst_offset,
 		  uint32_t dst_tiling,
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -31,12 +31,12 @@
 bool
 intelEmitCopyBlit(struct brw_context *brw,
                  GLuint cpp,
-                  GLshort src_pitch,
+                  int32_t src_pitch,
                  drm_intel_bo *src_buffer,
                  GLuint src_offset,
                  uint32_t src_tiling,
                  uint32_t src_tr_mode,
-                  GLshort dst_pitch,
+                  int32_t dst_pitch,
                  drm_intel_bo *dst_buffer,
                  GLuint dst_offset,
                  uint32_t dst_tiling,
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -79,6 +79,7 @@ DRI_CONF_BEGIN
      DRI_CONF_ALWAYS_FLUSH_CACHE("false")
      DRI_CONF_DISABLE_THROTTLING("false")
      DRI_CONF_FORCE_GLSL_EXTENSIONS_WARN("false")
+      DRI_CONF_FORCE_GLSL_VERSION(0)
      DRI_CONF_DISABLE_GLSL_LINE_CONTINUATIONS("false")
      DRI_CONF_DISABLE_BLEND_FUNC_EXTENDED("false")
      DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION("false")
--- a/src/mesa/drivers/dri/r200/Makefile.am
+++ b/src/mesa/drivers/dri/r200/Makefile.am
@@ -34,9 +34,9 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/r200/server \
-	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(RADEON_CFLAGS)
--- a/src/mesa/drivers/dri/radeon/Makefile.am
+++ b/src/mesa/drivers/dri/radeon/Makefile.am
@@ -35,9 +35,9 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/radeon/server \
-	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(RADEON_CFLAGS)
--- a/src/mesa/drivers/dri/swrast/Makefile.am
+++ b/src/mesa/drivers/dri/swrast/Makefile.am
@@ -30,8 +30,8 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	$(LIBDRM_CFLAGS) \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS)
--- a/src/mesa/drivers/osmesa/Makefile.am
+++ b/src/mesa/drivers/osmesa/Makefile.am
@@ -28,8 +28,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/mapi \
 	-I$(top_builddir)/src/mapi \
+	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/mesa/ \
 	$(DEFINES)
 AM_CFLAGS = $(PTHREAD_CFLAGS) \
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -1071,7 +1071,8 @@ _mesa_PopAttrib(void)
               if (ctx->Extensions.ARB_color_buffer_float)
                  _mesa_ClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
                                   color->ClampFragmentColor);
-               _mesa_ClampColor(GL_CLAMP_READ_COLOR_ARB, color->ClampReadColor);
+               if (ctx->Extensions.ARB_color_buffer_float || ctx->Version >= 30)
+                  _mesa_ClampColor(GL_CLAMP_READ_COLOR_ARB, color->ClampReadColor);

               /* GL_ARB_framebuffer_sRGB / GL_EXT_framebuffer_sRGB */
               if (ctx->Extensions.EXT_framebuffer_sRGB)
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -363,7 +363,7 @@ EXT(OES_point_size_array                    , dummy_true
 EXT(OES_point_sprite                        , ARB_point_sprite                       ,  x ,  x , ES1,  x , 2004)
 EXT(OES_primitive_bounding_box              , OES_primitive_bounding_box             ,  x ,  x ,  x ,  31, 2014)
 EXT(OES_query_matrix                        , dummy_true                             ,  x ,  x , ES1,  x , 2003)
-EXT(OES_read_format                         , dummy_true                             , GLL, GLC, ES1,  x , 2003)
+EXT(OES_read_format                         , dummy_true                             , GLL,  x , ES1,  x , 2003)
 EXT(OES_rgb8_rgba8                          , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_sample_shading                      , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_sample_variables                    , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1612,6 +1612,7 @@ _mesa_LinkProgram(GLuint programObj)
                                                           "glLinkProgram"));
 }

+#ifdef ENABLE_SHADER_CACHE
 /**
 * Generate a SHA-1 hash value string for given source string.
 */
@@ -1723,6 +1724,8 @@ read_shader(const gl_shader_stage stage, const char *source)
   return buffer;
 }

+#endif /* ENABLE_SHADER_CACHE */
+
 /**
 * Called via glShaderSource() and glShaderSourceARB() API functions.
 * Basically, concatenate the source code strings into one long string
@@ -1738,8 +1741,6 @@ _mesa_ShaderSource(GLuint shaderObj, GLsizei count,
   GLcharARB *source;
   struct gl_shader *sh;

-   GLcharARB *replacement;
-
   sh = _mesa_lookup_shader_err(ctx, shaderObj, "glShaderSourceARB");
   if (!sh)
      return;
@@ -1795,6 +1796,9 @@ _mesa_ShaderSource(GLuint shaderObj, GLsizei count,
   source[totalLength - 1] = '\0';
   source[totalLength - 2] = '\0';

+#ifdef ENABLE_SHADER_CACHE
+   GLcharARB *replacement;
+
   /* Dump original shader source to MESA_SHADER_DUMP_PATH and replace
    * if corresponding entry found from MESA_SHADER_READ_PATH.
    */
@@ -1805,6 +1809,7 @@ _mesa_ShaderSource(GLuint shaderObj, GLsizei count,
      free(source);
      source = replacement;
   }
+#endif /* ENABLE_SHADER_CACHE */

   shader_source(sh, source);

--- a/src/mesa/main/tests/Makefile.am
+++ b/src/mesa/main/tests/Makefile.am
@@ -4,8 +4,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/gtest/include \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mesa \
 	-I$(top_builddir)/src/mesa \
+	-I$(top_srcdir)/src/mesa \
 	-I$(top_srcdir)/include \
 	$(DEFINES) $(INCLUDE_DIRS)

--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -278,7 +278,7 @@ void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state)


 static void
-st_destroy_context_priv(struct st_context *st)
+st_destroy_context_priv(struct st_context *st, bool destroy_pipe)
 {
   uint shader, i;

@@ -314,6 +314,10 @@ st_destroy_context_priv(struct st_context *st)
   st_invalidate_readpix_cache(st);

   cso_destroy_context(st->cso_context);
+
+   if (st->pipe && destroy_pipe)
+      st->pipe->destroy(st->pipe);
+
   free( st );
 }

@@ -503,7 +507,7 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
      /* This can happen when a core profile was requested, but the driver
       * does not support some features of GL 3.1 or later.
       */
-      st_destroy_context_priv(st);
+      st_destroy_context_priv(st, false);
      return NULL;
   }

@@ -579,7 +583,6 @@ destroy_tex_sampler_cb(GLuint id, void *data, void *userData)
 
 void st_destroy_context( struct st_context *st )
 {
-   struct pipe_context *pipe = st->pipe;
   struct gl_context *ctx = st->ctx;
   GLuint i;

@@ -608,11 +611,9 @@ void st_destroy_context( struct st_context *st )

   /* This will free the st_context too, so 'st' must not be accessed
    * afterwards. */
-   st_destroy_context_priv(st);
+   st_destroy_context_priv(st, true);
   st = NULL;

-   pipe->destroy( pipe );
-
   free(ctx);
 }

--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -955,7 +955,7 @@ glsl_to_tgsi_visitor::get_opcode(unsigned op,
      case3fid(MUL, UMUL, DMUL);
      case3fid(MAD, UMAD, DMAD);
      case3fid(FMA, UMAD, DFMA);
-      case3(DIV, IDIV, UDIV);
+      case4d(DIV, IDIV, UDIV, DDIV);
      case4d(MAX, IMAX, UMAX, DMAX);
      case4d(MIN, IMIN, UMIN, DMIN);
      case2iu(MOD, UMOD);
@@ -1710,10 +1710,7 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
      emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
      break;
   case ir_binop_div:
-      if (result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_DOUBLE)
-         assert(!"not reached: should be handled by ir_div_to_mul_rcp");
-      else
-         emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
      break;
   case ir_binop_mod:
      if (result_dst.type == GLSL_TYPE_FLOAT)
@@ -6918,7 +6915,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)

      lower_instructions(ir,
                         MOD_TO_FLOOR |
-                         DIV_TO_MUL_RCP |
+                         FDIV_TO_MUL_RCP |
                         EXP_TO_EXP2 |
                         LOG_TO_LOG2 |
                         LDEXP_TO_ARITH |
--- a/src/util/disk_cache.c
+++ b/src/util/disk_cache.c
@@ -21,6 +21,8 @@
 * IN THE SOFTWARE.
 */

+#ifdef ENABLE_SHADER_CACHE
+
 #include <ctype.h>
 #include <string.h>
 #include <stdlib.h>
@@ -705,3 +707,5 @@ disk_cache_has_key(struct disk_cache *cache, cache_key key)

   return memcmp(entry, key, CACHE_KEY_SIZE) == 0;
 }
+
+#endif /* ENABLE_SHADER_CACHE */
--- a/src/util/disk_cache.h
+++ b/src/util/disk_cache.h
@@ -40,6 +40,8 @@ struct disk_cache;

 /* Provide inlined stub functions if the shader cache is disabled. */

+#ifdef ENABLE_SHADER_CACHE
+
 /**
 * Create a new cache object.
 *
@@ -129,6 +131,46 @@ disk_cache_put_key(struct disk_cache *cache, cache_key key);
 bool
 disk_cache_has_key(struct disk_cache *cache, cache_key key);

+#else
+
+static inline struct disk_cache *
+disk_cache_create(void)
+{
+   return NULL;
+}
+
+static inline void
+disk_cache_destroy(struct disk_cache *cache) {
+   return;
+}
+
+static inline void
+disk_cache_put(struct disk_cache *cache, cache_key key,
+          const void *data, size_t size)
+{
+   return;
+}
+
+static inline uint8_t *
+disk_cache_get(struct disk_cache *cache, cache_key key, size_t *size)
+{
+   return NULL;
+}
+
+static inline void
+disk_cache_put_key(struct disk_cache *cache, cache_key key)
+{
+   return;
+}
+
+static inline bool
+disk_cache_has_key(struct disk_cache *cache, cache_key key)
+{
+   return false;
+}
+
+#endif /* ENABLE_SHADER_CACHE */
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/util/sha1/sha1.h
+++ b/src/util/sha1/sha1.h
@@ -31,7 +31,6 @@ void SHA1Pad(SHA1_CTX *);
 void SHA1Transform(uint32_t [5], const uint8_t [SHA1_BLOCK_LENGTH]);
 void SHA1Update(SHA1_CTX *, const uint8_t *, size_t);
 void SHA1Final(uint8_t [SHA1_DIGEST_LENGTH], SHA1_CTX *);
-__END_DECLS

 #define HTONDIGEST(x) do {                                              \
        x[0] = htonl(x[0]);                                             \
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -379,7 +379,8 @@ wsi_wl_surface_get_capabilities(VkIcdSurfaceBase *surface,

   caps->currentExtent = (VkExtent2D) { -1, -1 };
   caps->minImageExtent = (VkExtent2D) { 1, 1 };
-   caps->maxImageExtent = (VkExtent2D) { INT16_MAX, INT16_MAX };
+   /* This is the maximum supported size on Intel */
+   caps->maxImageExtent = (VkExtent2D) { 1 << 14, 1 << 14 };
   caps->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
   caps->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
   caps->maxImageArrayLayers = 1;
@@ -409,25 +410,27 @@ wsi_wl_surface_get_formats(VkIcdSurfaceBase *icd_surface,
   if (!display)
      return VK_ERROR_OUT_OF_HOST_MEMORY;

-   uint32_t count = u_vector_length(&display->formats);
-
   if (pSurfaceFormats == NULL) {
-      *pSurfaceFormatCount = count;
+      *pSurfaceFormatCount = u_vector_length(&display->formats);
      return VK_SUCCESS;
   }

-   assert(*pSurfaceFormatCount >= count);
-   *pSurfaceFormatCount = count;
-
+   uint32_t count = 0;
   VkFormat *f;
   u_vector_foreach(f, &display->formats) {
-      *(pSurfaceFormats++) = (VkSurfaceFormatKHR) {
+      if (count == *pSurfaceFormatCount)
+         return VK_INCOMPLETE;
+
+      pSurfaceFormats[count++] = (VkSurfaceFormatKHR) {
         .format = *f,
         /* TODO: We should get this from the compositor somehow */
         .colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR,
      };
   }

+   assert(*pSurfaceFormatCount <= count);
+   *pSurfaceFormatCount = count;
+
   return VK_SUCCESS;
 }

@@ -441,11 +444,13 @@ wsi_wl_surface_get_present_modes(VkIcdSurfaceBase *surface,
      return VK_SUCCESS;
   }

-   assert(*pPresentModeCount >= ARRAY_SIZE(present_modes));
+   *pPresentModeCount = MIN2(*pPresentModeCount, ARRAY_SIZE(present_modes));
   typed_memcpy(pPresentModes, present_modes, *pPresentModeCount);
-   *pPresentModeCount = ARRAY_SIZE(present_modes);

-   return VK_SUCCESS;
+   if (*pPresentModeCount < ARRAY_SIZE(present_modes))
+      return VK_INCOMPLETE;
+   else
+      return VK_SUCCESS;
 }

 VkResult wsi_create_wl_surface(const VkAllocationCallbacks *pAllocator,
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@@ -265,7 +265,8 @@ VkBool32 wsi_get_physical_device_xcb_presentation_support(
      return false;

   if (!wsi_conn->has_dri3) {
-      fprintf(stderr, "vulkan: No DRI3 support\n");
+      fprintf(stderr, "vulkan: No DRI3 support detected - required for presentation\n");
+      fprintf(stderr, "Note: Buggy applications may crash, if they do please report to vendor\n");
      return false;
   }

@@ -313,7 +314,8 @@ x11_surface_get_support(VkIcdSurfaceBase *icd_surface,
      return VK_ERROR_OUT_OF_HOST_MEMORY;

   if (!wsi_conn->has_dri3) {
-      fprintf(stderr, "vulkan: No DRI3 support\n");
+      fprintf(stderr, "vulkan: No DRI3 support detected - required for presentation\n");
+      fprintf(stderr, "Note: Buggy applications may crash, if they do please report to vendor\n");
      *pSupported = false;
      return VK_SUCCESS;
   }
@@ -368,7 +370,8 @@ x11_surface_get_capabilities(VkIcdSurfaceBase *icd_surface,
       */
      caps->currentExtent = (VkExtent2D) { -1, -1 };
      caps->minImageExtent = (VkExtent2D) { 1, 1 };
-      caps->maxImageExtent = (VkExtent2D) { INT16_MAX, INT16_MAX };
+      /* This is the maximum supported size on Intel */
+      caps->maxImageExtent = (VkExtent2D) { 1 << 14, 1 << 14 };
   }
   free(err);
   free(geom);