Update version to 17.0.0-rc3

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
etnaviv: force vertex buffers through the MMU
2017-02-06 13:18:13 +00:00 · 2017-02-03 11:18:53 +00:00 · 2017-02-03 11:12:16 +00:00 · 2017-02-03 11:09:00 +00:00 · 2017-02-03 11:08:59 +00:00 · 2017-02-03 11:08:59 +00:00
64 changed files with 450 additions and 175 deletions
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -78,10 +78,22 @@ endif

 ifeq ($(MESA_ENABLE_LLVM),true)
 LOCAL_CFLAGS += \
-	-DHAVE_LLVM=0x0305 -DMESA_LLVM_VERSION_PATCH=2 \
 	-D__STDC_CONSTANT_MACROS \
 	-D__STDC_FORMAT_MACROS \
 	-D__STDC_LIMIT_MACROS
+
+  ifeq ($(MESA_ANDROID_MAJOR_VERSION),5)
+    LOCAL_CFLAGS += -DHAVE_LLVM=0x0305 -DMESA_LLVM_VERSION_PATCH=2
+    ELF_INCLUDES := external/elfutils/0.153/libelf
+  endif
+  ifeq ($(MESA_ANDROID_MAJOR_VERSION),6)
+    LOCAL_CFLAGS += -DHAVE_LLVM=0x0307 -DMESA_LLVM_VERSION_PATCH=0
+    ELF_INCLUDES := external/elfutils/src/libelf
+  endif
+  ifeq ($(MESA_ANDROID_MAJOR_VERSION),7)
+    LOCAL_CFLAGS += -DHAVE_LLVM=0x0308 -DMESA_LLVM_VERSION_PATCH=0
+    ELF_INCLUDES := external/elfutils/libelf
+  endif
 endif

 ifneq ($(LOCAL_IS_HOST_MODULE),true)
--- a/2
+++ b/2
@@ -1 +1 @@
-17.0.0-rc2
+17.0.0-rc3
--- a/configure.ac
+++ b/configure.ac
@@ -1436,6 +1436,22 @@ if test "x$enable_gallium_osmesa" = xyes; then
    fi
 fi

+require_dri_shared_libs_and_glapi() {
+    if test "x$enable_static" = xyes; then
+        AC_MSG_ERROR([$1 cannot be build as static library])
+    fi
+
+    if test "x$enable_dri" != xyes; then
+        # There is only a single backend which won't be build/used otherwise.
+        # XXX: Revisit this as the egl/haiku is a thing.
+        AC_MSG_ERROR([$1 requires --enable-dri])
+    fi
+
+    if test "x$enable_shared_glapi" != xyes; then
+        AC_MSG_ERROR([$1 requires --enable-shared-glapi])
+    fi
+}
+
 if test "x$enable_dri" = xyes; then
    require_dri_shared_libs_and_glapi "DRI"

@@ -1722,7 +1738,7 @@ fi
 AC_ARG_WITH([vulkan-drivers],
    [AS_HELP_STRING([--with-vulkan-drivers@<:@=DIRS...@:>@],
        [comma delimited Vulkan drivers list, e.g.
-        "intel"
+        "intel,radeon"
        @<:@default=no@:>@])],
    [with_vulkan_drivers="$withval"],
    [with_vulkan_drivers="no"])
@@ -1815,22 +1831,6 @@ AC_SUBST([OSMESA_LIB_DEPS])
 AC_SUBST([OSMESA_PC_REQ])
 AC_SUBST([OSMESA_PC_LIB_PRIV])

-require_dri_shared_libs_and_glapi() {
-    if test "x$enable_static" = xyes; then
-        AC_MSG_ERROR([$1 cannot be build as static library])
-    fi
-
-    if test "x$enable_dri" != xyes; then
-        # There is only a single backend which won't be build/used otherwise.
-        # XXX: Revisit this as the egl/haiku is a thing.
-        AC_MSG_ERROR([$1 requires --enable-dri])
-    fi
-
-    if test "x$enable_shared_glapi" != xyes; then
-        AC_MSG_ERROR([$1 requires --enable-shared-glapi])
-    fi
-}
-
 dnl
 dnl gbm configuration
 dnl
@@ -2212,6 +2212,19 @@ gallium_require_llvm() {
    fi
 }

+dnl
+dnl r300 doesn't strictly require LLVM, but for performance reasons we
+dnl highly recommend LLVM usage. So require it at least on x86 and x86_64
+dnl architectures.
+dnl
+r300_require_llvm() {
+    case "$host" in *gnux32) return;; esac
+    case "$host_cpu" in
+    i*86|x86_64|amd64) gallium_require_llvm $1
+        ;;
+    esac
+}
+
 dnl
 dnl DRM is needed by X, Wayland, and offscreen rendering.
 dnl Surfaceless is an alternative for the last one.
@@ -2298,7 +2311,7 @@ if test -n "$with_gallium_drivers"; then
            HAVE_GALLIUM_R300=yes
            PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
            require_libdrm "r300"
-            gallium_require_llvm "r300"
+            r300_require_llvm "r300"
            ;;
        xr600)
            HAVE_GALLIUM_R600=yes
--- a/src/amd/Android.common.mk
+++ b/src/amd/Android.common.mk
@@ -55,7 +55,7 @@ LOCAL_C_INCLUDES := \
 	external/llvm/include \
 	external/llvm/device/include \
 	external/libcxx/include \
-	external/elfutils/$(if $(filter 5,$(MESA_ANDROID_MAJOR_VERSION)),0.153/,$(if $(filter 6,$(MESA_ANDROID_MAJOR_VERSION)),src/))libelf
+	$(ELF_INCLUDES)

 LOCAL_STATIC_LIBRARIES := libLLVMCore

--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1267,6 +1267,9 @@ static void visit_alu(struct nir_to_llvm_context *ctx, nir_alu_instr *instr)
 		src[1] = to_float(ctx, src[1]);
 		result = LLVMBuildFRem(ctx->builder, src[0], src[1], "");
 		break;
+	case nir_op_irem:
+		result = LLVMBuildSRem(ctx->builder, src[0], src[1], "");
+		break;
 	case nir_op_idiv:
 		result = LLVMBuildSDiv(ctx->builder, src[0], src[1], "");
 		break;
@@ -1745,9 +1748,12 @@ static LLVMValueRef visit_vulkan_resource_index(struct nir_to_llvm_context *ctx,
 static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
                                             nir_intrinsic_instr *instr)
 {
-	LLVMValueRef ptr;
+	LLVMValueRef ptr, addr;

-	ptr = build_gep0(ctx, ctx->push_constants, get_src(ctx, instr->src[0]));
+	addr = LLVMConstInt(ctx->i32, nir_intrinsic_base(instr), 0);
+	addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx, instr->src[0]), "");
+
+	ptr = build_gep0(ctx, ctx->push_constants, addr);
 	ptr = cast_ptr(ctx, ptr, get_def_type(ctx, &instr->dest.ssa));

 	return LLVMBuildLoad(ctx->builder, ptr, "");
@@ -2238,7 +2244,7 @@ static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
 }

 static LLVMValueRef get_image_coords(struct nir_to_llvm_context *ctx,
-				     nir_intrinsic_instr *instr, bool add_frag_pos)
+				     nir_intrinsic_instr *instr)
 {
 	const struct glsl_type *type = instr->variables[0]->var->type;
 	if(instr->variables[0]->deref.child)
@@ -2253,6 +2259,8 @@ static LLVMValueRef get_image_coords(struct nir_to_llvm_context *ctx,
 	LLVMValueRef res;
 	int count;
 	enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
+	bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS ||
+			     dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
 	bool is_ms = (dim == GLSL_SAMPLER_DIM_MS ||
 		      dim == GLSL_SAMPLER_DIM_SUBPASS_MS);

@@ -2378,12 +2386,11 @@ static LLVMValueRef visit_image_load(struct nir_to_llvm_context *ctx,
 	} else {
 		bool is_da = glsl_sampler_type_is_array(type) ||
 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
-		bool add_frag_pos = glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_SUBPASS;
 		LLVMValueRef da = is_da ? ctx->i32one : ctx->i32zero;
 		LLVMValueRef glc = LLVMConstInt(ctx->i1, 0, false);
 		LLVMValueRef slc = LLVMConstInt(ctx->i1, 0, false);

-		params[0] = get_image_coords(ctx, instr, add_frag_pos);
+		params[0] = get_image_coords(ctx, instr);
 		params[1] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
 		params[2] = LLVMConstInt(ctx->i32, 15, false); /* dmask */
 		if (HAVE_LLVM <= 0x0309) {
@@ -2442,7 +2449,7 @@ static void visit_image_store(struct nir_to_llvm_context *ctx,
 		LLVMValueRef slc = i1false;

 		params[0] = to_float(ctx, get_src(ctx, instr->src[2]));
-		params[1] = get_image_coords(ctx, instr, false); /* coords */
+		params[1] = get_image_coords(ctx, instr); /* coords */
 		params[2] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
 		params[3] = LLVMConstInt(ctx->i32, 15, false); /* dmask */
 		if (HAVE_LLVM <= 0x0309) {
@@ -2502,7 +2509,7 @@ static LLVMValueRef visit_image_atomic(struct nir_to_llvm_context *ctx,
 		bool da = glsl_sampler_type_is_array(type) ||
 		          glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;

-		coords = params[param_count++] = get_image_coords(ctx, instr, false);
+		coords = params[param_count++] = get_image_coords(ctx, instr);
 		params[param_count++] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
 		params[param_count++] = i1false; /* r128 */
 		params[param_count++] = da ? i1true : i1false;      /* da */
@@ -3154,6 +3161,15 @@ static void tex_fetch_ptrs(struct nir_to_llvm_context *ctx,
 		*fmask_ptr = get_sampler_desc(ctx, instr->texture, DESC_FMASK);
 }

+static LLVMValueRef apply_round_slice(struct nir_to_llvm_context *ctx,
+				      LLVMValueRef coord)
+{
+	coord = to_float(ctx, coord);
+	coord = ac_emit_llvm_intrinsic(&ctx->ac, "llvm.rint.f32", ctx->f32, &coord, 1, 0);
+	coord = to_integer(ctx, coord);
+	return coord;
+}
+
 static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr)
 {
 	LLVMValueRef result = NULL;
@@ -3211,6 +3227,11 @@ static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr)
 		}
 	}

+	if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
+		result = get_buffer_size(ctx, res_ptr, false);
+		goto write_result;
+	}
+
 	if (instr->op == nir_texop_texture_samples) {
 		LLVMValueRef res, samples, is_msaa;
 		res = LLVMBuildBitCast(ctx->builder, res_ptr, ctx->v8i32, "");
@@ -3310,15 +3331,16 @@ static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr)
 	/* Pack texture coordinates */
 	if (coord) {
 		address[count++] = coords[0];
-		if (instr->coord_components > 1)
+		if (instr->coord_components > 1) {
+			if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array && instr->op != nir_texop_txf) {
+				coords[1] = apply_round_slice(ctx, coords[1]);
+			}
 			address[count++] = coords[1];
+		}
 		if (instr->coord_components > 2) {
 			/* This seems like a bit of a hack - but it passes Vulkan CTS with it */
 			if (instr->sampler_dim != GLSL_SAMPLER_DIM_3D && instr->op != nir_texop_txf) {
-				coords[2] = to_float(ctx, coords[2]);
-				coords[2] = ac_emit_llvm_intrinsic(&ctx->ac, "llvm.rint.f32", ctx->f32, &coords[2],
-								1, 0);
-				coords[2] = to_integer(ctx, coords[2]);
+				coords[2] = apply_round_slice(ctx, coords[2]);
 			}
 			address[count++] = coords[2];
 		}
--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -21,9 +21,7 @@

 include Makefile.sources

-vulkan_includedir = $(includedir)/vulkan
-
-vulkan_include_HEADERS = \
+noinst_HEADERS = \
 	$(top_srcdir)/include/vulkan/vk_platform.h \
 	$(top_srcdir)/include/vulkan/vulkan.h

--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -438,7 +438,8 @@ radv_emit_graphics_raster_state(struct radv_cmd_buffer *cmd_buffer,
 			       raster->spi_interp_control);

 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028A00_PA_SU_POINT_SIZE, 2);
-	radeon_emit(cmd_buffer->cs, 0);
+	unsigned tmp = (unsigned)(1.0 * 8.0);
+	radeon_emit(cmd_buffer->cs, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
 	radeon_emit(cmd_buffer->cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) |
 		    S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2))); /* R_028A04_PA_SU_POINT_MINMAX */

@@ -2605,6 +2606,7 @@ void radv_CmdPipelineBarrier(
 			break;
 		case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
 		case VK_ACCESS_TRANSFER_READ_BIT:
+		case VK_ACCESS_TRANSFER_WRITE_BIT:
 		case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
 			flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER | RADV_CMD_FLAG_INV_GLOBAL_L2;
 		default:
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -535,7 +535,7 @@ private:
            const char *str_end;
            while((str_start = strchr(name_copy, '[')) &&
                  (str_end = strchr(name_copy, ']'))) {
-               memmove(str_start, str_end + 1, 1 + strlen(str_end));
+               memmove(str_start, str_end + 1, 1 + strlen(str_end + 1));
            }

            unsigned index = 0;
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -1102,23 +1102,43 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
      SpvOp opcode = get_specialization(b, val, w[3]);
      switch (opcode) {
      case SpvOpVectorShuffle: {
-         struct vtn_value *v0 = vtn_value(b, w[4], vtn_value_type_constant);
-         struct vtn_value *v1 = vtn_value(b, w[5], vtn_value_type_constant);
-         unsigned len0 = glsl_get_vector_elements(v0->const_type);
-         unsigned len1 = glsl_get_vector_elements(v1->const_type);
+         struct vtn_value *v0 = &b->values[w[4]];
+         struct vtn_value *v1 = &b->values[w[5]];
+
+         assert(v0->value_type == vtn_value_type_constant ||
+                v0->value_type == vtn_value_type_undef);
+         assert(v1->value_type == vtn_value_type_constant ||
+                v1->value_type == vtn_value_type_undef);
+
+         unsigned len0 = v0->value_type == vtn_value_type_constant ?
+                         glsl_get_vector_elements(v0->const_type) :
+                         glsl_get_vector_elements(v0->type->type);
+         unsigned len1 = v1->value_type == vtn_value_type_constant ?
+                         glsl_get_vector_elements(v1->const_type) :
+                         glsl_get_vector_elements(v1->type->type);

         assert(len0 + len1 < 16);

         unsigned bit_size = glsl_get_bit_size(val->const_type);
-         assert(bit_size == glsl_get_bit_size(v0->const_type) &&
-                bit_size == glsl_get_bit_size(v1->const_type));
+         unsigned bit_size0 = v0->value_type == vtn_value_type_constant ?
+                              glsl_get_bit_size(v0->const_type) :
+                              glsl_get_bit_size(v0->type->type);
+         unsigned bit_size1 = v1->value_type == vtn_value_type_constant ?
+                              glsl_get_bit_size(v1->const_type) :
+                              glsl_get_bit_size(v1->type->type);
+
+         assert(bit_size == bit_size0 && bit_size == bit_size1);

         if (bit_size == 64) {
            uint64_t u64[8];
-            for (unsigned i = 0; i < len0; i++)
-               u64[i] = v0->constant->values[0].u64[i];
-            for (unsigned i = 0; i < len1; i++)
-               u64[len0 + i] = v1->constant->values[0].u64[i];
+            if (v0->value_type == vtn_value_type_constant) {
+               for (unsigned i = 0; i < len0; i++)
+                  u64[i] = v0->constant->values[0].u64[i];
+            }
+            if (v1->value_type == vtn_value_type_constant) {
+               for (unsigned i = 0; i < len1; i++)
+                  u64[len0 + i] = v1->constant->values[0].u64[i];
+            }

            for (unsigned i = 0, j = 0; i < count - 6; i++, j++) {
               uint32_t comp = w[i + 6];
@@ -1132,11 +1152,14 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
            }
         } else {
            uint32_t u32[8];
-            for (unsigned i = 0; i < len0; i++)
-               u32[i] = v0->constant->values[0].u32[i];
-
-            for (unsigned i = 0; i < len1; i++)
-               u32[len0 + i] = v1->constant->values[0].u32[i];
+            if (v0->value_type == vtn_value_type_constant) {
+               for (unsigned i = 0; i < len0; i++)
+                  u32[i] = v0->constant->values[0].u32[i];
+            }
+            if (v1->value_type == vtn_value_type_constant) {
+               for (unsigned i = 0; i < len1; i++)
+                  u32[len0 + i] = v1->constant->values[0].u32[i];
+            }

            for (unsigned i = 0, j = 0; i < count - 6; i++, j++) {
               uint32_t comp = w[i + 6];
@@ -2902,6 +2925,7 @@ vtn_handle_variable_or_type_instruction(struct vtn_builder *b, SpvOp opcode,
      vtn_handle_constant(b, opcode, w, count);
      break;

+   case SpvOpUndef:
   case SpvOpVariable:
      vtn_handle_variables(b, opcode, w, count);
      break;
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -1268,6 +1268,12 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
                     const uint32_t *w, unsigned count)
 {
   switch (opcode) {
+   case SpvOpUndef: {
+      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_undef);
+      val->type = vtn_value(b, w[1], vtn_value_type_type)->type;
+      break;
+   }
+
   case SpvOpVariable: {
      struct vtn_variable *var = rzalloc(b, struct vtn_variable);
      var->type = vtn_value(b, w[1], vtn_value_type_type)->type;
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -96,8 +96,8 @@ AM_CFLAGS += \
 	-I$(top_srcdir)/src/egl/drivers/dri2 \
 	-I$(top_srcdir)/src/gbm/backends/dri \
 	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
-	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
 	-I$(top_builddir)/src/egl/wayland/wayland-drm \
+	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
 	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\" \
 	-D_EGL_BUILT_IN_DRIVER_DRI2

--- a/src/gallium/Android.common.mk
+++ b/src/gallium/Android.common.mk
@@ -34,7 +34,7 @@ LOCAL_C_INCLUDES += \
 	external/llvm/include \
 	external/llvm/device/include \
 	external/libcxx/include \
-	external/elfutils/$(if $(filter true,$(MESA_LOLLIPOP_BUILD)),0.153/)libelf
+	$(ELF_INCLUDES)
 endif

 include $(MESA_COMMON_MK)
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -2624,7 +2624,6 @@ lp_set_default_actions_cpu(
   bld_base->op_actions[TGSI_OPCODE_DSLT].emit = dslt_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_DSNE].emit = dsne_emit_cpu;

-   bld_base->op_actions[TGSI_OPCODE_DDIV].emit = div_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = drecip_sqrt_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = dsqrt_emit_cpu;

--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -209,6 +209,16 @@ micro_dadd(union tgsi_double_channel *dst,
   dst->d[3] = src[0].d[3] + src[1].d[3];
 }

+static void
+micro_ddiv(union tgsi_double_channel *dst,
+          const union tgsi_double_channel *src)
+{
+   dst->d[0] = src[0].d[0] / src[1].d[0];
+   dst->d[1] = src[0].d[1] / src[1].d[1];
+   dst->d[2] = src[0].d[2] / src[1].d[2];
+   dst->d[3] = src[0].d[3] / src[1].d[3];
+}
+
 static void
 micro_ddx(union tgsi_exec_channel *dst,
          const union tgsi_exec_channel *src)
@@ -5995,6 +6005,10 @@ exec_instruction(
      exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
      break;

+   case TGSI_OPCODE_DDIV:
+      exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
+      break;
+
   case TGSI_OPCODE_DMUL:
      exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
      break;
--- a/src/gallium/drivers/etnaviv/etnaviv_compiler.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_compiler.c
@@ -1021,7 +1021,7 @@ label_mark_use(struct etna_compile *c, struct etna_compile_label *label)
 static struct etna_compile_frame *
 find_frame(struct etna_compile *c, enum etna_compile_frame_type type)
 {
-   for (unsigned sp = c->frame_sp; sp >= 0; sp--)
+   for (int sp = c->frame_sp; sp >= 0; sp--)
      if (c->frame_stack[sp].type == type)
         return &c->frame_stack[sp];

@@ -1444,7 +1444,42 @@ static void
 trans_trig(const struct instr_translater *t, struct etna_compile *c,
           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
-   if (c->specs->has_sin_cos_sqrt) {
+   if (c->specs->has_new_sin_cos) { /* Alternative SIN/COS */
+      /* On newer chips alternative SIN/COS instructions are implemented,
+       * which:
+       * - Need their input scaled by 1/pi instead of 2/pi
+       * - Output an x and y component, which need to be multiplied to
+       *   get the result
+       */
+      /* TGSI lowering should deal with SCS */
+      assert(inst->Instruction.Opcode != TGSI_OPCODE_SCS);
+
+      struct etna_native_reg temp = etna_compile_get_inner_temp(c); /* only using .xyz */
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_MUL,
+         .sat = 0,
+         .dst = etna_native_to_dst(temp, INST_COMPS_Z),
+         .src[0] = src[0], /* any swizzling happens here */
+         .src[1] = alloc_imm_f32(c, 1.0f / M_PI),
+      });
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = inst->Instruction.Opcode == TGSI_OPCODE_COS
+                    ? INST_OPCODE_COS
+                    : INST_OPCODE_SIN,
+         .sat = 0,
+         .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y),
+         .src[2] = etna_native_to_src(temp, SWIZZLE(Z, Z, Z, Z)),
+         .tex = { .amode=1 }, /* Unknown bit needs to be set */
+      });
+      emit_inst(c, &(struct etna_inst) {
+         .opcode = INST_OPCODE_MUL,
+         .sat = inst->Instruction.Saturate,
+         .dst = convert_dst(c, &inst->Dst[0]),
+         .src[0] = etna_native_to_src(temp, SWIZZLE(X, X, X, X)),
+         .src[1] = etna_native_to_src(temp, SWIZZLE(Y, Y, Y, Y)),
+      });
+
+   } else if (c->specs->has_sin_cos_sqrt) {
      /* TGSI lowering should deal with SCS */
      assert(inst->Instruction.Opcode != TGSI_OPCODE_SCS);

--- a/src/gallium/drivers/etnaviv/etnaviv_emit.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_emit.c
@@ -491,6 +491,23 @@ etna_emit_state(struct etna_context *ctx)
      /*00C14*/ EMIT_STATE(SE_DEPTH_BIAS, rasterizer->SE_DEPTH_BIAS);
      /*00C18*/ EMIT_STATE(SE_CONFIG, rasterizer->SE_CONFIG);
   }
+   if (unlikely(dirty & (ETNA_DIRTY_SCISSOR | ETNA_DIRTY_FRAMEBUFFER |
+                         ETNA_DIRTY_RASTERIZER | ETNA_DIRTY_VIEWPORT))) {
+      struct etna_rasterizer_state *rasterizer = etna_rasterizer_state(ctx->rasterizer);
+
+      uint32_t clip_right =
+         MIN2(ctx->framebuffer.SE_CLIP_RIGHT, ctx->viewport.SE_CLIP_RIGHT);
+      uint32_t clip_bottom =
+         MIN2(ctx->framebuffer.SE_CLIP_BOTTOM, ctx->viewport.SE_CLIP_BOTTOM);
+
+      if (rasterizer->scissor) {
+         clip_right = MIN2(ctx->scissor.SE_CLIP_RIGHT, clip_right);
+         clip_bottom = MIN2(ctx->scissor.SE_CLIP_BOTTOM, clip_bottom);
+      }
+
+      /*00C20*/ EMIT_STATE_FIXP(SE_CLIP_RIGHT, clip_right);
+      /*00C24*/ EMIT_STATE_FIXP(SE_CLIP_BOTTOM, clip_bottom);
+   }
   if (unlikely(dirty & (ETNA_DIRTY_SHADER))) {
      /*00E00*/ EMIT_STATE(RA_CONTROL, ctx->shader_state.RA_CONTROL);
   }
--- a/src/gallium/drivers/etnaviv/etnaviv_internal.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_internal.h
@@ -47,6 +47,17 @@
 /* PE render targets must be aligned to 64 bytes */
 #define ETNA_PE_ALIGNMENT (64)

+/* These demarcate the margin (fixp16) between the computed sizes and the
+  value sent to the chip. These have been set to the numbers used by the
+  Vivante driver on gc2000. They used to be -1 for scissor right and bottom. I
+  am not sure whether older hardware was relying on these or they were just a
+  guess. But if so, these need to be moved to the _specs structure.
+*/
+#define ETNA_SE_SCISSOR_MARGIN_RIGHT (0x1119)
+#define ETNA_SE_SCISSOR_MARGIN_BOTTOM (0x1111)
+#define ETNA_SE_CLIP_MARGIN_RIGHT (0xffff)
+#define ETNA_SE_CLIP_MARGIN_BOTTOM (0xffff)
+
 /* GPU chip 3D specs */
 struct etna_specs {
   /* supports SUPERTILE (64x64) tiling? */
@@ -59,6 +70,8 @@ struct etna_specs {
   unsigned has_sign_floor_ceil : 1;
   /* can use VS_RANGE, PS_RANGE registers*/
   unsigned has_shader_range_registers : 1;
+   /* has the new sin/cos functions */
+   unsigned has_new_sin_cos : 1;
   /* can use any kind of wrapping mode on npot textures */
   unsigned npot_tex_any_wrap;
   /* number of bits per TS tile */
@@ -126,6 +139,8 @@ struct compiled_scissor_state {
   uint32_t SE_SCISSOR_TOP;
   uint32_t SE_SCISSOR_RIGHT;
   uint32_t SE_SCISSOR_BOTTOM;
+   uint32_t SE_CLIP_RIGHT;
+   uint32_t SE_CLIP_BOTTOM;
 };

 /* Compiled pipe_viewport_state */
@@ -140,6 +155,8 @@ struct compiled_viewport_state {
   uint32_t SE_SCISSOR_TOP;
   uint32_t SE_SCISSOR_RIGHT;
   uint32_t SE_SCISSOR_BOTTOM;
+   uint32_t SE_CLIP_RIGHT;
+   uint32_t SE_CLIP_BOTTOM;
   uint32_t PE_DEPTH_NEAR;
   uint32_t PE_DEPTH_FAR;
 };
@@ -162,6 +179,8 @@ struct compiled_framebuffer_state {
   uint32_t SE_SCISSOR_TOP;
   uint32_t SE_SCISSOR_RIGHT;
   uint32_t SE_SCISSOR_BOTTOM;
+   uint32_t SE_CLIP_RIGHT;
+   uint32_t SE_CLIP_BOTTOM;
   uint32_t RA_MULTISAMPLE_UNK00E04;
   uint32_t RA_MULTISAMPLE_UNK00E10[VIVS_RA_MULTISAMPLE_UNK00E10__LEN];
   uint32_t RA_CENTROID_TABLE[VIVS_RA_CENTROID_TABLE__LEN];
--- a/src/gallium/drivers/etnaviv/etnaviv_resource.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_resource.c
@@ -201,7 +201,10 @@ etna_resource_alloc(struct pipe_screen *pscreen, unsigned layout,

   size = setup_miptree(rsc, paddingX, paddingY, msaa_xscale, msaa_yscale);

-   struct etna_bo *bo = etna_bo_new(screen->dev, size, DRM_ETNA_GEM_CACHE_WC);
+   uint32_t flags = DRM_ETNA_GEM_CACHE_WC;
+   if (templat->bind & PIPE_BIND_VERTEX_BUFFER)
+      flags |= DRM_ETNA_GEM_FORCE_MMU;
+   struct etna_bo *bo = etna_bo_new(screen->dev, size, flags);
   if (unlikely(bo == NULL)) {
      BUG("Problem allocating video memory for resource");
      return NULL;
--- a/src/gallium/drivers/etnaviv/etnaviv_screen.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c
@@ -469,8 +469,11 @@ etna_screen_is_format_supported(struct pipe_screen *pscreen,
      return FALSE;

   if (usage & PIPE_BIND_RENDER_TARGET) {
-      /* if render target, must be RS-supported format */
-      if (translate_rs_format(format) != ETNA_NO_MATCH) {
+      /* If render target, must be RS-supported format that is not rb swapped.
+       * Exposing rb swapped (or other swizzled) formats for rendering would
+       * involve swizzing in the pixel shader.
+       */
+      if (translate_rs_format(format) != ETNA_NO_MATCH && !translate_rs_format_rb_swap(format)) {
         /* Validate MSAA; number of samples must be allowed, and render target
          * must have MSAA'able format. */
         if (sample_count > 1) {
@@ -617,6 +620,8 @@ etna_get_specs(struct etna_screen *screen)
      screen->model >= 0x1000 || screen->model == 0x880;
   screen->specs.npot_tex_any_wrap =
      VIV_FEATURE(screen, chipMinorFeatures1, NON_POWER_OF_TWO);
+   screen->specs.has_new_sin_cos =
+      VIV_FEATURE(screen, chipMinorFeatures3, HAS_FAST_TRANSCENDENTALS);

   if (instruction_count > 256) { /* unified instruction memory? */
      screen->specs.vs_offset = 0xC000;
--- a/src/gallium/drivers/etnaviv/etnaviv_state.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_state.c
@@ -323,8 +323,10 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
   /* Scissor setup */
   cs->SE_SCISSOR_LEFT = 0; /* affected by rasterizer and scissor state as well */
   cs->SE_SCISSOR_TOP = 0;
-   cs->SE_SCISSOR_RIGHT = (sv->width << 16) - 1;
-   cs->SE_SCISSOR_BOTTOM = (sv->height << 16) - 1;
+   cs->SE_SCISSOR_RIGHT = (sv->width << 16) + ETNA_SE_SCISSOR_MARGIN_RIGHT;
+   cs->SE_SCISSOR_BOTTOM = (sv->height << 16) + ETNA_SE_SCISSOR_MARGIN_BOTTOM;
+   cs->SE_CLIP_RIGHT = (sv->width << 16) + ETNA_SE_CLIP_MARGIN_RIGHT;
+   cs->SE_CLIP_BOTTOM = (sv->height << 16) + ETNA_SE_CLIP_MARGIN_BOTTOM;

   cs->TS_MEM_CONFIG = ts_mem_config;

@@ -345,13 +347,17 @@ etna_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
 {
   struct etna_context *ctx = etna_context(pctx);
   struct compiled_scissor_state *cs = &ctx->scissor;
+   assert(ss->minx <= ss->maxx);
+   assert(ss->miny <= ss->maxy);

   /* note that this state is only used when rasterizer_state->scissor is on */
   ctx->scissor_s = *ss;
   cs->SE_SCISSOR_LEFT = (ss->minx << 16);
   cs->SE_SCISSOR_TOP = (ss->miny << 16);
-   cs->SE_SCISSOR_RIGHT = (ss->maxx << 16) - 1;
-   cs->SE_SCISSOR_BOTTOM = (ss->maxy << 16) - 1;
+   cs->SE_SCISSOR_RIGHT = (ss->maxx << 16) + ETNA_SE_SCISSOR_MARGIN_RIGHT;
+   cs->SE_SCISSOR_BOTTOM = (ss->maxy << 16) + ETNA_SE_SCISSOR_MARGIN_BOTTOM;
+   cs->SE_CLIP_RIGHT = (ss->maxx << 16) + ETNA_SE_CLIP_MARGIN_RIGHT;
+   cs->SE_CLIP_BOTTOM = (ss->maxy << 16) + ETNA_SE_CLIP_MARGIN_BOTTOM;

   ctx->dirty |= ETNA_DIRTY_SCISSOR;
 }
@@ -387,22 +393,14 @@ etna_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
   /* Compute scissor rectangle (fixp) from viewport.
    * Make sure left is always < right and top always < bottom.
    */
-   cs->SE_SCISSOR_LEFT = etna_f32_to_fixp16(MAX2(vs->translate[0] - vs->scale[0], 0.0f));
-   cs->SE_SCISSOR_TOP = etna_f32_to_fixp16(MAX2(vs->translate[1] - vs->scale[1], 0.0f));
-   cs->SE_SCISSOR_RIGHT = etna_f32_to_fixp16(MAX2(vs->translate[0] + vs->scale[0], 0.0f));
-   cs->SE_SCISSOR_BOTTOM = etna_f32_to_fixp16(MAX2(vs->translate[1] + vs->scale[1], 0.0f));
-
-   if (cs->SE_SCISSOR_LEFT > cs->SE_SCISSOR_RIGHT) {
-      uint32_t tmp = cs->SE_SCISSOR_RIGHT;
-      cs->SE_SCISSOR_RIGHT = cs->SE_SCISSOR_LEFT;
-      cs->SE_SCISSOR_LEFT = tmp;
-   }
-
-   if (cs->SE_SCISSOR_TOP > cs->SE_SCISSOR_BOTTOM) {
-      uint32_t tmp = cs->SE_SCISSOR_BOTTOM;
-      cs->SE_SCISSOR_BOTTOM = cs->SE_SCISSOR_TOP;
-      cs->SE_SCISSOR_TOP = tmp;
-   }
+   cs->SE_SCISSOR_LEFT = etna_f32_to_fixp16(MAX2(vs->translate[0] - fabsf(vs->scale[0]), 0.0f));
+   cs->SE_SCISSOR_TOP = etna_f32_to_fixp16(MAX2(vs->translate[1] - fabsf(vs->scale[1]), 0.0f));
+   uint32_t right_fixp = etna_f32_to_fixp16(MAX2(vs->translate[0] + fabsf(vs->scale[0]), 0.0f));
+   uint32_t bottom_fixp = etna_f32_to_fixp16(MAX2(vs->translate[1] + fabsf(vs->scale[1]), 0.0f));
+   cs->SE_SCISSOR_RIGHT = right_fixp + ETNA_SE_SCISSOR_MARGIN_RIGHT;
+   cs->SE_SCISSOR_BOTTOM = bottom_fixp + ETNA_SE_SCISSOR_MARGIN_BOTTOM;
+   cs->SE_CLIP_RIGHT = right_fixp + ETNA_SE_CLIP_MARGIN_RIGHT;
+   cs->SE_CLIP_BOTTOM = bottom_fixp + ETNA_SE_CLIP_MARGIN_BOTTOM;

   cs->PE_DEPTH_NEAR = fui(0.0); /* not affected if depth mode is Z (as in GL) */
   cs->PE_DEPTH_FAR = fui(1.0);
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -9,6 +9,7 @@ AM_CFLAGS = \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(FREEDRENO_CFLAGS)

+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
 	$(MKDIR_GEN)
 	$(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py > $@ || ($(RM) $@; false)
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2924,7 +2924,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	struct pipe_stream_output_info so = pipeshader->selector->so;
 	struct tgsi_full_immediate *immediate;
 	struct r600_shader_ctx ctx;
-	struct r600_bytecode_output output[32];
+	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
 	unsigned output_done, noutput;
 	unsigned opcode;
 	int i, j, k, r = 0;
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -660,7 +660,8 @@ si_mark_image_range_valid(const struct pipe_image_view *view)

 static void si_set_shader_image(struct si_context *ctx,
 				unsigned shader,
-				unsigned slot, const struct pipe_image_view *view)
+				unsigned slot, const struct pipe_image_view *view,
+				bool skip_decompress)
 {
 	struct si_screen *screen = ctx->screen;
 	struct si_images_info *images = &ctx->images[shader];
@@ -702,7 +703,7 @@ static void si_set_shader_image(struct si_context *ctx,
 		assert(!tex->is_depth);
 		assert(tex->fmask.size == 0);

-		if (uses_dcc &&
+		if (uses_dcc && !skip_decompress &&
 		    (view->access & PIPE_IMAGE_ACCESS_WRITE ||
 		     !vi_dcc_formats_compatible(res->b.b.format, view->format))) {
 			/* If DCC can't be disabled, at least decompress it.
@@ -776,10 +777,10 @@ si_set_shader_images(struct pipe_context *pipe,

 	if (views) {
 		for (i = 0, slot = start_slot; i < count; ++i, ++slot)
-			si_set_shader_image(ctx, shader, slot, &views[i]);
+			si_set_shader_image(ctx, shader, slot, &views[i], false);
 	} else {
 		for (i = 0, slot = start_slot; i < count; ++i, ++slot)
-			si_set_shader_image(ctx, shader, slot, NULL);
+			si_set_shader_image(ctx, shader, slot, NULL, false);
 	}

 	si_update_compressed_tex_shader_mask(ctx, shader);
@@ -1710,7 +1711,7 @@ void si_update_all_texture_descriptors(struct si_context *sctx)
 			    view->resource->target == PIPE_BUFFER)
 				continue;

-			si_set_shader_image(sctx, shader, i, view);
+			si_set_shader_image(sctx, shader, i, view, true);
 		}

 		/* Sampler views. */
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3368,7 +3368,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
 		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
 		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
-		channel = &desc->channel[first_non_void];
+		channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;

 		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
 				   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
@@ -3390,12 +3390,12 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 				/* This isn't actually used in OpenGL. */
 				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_A2_SINT << (4 * i);
 			}
-		} else if (channel->type == UTIL_FORMAT_TYPE_FIXED) {
+		} else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) {
 			if (desc->swizzle[3] == PIPE_SWIZZLE_1)
 				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGBX_32_FIXED << (4 * i);
 			else
 				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGBA_32_FIXED << (4 * i);
-		} else if (channel->size == 32 && !channel->pure_integer) {
+		} else if (channel && channel->size == 32 && !channel->pure_integer) {
 			if (channel->type == UTIL_FORMAT_TYPE_SIGNED) {
 				if (channel->normalized) {
 					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
--- a/src/gallium/state_trackers/clover/Makefile.am
+++ b/src/gallium/state_trackers/clover/Makefile.am
@@ -2,12 +2,12 @@ include Makefile.sources

 AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
+	-I$(top_builddir)/src \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/drivers \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/gallium/winsys \
-	-I$(top_builddir)/src \
 	-I$(srcdir)

 if HAVE_CLOVER_ICD
--- a/src/gallium/state_trackers/dri/Makefile.am
+++ b/src/gallium/state_trackers/dri/Makefile.am
@@ -28,8 +28,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/mesa \
-	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	$(GALLIUM_CFLAGS) \
 	$(LIBDRM_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -81,7 +81,7 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende
   }

   if (context->decoder->entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE)
-      context->decoder->begin_frame(context->decoder, context->target, &context->desc.base);
+      context->needs_begin_frame = true;

   return VA_STATUS_SUCCESS;
 }
@@ -178,6 +178,8 @@ handlePictureParameterBuffer(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *

      if (!context->decoder)
         return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+      context->needs_begin_frame = true;
   }

   return vaStatus;
@@ -308,8 +310,11 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
   sizes[num_buffers] = buf->size;
   ++num_buffers;

-   context->decoder->begin_frame(context->decoder, context->target,
-      &context->desc.base);
+   if (context->needs_begin_frame) {
+      context->decoder->begin_frame(context->decoder, context->target,
+         &context->desc.base);
+      context->needs_begin_frame = false;
+   }
   context->decoder->decode_bitstream(context->decoder, context->target, &context->desc.base,
      num_buffers, (const void * const*)buffers, sizes);
 }
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -261,6 +261,7 @@ typedef struct {
   int target_id;
   bool first_single_submitted;
   int gop_coeff;
+   bool needs_begin_frame;
 } vlVaContext;

 typedef struct {
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -75,6 +75,13 @@ vlVdpOutputSurfaceCreate(VdpDevice device,

   memset(&res_tmpl, 0, sizeof(res_tmpl));

+   /*
+    * The output won't look correctly when this buffer is send to X,
+    * if the VDPAU RGB component order doesn't match the X11 one so
+    * we only allow the X11 format
+    */
+   vlsurface->send_to_X = rgba_format == VDP_RGBA_FORMAT_B8G8R8A8;
+
   res_tmpl.target = PIPE_TEXTURE_2D;
   res_tmpl.format = VdpFormatRGBAToPipe(rgba_format);
   res_tmpl.width0 = width;
--- a/src/gallium/state_trackers/vdpau/presentation.c
+++ b/src/gallium/state_trackers/vdpau/presentation.c
@@ -231,7 +231,7 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
   vscreen = pq->device->vscreen;

   pipe_mutex_lock(pq->device->mutex);
-   if (vscreen->set_back_texture_from_output)
+   if (vscreen->set_back_texture_from_output && surf->send_to_X)
      vscreen->set_back_texture_from_output(vscreen, surf->surface->texture, clip_width, clip_height);
   tex = vscreen->texture_from_drawable(vscreen, (void *)pq->drawable);
   if (!tex) {
@@ -239,7 +239,7 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
      return VDP_STATUS_INVALID_HANDLE;
   }

-   if (!vscreen->set_back_texture_from_output) {
+   if (!vscreen->set_back_texture_from_output || !surf->send_to_X) {
      dirty_area = vscreen->get_dirty_area(vscreen);

      memset(&surf_templ, 0, sizeof(surf_templ));
@@ -289,7 +289,7 @@ vlVdpPresentationQueueDisplay(VdpPresentationQueue presentation_queue,
      framenum++;
   }

-   if (!vscreen->set_back_texture_from_output) {
+   if (!vscreen->set_back_texture_from_output || !surf->send_to_X) {
      pipe_resource_reference(&tex, NULL);
      pipe_surface_reference(&surf_draw, NULL);
   }
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -415,6 +415,7 @@ typedef struct
   struct pipe_fence_handle *fence;
   struct vl_compositor_state cstate;
   struct u_rect dirty_area;
+   bool send_to_X;
 } vlVdpOutputSurface;

 typedef struct
--- a/src/gallium/targets/d3dadapter9/Makefile.am
+++ b/src/gallium/targets/d3dadapter9/Makefile.am
@@ -27,8 +27,8 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/loader \
 	-I$(top_srcdir)/src/mapi/ \
 	-I$(top_srcdir)/src/mesa/ \
-	-I$(top_srcdir)/src/mesa/drivers/dri/common/ \
 	-I$(top_builddir)/src/mesa/drivers/dri/common/ \
+	-I$(top_srcdir)/src/mesa/drivers/dri/common/ \
 	-I$(top_srcdir)/src/gallium/winsys \
 	-I$(top_srcdir)/src/gallium/state_trackers/nine \
 	$(GALLIUM_TARGET_CFLAGS) \
--- a/src/glx/Makefile.am
+++ b/src/glx/Makefile.am
@@ -37,10 +37,10 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/include/GL/internal \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/loader \
-	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_builddir)/src/mapi \
+	-I$(top_srcdir)/src/mapi \
 	-I$(top_builddir)/src/mapi/glapi \
+	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
 	$(SHARED_GLAPI_CFLAGS) \
 	$(EXTRA_DEFINES_XF86VIDMODE) \
--- a/src/glx/apple/Makefile.am
+++ b/src/glx/apple/Makefile.am
@@ -6,11 +6,11 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/glx \
-	-I$(top_srcdir)/src/mesa \
 	-I$(top_builddir)/src/mesa \
+	-I$(top_srcdir)/src/mesa \
 	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_builddir)/src/mapi/glapi \
+	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
 	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES) \
--- a/src/glx/windows/Makefile.am
+++ b/src/glx/windows/Makefile.am
@@ -24,8 +24,8 @@ libwindowsglx_la_CFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/glx \
 	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_builddir)/src/mapi/glapi \
+	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
 	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES) \
--- a/src/intel/blorp/blorp_clear.c
+++ b/src/intel/blorp/blorp_clear.c
@@ -349,6 +349,29 @@ blorp_clear(struct blorp_batch *batch,
   if (format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {
      clear_color.u32[0] = float3_to_rgb9e5(clear_color.f32);
      format = ISL_FORMAT_R32_UINT;
+   } else if (format == ISL_FORMAT_A4B4G4R4_UNORM) {
+      /* Broadwell and earlier cannot render to this format so we need to work
+       * around it by swapping the colors around and using B4G4R4A4 instead.
+       */
+
+      /* First, we apply the swizzle. */
+      union isl_color_value old;
+      assert((unsigned)(swizzle.r - ISL_CHANNEL_SELECT_RED) < 4);
+      assert((unsigned)(swizzle.g - ISL_CHANNEL_SELECT_RED) < 4);
+      assert((unsigned)(swizzle.b - ISL_CHANNEL_SELECT_RED) < 4);
+      assert((unsigned)(swizzle.a - ISL_CHANNEL_SELECT_RED) < 4);
+      old.u32[swizzle.r - ISL_CHANNEL_SELECT_RED] = clear_color.u32[0];
+      old.u32[swizzle.g - ISL_CHANNEL_SELECT_RED] = clear_color.u32[1];
+      old.u32[swizzle.b - ISL_CHANNEL_SELECT_RED] = clear_color.u32[2];
+      old.u32[swizzle.a - ISL_CHANNEL_SELECT_RED] = clear_color.u32[3];
+      swizzle = ISL_SWIZZLE_IDENTITY;
+
+      /* Now we re-order for the new format */
+      clear_color.u32[0] = old.u32[1];
+      clear_color.u32[1] = old.u32[2];
+      clear_color.u32[2] = old.u32[3];
+      clear_color.u32[3] = old.u32[0];
+      format = ISL_FORMAT_B4G4R4A4_UNORM;
   }

   memcpy(&params.wm_inputs.clear_color, clear_color.f32, sizeof(float) * 4);
--- a/src/intel/isl/isl_format.c
+++ b/src/intel/isl/isl_format.c
@@ -218,9 +218,10 @@ static const struct surface_format_info format_info[] = {
   SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,   P8A8_UNORM_PALETTE1)
   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   A1B5G5R5_UNORM)
   /* According to the PRM, A4B4G4R4_UNORM isn't supported until Sky Lake
-    * but empirical testing indicates that it works just fine on Broadwell.
+    * but empirical testing indicates that at least sampling works just fine
+    * on Broadwell.
    */
-   SF(80, 80,  x,  x, 80,  x,  x,  x,  x,    x,   A4B4G4R4_UNORM)
+   SF(80, 80,  x,  x, 90,  x,  x,  x,  x,    x,   A4B4G4R4_UNORM)
   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_UINT)
   SF(90,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_SINT)
   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x,    x,   R8_UNORM)
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -232,9 +232,12 @@ VkResult anv_AllocateCommandBuffers(
         break;
   }

-   if (result != VK_SUCCESS)
+   if (result != VK_SUCCESS) {
      anv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
                             i, pCommandBuffers);
+      for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
+         pCommandBuffers[i] = VK_NULL_HANDLE;
+   }

   return result;
 }
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -329,18 +329,18 @@ VkResult anv_CreateDescriptorPool(
      }
   }

-   const size_t size =
-      sizeof(*pool) +
+   const size_t pool_size =
      pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) +
      descriptor_count * sizeof(struct anv_descriptor) +
      buffer_count * sizeof(struct anv_buffer_view);
+   const size_t total_size = sizeof(*pool) + pool_size;

-   pool = vk_alloc2(&device->alloc, pAllocator, size, 8,
+   pool = vk_alloc2(&device->alloc, pAllocator, total_size, 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (!pool)
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

-   pool->size = size;
+   pool->size = pool_size;
   pool->next = 0;
   pool->free_list = EMPTY;

--- a/src/intel/vulkan/anv_nir_lower_input_attachments.c
+++ b/src/intel/vulkan/anv_nir_lower_input_attachments.c
@@ -100,11 +100,8 @@ try_lower_input_load(nir_function_impl *impl, nir_intrinsic_instr *load)

   if (image_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) {
      tex->op = nir_texop_txf_ms;
-
-      nir_ssa_def *sample_id =
-         nir_load_system_value(&b, nir_intrinsic_load_sample_id, 0);
      tex->src[2].src_type = nir_tex_src_ms_index;
-      tex->src[2].src = nir_src_for_ssa(sample_id);
+      tex->src[2].src = load->src[1];
   }

   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -55,8 +55,6 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_device *device = cmd_buffer->device;

-/* XXX: Do we need this on more than just BDW? */
-#if (GEN_GEN >= 8)
   /* Emit a render target cache flush.
    *
    * This isn't documented anywhere in the PRM.  However, it seems to be
@@ -65,9 +63,10 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
    * clear depth, reset state base address, and then go render stuff.
    */
   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DCFlushEnable = true;
      pc.RenderTargetCacheFlushEnable = true;
+      pc.CommandStreamerStallEnable = true;
   }
-#endif

   anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
      sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
@@ -148,6 +147,8 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
    */
   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
      pc.TextureCacheInvalidationEnable = true;
+      pc.ConstantCacheInvalidationEnable = true;
+      pc.StateCacheInvalidationEnable = true;
   }
 }

@@ -1177,9 +1178,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,

      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
         assert(stage == MESA_SHADER_FRAGMENT);
-         if (desc->image_view->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
-            /* For stencil input attachments, we treat it like any old texture
-             * that a user may have bound.
+         if (desc->image_view->aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) {
+            /* For depth and stencil input attachments, we treat it like any
+             * old texture that a user may have bound.
             */
            surface_state = desc->image_view->sampler_surface_state;
            assert(surface_state.alloc_size);
@@ -1187,9 +1188,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
                                  desc->image_view->image->aux_usage,
                                  surface_state);
         } else {
-            /* For depth and color input attachments, we create the surface
-             * state at vkBeginRenderPass time so that we can include aux
-             * and clear color information.
+            /* For color input attachments, we create the surface state at
+             * vkBeginRenderPass time so that we can include aux and clear
+             * color information.
             */
            assert(binding->input_attachment_index < subpass->input_count);
            const unsigned subpass_att = binding->input_attachment_index;
--- a/src/loader/Makefile.am
+++ b/src/loader/Makefile.am
@@ -39,8 +39,8 @@ libloader_la_LIBADD =

 if HAVE_DRICOMMON
 libloader_la_CPPFLAGS += \
-	-I$(top_srcdir)/src/mesa/drivers/dri/common/ \
 	-I$(top_builddir)/src/mesa/drivers/dri/common/ \
+	-I$(top_srcdir)/src/mesa/drivers/dri/common/ \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/mapi/ \
 	-DUSE_DRICONF
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -46,8 +46,8 @@ AM_CPPFLAGS =							\
 	$(SELINUX_CFLAGS)					\
 	-I$(top_srcdir)/include					\
 	-I$(top_srcdir)/src					\
-	-I$(top_srcdir)/src/mapi				\
-	-I$(top_builddir)/src/mapi
+	-I$(top_builddir)/src/mapi				\
+	-I$(top_srcdir)/src/mapi

 include Makefile.sources

--- a/src/mesa/drivers/dri/i915/Makefile.am
+++ b/src/mesa/drivers/dri/i915/Makefile.am
@@ -30,9 +30,9 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
-	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(INTEL_CFLAGS)
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -30,21 +30,22 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
 	-I$(top_srcdir)/src/gtest/include \
-	-I$(top_srcdir)/src/compiler/nir \
-	-I$(top_srcdir)/src/intel \
 	-I$(top_builddir)/src/compiler/glsl \
 	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_srcdir)/src/compiler/nir \
 	-I$(top_builddir)/src/intel \
-	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	-I$(top_srcdir)/src/intel \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(INTEL_CFLAGS)

 AM_CXXFLAGS = $(AM_CFLAGS)

+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 brw_nir_trig_workarounds.c: brw_nir_trig_workarounds.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
 	$(MKDIR_GEN)
 	$(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/brw_nir_trig_workarounds.py > $@ || ($(RM) $@; false)
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -284,8 +284,10 @@ brw_blorp_to_isl_format(struct brw_context *brw, mesa_format format,
   case MESA_FORMAT_S_UINT8:
      return ISL_FORMAT_R8_UINT;
   case MESA_FORMAT_Z24_UNORM_X8_UINT:
+   case MESA_FORMAT_Z24_UNORM_S8_UINT:
      return ISL_FORMAT_R24_UNORM_X8_TYPELESS;
   case MESA_FORMAT_Z_FLOAT32:
+   case MESA_FORMAT_Z32_FLOAT_S8X24_UINT:
      return ISL_FORMAT_R32_FLOAT;
   case MESA_FORMAT_Z_UNORM16:
      return ISL_FORMAT_R16_UNORM;
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -910,6 +910,9 @@ brw_process_driconf_options(struct brw_context *brw)
   ctx->Const.ForceGLSLExtensionsWarn =
      driQueryOptionb(options, "force_glsl_extensions_warn");

+   ctx->Const.ForceGLSLVersion =
+      driQueryOptioni(options, "force_glsl_version");
+
   ctx->Const.DisableGLSLLineContinuations =
      driQueryOptionb(options, "disable_glsl_line_continuations");

--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -508,7 +508,7 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
   insn = brw_next_insn(p, BRW_OPCODE_SEND);

   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
-   brw_set_src0(p, insn, payload);
+   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
   brw_set_src1(p, insn, brw_imm_d(0));

   /* Terminate a compute shader by sending a message to the thread spawner.
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -177,6 +177,49 @@ static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 static void brwDeleteProgram( struct gl_context *ctx,
 			      struct gl_program *prog )
 {
+   struct brw_context *brw = brw_context(ctx);
+
+   /* Beware!  prog's refcount has reached zero, and it's about to be freed.
+    *
+    * In brw_upload_pipeline_state(), we compare brw->foo_program to
+    * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
+    * pointer has changed.
+    *
+    * We cannot leave brw->foo_program as a dangling pointer to the dead
+    * program.  malloc() may allocate the same memory for a new gl_program,
+    * causing us to see matching pointers...but totally different programs.
+    *
+    * We cannot set brw->foo_program to NULL, either.  If we've deleted the
+    * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
+    * would cause us to see matching pointers (NULL == NULL), and fail to
+    * detect that a program has changed since our last draw.
+    *
+    * So, set it to a bogus gl_program pointer that will never match,
+    * causing us to properly reevaluate the state on our next draw.
+    *
+    * Getting this wrong causes heisenbugs which are very hard to catch,
+    * as you need a very specific allocation pattern to hit the problem.
+    */
+   static const struct gl_program deleted_program;
+
+   if (brw->vertex_program == prog)
+      brw->vertex_program = &deleted_program;
+
+   if (brw->tess_ctrl_program == prog)
+      brw->tess_ctrl_program = &deleted_program;
+
+   if (brw->tess_eval_program == prog)
+      brw->tess_eval_program = &deleted_program;
+
+   if (brw->geometry_program == prog)
+      brw->geometry_program = &deleted_program;
+
+   if (brw->fragment_program == prog)
+      brw->fragment_program = &deleted_program;
+
+   if (brw->compute_program == prog)
+      brw->compute_program = &deleted_program;
+
   _mesa_delete_program( ctx, prog );
 }

--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -477,6 +477,18 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
      break;
   case BLORP_HIZ_OP_DEPTH_CLEAR:
      dw1 |= GEN8_WM_HZ_DEPTH_CLEAR;
+
+      /* The "Clear Rectangle X Max" (and Y Max) fields are exclusive,
+       * rather than inclusive, and limited to 16383.  This means that
+       * for a 16384x16384 render target, we would miss the last row
+       * or column of pixels along the edge.
+       *
+       * To work around this, we have to set the "Full Surface Depth
+       * and Stencil Clear" bit.  We can do this in all cases because
+       * we always clear the full rectangle anyway.  We'll need to
+       * change this if we ever add scissored clear support.
+       */
+      dw1 |= GEN8_WM_HZ_FULL_SURFACE_DEPTH_CLEAR;
      break;
   case BLORP_HIZ_OP_NONE:
      unreachable("Should not get here.");
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -261,4 +261,8 @@ retry:

   if (params->dst.enabled)
      brw_render_cache_set_add_bo(brw, params->dst.addr.buffer);
+   if (params->depth.enabled)
+      brw_render_cache_set_add_bo(brw, params->depth.addr.buffer);
+   if (params->stencil.enabled)
+      brw_render_cache_set_add_bo(brw, params->stencil.addr.buffer);
 }
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -235,13 +235,9 @@ emit_miptree_blit(struct brw_context *brw,
    *    represented per scan line’s worth of graphics data depends on the
    *    color depth.
    *
-    * Furthermore, intelEmitCopyBlit (which is called below) uses a signed
-    * 16-bit integer to represent buffer pitch, so it can only handle buffer
-    * pitches < 32k. However, the pitch is measured in bytes for linear buffers
-    * and dwords for tiled buffers.
-    *
-    * As a result of these two limitations, we can only use the blitter to do
-    * this copy when the miptree's pitch is less than 32k linear or 128k tiled.
+    * The blitter's pitch is a signed 16-bit integer, but measured in bytes
+    * for linear surfaces and DWords for tiled surfaces.  So the maximum
+    * pitch is 32k linear and 128k tiled.
    */
   if (blt_pitch(src_mt) >= 32768 || blt_pitch(dst_mt) >= 32768) {
      perf_debug("Falling back due to >= 32k/128k pitch\n");
@@ -480,11 +476,11 @@ static bool
 can_fast_copy_blit(struct brw_context *brw,
 		   drm_intel_bo *src_buffer,
                   int16_t src_x, int16_t src_y,
-                   uintptr_t src_offset, uint32_t src_pitch,
+                   uintptr_t src_offset, int32_t src_pitch,
                   uint32_t src_tiling, uint32_t src_tr_mode,
 		   drm_intel_bo *dst_buffer,
                   int16_t dst_x, int16_t dst_y,
-                   uintptr_t dst_offset, uint32_t dst_pitch,
+                   uintptr_t dst_offset, int32_t dst_pitch,
                   uint32_t dst_tiling, uint32_t dst_tr_mode,
                   int16_t w, int16_t h, uint32_t cpp,
                   GLenum logic_op)
@@ -520,10 +516,8 @@ can_fast_copy_blit(struct brw_context *brw,
   if (!_mesa_is_pow_two(cpp) || cpp > 16)
      return false;

-   /* For Fast Copy Blits the pitch cannot be a negative number. So, bit 15
-    * of the destination pitch must be zero.
-    */
-   if ((src_pitch >> 15 & 1) != 0 || (dst_pitch >> 15 & 1) != 0)
+   /* For Fast Copy Blits the pitch cannot be a negative number. */
+   if (src_pitch < 0 || dst_pitch < 0)
      return false;

   /* For Linear surfaces, the pitch has to be an OWord (16byte) multiple. */
@@ -577,12 +571,12 @@ xy_blit_cmd(uint32_t src_tiling, uint32_t src_tr_mode,
 bool
 intelEmitCopyBlit(struct brw_context *brw,
 		  GLuint cpp,
-		  GLshort src_pitch,
+		  int32_t src_pitch,
 		  drm_intel_bo *src_buffer,
 		  GLuint src_offset,
 		  uint32_t src_tiling,
 		  uint32_t src_tr_mode,
-		  GLshort dst_pitch,
+		  int32_t dst_pitch,
 		  drm_intel_bo *dst_buffer,
 		  GLuint dst_offset,
 		  uint32_t dst_tiling,
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -31,12 +31,12 @@
 bool
 intelEmitCopyBlit(struct brw_context *brw,
                  GLuint cpp,
-                  GLshort src_pitch,
+                  int32_t src_pitch,
                  drm_intel_bo *src_buffer,
                  GLuint src_offset,
                  uint32_t src_tiling,
                  uint32_t src_tr_mode,
-                  GLshort dst_pitch,
+                  int32_t dst_pitch,
                  drm_intel_bo *dst_buffer,
                  GLuint dst_offset,
                  uint32_t dst_tiling,
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -79,6 +79,7 @@ DRI_CONF_BEGIN
      DRI_CONF_ALWAYS_FLUSH_CACHE("false")
      DRI_CONF_DISABLE_THROTTLING("false")
      DRI_CONF_FORCE_GLSL_EXTENSIONS_WARN("false")
+      DRI_CONF_FORCE_GLSL_VERSION(0)
      DRI_CONF_DISABLE_GLSL_LINE_CONTINUATIONS("false")
      DRI_CONF_DISABLE_BLEND_FUNC_EXTENDED("false")
      DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION("false")
--- a/src/mesa/drivers/dri/r200/Makefile.am
+++ b/src/mesa/drivers/dri/r200/Makefile.am
@@ -34,9 +34,9 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/r200/server \
-	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(RADEON_CFLAGS)
--- a/src/mesa/drivers/dri/radeon/Makefile.am
+++ b/src/mesa/drivers/dri/radeon/Makefile.am
@@ -35,9 +35,9 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/radeon/server \
-	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
 	$(RADEON_CFLAGS)
--- a/src/mesa/drivers/dri/swrast/Makefile.am
+++ b/src/mesa/drivers/dri/swrast/Makefile.am
@@ -30,8 +30,8 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	$(LIBDRM_CFLAGS) \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS)
--- a/src/mesa/drivers/osmesa/Makefile.am
+++ b/src/mesa/drivers/osmesa/Makefile.am
@@ -28,8 +28,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/mapi \
 	-I$(top_builddir)/src/mapi \
+	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/mesa/ \
 	$(DEFINES)
 AM_CFLAGS = $(PTHREAD_CFLAGS) \
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -363,7 +363,7 @@ EXT(OES_point_size_array                    , dummy_true
 EXT(OES_point_sprite                        , ARB_point_sprite                       ,  x ,  x , ES1,  x , 2004)
 EXT(OES_primitive_bounding_box              , OES_primitive_bounding_box             ,  x ,  x ,  x ,  31, 2014)
 EXT(OES_query_matrix                        , dummy_true                             ,  x ,  x , ES1,  x , 2003)
-EXT(OES_read_format                         , dummy_true                             , GLL, GLC, ES1,  x , 2003)
+EXT(OES_read_format                         , dummy_true                             , GLL,  x , ES1,  x , 2003)
 EXT(OES_rgb8_rgba8                          , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_sample_shading                      , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_sample_variables                    , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1741,8 +1741,6 @@ _mesa_ShaderSource(GLuint shaderObj, GLsizei count,
   GLcharARB *source;
   struct gl_shader *sh;

-   GLcharARB *replacement;
-
   sh = _mesa_lookup_shader_err(ctx, shaderObj, "glShaderSourceARB");
   if (!sh)
      return;
@@ -1799,6 +1797,8 @@ _mesa_ShaderSource(GLuint shaderObj, GLsizei count,
   source[totalLength - 2] = '\0';

 #ifdef ENABLE_SHADER_CACHE
+   GLcharARB *replacement;
+
   /* Dump original shader source to MESA_SHADER_DUMP_PATH and replace
    * if corresponding entry found from MESA_SHADER_READ_PATH.
    */
--- a/src/mesa/main/tests/Makefile.am
+++ b/src/mesa/main/tests/Makefile.am
@@ -4,8 +4,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/gtest/include \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mesa \
 	-I$(top_builddir)/src/mesa \
+	-I$(top_srcdir)/src/mesa \
 	-I$(top_srcdir)/include \
 	$(DEFINES) $(INCLUDE_DIRS)

--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -278,7 +278,7 @@ void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state)


 static void
-st_destroy_context_priv(struct st_context *st)
+st_destroy_context_priv(struct st_context *st, bool destroy_pipe)
 {
   uint shader, i;

@@ -314,6 +314,10 @@ st_destroy_context_priv(struct st_context *st)
   st_invalidate_readpix_cache(st);

   cso_destroy_context(st->cso_context);
+
+   if (st->pipe && destroy_pipe)
+      st->pipe->destroy(st->pipe);
+
   free( st );
 }

@@ -503,7 +507,7 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
      /* This can happen when a core profile was requested, but the driver
       * does not support some features of GL 3.1 or later.
       */
-      st_destroy_context_priv(st);
+      st_destroy_context_priv(st, false);
      return NULL;
   }

@@ -579,7 +583,6 @@ destroy_tex_sampler_cb(GLuint id, void *data, void *userData)
 
 void st_destroy_context( struct st_context *st )
 {
-   struct pipe_context *pipe = st->pipe;
   struct gl_context *ctx = st->ctx;
   GLuint i;

@@ -608,11 +611,9 @@ void st_destroy_context( struct st_context *st )

   /* This will free the st_context too, so 'st' must not be accessed
    * afterwards. */
-   st_destroy_context_priv(st);
+   st_destroy_context_priv(st, true);
   st = NULL;

-   pipe->destroy( pipe );
-
   free(ctx);
 }

--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -379,7 +379,8 @@ wsi_wl_surface_get_capabilities(VkIcdSurfaceBase *surface,

   caps->currentExtent = (VkExtent2D) { -1, -1 };
   caps->minImageExtent = (VkExtent2D) { 1, 1 };
-   caps->maxImageExtent = (VkExtent2D) { INT16_MAX, INT16_MAX };
+   /* This is the maximum supported size on Intel */
+   caps->maxImageExtent = (VkExtent2D) { 1 << 14, 1 << 14 };
   caps->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
   caps->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
   caps->maxImageArrayLayers = 1;
@@ -409,25 +410,27 @@ wsi_wl_surface_get_formats(VkIcdSurfaceBase *icd_surface,
   if (!display)
      return VK_ERROR_OUT_OF_HOST_MEMORY;

-   uint32_t count = u_vector_length(&display->formats);
-
   if (pSurfaceFormats == NULL) {
-      *pSurfaceFormatCount = count;
+      *pSurfaceFormatCount = u_vector_length(&display->formats);
      return VK_SUCCESS;
   }

-   assert(*pSurfaceFormatCount >= count);
-   *pSurfaceFormatCount = count;
-
+   uint32_t count = 0;
   VkFormat *f;
   u_vector_foreach(f, &display->formats) {
-      *(pSurfaceFormats++) = (VkSurfaceFormatKHR) {
+      if (count == *pSurfaceFormatCount)
+         return VK_INCOMPLETE;
+
+      pSurfaceFormats[count++] = (VkSurfaceFormatKHR) {
         .format = *f,
         /* TODO: We should get this from the compositor somehow */
         .colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR,
      };
   }

+   assert(*pSurfaceFormatCount <= count);
+   *pSurfaceFormatCount = count;
+
   return VK_SUCCESS;
 }

@@ -441,11 +444,13 @@ wsi_wl_surface_get_present_modes(VkIcdSurfaceBase *surface,
      return VK_SUCCESS;
   }

-   assert(*pPresentModeCount >= ARRAY_SIZE(present_modes));
+   *pPresentModeCount = MIN2(*pPresentModeCount, ARRAY_SIZE(present_modes));
   typed_memcpy(pPresentModes, present_modes, *pPresentModeCount);
-   *pPresentModeCount = ARRAY_SIZE(present_modes);

-   return VK_SUCCESS;
+   if (*pPresentModeCount < ARRAY_SIZE(present_modes))
+      return VK_INCOMPLETE;
+   else
+      return VK_SUCCESS;
 }

 VkResult wsi_create_wl_surface(const VkAllocationCallbacks *pAllocator,
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@@ -370,7 +370,8 @@ x11_surface_get_capabilities(VkIcdSurfaceBase *icd_surface,
       */
      caps->currentExtent = (VkExtent2D) { -1, -1 };
      caps->minImageExtent = (VkExtent2D) { 1, 1 };
-      caps->maxImageExtent = (VkExtent2D) { INT16_MAX, INT16_MAX };
+      /* This is the maximum supported size on Intel */
+      caps->maxImageExtent = (VkExtent2D) { 1 << 14, 1 << 14 };
   }
   free(err);
   free(geom);