Update version to 17.0.0-rc2

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
i965/blorp: Make post draw flush more explicit
2017-01-25 13:24:27 +00:00 · 2017-01-24 17:18:18 +00:00 · 2017-01-24 17:17:53 +00:00 · 2017-01-24 17:17:22 +00:00 · 2017-01-24 17:17:02 +00:00 · 2017-01-24 02:28:55 +00:00
43 changed files with 545 additions and 182 deletions
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -43,6 +43,7 @@ LOCAL_CFLAGS += \
 	-DANDROID_VERSION=0x0$(MESA_ANDROID_MAJOR_VERSION)0$(MESA_ANDROID_MINOR_VERSION)

 LOCAL_CFLAGS += \
+	-DENABLE_SHADER_CACHE \
 	-D__STDC_LIMIT_MACROS \
 	-DHAVE___BUILTIN_EXPECT \
 	-DHAVE___BUILTIN_FFS \
--- a/2
+++ b/2
@@ -1 +1 @@
-17.0.0-devel
+17.0.0-rc2
--- a/configure.ac
+++ b/configure.ac
@@ -1766,6 +1766,7 @@ if test -n "$with_vulkan_drivers"; then
 fi


+DEFINES="$DEFINES -DENABLE_SHADER_CACHE"
 AM_CONDITIONAL(NEED_MEGADRIVER, test -n "$DRI_DIRS")
 AM_CONDITIONAL(NEED_LIBMESA, test "x$enable_glx" = xxlib -o \
                                  "x$enable_osmesa" = xyes -o \
--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -32,9 +32,6 @@ lib_LTLIBRARIES = libvulkan_radeon.la
 # The gallium includes are for the util/u_math.h include from main/macros.h

 AM_CPPFLAGS = \
-	$(AMDGPU_CFLAGS) \
-	$(VALGRIND_CFLAGS) \
-	$(DEFINES) \
 	-I$(top_srcdir)/include \
 	-I$(top_builddir)/src \
 	-I$(top_srcdir)/src \
@@ -48,7 +45,10 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/mesa \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/gallium/include
+	-I$(top_srcdir)/src/gallium/include \
+	$(AMDGPU_CFLAGS) \
+	$(VALGRIND_CFLAGS) \
+	$(DEFINES)

 AM_CFLAGS = \
 	$(VISIBILITY_CFLAGS) \
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -989,8 +989,7 @@ VkResult radv_QueueSubmit(
 			if (queue->device->trace_bo)
 				*queue->device->trace_id_ptr = 0;

-			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array,
-							pSubmits[i].commandBufferCount,
+			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, advance,
 							(struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores,
 							b ? pSubmits[i].waitSemaphoreCount : 0,
 							(struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores,
--- a/src/compiler/glsl/ir_optimization.h
+++ b/src/compiler/glsl/ir_optimization.h
@@ -30,7 +30,7 @@

 /* Operations for lower_instructions() */
 #define SUB_TO_ADD_NEG     0x01
-#define DIV_TO_MUL_RCP     0x02
+#define FDIV_TO_MUL_RCP    0x02
 #define EXP_TO_EXP2        0x04
 #define POW_TO_EXP2        0x08
 #define LOG_TO_LOG2        0x10
@@ -49,6 +49,8 @@
 #define FIND_LSB_TO_FLOAT_CAST    0x20000
 #define FIND_MSB_TO_FLOAT_CAST    0x40000
 #define IMUL_HIGH_TO_MUL          0x80000
+#define DDIV_TO_MUL_RCP           0x100000
+#define DIV_TO_MUL_RCP            (FDIV_TO_MUL_RCP | DDIV_TO_MUL_RCP)

 /**
 * \see class lower_packing_builtins_visitor
--- a/src/compiler/glsl/lower_instructions.cpp
+++ b/src/compiler/glsl/lower_instructions.cpp
@@ -54,8 +54,8 @@
 * want to recognize add(op0, neg(op1)) or the other way around to
 * produce a subtract anyway.
 *
- * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
- * --------------------------------------
+ * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP:
+ * ---------------------------------------------------------
 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
 *
 * Many GPUs don't have a divide instruction (945 and 965 included),
@@ -63,9 +63,11 @@
 * reciprocal.  By breaking the operation down, constant reciprocals
 * can get constant folded.
 *
- * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
- * handles the integer case, converting to and from floating point so that
- * RCP is possible.
+ * FDIV_TO_MUL_RCP only lowers single-precision floating point division;
+ * DDIV_TO_MUL_RCP only lowers double-precision floating point division.
+ * DIV_TO_MUL_RCP is a convenience macro that sets both flags.
+ * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating
+ * point so that RCP is possible.
 *
 * EXP_TO_EXP2 and LOG_TO_LOG2:
 * ----------------------------
@@ -326,7 +328,8 @@ lower_instructions_visitor::mod_to_floor(ir_expression *ir)
   /* Don't generate new IR that would need to be lowered in an additional
    * pass.
    */
-   if (lowering(DIV_TO_MUL_RCP) && (ir->type->is_float() || ir->type->is_double()))
+   if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) ||
+       (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double()))
      div_to_mul_rcp(div_expr);

   ir_expression *const floor_expr =
@@ -1599,8 +1602,8 @@ lower_instructions_visitor::visit_leave(ir_expression *ir)
   case ir_binop_div:
      if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
 	 int_div_to_mul_rcp(ir);
-      else if ((ir->operands[1]->type->is_float() ||
-                ir->operands[1]->type->is_double()) && lowering(DIV_TO_MUL_RCP))
+      else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) ||
+               (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP)))
 	 div_to_mul_rcp(ir);
      break;

--- a/src/compiler/glsl/tests/cache_test.c
+++ b/src/compiler/glsl/tests/cache_test.c
@@ -37,6 +37,8 @@

 bool error = false;

+#ifdef ENABLE_SHADER_CACHE
+
 static void
 expect_equal(uint64_t actual, uint64_t expected, const char *test)
 {
@@ -378,10 +380,12 @@ test_put_key_and_get_key(void)

   disk_cache_destroy(cache);
 }
+#endif /* ENABLE_SHADER_CACHE */

 int
 main(void)
 {
+#ifdef ENABLE_SHADER_CACHE
   int err;

   test_disk_cache_create();
@@ -392,6 +396,7 @@ main(void)

   err = rmrf_local(CACHE_TEST_TMP);
   expect_equal(err, 0, "Removing " CACHE_TEST_TMP " again");
+#endif /* ENABLE_SHADER_CACHE */

   return error ? 1 : 0;
 }
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -210,43 +210,27 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
         return true;

      case nir_type_int:
-         for (unsigned i = 0; i < num_components; ++i) {
-            int64_t val;
-            switch (load->def.bit_size) {
-            case 32:
-               val = load->value.i32[new_swizzle[i]];
-               break;
-            case 64:
-               val = load->value.i64[new_swizzle[i]];
-               break;
-            default:
-               unreachable("unknown bit size");
-            }
-
-            if (val != const_val->data.i)
-               return false;
-         }
-         return true;
-
      case nir_type_uint:
      case nir_type_bool32:
-         for (unsigned i = 0; i < num_components; ++i) {
-            uint64_t val;
-            switch (load->def.bit_size) {
-            case 32:
-               val = load->value.u32[new_swizzle[i]];
-               break;
-            case 64:
-               val = load->value.u64[new_swizzle[i]];
-               break;
-            default:
-               unreachable("unknown bit size");
+         switch (load->def.bit_size) {
+         case 32:
+            for (unsigned i = 0; i < num_components; ++i) {
+               if (load->value.u32[new_swizzle[i]] !=
+                   (uint32_t)const_val->data.u)
+                  return false;
            }
+            return true;

-            if (val != const_val->data.u)
-               return false;
+         case 64:
+            for (unsigned i = 0; i < num_components; ++i) {
+               if (load->value.u64[new_swizzle[i]] != const_val->data.u)
+                  return false;
+            }
+            return true;
+
+         default:
+            unreachable("unknown bit size");
         }
-         return true;

      default:
         unreachable("Invalid alu source type");
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -1199,7 +1199,8 @@ var_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member,
         is_vertex_input = false;
         location += vtn_var->patch ? VARYING_SLOT_PATCH0 : VARYING_SLOT_VAR0;
      } else {
-         unreachable("Location must be on input or output variable");
+         vtn_warn("Location must be on input or output variable");
+         return;
      }

      if (vtn_var->var) {
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -527,7 +527,7 @@ lp_build_gather(struct gallivm_state *gallivm,
      if (vec_zext) {
         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
         if (vector_justify) {
-#if PIPE_ARCH_BIG_ENDIAN
+#ifdef PIPE_ARCH_BIG_ENDIAN
            unsigned sv = dst_type.width - src_width;
            res = LLVMBuildShl(gallivm->builder, res,
                               lp_build_const_int_vec(gallivm, res_type, sv), "");
--- a/src/gallium/auxiliary/hud/hud_cpufreq.c
+++ b/src/gallium/auxiliary/hud/hud_cpufreq.c
@@ -149,6 +149,7 @@ hud_cpufreq_graph_install(struct hud_pane *pane, int cpu_index,
      break;
   case CPUFREQ_MAXIMUM:
      snprintf(gr->name, sizeof(gr->name), "%s-Max", cfi->name);
+      break;
   default:
      return;
   }
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
+++ b/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2017 by the following authors:
@@ -2028,6 +2028,8 @@ static inline uint32_t A5XX_GRAS_CL_VPORT_ZSCALE_0(float val)
 }

 #define REG_A5XX_GRAS_SU_CNTL					0x0000e090
+#define A5XX_GRAS_SU_CNTL_CULL_FRONT				0x00000001
+#define A5XX_GRAS_SU_CNTL_CULL_BACK				0x00000002
 #define A5XX_GRAS_SU_CNTL_FRONT_CW				0x00000004
 #define A5XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK			0x000007f8
 #define A5XX_GRAS_SU_CNTL_LINEHALFWIDTH__SHIFT			3
@@ -2909,6 +2911,12 @@ static inline uint32_t A5XX_VPC_PACK_NUMNONPOSVAR(uint32_t val)
 {
 	return ((val) << A5XX_VPC_PACK_NUMNONPOSVAR__SHIFT) & A5XX_VPC_PACK_NUMNONPOSVAR__MASK;
 }
+#define A5XX_VPC_PACK_PSIZELOC__MASK				0x0000ff00
+#define A5XX_VPC_PACK_PSIZELOC__SHIFT				8
+static inline uint32_t A5XX_VPC_PACK_PSIZELOC(uint32_t val)
+{
+	return ((val) << A5XX_VPC_PACK_PSIZELOC__SHIFT) & A5XX_VPC_PACK_PSIZELOC__MASK;
+}

 #define REG_A5XX_VPC_FS_PRIMITIVEID_CNTL			0x0000e2a0

@@ -3049,19 +3057,15 @@ static inline uint32_t A5XX_VFD_DECODE_INSTR_IDX(uint32_t val)
 {
 	return ((val) << A5XX_VFD_DECODE_INSTR_IDX__SHIFT) & A5XX_VFD_DECODE_INSTR_IDX__MASK;
 }
+#define A5XX_VFD_DECODE_INSTR_INSTANCED				0x00020000
 #define A5XX_VFD_DECODE_INSTR_FORMAT__MASK			0x3ff00000
 #define A5XX_VFD_DECODE_INSTR_FORMAT__SHIFT			20
 static inline uint32_t A5XX_VFD_DECODE_INSTR_FORMAT(enum a5xx_vtx_fmt val)
 {
 	return ((val) << A5XX_VFD_DECODE_INSTR_FORMAT__SHIFT) & A5XX_VFD_DECODE_INSTR_FORMAT__MASK;
 }
-#define A5XX_VFD_DECODE_INSTR_SWAP__MASK			0xc0000000
-#define A5XX_VFD_DECODE_INSTR_SWAP__SHIFT			30
-static inline uint32_t A5XX_VFD_DECODE_INSTR_SWAP(enum a3xx_color_swap val)
-{
-	return ((val) << A5XX_VFD_DECODE_INSTR_SWAP__SHIFT) & A5XX_VFD_DECODE_INSTR_SWAP__MASK;
-}
-#define A5XX_VFD_DECODE_INSTR_INSTANCED				0x00020000
+#define A5XX_VFD_DECODE_INSTR_UNK30				0x40000000
+#define A5XX_VFD_DECODE_INSTR_FLOAT				0x80000000

 static inline uint32_t REG_A5XX_VFD_DECODE_STEP_RATE(uint32_t i0) { return 0x0000e48b + 0x2*i0; }

@@ -3167,6 +3171,12 @@ static inline uint32_t A5XX_SP_GS_CONTROL_REG_SHADEROBJOFFSET(uint32_t val)
 #define REG_A5XX_SP_FS_CONFIG_MAX_CONST				0x0000e58b

 #define REG_A5XX_SP_VS_CTRL_REG0				0x0000e590
+#define A5XX_SP_VS_CTRL_REG0_THREADSIZE__MASK			0x00000008
+#define A5XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT			3
+static inline uint32_t A5XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT) & A5XX_SP_VS_CTRL_REG0_THREADSIZE__MASK;
+}
 #define A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
 #define A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
 static inline uint32_t A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
@@ -3259,6 +3269,12 @@ static inline uint32_t A5XX_SP_VS_VPC_DST_REG_OUTLOC3(uint32_t val)
 #define REG_A5XX_SP_VS_OBJ_START_HI				0x0000e5ad

 #define REG_A5XX_SP_FS_CTRL_REG0				0x0000e5c0
+#define A5XX_SP_FS_CTRL_REG0_THREADSIZE__MASK			0x00000008
+#define A5XX_SP_FS_CTRL_REG0_THREADSIZE__SHIFT			3
+static inline uint32_t A5XX_SP_FS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_SP_FS_CTRL_REG0_THREADSIZE__SHIFT) & A5XX_SP_FS_CTRL_REG0_THREADSIZE__MASK;
+}
 #define A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
 #define A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
 static inline uint32_t A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
@@ -3328,6 +3344,7 @@ static inline uint32_t A5XX_SP_FS_MRT_REG_COLOR_FORMAT(enum a5xx_color_fmt val)
 {
 	return ((val) << A5XX_SP_FS_MRT_REG_COLOR_FORMAT__SHIFT) & A5XX_SP_FS_MRT_REG_COLOR_FORMAT__MASK;
 }
+#define A5XX_SP_FS_MRT_REG_COLOR_SRGB				0x00000400

 #define REG_A5XX_UNKNOWN_E5DB					0x0000e5db

@@ -3381,6 +3398,12 @@ static inline uint32_t A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(enum a3xx_msaa_sample
 #define REG_A5XX_TPL1_TP_FS_ROTATION_CNTL			0x0000e764

 #define REG_A5XX_HLSQ_CONTROL_0_REG				0x0000e784
+#define A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK		0x00000001
+#define A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT		0
+static inline uint32_t A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT) & A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK;
+}

 #define REG_A5XX_HLSQ_CONTROL_1_REG				0x0000e785
 #define A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD__MASK	0x0000003f
--- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
@@ -60,12 +60,6 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
 			info->restart_index : 0xffffffff);

-	/* points + psize -> spritelist: */
-	if (ctx->rasterizer->point_size_per_vertex &&
-			fd5_emit_get_vp(emit)->writes_psize &&
-			(info->mode == PIPE_PRIM_POINTS))
-		primtype = DI_PT_POINTLIST_PSIZE;
-
 	fd5_emit_render_cntl(ctx, false);
 	fd5_draw_emit(ctx->batch, ring, primtype,
 			emit->key.binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY,
@@ -214,35 +208,44 @@ fd5_clear(struct fd_context *ctx, unsigned buffers,
 			if (!(buffers & (PIPE_CLEAR_COLOR0 << i)))
 				continue;

+			enum pipe_format pfmt = pfb->cbufs[i]->format;
+
 			// XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP??
-			float f[4];
-			switch (fd5_pipe2swap(pfb->cbufs[i]->format)) {
+			union pipe_color_union swapped;
+			switch (fd5_pipe2swap(pfmt)) {
 			case WZYX:
-				f[0] = color->f[0];
-				f[1] = color->f[1];
-				f[2] = color->f[2];
-				f[3] = color->f[3];
+				swapped.ui[0] = color->ui[0];
+				swapped.ui[1] = color->ui[1];
+				swapped.ui[2] = color->ui[2];
+				swapped.ui[3] = color->ui[3];
 				break;
 			case WXYZ:
-				f[2] = color->f[0];
-				f[1] = color->f[1];
-				f[0] = color->f[2];
-				f[3] = color->f[3];
+				swapped.ui[2] = color->ui[0];
+				swapped.ui[1] = color->ui[1];
+				swapped.ui[0] = color->ui[2];
+				swapped.ui[3] = color->ui[3];
 				break;
 			case ZYXW:
-				f[3] = color->f[0];
-				f[0] = color->f[1];
-				f[1] = color->f[2];
-				f[2] = color->f[3];
+				swapped.ui[3] = color->ui[0];
+				swapped.ui[0] = color->ui[1];
+				swapped.ui[1] = color->ui[2];
+				swapped.ui[2] = color->ui[3];
 				break;
 			case XYZW:
-				f[3] = color->f[0];
-				f[2] = color->f[1];
-				f[1] = color->f[2];
-				f[0] = color->f[3];
+				swapped.ui[3] = color->ui[0];
+				swapped.ui[2] = color->ui[1];
+				swapped.ui[1] = color->ui[2];
+				swapped.ui[0] = color->ui[3];
 				break;
 			}
-			util_pack_color(f, pfb->cbufs[i]->format, &uc);
+
+			if (util_format_is_pure_uint(pfmt)) {
+				util_format_write_4ui(pfmt, swapped.ui, 0, &uc, 0, 0, 0, 1, 1);
+			} else if (util_format_is_pure_sint(pfmt)) {
+				util_format_write_4i(pfmt, swapped.i, 0, &uc, 0, 0, 0, 1, 1);
+			} else {
+				util_pack_color(swapped.f, pfmt, &uc);
+			}

 			OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1);
 			OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_MRT0 + i));
--- a/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
@@ -366,6 +366,7 @@ fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 			struct fd_resource *rsc = fd_resource(vb->buffer);
 			enum pipe_format pfmt = elem->src_format;
 			enum a5xx_vtx_fmt fmt = fd5_pipe2vtx(pfmt);
+			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t off = vb->buffer_offset + elem->src_offset;
 			uint32_t size = fd_bo_size(rsc->bo) - off;
 			debug_assert(fmt != ~0);
@@ -379,7 +380,8 @@ fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 			OUT_RING(ring, A5XX_VFD_DECODE_INSTR_IDX(j) |
 					A5XX_VFD_DECODE_INSTR_FORMAT(fmt) |
 					COND(elem->instance_divisor, A5XX_VFD_DECODE_INSTR_INSTANCED) |
-					0xc0000000);  // XXX
+					A5XX_VFD_DECODE_INSTR_UNK30 |
+					COND(!isint, A5XX_VFD_DECODE_INSTR_FLOAT));
 			OUT_RING(ring, MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */

 			OUT_PKT4(ring, REG_A5XX_VFD_DEST_CNTL(j), 1);
--- a/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
@@ -109,7 +109,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		}

 		OUT_PKT4(ring, REG_A5XX_SP_FS_MRT_REG(i), 1);
-		OUT_RING(ring, A5XX_SP_FS_MRT_REG_COLOR_FORMAT(format));
+		OUT_RING(ring, A5XX_SP_FS_MRT_REG_COLOR_FORMAT(format) |
+				COND(srgb, A5XX_SP_FS_MRT_REG_COLOR_SRGB));

 		/* when we support UBWC, these would be the system memory
 		 * addr/pitch/etc:
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
@@ -336,10 +336,14 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	uint32_t pos_regid, psize_regid, color_regid[8];
 	uint32_t face_regid, coord_regid, zwcoord_regid;
 	uint32_t vcoord_regid, vertex_regid, instance_regid;
+	enum a3xx_threadsize fssz;
+	uint8_t psize_loc = ~0;
 	int i, j;

 	setup_stages(emit, s);

+	fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
+
 	pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS);
 	psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ);
 	vertex_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_VERTEX_ID);
@@ -364,7 +368,7 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
 	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
 	zwcoord_regid = s[FS].v->frag_coord ? regid(0,2) : regid(63,0);
-	vcoord_regid = (s[FS].v->total_in > 0) ? regid(0,0) : regid(63,0);
+	vcoord_regid = (s[FS].v->total_in > 0) ? s[FS].v->pos_regid : regid(63,0);

 	/* we could probably divide this up into things that need to be
 	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
@@ -472,8 +476,10 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	if (pos_regid != regid(63,0))
 		ir3_link_add(&l, pos_regid, 0xf, l.max_loc);

-	if (psize_regid != regid(63,0))
+	if (psize_regid != regid(63,0)) {
+		psize_loc = l.max_loc;
 		ir3_link_add(&l, psize_regid, 0x1, l.max_loc);
+	}

 	if ((s[VS].v->shader->stream_output.num_outputs > 0) &&
 			!emit->key.binning_pass) {
@@ -551,7 +557,8 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	}

 	OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 5);
-	OUT_RING(ring, 0x00000881);        /* XXX HLSQ_CONTROL_0 */
+	OUT_RING(ring, A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
+			0x00000880);               /* XXX HLSQ_CONTROL_0 */
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD(63));
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
 			0xfcfcfc00);               /* XXX */
@@ -564,7 +571,8 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 	OUT_PKT4(ring, REG_A5XX_SP_FS_CTRL_REG0, 1);
 	OUT_RING(ring, COND(s[FS].v->total_in > 0, A5XX_SP_FS_CTRL_REG0_VARYING) |
 			COND(s[FS].v->frag_coord, A5XX_SP_FS_CTRL_REG0_VARYING) |
-			0x4000e | /* XXX set pretty much everywhere */
+			0x40006 | /* XXX set pretty much everywhere */
+			A5XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
 			A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
 			A5XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
 			A5XX_SP_FS_CTRL_REG0_BRANCHSTACK(0x3) |  // XXX need to figure this out somehow..
@@ -692,7 +700,7 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)

 		OUT_PKT4(ring, REG_A5XX_VPC_PACK, 1);
 		OUT_RING(ring, A5XX_VPC_PACK_NUMNONPOSVAR(s[FS].v->total_in) |
-				(s[VS].v->writes_psize ? 0x0c00 : 0xff00)); // XXX
+				A5XX_VPC_PACK_PSIZELOC(psize_loc));

 		OUT_PKT4(ring, REG_A5XX_VPC_VARYING_INTERP_MODE(0), 8);
 		for (i = 0; i < 8; i++)
--- a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c
@@ -76,11 +76,11 @@ fd5_rasterizer_state_create(struct pipe_context *pctx,
 //	if (cso->fill_front != PIPE_POLYGON_MODE_FILL ||
 //		cso->fill_back != PIPE_POLYGON_MODE_FILL)
 //		so->pc_prim_vtx_cntl2 |= A5XX_PC_PRIM_VTX_CNTL2_POLYMODE_ENABLE;
-//
-//	if (cso->cull_face & PIPE_FACE_FRONT)
-//		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_FRONT;
-//	if (cso->cull_face & PIPE_FACE_BACK)
-//		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_BACK;
+
+	if (cso->cull_face & PIPE_FACE_FRONT)
+		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_FRONT;
+	if (cso->cull_face & PIPE_FACE_BACK)
+		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_BACK;
 	if (!cso->front_ccw)
 		so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_FRONT_CW;
 //	if (!cso->flatshade_first)
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -15,7 +15,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  23277 bytes, from 2016-12-24 05:01:47)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110757 bytes, from 2016-12-26 17:51:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  99436 bytes, from 2017-01-10 16:36:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 100594 bytes, from 2017-01-20 23:03:30)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -4185,41 +4185,63 @@ static int egcm_double_to_int(struct r600_shader_ctx *ctx)
 	return 0;
 }

+static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
+					unsigned op,
+					int dst_reg,
+					struct r600_shader_src *src,
+					bool abs)
+{
+	struct r600_bytecode_alu alu;
+	const int last_slot = 3;
+	int r;
+
+	/* these have to write the result to X/Y by the looks of it */
+	for (int i = 0 ; i < last_slot; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = op;
+
+		r600_bytecode_src(&alu.src[0], src, 1);
+		r600_bytecode_src(&alu.src[1], src, 0);
+
+		if (abs)
+			r600_bytecode_src_set_abs(&alu.src[1]);
+
+		alu.dst.sel = dst_reg;
+		alu.dst.chan = i;
+		alu.dst.write = (i == 0 || i == 1);
+
+		if (bc->chip_class != CAYMAN || i == last_slot - 1)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(bc, &alu);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	int i, r;
 	struct r600_bytecode_alu alu;
-	int last_slot = 3;
 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
 	int t1 = ctx->temp_reg;

-	/* these have to write the result to X/Y by the looks of it */
-	for (i = 0 ; i < last_slot; i++) {
-		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-		alu.op = ctx->inst_info->op;
+	/* should only be one src regs */
+	assert(inst->Instruction.NumSrcRegs == 1);

-		/* should only be one src regs */
-		assert (inst->Instruction.NumSrcRegs == 1);
+	/* only support one double at a time */
+	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
+	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);

-		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
-		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
-
-		/* RSQ should take the absolute value of src */
-		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
-		    ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
-			r600_bytecode_src_set_abs(&alu.src[1]);
-		}
-		alu.dst.sel = t1;
-		alu.dst.chan = i;
-		alu.dst.write = (i == 0 || i == 1);
-
-		if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
-			alu.last = 1;
-		r = r600_bytecode_add_alu(ctx->bc, &alu);
-		if (r)
-			return r;
-	}
+	r = cayman_emit_unary_double_raw(
+		ctx->bc, ctx->inst_info->op, t1,
+		&ctx->src[0],
+		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
+		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
+	if (r)
+		return r;

 	for (i = 0 ; i <= lasti; i++) {
 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
@@ -4326,25 +4348,27 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
 	int t1 = ctx->temp_reg;

-	for (k = 0; k < 2; k++) {
-		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
-			continue;
+	/* t1 would get overwritten below if we actually tried to
+	 * multiply two pairs of doubles at a time. */
+	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
+	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);

-		for (i = 0; i < 4; i++) {
-			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-			alu.op = ctx->inst_info->op;
-			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
-			}
-			alu.dst.sel = t1;
-			alu.dst.chan = i;
-			alu.dst.write = 1;
-			if (i == 3)
-				alu.last = 1;
-			r = r600_bytecode_add_alu(ctx->bc, &alu);
-			if (r)
-				return r;
+	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ctx->inst_info->op;
+		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
+			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
 		}
+		alu.dst.sel = t1;
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
 	}

 	for (i = 0; i <= lasti; i++) {
@@ -4366,6 +4390,63 @@ static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
 	return 0;
 }

+/*
+ * Emit RECIP_64 + MUL_64 to implement division.
+ */
+static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	int r;
+	struct r600_bytecode_alu alu;
+	int t1 = ctx->temp_reg;
+	int k;
+
+	/* Only support one double at a time. This is the same constraint as
+	 * in DMUL lowering. */
+	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
+	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
+
+	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
+
+	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
+	if (r)
+		return r;
+
+	for (int i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP2_MUL_64;
+
+		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
+
+		alu.src[1].sel = t1;
+		alu.src[1].chan = (i == 3) ? 0 : 1;
+
+		alu.dst.sel = t1;
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+
+	for (int i = 0; i < 2; i++) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.op = ALU_OP1_MOV;
+		alu.src[0].sel = t1;
+		alu.src[0].chan = i;
+		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
+		alu.dst.write = 1;
+		if (i == 1)
+			alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
 /*
 * r600 - trunc to -PI..PI range
 * r700 - normalize by dividing by 2PI
@@ -9376,6 +9457,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
+	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
@@ -9598,6 +9680,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
+	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -320,14 +320,21 @@ static void si_sampler_view_add_buffer(struct si_context *sctx,
 	if (resource->target == PIPE_BUFFER)
 		return;

-	/* Now add separate DCC if it's present. */
+	/* Now add separate DCC or HTILE. */
 	rtex = (struct r600_texture*)resource;
-	if (!rtex->dcc_separate_buffer)
-		return;
+	if (rtex->dcc_separate_buffer) {
+		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
+						    rtex->dcc_separate_buffer, usage,
+						    RADEON_PRIO_DCC, check_mem);
+	}

-	radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
-					    rtex->dcc_separate_buffer, usage,
-					    RADEON_PRIO_DCC, check_mem);
+	if (rtex->htile_buffer &&
+	    rtex->tc_compatible_htile &&
+	    !is_stencil_sampler) {
+		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
+						    rtex->htile_buffer, usage,
+						    RADEON_PRIO_HTILE, check_mem);
+	}
 }

 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -717,8 +717,10 @@ static void si_update_poly_offset_state(struct si_context *sctx)
 {
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;

-	if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf)
+	if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
+		si_pm4_bind_state(sctx, poly_offset, NULL);
 		return;
+	}

 	/* Use the user format, not db_render_format, so that the polygon
 	 * offset behaves as expected by applications.
@@ -1363,11 +1365,17 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 		case PIPE_FORMAT_Z16_UNORM:
 			return V_008F14_IMG_DATA_FORMAT_16;
 		case PIPE_FORMAT_X24S8_UINT:
+		case PIPE_FORMAT_S8X24_UINT:
+			/*
+			 * Implemented as an 8_8_8_8 data format to fix texture
+			 * gathers in stencil sampling. This affects at least
+			 * GL45-CTS.texture_cube_map_array.sampling on VI.
+			 */
+			return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
 		case PIPE_FORMAT_Z24X8_UNORM:
 		case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 			return V_008F14_IMG_DATA_FORMAT_8_24;
 		case PIPE_FORMAT_X8Z24_UNORM:
-		case PIPE_FORMAT_S8X24_UINT:
 		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 			return V_008F14_IMG_DATA_FORMAT_24_8;
 		case PIPE_FORMAT_S8_UINT:
@@ -2794,14 +2802,22 @@ si_make_texture_descriptor(struct si_screen *screen,
 	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
 		const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
 		const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+		const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};

 		switch (pipe_format) {
 		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-		case PIPE_FORMAT_X24S8_UINT:
 		case PIPE_FORMAT_X32_S8X24_UINT:
 		case PIPE_FORMAT_X8Z24_UNORM:
 			util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
 			break;
+		case PIPE_FORMAT_X24S8_UINT:
+			/*
+			 * X24S8 is implemented as an 8_8_8_8 data format, to
+			 * fix texture gathers. This affects at least
+			 * GL45-CTS.texture_cube_map_array.sampling on VI.
+			 */
+			util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+			break;
 		default:
 			util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
 		}
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -850,11 +850,12 @@ void si_emit_cache_flush(struct si_context *sctx)
 	if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2 ||
 	    (rctx->chip_class <= CIK &&
 	     (rctx->flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
-		/* Invalidate L1 & L2. (L1 is always invalidated)
+		/* Invalidate L1 & L2. (L1 is always invalidated on SI)
 		 * WB must be set on VI+ when TC_ACTION is set.
 		 */
 		si_emit_surface_sync(rctx, cp_coher_cntl |
 				     S_0085F0_TC_ACTION_ENA(1) |
+				     S_0085F0_TCL1_ACTION_ENA(1) |
 				     S_0301F0_TC_WB_ACTION_ENA(rctx->chip_class >= VI));
 		cp_coher_cntl = 0;
 		sctx->b.num_L2_invalidates++;
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -217,6 +217,15 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
        out_numThreadsPerProcGroup++;
    }

+    /* Prune empty numa nodes */
+    for (auto it = out_nodes.begin(); it != out_nodes.end(); ) {
+       if ((*it).cores.size() == 0)
+          it = out_nodes.erase(it);
+       else
+          ++it;
+    }
+
+    /* Prune empty core nodes */
    for (uint32_t node = 0; node < out_nodes.size(); node++) {
        auto& numaNode = out_nodes[node];
        auto it = numaNode.cores.begin();
--- a/src/gallium/drivers/swr/swr_query.cpp
+++ b/src/gallium/drivers/swr/swr_query.cpp
@@ -29,7 +29,7 @@
 #include "swr_query.h"
 #include "swr_screen.h"
 #include "swr_state.h"
-
+#include "common/os.h"

 static struct swr_query *
 swr_query(struct pipe_query *p)
@@ -45,7 +45,8 @@ swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
   assert(type < PIPE_QUERY_TYPES);
   assert(index < MAX_SO_STREAMS);

-   pq = CALLOC_STRUCT(swr_query);
+   pq = (struct swr_query *) AlignedMalloc(sizeof(struct swr_query), 64);
+   memset(pq, 0, sizeof(*pq));

   if (pq) {
      pq->type = type;
@@ -67,7 +68,7 @@ swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
      swr_fence_reference(pipe->screen, &pq->fence, NULL);
   }

-   FREE(pq);
+   AlignedFree(pq);
 }


--- a/src/gallium/drivers/swr/swr_query.h
+++ b/src/gallium/drivers/swr/swr_query.h
@@ -34,7 +34,7 @@ struct swr_query_result {
   uint64_t timestamp_end;
 };

-struct swr_query {
+OSALIGNLINE(struct) swr_query {
   unsigned type; /* PIPE_QUERY_* */
   unsigned index;

--- a/src/intel/blorp/blorp_blit.c
+++ b/src/intel/blorp/blorp_blit.c
@@ -26,6 +26,9 @@
 #include "blorp_priv.h"
 #include "brw_meta_util.h"

+/* header-only include needed for _mesa_unorm_to_float and friends. */
+#include "mesa/main/format_utils.h"
+
 #define FILE_DEBUG_FLAG DEBUG_BLORP

 static const bool split_blorp_blit_debug = false;
@@ -2204,6 +2207,75 @@ get_ccs_compatible_uint_format(const struct isl_format_layout *fmtl)
   }
 }

+/* Takes an isl_color_value and returns a color value that is the original
+ * color value only bit-casted to a UINT format.  This value, together with
+ * the format from get_ccs_compatible_uint_format, will yield the same bit
+ * value as the original color and format.
+ */
+static union isl_color_value
+bitcast_color_value_to_uint(union isl_color_value color,
+                            const struct isl_format_layout *fmtl)
+{
+   /* All CCS formats have the same number of bits in each channel */
+   const struct isl_channel_layout *chan = &fmtl->channels.r;
+
+   union isl_color_value bits;
+   switch (chan->type) {
+   case ISL_UINT:
+   case ISL_SINT:
+      /* Hardware will ignore the high bits so there's no need to cast */
+      bits = color;
+      break;
+
+   case ISL_UNORM:
+      for (unsigned i = 0; i < 4; i++)
+         bits.u32[i] = _mesa_float_to_unorm(color.f32[i], chan->bits);
+      break;
+
+   case ISL_SNORM:
+      for (unsigned i = 0; i < 4; i++)
+         bits.i32[i] = _mesa_float_to_snorm(color.f32[i], chan->bits);
+      break;
+
+   case ISL_SFLOAT:
+      switch (chan->bits) {
+      case 16:
+         for (unsigned i = 0; i < 4; i++)
+            bits.u32[i] = _mesa_float_to_half(color.f32[i]);
+         break;
+
+      case 32:
+         bits = color;
+         break;
+
+      default:
+         unreachable("Invalid float format size");
+      }
+      break;
+
+   default:
+      unreachable("Invalid channel type");
+   }
+
+   switch (fmtl->format) {
+   case ISL_FORMAT_B8G8R8A8_UNORM:
+   case ISL_FORMAT_B8G8R8A8_UNORM_SRGB:
+   case ISL_FORMAT_B8G8R8X8_UNORM:
+   case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: {
+      /* If it's a BGRA format, we need to swap blue and red */
+      uint32_t tmp = bits.u32[0];
+      bits.u32[0] = bits.u32[2];
+      bits.u32[2] = tmp;
+      break;
+   }
+
+   default:
+      break; /* Nothing to do */
+   }
+
+   return bits;
+}
+
 static void
 surf_convert_to_uncompressed(const struct isl_device *isl_dev,
                             struct brw_blorp_surface_info *info,
@@ -2320,6 +2392,16 @@ blorp_copy(struct blorp_batch *batch,
      params.src.view.format = get_copy_format_for_bpb(isl_dev, src_fmtl->bpb);
   }

+   if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) {
+      params.src.clear_color =
+         bitcast_color_value_to_uint(params.src.clear_color, src_fmtl);
+   }
+
+   if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) {
+      params.dst.clear_color =
+         bitcast_color_value_to_uint(params.dst.clear_color, dst_fmtl);
+   }
+
   wm_prog_key.src_bpc =
      isl_format_get_layout(params.src.view.format)->channels.r.bits;
   wm_prog_key.dst_bpc =
--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -75,8 +75,11 @@ choose_isl_surf_usage(VkImageUsageFlags vk_usage,
      isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
   }

-   if (vk_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
-      /* blorp implements transfers by rendering into the destination image. */
+   if (vk_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT &&
+       aspect == VK_IMAGE_ASPECT_COLOR_BIT) {
+      /* blorp implements transfers by rendering into the destination image.
+       * Only request this with color images, as we deal with depth/stencil
+       * formats differently. */
      isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
   }

--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -908,6 +908,17 @@ do_single_blorp_clear(struct brw_context *brw, struct gl_framebuffer *fb,
      blorp_batch_finish(&batch);
   }

+   /*
+    * Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+    *
+    *  Any transition from any value in {Clear, Render, Resolve} to a
+    *  different value in {Clear, Render, Resolve} requires end of pipe
+    *  synchronization.
+    */
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                               PIPE_CONTROL_CS_STALL);
+
   return true;
 }

@@ -975,6 +986,17 @@ brw_blorp_resolve_color(struct brw_context *brw, struct intel_mipmap_tree *mt,
                     brw_blorp_to_isl_format(brw, format, true),
                     resolve_op);
   blorp_batch_finish(&batch);
+
+   /*
+    * Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+    *
+    *  Any transition from any value in {Clear, Render, Resolve} to a
+    *  different value in {Clear, Render, Resolve} requires end of pipe
+    *  synchronization.
+    */
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                               PIPE_CONTROL_CS_STALL);
 }

 static void
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -36,6 +36,7 @@

 #include "brw_context.h"
 #include "brw_blorp.h"
+#include "brw_defines.h"

 #define FILE_DEBUG_FLAG DEBUG_BLIT

@@ -174,14 +175,46 @@ brw_fast_clear_depth(struct gl_context *ctx)
      mt->depth_clear_value = depth_clear_value;
   }

-   /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
-    *
-    *     "If other rendering operations have preceded this clear, a
-    *      PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled
-    *      must be issued before the rectangle primitive used for the depth
-    *      buffer clear operation.
-    */
-   brw_emit_mi_flush(brw);
+   if (brw->gen == 6) {
+      /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+       *
+       *   "If other rendering operations have preceded this clear, a
+       *    PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled
+       *    must be issued before the rectangle primitive used for the depth
+       *    buffer clear operation.
+       */
+       brw_emit_pipe_control_flush(brw,
+                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                   PIPE_CONTROL_CS_STALL);
+   } else if (brw->gen >= 7) {
+      /*
+       * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
+       *
+       *   If other rendering operations have preceded this clear, a
+       *   PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+       *   enabled must be issued before the rectangle primitive used for the
+       *   depth buffer clear operation.
+       *
+       * Same applies for Gen8 and Gen9.
+       *
+       * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1 PIPE_CONTROL,
+       * Depth Cache Flush Enable:
+       *
+       *   This bit must not be set when Depth Stall Enable bit is set in
+       *   this packet.
+       *
+       * This is confirmed to hold for real, HSW gets immediate gpu hangs.
+       *
+       * Therefore issue two pipe control flushes, one for cache flush and
+       * another for depth stall.
+       */
+       brw_emit_pipe_control_flush(brw,
+                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                   PIPE_CONTROL_CS_STALL);
+
+       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+   }

   if (fb->MaxNumLayers > 0) {
      for (unsigned layer = 0; layer < depth_irb->layer_count; layer++) {
@@ -201,7 +234,12 @@ brw_fast_clear_depth(struct gl_context *ctx)
       *      by a PIPE_CONTROL command with DEPTH_STALL bit set and Then
       *      followed by Depth FLUSH'
      */
-      brw_emit_mi_flush(brw);
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_STALL);
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  PIPE_CONTROL_CS_STALL);
   }

   /* Now, the HiZ buffer contains data that needs to be resolved to the depth
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -511,6 +511,22 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
   OUT_BATCH(0);
   ADVANCE_BATCH();

+   /*
+    * From the Broadwell PRM, volume 7, "Depth Buffer Clear":
+    *
+    *  Depth buffer clear pass using any of the methods (WM_STATE, 3DSTATE_WM
+    *  or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL command with
+    *  DEPTH_STALL bit and Depth FLUSH bits "set" before starting to render.
+    *  DepthStall and DepthFlush are not needed between consecutive depth
+    *  clear passes nor is it required if th e depth clear pass was done with
+    *  "full_surf_clear" bit set in the 3DSTATE_WM_HZ_OP.
+    *
+    *  TODO: Such as the spec says, this could be conditional.
+    */
+   brw_emit_pipe_control_flush(brw, 
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               PIPE_CONTROL_DEPTH_STALL);
+
   /* Mark this buffer as needing a TC flush, as we've rendered to it. */
   brw_render_cache_set_add_bo(brw, mt->bo);

--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -25,6 +25,7 @@

 #include "intel_batchbuffer.h"
 #include "intel_mipmap_tree.h"
+#include "intel_fbo.h"

 #include "brw_context.h"
 #include "brw_state.h"
@@ -179,7 +180,9 @@ genX(blorp_exec)(struct blorp_batch *batch,
    * data with different formats, which blorp does for stencil and depth
    * data.
    */
-   brw_emit_mi_flush(brw);
+   if (params->src.enabled)
+      brw_render_cache_set_check_flush(brw, params->src.addr.buffer);
+   brw_render_cache_set_check_flush(brw, params->dst.addr.buffer);

   brw_select_pipeline(brw, BRW_RENDER_PIPELINE);

@@ -256,8 +259,6 @@ retry:
   brw->no_depth_or_stencil = false;
   brw->ib.type = -1;

-   /* Flush the sampler cache so any texturing from the destination is
-    * coherent.
-    */
-   brw_emit_mi_flush(brw);
+   if (params->dst.enabled)
+      brw_render_cache_set_add_bo(brw, params->dst.addr.buffer);
 }
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -1071,7 +1071,8 @@ _mesa_PopAttrib(void)
               if (ctx->Extensions.ARB_color_buffer_float)
                  _mesa_ClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
                                   color->ClampFragmentColor);
-               _mesa_ClampColor(GL_CLAMP_READ_COLOR_ARB, color->ClampReadColor);
+               if (ctx->Extensions.ARB_color_buffer_float || ctx->Version >= 30)
+                  _mesa_ClampColor(GL_CLAMP_READ_COLOR_ARB, color->ClampReadColor);

               /* GL_ARB_framebuffer_sRGB / GL_EXT_framebuffer_sRGB */
               if (ctx->Extensions.EXT_framebuffer_sRGB)
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1612,6 +1612,7 @@ _mesa_LinkProgram(GLuint programObj)
                                                           "glLinkProgram"));
 }

+#ifdef ENABLE_SHADER_CACHE
 /**
 * Generate a SHA-1 hash value string for given source string.
 */
@@ -1723,6 +1724,8 @@ read_shader(const gl_shader_stage stage, const char *source)
   return buffer;
 }

+#endif /* ENABLE_SHADER_CACHE */
+
 /**
 * Called via glShaderSource() and glShaderSourceARB() API functions.
 * Basically, concatenate the source code strings into one long string
@@ -1795,6 +1798,7 @@ _mesa_ShaderSource(GLuint shaderObj, GLsizei count,
   source[totalLength - 1] = '\0';
   source[totalLength - 2] = '\0';

+#ifdef ENABLE_SHADER_CACHE
   /* Dump original shader source to MESA_SHADER_DUMP_PATH and replace
    * if corresponding entry found from MESA_SHADER_READ_PATH.
    */
@@ -1805,6 +1809,7 @@ _mesa_ShaderSource(GLuint shaderObj, GLsizei count,
      free(source);
      source = replacement;
   }
+#endif /* ENABLE_SHADER_CACHE */

   shader_source(sh, source);

--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -955,7 +955,7 @@ glsl_to_tgsi_visitor::get_opcode(unsigned op,
      case3fid(MUL, UMUL, DMUL);
      case3fid(MAD, UMAD, DMAD);
      case3fid(FMA, UMAD, DFMA);
-      case3(DIV, IDIV, UDIV);
+      case4d(DIV, IDIV, UDIV, DDIV);
      case4d(MAX, IMAX, UMAX, DMAX);
      case4d(MIN, IMIN, UMIN, DMIN);
      case2iu(MOD, UMOD);
@@ -1710,10 +1710,7 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
      emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
      break;
   case ir_binop_div:
-      if (result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_DOUBLE)
-         assert(!"not reached: should be handled by ir_div_to_mul_rcp");
-      else
-         emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
      break;
   case ir_binop_mod:
      if (result_dst.type == GLSL_TYPE_FLOAT)
@@ -6918,7 +6915,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)

      lower_instructions(ir,
                         MOD_TO_FLOOR |
-                         DIV_TO_MUL_RCP |
+                         FDIV_TO_MUL_RCP |
                         EXP_TO_EXP2 |
                         LOG_TO_LOG2 |
                         LDEXP_TO_ARITH |
--- a/src/util/disk_cache.c
+++ b/src/util/disk_cache.c
@@ -21,6 +21,8 @@
 * IN THE SOFTWARE.
 */

+#ifdef ENABLE_SHADER_CACHE
+
 #include <ctype.h>
 #include <string.h>
 #include <stdlib.h>
@@ -705,3 +707,5 @@ disk_cache_has_key(struct disk_cache *cache, cache_key key)

   return memcmp(entry, key, CACHE_KEY_SIZE) == 0;
 }
+
+#endif /* ENABLE_SHADER_CACHE */
--- a/src/util/disk_cache.h
+++ b/src/util/disk_cache.h
@@ -40,6 +40,8 @@ struct disk_cache;

 /* Provide inlined stub functions if the shader cache is disabled. */

+#ifdef ENABLE_SHADER_CACHE
+
 /**
 * Create a new cache object.
 *
@@ -129,6 +131,46 @@ disk_cache_put_key(struct disk_cache *cache, cache_key key);
 bool
 disk_cache_has_key(struct disk_cache *cache, cache_key key);

+#else
+
+static inline struct disk_cache *
+disk_cache_create(void)
+{
+   return NULL;
+}
+
+static inline void
+disk_cache_destroy(struct disk_cache *cache) {
+   return;
+}
+
+static inline void
+disk_cache_put(struct disk_cache *cache, cache_key key,
+          const void *data, size_t size)
+{
+   return;
+}
+
+static inline uint8_t *
+disk_cache_get(struct disk_cache *cache, cache_key key, size_t *size)
+{
+   return NULL;
+}
+
+static inline void
+disk_cache_put_key(struct disk_cache *cache, cache_key key)
+{
+   return;
+}
+
+static inline bool
+disk_cache_has_key(struct disk_cache *cache, cache_key key)
+{
+   return false;
+}
+
+#endif /* ENABLE_SHADER_CACHE */
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/util/sha1/sha1.h
+++ b/src/util/sha1/sha1.h
@@ -31,7 +31,6 @@ void SHA1Pad(SHA1_CTX *);
 void SHA1Transform(uint32_t [5], const uint8_t [SHA1_BLOCK_LENGTH]);
 void SHA1Update(SHA1_CTX *, const uint8_t *, size_t);
 void SHA1Final(uint8_t [SHA1_DIGEST_LENGTH], SHA1_CTX *);
-__END_DECLS

 #define HTONDIGEST(x) do {                                              \
        x[0] = htonl(x[0]);                                             \
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@@ -265,7 +265,8 @@ VkBool32 wsi_get_physical_device_xcb_presentation_support(
      return false;

   if (!wsi_conn->has_dri3) {
-      fprintf(stderr, "vulkan: No DRI3 support\n");
+      fprintf(stderr, "vulkan: No DRI3 support detected - required for presentation\n");
+      fprintf(stderr, "Note: Buggy applications may crash, if they do please report to vendor\n");
      return false;
   }

@@ -313,7 +314,8 @@ x11_surface_get_support(VkIcdSurfaceBase *icd_surface,
      return VK_ERROR_OUT_OF_HOST_MEMORY;

   if (!wsi_conn->has_dri3) {
-      fprintf(stderr, "vulkan: No DRI3 support\n");
+      fprintf(stderr, "vulkan: No DRI3 support detected - required for presentation\n");
+      fprintf(stderr, "Note: Buggy applications may crash, if they do please report to vendor\n");
      *pSupported = false;
      return VK_SUCCESS;
   }
Author	SHA1	Message	Date
Emil Velikov	d283ec0a7b	Update version to 17.0.0-rc2 Signed-off-by: Emil Velikov <emil.velikov@collabora.com>	2017-01-25 13:24:27 +00:00
Topi Pohjolainen	9577977266	i965/blorp: Make post draw flush more explicit Blits do not need any special treatment as the target buffer object is added to render cache just as one does for normal draw. Color clears and resolves in turn require explicit "end of pipe synchronization". It is not clear what this means exactly but the assumption is that render cache flush with command stream stall should be sufficient. Signed-off-by: Topi Pohjolainen <topi.pohjolainen@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> (cherry picked from commit `180653c357`)	2017-01-24 17:18:18 +00:00
Topi Pohjolainen	8621961d43	i965/gen6: Issue direct depth stall and flush after depth clear instead of calling unconditionally brw_emit_mi_flush() which does: brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH \| PIPE_CONTROL_RENDER_TARGET_FLUSH \| PIPE_CONTROL_CS_STALL); brw_emit_pipe_control_flush(brw, PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE \| PIPE_CONTROL_CONST_CACHE_INVALIDATE); Signed-off-by: Topi Pohjolainen <topi.pohjolainen@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> (cherry picked from commit `46b346899d`)	2017-01-24 17:17:53 +00:00
Topi Pohjolainen	7d5a98f106	i965: Make depth clear flushing more explicit Current blorp logic issues unconditional "flush everything" (see brw_emit_mi_flush()) after each render. For example, all blits issue this unconditionally which shouldn't be needed if they set render cache properly so that subsequent renders do necessary flushing before drawing. In case of piglit: ext_framebuffer_multisample-accuracy all_samples depth_draw small intel_hiz_exec() is always preceded by blorb blit and the unconditional flush looks to hide the lack of stall and flushes in depth clears. By removing the brw_emit_mi_flush() I get gpu hangs. This patch adds the stalls and flushes mandated by the spec and gets rid of those hangs. v2 (Jason, Ken): Document the rational for separating depth cache flush and stall on Gen7. Signed-off-by: Topi Pohjolainen <topi.pohjolainen@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> (cherry picked from commit `e6da6943fe`)	2017-01-24 17:17:22 +00:00
Topi Pohjolainen	4e6445caa9	i965/blorp: Use the render cache mechanism instead of explicit flushing by replacing brw_emit_mi_flush() with brw_render_cache_set_check_flush(). The latter splits the flush in two: brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH \| PIPE_CONTROL_RENDER_TARGET_FLUSH \| PIPE_CONTROL_CS_STALL); brw_emit_pipe_control_flush(brw, PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE \| PIPE_CONTROL_CONST_CACHE_INVALIDATE); instead of int flags = PIPE_CONTROL_NO_WRITE \| PIPE_CONTROL_RENDER_TARGET_FLUSH; if (brw->gen >= 6) { flags \|= PIPE_CONTROL_INSTRUCTION_INVALIDATE \| PIPE_CONTROL_CONST_CACHE_INVALIDATE \| PIPE_CONTROL_DEPTH_CACHE_FLUSH \| PIPE_CONTROL_VF_CACHE_INVALIDATE \| PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE \| PIPE_CONTROL_CS_STALL; } brw_emit_pipe_control_flush(brw, flags); v2 (Jason): Check that destination exists before trying to add to render cache. Depth clears and resolves don't have it. Signed-off-by: Topi Pohjolainen <topi.pohjolainen@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> (cherry picked from commit `4840a53e90`)	2017-01-24 17:17:02 +00:00
Marek Olšák	e405d0d3c6	radeonsi: always set the TCL1_ACTION_ENA when invalidating L2 Some CIK-VI docs say this is the default behavior on SI. That doesn't answer whether it's also the default behavior on CIK-VI. Cc: 17.0 13.0 <mesa-stable@lists.freedesktop.org> Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com> (cherry picked from commit `573bf0940a`)	2017-01-24 02:28:55 +00:00
Grazvydas Ignotas	0c4b8c75e2	radv: don't resubmit the same cs over and over while tracing Fixes: `97dfff54` ("radv: Dump command buffer on hang.") Signed-off-by: Grazvydas Ignotas <notasas@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> CC: <mesa-stable@lists.freedesktop.org> (cherry picked from commit `f65b3641c3`)	2017-01-24 02:26:52 +00:00
George Kyriazis	e35cfa15cf	swr: Align query results allocation Some query results struct contents are declared as cache line aligned. Use aligned malloc, and align the whole struct, to be safe. Fixes crash when compiling with clang. CC: <mesa-stable@lists.freedesktop.org> Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com> (cherry picked from commit `00847e4f14`)	2017-01-24 02:24:38 +00:00
Bruce Cherniak	34f902e17e	swr: Prune empty nodes in CalculateProcessorTopology. CalculateProcessorTopology tries to figure out system topology by parsing /proc/cpuinfo to determine the number of threads, cores, and NUMA nodes. There are some architectures where the "physical id" begins with 1 rather than 0, which was creating and empty "0" node and causing a crash in CreateThreadPool. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=97102 Reviewed-By: George Kyriazis <george.kyriazis@intel.com> CC: <mesa-stable@lists.freedesktop.org> (cherry picked from commit `b829206b07`)	2017-01-24 02:22:31 +00:00
Nicolai Hähnle	e4cf4690d1	st/glsl_to_tgsi: use DDIV instead of DRCP + DMUL Fixes GL45-CTS.gpu_shader_fp64.built_in_functions. v2: use DDIV unconditionally (Roland) Reviewed-by: Roland Scheidegger <sroland@vmware.com> (v1) Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v1) Tested-by: Glenn Kennard <glenn.kennard@gmail.com> Tested-by: James Harvey <lothmordor@gmail.com> Cc: 17.0 <mesa-stable@lists.freedesktop.org> (cherry picked from commit `cfabbbcfd7`)	2017-01-24 02:20:31 +00:00
Nicolai Hähnle	7f6c6b9101	glsl: split DIV_TO_MUL_RCP into single- and double-precision flags Reviewed-by: Marek Olšák <marek.olsak@amd.com> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Tested-by: Glenn Kennard <glenn.kennard@gmail.com> Tested-by: James Harvey <lothmordor@gmail.com> Cc: 17.0 <mesa-stable@lists.freedesktop.org> (cherry picked from commit `b71c415c3d`)	2017-01-24 02:18:17 +00:00
Nicolai Hähnle	23ead4c7b2	r600: implement DDIV Tested-by: Glenn Kennard <glenn.kennard@gmail.com> Tested-by: James Harvey <lothmordor@gmail.com> Cc: 17.0 <mesa-stable@lists.freedesktop.org> (cherry picked from commit `e4f8f9a638`)	2017-01-24 02:16:04 +00:00
Nicolai Hähnle	7856dfdbab	r600: factor out cayman_emit_unary_double_raw We will use it for DDIV. Tested-by: Glenn Kennard <glenn.kennard@gmail.com> Tested-by: James Harvey <lothmordor@gmail.com> Cc: 17.0 <mesa-stable@lists.freedesktop.org> (cherry picked from commit `488560cfe6`)	2017-01-24 02:13:45 +00:00
Nicolai Hähnle	41b93b1fe0	r600: double multiply can handle only one multiply at a time It seems clear that trying to multiply two pairs of doubles would result in the temporary register getting overwritten by the second pair. So make the code more explicit. Tested-by: Glenn Kennard <glenn.kennard@gmail.com> Tested-by: James Harvey <lothmordor@gmail.com> Cc: 17.0 <mesa-stable@lists.freedesktop.org> (cherry picked from commit `76b02d2fe1`)	2017-01-24 02:11:28 +00:00
Rob Clark	8857256214	freedreno/a5xx: set frag shader threadsize Signed-off-by: Rob Clark <robdclark@gmail.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `31daeb5bf1`)	2017-01-24 02:09:27 +00:00
Rob Clark	516b34908d	freedreno/a5xx: set fragcoordxy properly What a3xx docs call IJPERSPCENTERREGID.. the xy coord passed into bary.f. We were incorrectly setting both this and gl_FragCoord.xy to the same register resulting in all sorts of hilarity. Fixes stk, vdrift, 0ad, probably a bunch others. Signed-off-by: Rob Clark <robdclark@gmail.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `8d6af93e76`)	2017-01-24 02:07:25 +00:00
Rob Clark	0645c0e0d4	freedreno/a5xx: fix psize Note spritelist (POINTLIST_PSIZE) seems not to be a thing anymore on a5xx. Signed-off-by: Rob Clark <robdclark@gmail.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `6cc93bedc1`)	2017-01-24 02:05:21 +00:00
Rob Clark	6aade42111	freedreno/a5xx: srgb fix Signed-off-by: Rob Clark <robdclark@gmail.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `141a4f86d6`)	2017-01-24 02:03:15 +00:00
Rob Clark	4dc6ed53c1	freedreno/a5xx: fix int vbos Signed-off-by: Rob Clark <robdclark@gmail.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `69fbb458cf`)	2017-01-24 02:01:11 +00:00
Rob Clark	cfe14ab39c	freedreno/a5xx: fix clear for uint/sint formats Signed-off-by: Rob Clark <robdclark@gmail.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `16671e9704`)	2017-01-24 01:59:17 +00:00
Rob Clark	250b1cad3b	freedreno/a5xx: fix cull state Signed-off-by: Rob Clark <robdclark@gmail.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `4d9aa4f67d`)	2017-01-24 01:57:10 +00:00
Rob Clark	9318d81574	freedreno: update generated headers Signed-off-by: Rob Clark <robdclark@gmail.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `4c39458460`)	2017-01-24 01:54:34 +00:00
Jason Ekstrand	00cdbfe6ef	nir/search: Use the correct bit size for integer comparisons The previous code always compared integers as 64-bit. Due to variations in sign-extension in the code generated by nir_opt_algebraic.py, this meant that nir_search doesn't always do what you want. Instead, 32-bit values should be matched as 32-bit and 64-bit values should be matched as 64-bit. While we're here we unify the unsigned and signed paths. Now that we're using the right bit size, they should be the same since the only difference we had before was sign extension. This gets the UE4 bitfield_extract optimization working again. It had stopped working due to the constant 0xff00ff00 getting sign-extended when it shouldn't have. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Reviewed-by: Eric Anholt <eric@anholt.net> Cc: "17.0 13.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `bb96b03461`)	2017-01-24 01:52:20 +00:00
Jason Ekstrand	83deab2f6a	intel/blorp/copy: Properly handle clear colors for CCS_E images In order to handle CCS_E, we stomp the image format to a UINT format and then do some bitcasting logic in the shader. This works fine since SKL render compression only considers the channel layout of the format and not the format itself. In order for this to work on images that have been fast-cleared, we need to also convert the clear color so that, when interpreted as UINT, it provides the same bit value as it would have in the original format. This fixes a bunch of OpenGL ES CTS tests for copy_image when we start using CCS more aggressively. Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `817f9e3b17`)	2017-01-24 01:15:47 +00:00
Andres Rodriguez	de2dfa1dc3	radv: fix include order for installed headers v2 In situations where libdrm_amdgpu and mesa are installed to the same location, the mesa installed headers will take precedence over the git source headers. This is due to the AMDGPU_CFLAGS containing the install directory. This situation can cause build errors if the git version of a header is newer than the currently installed version of a header (e.g. git pull updates vulkan.h) Note: using the same install prefix for mesa and libdrm is probably a common occurrence since it is described in the radeonBuildHowTo wiki: https://www.x.org/wiki/radeonBuildHowTo/ v2: added sign-off Signed-off-by: Andres Rodriguez <andresx7@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Reviewed-by: Emil Velikov <emil.velikov@collabora.com> (cherry picked from commit `a3ad6a34c6`)	2017-01-20 23:45:57 +00:00
Andres Rodriguez	5c2951c7f9	vulkan/wsi: clarify the severity of lack of DRI3 v2 The current message sounds like a small warning, clarify that it can result in lack of presentation support and application crashes. v2: add "if they do" (Bas) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=98263 Signed-off-by: Andres Rodriguez <andresx7@gmail.com> Acked-by: Jason ekstrand <jason@jlekstrand.net> Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Reviewed-by: Emil Velikov <emil.velikov@collabora.com> (cherry picked from commit `e0674e740b`)	2017-01-20 23:45:50 +00:00
Lionel Landwerlin	e3bfa959a8	anv: don't require render target isl bit for depth/stencil surfaces Blorp can deal with depth/stencil surfaces blits/copies without the render target requirement. Also having both render target and depth/stencil requirement is incompatible from isl's point of view. This fixes an image creation issue in the high level quality settings of the Unity3D player, which requires a depth texture with src/dst transfer & 4x multisampling. v2: Simply aspect checking condition (Jason) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Cc: 13.0 17.0 <mesa-stable@lists.freedesktop.org> (cherry picked from commit `74c23bde5b`)	2017-01-20 23:31:40 +00:00
Lionel Landwerlin	a259b800df	spirv: don't assert with location decorations on non i/o variables Some applications might add location decoration to samplers. Rather than raising an error it seems it would make more sense to just discard these decorations. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Cc: 17.0 <mesa-stable@lists.freedesktop.org> (cherry picked from commit `8a28e764d0`)	2017-01-20 23:29:48 +00:00
Samuel Pitoiset	45f13c2be0	gallium/hud: add missing break in hud_cpufreq_graph_install() Fixes: `e99b9395be` "gallium/hud: Add support for CPU frequency monitoring" Cc: mesa-stable@lists.freedesktop.org Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com> (cherry picked from commit `383fc8e9f3`)	2017-01-20 23:27:54 +00:00
Marek Olšák	b72f8de873	radeonsi: don't forget to add HTILE to the buffer list for texturing This fixes VM faults. Discovered by Samuel Pitoiset. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=98975 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99450 Cc: 17.0 13.0 <mesa-stable@lists.freedesktop.org> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com> Reviewed-by: Edward O'Callaghan <funfunctor@folklore1984.net> (cherry picked from commit `e490b7812c`)	2017-01-20 23:26:03 +00:00
Nicolai Hähnle	1cc5774e5e	radeonsi: fix texture gather on stencil textures At least on VI, texture gather doesn't work with a 24_8 data format, so use 8_8_8_8 and a modified swizzle instead. A bit of background: When creating a GL_STENCIL_INDEX8 texture, we select the X24S8 pipe format because we don't support stencil-only render targets properly. With mip-mapping this can lead to a setup where the tiling is incompatible with stencil texturing, and a flushed stencil texture is used. For the flushed stencil, a literal X24S8 is used because there were issues with an 8bpp DB->CB copy. Longer term, it would be good if we could get away from these workarounds, i.e. properly support an S8 format for stencil-only rendering and flushed stencil. Since stencil texturing is somewhat rare, it's not a high priority. Fixes GL45-CTS.texture_cube_map_array.sampling. Cc: 17.0 <mesa-stable@lists.freedesktop.org> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Acked-by: Edward O'Callaghan <funfunctor@folklore1984.net> (cherry picked from commit `3cd092c415`)	2017-01-20 23:24:17 +00:00
Zachary Michaels	50a607cf70	radeonsi: Always leave poly_offset in a valid state This commit makes si_update_poly_offset set poly_offset to NULL if uses_poly_offset is false. This way poly_offset either points into the currently queued rasterizer, or it is NULL. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99451 Cc: "13.0 17.0" <mesa-stable@lists.freedesktop.org> Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com> (cherry picked from commit `d7d32b3bfe`)	2017-01-20 23:22:41 +00:00
Nicolai Hähnle	613154fc8f	mesa/main: fix meta caller of _mesa_ClampColor Since _mesa_ClampColor properly checks for support of the API function now, it's meta callers need to check support as well. Fixes: `963311b71f` ("mesa/main: fix version/extension checks in _mesa_ClampColor") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99401 Tested-by: Mark Janes <mark.a.janes@intel.com> Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `a7c635ec65`)	2017-01-20 23:21:06 +00:00
Dave Airlie	ff81869f0d	gallivm: use #ifdef not #if for PIPE_ARCH_BIG_ENDIAN This fixes the build on ppc/s390. Reviewed-by: Roland Scheidegger <sroland@vmware.com> Cc: "17.0" <mesa-stable@lists.freedesktop.org> Signed-off-by: Dave Airlie <airlied@redhat.com> (cherry picked from commit `ef71b867ee`)	2017-01-20 23:19:04 +00:00
Emil Velikov	9cb066601c	Update version to 17.0.0-rc1 Signed-off-by: Emil Velikov <emil.velikov@collabora.com>	2017-01-18 20:12:04 +00:00
Emil Velikov	45297f7e4a	utils: really remove the __END_DECLS macro Fixes: `d1efa09d34` "util: import sha1 implementation from OpenBSD" Signed-off-by: Emil Velikov <emil.velikov@collabora.com> (cherry picked from commit `ea8b2624c8`)	2017-01-18 20:11:22 +00:00
Emil Velikov	acc7837799	utils: build sha1/disk cache only with Android/Autoconf Earlier commit imported a SHA1 implementation and relaxed the SHA1 and disk cache handling, broking the Windows builds. Restrict things for now until we get to a proper fix. Fixes: `d1efa09d34` "util: import sha1 implementation from OpenBSD" Signed-off-by: Emil Velikov <emil.velikov@collabora.com> (cherry picked from commit `9f8dc3bf03`)	2017-01-18 20:11:20 +00:00