Update version to 17.2.0-rc5

Signed-off-by: Emil Velikov <emil.l.velikov@gmail.com>
cherry-ignore: ignore storage offset fixes
2017-08-21 11:49:19 +01:00 · 2017-08-19 02:57:19 +01:00 · 2017-08-19 02:57:19 +01:00 · 2017-08-19 02:57:19 +01:00 · 2017-08-19 02:57:19 +01:00 · 2017-08-19 02:57:19 +01:00
54 changed files with 441 additions and 331 deletions
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -88,6 +88,10 @@ LOCAL_CFLAGS += \

 endif
 endif
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_CFLAGS_arm += -DUSE_ARM_ASM
+endif
+LOCAL_CFLAGS_arm64 += -DUSE_AARCH64_ASM

 ifneq ($(LOCAL_IS_HOST_MODULE),true)
 LOCAL_CFLAGS += -DHAVE_LIBDRM
--- a/2
+++ b/2
@@ -1 +1 @@
-17.2.0-rc3
+17.2.0-rc5
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,4 @@
+# fixes:  The commits are too invasive for stable. Instead the offending patches
+#         causing regressions have been reverted.
+365d34540f331df57780dddf8da87235be0a6bcb mesa: correctly calculate the storage offset for i915
+de0e62e1065e2d9172acf3ab7c70bba0160125c8 st/mesa: correctly calculate the storage offset
--- a/configure.ac
+++ b/configure.ac
@@ -773,6 +773,20 @@ if test "x$enable_asm" = xyes; then
            ;;
        esac
        ;;
+    aarch64)
+        case "$host_os" in
+        linux*)
+            asm_arch=aarch64
+            ;;
+        esac
+        ;;
+    arm)
+        case "$host_os" in
+        linux*)
+            asm_arch=arm
+            ;;
+        esac
+        ;;
    esac

    case "$asm_arch" in
@@ -792,6 +806,14 @@ if test "x$enable_asm" = xyes; then
        DEFINES="$DEFINES -DUSE_PPC64LE_ASM"
        AC_MSG_RESULT([yes, ppc64le])
        ;;
+    aarch64)
+        DEFINES="$DEFINES -DUSE_AARCH64_ASM"
+        AC_MSG_RESULT([yes, aarch64])
+        ;;
+    arm)
+        DEFINES="$DEFINES -DUSE_ARM_ASM"
+        AC_MSG_RESULT([yes, arm])
+        ;;
    *)
        AC_MSG_RESULT([no, platform not supported])
        ;;
@@ -2551,7 +2573,7 @@ if test -n "$with_gallium_drivers"; then
            if test "x$HAVE_SWR_AVX" != xyes -a \
                    "x$HAVE_SWR_AVX2" != xyes -a \
                    "x$HAVE_SWR_KNL" != xyes -a \
-                    "x$HAVE_SWR_SKX" != xyes -a; then
+                    "x$HAVE_SWR_SKX" != xyes; then
               AC_MSG_ERROR([swr enabled but no swr architectures selected])
            fi

@@ -2735,6 +2757,8 @@ AM_CONDITIONAL(HAVE_X86_ASM, test "x$asm_arch" = xx86 -o "x$asm_arch" = xx86_64)
 AM_CONDITIONAL(HAVE_X86_64_ASM, test "x$asm_arch" = xx86_64)
 AM_CONDITIONAL(HAVE_SPARC_ASM, test "x$asm_arch" = xsparc)
 AM_CONDITIONAL(HAVE_PPC64LE_ASM, test "x$asm_arch" = xppc64le)
+AM_CONDITIONAL(HAVE_AARCH64_ASM, test "x$asm_arch" = xaarch64)
+AM_CONDITIONAL(HAVE_ARM_ASM, test "x$asm_arch" = xarm)

 AC_SUBST([NINE_MAJOR], 1)
 AC_SUBST([NINE_MINOR], 0)
--- a/src/amd/common/ac_binary.c
+++ b/src/amd/common/ac_binary.c
@@ -109,7 +109,7 @@ static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 	}
 }

-void ac_elf_read(const char *elf_data, unsigned elf_size,
+bool ac_elf_read(const char *elf_data, unsigned elf_size,
 		 struct ac_shader_binary *binary)
 {
 	char *elf_buffer;
@@ -118,6 +118,7 @@ void ac_elf_read(const char *elf_data, unsigned elf_size,
 	Elf_Data *symbols = NULL, *relocs = NULL;
 	size_t section_str_index;
 	unsigned symbol_sh_link = 0;
+	bool success = true;

 	/* One of the libelf implementations
 	 * (http://www.mr511.de/software/english.htm) requires calling
@@ -137,7 +138,8 @@ void ac_elf_read(const char *elf_data, unsigned elf_size,
 		GElf_Shdr section_header;
 		if (gelf_getshdr(section, &section_header) != &section_header) {
 			fprintf(stderr, "Failed to read ELF section header\n");
-			return;
+			success = false;
+			break;
 		}
 		name = elf_strptr(elf, section_str_index, section_header.sh_name);
 		if (!strcmp(name, ".text")) {
@@ -148,6 +150,11 @@ void ac_elf_read(const char *elf_data, unsigned elf_size,
 		} else if (!strcmp(name, ".AMDGPU.config")) {
 			section_data = elf_getdata(section, section_data);
 			binary->config_size = section_data->d_size;
+			if (!binary->config_size) {
+				fprintf(stderr, ".AMDGPU.config is empty!\n");
+				success = false;
+				break;
+			}
 			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 			memcpy(binary->config, section_data->d_buf, binary->config_size);
 		} else if (!strcmp(name, ".AMDGPU.disasm")) {
@@ -186,6 +193,7 @@ void ac_elf_read(const char *elf_data, unsigned elf_size,
 		binary->global_symbol_count = 1;
 		binary->config_size_per_symbol = binary->config_size;
 	}
+	return success;
 }

 const unsigned char *ac_shader_binary_config_start(
--- a/src/amd/common/ac_binary.h
+++ b/src/amd/common/ac_binary.h
@@ -83,7 +83,7 @@ struct ac_shader_config {
 * Parse the elf binary stored in \p elf_data and create a
 * ac_shader_binary object.
 */
-void ac_elf_read(const char *elf_data, unsigned elf_size,
+bool ac_elf_read(const char *elf_data, unsigned elf_size,
 		 struct ac_shader_binary *binary);

 /**
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1315,7 +1315,6 @@ static LLVMValueRef emit_f2f16(struct nir_to_llvm_context *ctx,
 	src0 = to_float(&ctx->ac, src0);
 	result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, "");

-	/* TODO SI/CIK options here */
 	if (ctx->options->chip_class >= VI) {
 		LLVMValueRef args[2];
 		/* Check if the result is a denormal - and flush to 0 if so. */
@@ -1329,7 +1328,22 @@ static LLVMValueRef emit_f2f16(struct nir_to_llvm_context *ctx,

 	if (ctx->options->chip_class >= VI)
 		result = LLVMBuildSelect(ctx->builder, cond, ctx->f32zero, result, "");
-
+	else {
+		/* for SI/CIK */
+		/* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
+		 * so compare the result and flush to 0 if it's smaller.
+		 */
+		LLVMValueRef temp, cond2;
+		temp = emit_intrin_1f_param(&ctx->ac, "llvm.fabs",
+					    ctx->f32, result);
+		cond = LLVMBuildFCmp(ctx->builder, LLVMRealUGT,
+				     LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""),
+				     temp, "");
+		cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
+				      temp, ctx->f32zero, "");
+		cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
+		result = LLVMBuildSelect(ctx->builder, cond, ctx->f32zero, result, "");
+	}
 	return result;
 }

--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1007,6 +1007,8 @@ radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
 	}

 	radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
+	radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
+

 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
@@ -1043,7 +1045,6 @@ radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
 		radeon_emit(cmd_buffer->cs, ds->db_depth_size);	/* R_028058_DB_DEPTH_SIZE */
 		radeon_emit(cmd_buffer->cs, ds->db_depth_slice);	/* R_02805C_DB_DEPTH_SLICE */

-		radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
 	}

 	radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
@@ -2233,8 +2234,11 @@ VkResult radv_EndCommandBuffer(
 {
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);

-	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
+	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
+		if (cmd_buffer->device->physical_device->rad_info.chip_class == SI)
+			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
 		si_emit_cache_flush(cmd_buffer);
+	}

 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
 	    cmd_buffer->record_fail)
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -3079,9 +3079,13 @@ radv_initialise_color_surface(struct radv_device *device,
 				    format != V_028C70_COLOR_24_8) |
 		S_028C70_NUMBER_TYPE(ntype) |
 		S_028C70_ENDIAN(endian);
-	if (iview->image->info.samples > 1)
-		if (iview->image->fmask.size)
-			cb->cb_color_info |= S_028C70_COMPRESSION(1);
+	if ((iview->image->info.samples > 1) && iview->image->fmask.size) {
+		cb->cb_color_info |= S_028C70_COMPRESSION(1);
+		if (device->physical_device->rad_info.chip_class == SI) {
+			unsigned fmask_bankh = util_logbase2(iview->image->fmask.bank_height);
+			cb->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
+		}
+	}

 	if (iview->image->cmask.size &&
 	    !(device->debug_flags & RADV_DEBUG_NO_FAST_CLEARS))
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -205,7 +205,6 @@ si_set_mutable_tex_desc_fields(struct radv_device *device,
 {
 	uint64_t gpu_address = image->bo ? device->ws->buffer_get_va(image->bo) + image->offset : 0;
 	uint64_t va = gpu_address;
-	unsigned pitch = base_level_info->nblk_x * block_width;
 	enum chip_class chip_class = device->physical_device->rad_info.chip_class;
 	uint64_t meta_va = 0;
 	if (chip_class >= GFX9) {
@@ -221,9 +220,6 @@ si_set_mutable_tex_desc_fields(struct radv_device *device,
 		state[0] |= image->surface.u.legacy.tile_swizzle;
 	state[1] &= C_008F14_BASE_ADDRESS_HI;
 	state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
-	state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(image, base_level,
-							     is_stencil));
-	state[4] |= S_008F20_PITCH_GFX6(pitch - 1);

 	if (chip_class >= VI) {
 		state[6] &= C_008F28_COMPRESSION_EN;
@@ -559,10 +555,11 @@ radv_query_opaque_metadata(struct radv_device *device,
 	memcpy(&md->metadata[2], desc, sizeof(desc));

 	/* Dwords [10:..] contain the mipmap level offsets. */
-	for (i = 0; i <= image->info.levels - 1; i++)
-		md->metadata[10+i] = image->surface.u.legacy.level[i].offset >> 8;
-
-	md->size_metadata = (11 + image->info.levels - 1) * 4;
+	if (device->physical_device->rad_info.chip_class <= VI) {
+		for (i = 0; i <= image->info.levels - 1; i++)
+			md->metadata[10+i] = image->surface.u.legacy.level[i].offset >> 8;
+		md->size_metadata = (11 + image->info.levels - 1) * 4;
+	}
 }

 void
--- a/src/amd/vulkan/radv_meta.c
+++ b/src/amd/vulkan/radv_meta.c
@@ -477,48 +477,8 @@ radv_meta_build_nir_fs_noop(void)
 	return b.shader;
 }

-static nir_ssa_def *radv_meta_build_resolve_srgb_conversion(nir_builder *b,
-							    nir_ssa_def *input)
-{
-	nir_const_value v;
-	unsigned i;
-	v.u32[0] = 0x3b4d2e1c; // 0.00313080009
-
-	nir_ssa_def *cmp[3];
-	for (i = 0; i < 3; i++)
-		cmp[i] = nir_flt(b, nir_channel(b, input, i),
-				 nir_build_imm(b, 1, 32, v));
-
-	nir_ssa_def *ltvals[3];
-	v.f32[0] = 12.92;
-	for (i = 0; i < 3; i++)
-		ltvals[i] = nir_fmul(b, nir_channel(b, input, i),
-				     nir_build_imm(b, 1, 32, v));
-
-	nir_ssa_def *gtvals[3];
-
-	for (i = 0; i < 3; i++) {
-		v.f32[0] = 1.0/2.4;
-		gtvals[i] = nir_fpow(b, nir_channel(b, input, i),
-				     nir_build_imm(b, 1, 32, v));
-		v.f32[0] = 1.055;
-		gtvals[i] = nir_fmul(b, gtvals[i],
-				     nir_build_imm(b, 1, 32, v));
-		v.f32[0] = 0.055;
-		gtvals[i] = nir_fsub(b, gtvals[i],
-				     nir_build_imm(b, 1, 32, v));
-	}
-
-	nir_ssa_def *comp[4];
-	for (i = 0; i < 3; i++)
-		comp[i] = nir_bcsel(b, cmp[i], ltvals[i], gtvals[i]);
-	comp[3] = nir_channels(b, input, 3);
-	return nir_vec(b, comp, 4);
-}
-
 void radv_meta_build_resolve_shader_core(nir_builder *b,
 					 bool is_integer,
-					 bool is_srgb,
 					 int samples,
 					 nir_variable *input_img,
 					 nir_variable *color,
@@ -596,10 +556,4 @@ void radv_meta_build_resolve_shader_core(nir_builder *b,

 	if (outer_if)
 		b->cursor = nir_after_cf_node(&outer_if->cf_node);
-
-	if (is_srgb) {
-		nir_ssa_def *newv = nir_load_var(b, color);
-		newv = radv_meta_build_resolve_srgb_conversion(b, newv);
-		nir_store_var(b, color, newv, 0xf);
-	}
 }
--- a/src/amd/vulkan/radv_meta.h
+++ b/src/amd/vulkan/radv_meta.h
@@ -234,7 +234,6 @@ nir_shader *radv_meta_build_nir_fs_noop(void);

 void radv_meta_build_resolve_shader_core(nir_builder *b,
 					 bool is_integer,
-					 bool is_srgb,
 					 int samples,
 					 nir_variable *input_img,
 					 nir_variable *color,
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -979,7 +979,7 @@ emit_fast_color_clear(struct radv_cmd_buffer *cmd_buffer,
 	if (iview->image->info.levels > 1)
 		goto fail;

-	if (iview->image->surface.u.legacy.level[0].mode < RADEON_SURF_MODE_1D)
+	if (iview->image->surface.is_linear)
 		goto fail;
 	if (!radv_image_extent_compare(iview->image, &iview->extent))
 		goto fail;
--- a/src/amd/vulkan/radv_meta_decompress.c
+++ b/src/amd/vulkan/radv_meta_decompress.c
@@ -29,7 +29,9 @@
 #include "sid.h"

 static VkResult
-create_pass(struct radv_device *device)
+create_pass(struct radv_device *device,
+	    uint32_t samples,
+	    VkRenderPass *pass)
 {
 	VkResult result;
 	VkDevice device_h = radv_device_to_handle(device);
@@ -37,7 +39,7 @@ create_pass(struct radv_device *device)
 	VkAttachmentDescription attachment;

 	attachment.format = VK_FORMAT_D32_SFLOAT_S8_UINT;
-	attachment.samples = 1;
+	attachment.samples = samples;
 	attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
 	attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
 	attachment.initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
@@ -65,14 +67,18 @@ create_pass(struct radv_device *device)
 								.dependencyCount = 0,
 								   },
 				       alloc,
-				       &device->meta_state.depth_decomp.pass);
+				       pass);

 	return result;
 }

 static VkResult
 create_pipeline(struct radv_device *device,
-                VkShaderModule vs_module_h)
+                VkShaderModule vs_module_h,
+		uint32_t samples,
+		VkRenderPass pass,
+		VkPipeline *decompress_pipeline,
+		VkPipeline *resummarize_pipeline)
 {
 	VkResult result;
 	VkDevice device_h = radv_device_to_handle(device);
@@ -129,7 +135,7 @@ create_pipeline(struct radv_device *device,
 		},
 		.pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
 			.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
-			.rasterizationSamples = 1,
+			.rasterizationSamples = samples,
 			.sampleShadingEnable = false,
 			.pSampleMask = NULL,
 			.alphaToCoverageEnable = false,
@@ -156,7 +162,7 @@ create_pipeline(struct radv_device *device,
 				VK_DYNAMIC_STATE_SCISSOR,
 			},
 		},
-		.renderPass = device->meta_state.depth_decomp.pass,
+		.renderPass = pass,
 		.subpass = 0,
 	};

@@ -169,7 +175,7 @@ create_pipeline(struct radv_device *device,
 							.db_flush_stencil_inplace = true,
 					       },
 					       &device->meta_state.alloc,
-					       &device->meta_state.depth_decomp.decompress_pipeline);
+					       decompress_pipeline);
 	if (result != VK_SUCCESS)
 		goto cleanup;

@@ -183,7 +189,7 @@ create_pipeline(struct radv_device *device,
 							.db_resummarize = true,
 					       },
 					       &device->meta_state.alloc,
-					       &device->meta_state.depth_decomp.resummarize_pipeline);
+					       resummarize_pipeline);
 	if (result != VK_SUCCESS)
 		goto cleanup;

@@ -199,29 +205,31 @@ radv_device_finish_meta_depth_decomp_state(struct radv_device *device)
 {
 	struct radv_meta_state *state = &device->meta_state;
 	VkDevice device_h = radv_device_to_handle(device);
-	VkRenderPass pass_h = device->meta_state.depth_decomp.pass;
 	const VkAllocationCallbacks *alloc = &device->meta_state.alloc;

-	if (pass_h)
-		radv_DestroyRenderPass(device_h, pass_h,
-					     &device->meta_state.alloc);
-
-	VkPipeline pipeline_h = state->depth_decomp.decompress_pipeline;
-	if (pipeline_h) {
-		radv_DestroyPipeline(device_h, pipeline_h, alloc);
-	}
-	pipeline_h = state->depth_decomp.resummarize_pipeline;
-	if (pipeline_h) {
-		radv_DestroyPipeline(device_h, pipeline_h, alloc);
+	for (uint32_t i = 0; i < ARRAY_SIZE(state->depth_decomp); ++i) {
+		VkRenderPass pass_h = state->depth_decomp[i].pass;
+		if (pass_h) {
+			radv_DestroyRenderPass(device_h, pass_h, alloc);
+		}
+		VkPipeline pipeline_h = state->depth_decomp[i].decompress_pipeline;
+		if (pipeline_h) {
+			radv_DestroyPipeline(device_h, pipeline_h, alloc);
+		}
+		pipeline_h = state->depth_decomp[i].resummarize_pipeline;
+		if (pipeline_h) {
+			radv_DestroyPipeline(device_h, pipeline_h, alloc);
+		}
 	}
 }

 VkResult
 radv_device_init_meta_depth_decomp_state(struct radv_device *device)
 {
+	struct radv_meta_state *state = &device->meta_state;
 	VkResult res = VK_SUCCESS;

-	zero(device->meta_state.depth_decomp);
+	zero(state->depth_decomp);

 	struct radv_shader_module vs_module = { .nir = radv_meta_build_nir_vs_generate_vertices() };
 	if (!vs_module.nir) {
@@ -230,14 +238,22 @@ radv_device_init_meta_depth_decomp_state(struct radv_device *device)
 		goto fail;
 	}

-	res = create_pass(device);
-	if (res != VK_SUCCESS)
-		goto fail;
-
 	VkShaderModule vs_module_h = radv_shader_module_to_handle(&vs_module);
-	res = create_pipeline(device, vs_module_h);
-	if (res != VK_SUCCESS)
-		goto fail;
+
+	for (uint32_t i = 0; i < ARRAY_SIZE(state->depth_decomp); ++i) {
+		uint32_t samples = 1 << i;
+
+		res = create_pass(device, samples, &state->depth_decomp[i].pass);
+		if (res != VK_SUCCESS)
+			goto fail;
+
+		res = create_pipeline(device, vs_module_h, samples,
+				      state->depth_decomp[i].pass,
+				      &state->depth_decomp[i].decompress_pipeline,
+				      &state->depth_decomp[i].resummarize_pipeline);
+		if (res != VK_SUCCESS)
+			goto fail;
+	}

 	goto cleanup;

@@ -283,10 +299,15 @@ emit_depth_decomp(struct radv_cmd_buffer *cmd_buffer,
 }


+enum radv_depth_op {
+	DEPTH_DECOMPRESS,
+	DEPTH_RESUMMARIZE,
+};
+
 static void radv_process_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 					     struct radv_image *image,
 					     VkImageSubresourceRange *subresourceRange,
-					     VkPipeline pipeline_h)
+					     enum radv_depth_op op)
 {
 	struct radv_meta_saved_state saved_state;
 	struct radv_meta_saved_pass_state saved_pass_state;
@@ -296,6 +317,9 @@ static void radv_process_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 				     subresourceRange->baseMipLevel);
 	uint32_t height = radv_minify(image->info.height,
 				     subresourceRange->baseMipLevel);
+	uint32_t samples = image->info.samples;
+	uint32_t samples_log2 = ffs(samples) - 1;
+	struct radv_meta_state *meta_state = &cmd_buffer->device->meta_state;

 	if (!image->surface.htile_size)
 		return;
@@ -339,7 +363,7 @@ static void radv_process_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 		radv_CmdBeginRenderPass(cmd_buffer_h,
 					      &(VkRenderPassBeginInfo) {
 						      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-							      .renderPass = cmd_buffer->device->meta_state.depth_decomp.pass,
+							      .renderPass = meta_state->depth_decomp[samples_log2].pass,
 							      .framebuffer = fb_h,
 							      .renderArea = {
 							      .offset = {
@@ -356,6 +380,18 @@ static void radv_process_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 					   },
 					   VK_SUBPASS_CONTENTS_INLINE);

+		VkPipeline pipeline_h;
+		switch (op) {
+		case DEPTH_DECOMPRESS:
+			pipeline_h = meta_state->depth_decomp[samples_log2].decompress_pipeline;
+			break;
+		case DEPTH_RESUMMARIZE:
+			pipeline_h = meta_state->depth_decomp[samples_log2].resummarize_pipeline;
+			break;
+		default:
+			unreachable("unknown operation");
+		}
+
 		emit_depth_decomp(cmd_buffer, &(VkOffset2D){0, 0 }, &(VkExtent2D){width, height}, pipeline_h);
 		radv_CmdEndRenderPass(cmd_buffer_h);

@@ -371,8 +407,7 @@ void radv_decompress_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 					 VkImageSubresourceRange *subresourceRange)
 {
 	assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL);
-	radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange,
-					 cmd_buffer->device->meta_state.depth_decomp.decompress_pipeline);
+	radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange, DEPTH_DECOMPRESS);
 }

 void radv_resummarize_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
@@ -380,6 +415,5 @@ void radv_resummarize_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 					 VkImageSubresourceRange *subresourceRange)
 {
 	assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL);
-	radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange,
-					 cmd_buffer->device->meta_state.depth_decomp.resummarize_pipeline);
+	radv_process_depth_image_inplace(cmd_buffer, image, subresourceRange, DEPTH_RESUMMARIZE);
 }
--- a/src/amd/vulkan/radv_meta_resolve.c
+++ b/src/amd/vulkan/radv_meta_resolve.c
@@ -382,6 +382,11 @@ void radv_CmdResolveImage(
 	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);

 	assert(src_image->info.samples > 1);
+	if (src_image->info.samples <= 1) {
+		/* this causes GPU hangs if we get past here */
+		fprintf(stderr, "radv: Illegal resolve operation (src not multisampled), will hang GPU.");
+		return;
+	}
 	assert(dest_image->info.samples == 1);

 	if (src_image->info.samples >= 16) {
--- a/src/amd/vulkan/radv_meta_resolve_cs.c
+++ b/src/amd/vulkan/radv_meta_resolve_cs.c
@@ -31,6 +31,45 @@
 #include "sid.h"
 #include "vk_format.h"

+static nir_ssa_def *radv_meta_build_resolve_srgb_conversion(nir_builder *b,
+							    nir_ssa_def *input)
+{
+	nir_const_value v;
+	unsigned i;
+	v.u32[0] = 0x3b4d2e1c; // 0.00313080009
+
+	nir_ssa_def *cmp[3];
+	for (i = 0; i < 3; i++)
+		cmp[i] = nir_flt(b, nir_channel(b, input, i),
+				 nir_build_imm(b, 1, 32, v));
+
+	nir_ssa_def *ltvals[3];
+	v.f32[0] = 12.92;
+	for (i = 0; i < 3; i++)
+		ltvals[i] = nir_fmul(b, nir_channel(b, input, i),
+				     nir_build_imm(b, 1, 32, v));
+
+	nir_ssa_def *gtvals[3];
+
+	for (i = 0; i < 3; i++) {
+		v.f32[0] = 1.0/2.4;
+		gtvals[i] = nir_fpow(b, nir_channel(b, input, i),
+				     nir_build_imm(b, 1, 32, v));
+		v.f32[0] = 1.055;
+		gtvals[i] = nir_fmul(b, gtvals[i],
+				     nir_build_imm(b, 1, 32, v));
+		v.f32[0] = 0.055;
+		gtvals[i] = nir_fsub(b, gtvals[i],
+				     nir_build_imm(b, 1, 32, v));
+	}
+
+	nir_ssa_def *comp[4];
+	for (i = 0; i < 3; i++)
+		comp[i] = nir_bcsel(b, cmp[i], ltvals[i], gtvals[i]);
+	comp[3] = nir_channels(b, input, 1 << 3);
+	return nir_vec(b, comp, 4);
+}
+
 static nir_shader *
 build_resolve_compute_shader(struct radv_device *dev, bool is_integer, bool is_srgb, int samples)
 {
@@ -88,10 +127,13 @@ build_resolve_compute_shader(struct radv_device *dev, bool is_integer, bool is_s
 	nir_ssa_def *img_coord = nir_channels(&b, nir_iadd(&b, global_id, &src_offset->dest.ssa), 0x3);
 	nir_variable *color = nir_local_variable_create(b.impl, glsl_vec4_type(), "color");

-	radv_meta_build_resolve_shader_core(&b, is_integer, is_srgb, samples,
-					    input_img, color, img_coord);
+	radv_meta_build_resolve_shader_core(&b, is_integer, samples, input_img,
+	                                    color, img_coord);

 	nir_ssa_def *outval = nir_load_var(&b, color);
+	if (is_srgb)
+		outval = radv_meta_build_resolve_srgb_conversion(&b, outval);
+
 	nir_ssa_def *coord = nir_iadd(&b, global_id, &dst_offset->dest.ssa);
 	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_store);
 	store->src[0] = nir_src_for_ssa(coord);
@@ -402,7 +444,7 @@ void radv_meta_resolve_compute_image(struct radv_cmd_buffer *cmd_buffer,
 						     .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
 							     .image = radv_image_to_handle(dest_image),
 							     .viewType = radv_meta_get_view_type(dest_image),
-							     .format = dest_image->vk_format,
+							     .format = vk_to_non_srgb_format(dest_image->vk_format),
 							     .subresourceRange = {
 							     .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
 							     .baseMipLevel = region->dstSubresource.mipLevel,
--- a/src/amd/vulkan/radv_meta_resolve_fs.c
+++ b/src/amd/vulkan/radv_meta_resolve_fs.c
@@ -51,7 +51,7 @@ build_nir_vertex_shader(void)
 }

 static nir_shader *
-build_resolve_fragment_shader(struct radv_device *dev, bool is_integer, bool is_srgb, int samples)
+build_resolve_fragment_shader(struct radv_device *dev, bool is_integer, int samples)
 {
 	nir_builder b;
 	char name[64];
@@ -62,7 +62,7 @@ build_resolve_fragment_shader(struct radv_device *dev, bool is_integer, bool is_
 								 false,
 								 GLSL_TYPE_FLOAT);

-	snprintf(name, 64, "meta_resolve_fs-%d-%s", samples, is_integer ? "int" : (is_srgb ? "srgb" : "float"));
+	snprintf(name, 64, "meta_resolve_fs-%d-%s", samples, is_integer ? "int" : "float");
 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
 	b.shader->info.name = ralloc_strdup(b.shader, name);

@@ -92,8 +92,8 @@ build_resolve_fragment_shader(struct radv_device *dev, bool is_integer, bool is_
 	nir_ssa_def *img_coord = nir_channels(&b, nir_iadd(&b, pos_int, &src_offset->dest.ssa), 0x3);
 	nir_variable *color = nir_local_variable_create(b.impl, glsl_vec4_type(), "color");

-	radv_meta_build_resolve_shader_core(&b, is_integer, is_srgb,samples,
-					    input_img, color, img_coord);
+	radv_meta_build_resolve_shader_core(&b, is_integer, samples, input_img,
+	                                    color, img_coord);

 	nir_ssa_def *outval = nir_load_var(&b, color);
 	nir_store_var(&b, color_out, outval, 0xf);
@@ -177,31 +177,25 @@ create_resolve_pipeline(struct radv_device *device,
 			VkFormat format)
 {
 	VkResult result;
-	bool is_integer = false, is_srgb = false;
+	bool is_integer = false;
 	uint32_t samples = 1 << samples_log2;
 	unsigned fs_key = radv_format_meta_fs_key(format);
 	const VkPipelineVertexInputStateCreateInfo *vi_create_info;
 	vi_create_info = &normal_vi_create_info;
 	if (vk_format_is_int(format))
 		is_integer = true;
-	else if (vk_format_is_srgb(format))
-		is_srgb = true;

 	struct radv_shader_module fs = { .nir = NULL };
-	fs.nir = build_resolve_fragment_shader(device, is_integer, is_srgb, samples);
+	fs.nir = build_resolve_fragment_shader(device, is_integer, samples);
 	struct radv_shader_module vs = {
 		.nir = build_nir_vertex_shader(),
 	};

-	VkRenderPass *rp = is_srgb ?
-		&device->meta_state.resolve_fragment.rc[samples_log2].srgb_render_pass :
-		&device->meta_state.resolve_fragment.rc[samples_log2].render_pass[fs_key];
+	VkRenderPass *rp = &device->meta_state.resolve_fragment.rc[samples_log2].render_pass[fs_key];

 	assert(!*rp);

-	VkPipeline *pipeline = is_srgb ?
-		&device->meta_state.resolve_fragment.rc[samples_log2].srgb_pipeline :
-		&device->meta_state.resolve_fragment.rc[samples_log2].pipeline[fs_key];
+	VkPipeline *pipeline = &device->meta_state.resolve_fragment.rc[samples_log2].pipeline[fs_key];
 	assert(!*pipeline);

 	VkPipelineShaderStageCreateInfo pipeline_shader_stages[] = {
@@ -350,8 +344,6 @@ radv_device_init_meta_resolve_fragment_state(struct radv_device *device)
 		for (unsigned j = 0; j < ARRAY_SIZE(pipeline_formats); ++j) {
 			res = create_resolve_pipeline(device, i, pipeline_formats[j]);
 		}
-
-		res = create_resolve_pipeline(device, i, VK_FORMAT_R8G8B8A8_SRGB);
 	}

 	return res;
@@ -370,12 +362,6 @@ radv_device_finish_meta_resolve_fragment_state(struct radv_device *device)
 					     state->resolve_fragment.rc[i].pipeline[j],
 					     &state->alloc);
 		}
-		radv_DestroyRenderPass(radv_device_to_handle(device),
-				       state->resolve_fragment.rc[i].srgb_render_pass,
-					       &state->alloc);
-		radv_DestroyPipeline(radv_device_to_handle(device),
-				     state->resolve_fragment.rc[i].srgb_pipeline,
-				     &state->alloc);
 	}

 	radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
@@ -432,9 +418,7 @@ emit_resolve(struct radv_cmd_buffer *cmd_buffer,
 			      push_constants);

 	unsigned fs_key = radv_format_meta_fs_key(dest_iview->vk_format);
-	VkPipeline pipeline_h = vk_format_is_srgb(dest_iview->vk_format) ?
-		device->meta_state.resolve_fragment.rc[samples_log2].srgb_pipeline :
-		device->meta_state.resolve_fragment.rc[samples_log2].pipeline[fs_key];
+	VkPipeline pipeline_h = device->meta_state.resolve_fragment.rc[samples_log2].pipeline[fs_key];

 	radv_CmdBindPipeline(cmd_buffer_h, VK_PIPELINE_BIND_POINT_GRAPHICS,
 			     pipeline_h);
@@ -485,9 +469,7 @@ void radv_meta_resolve_fragment_image(struct radv_cmd_buffer *cmd_buffer,
 		radv_fast_clear_flush_image_inplace(cmd_buffer, src_image, &range);
 	}

-	rp = vk_format_is_srgb(dest_image->vk_format) ?
-		device->meta_state.resolve_fragment.rc[samples_log2].srgb_render_pass :
-		device->meta_state.resolve_fragment.rc[samples_log2].render_pass[fs_key];
+	rp = device->meta_state.resolve_fragment.rc[samples_log2].render_pass[fs_key];
 	radv_meta_save_graphics_reset_vport_scissor_novertex(&saved_state, cmd_buffer);

 	for (uint32_t r = 0; r < region_count; ++r) {
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -433,8 +433,6 @@ struct radv_meta_state {
 		VkPipelineLayout                          p_layout;

 		struct {
-			VkRenderPass srgb_render_pass;
-			VkPipeline   srgb_pipeline;
 			VkRenderPass render_pass[NUM_META_FS_KEYS];
 			VkPipeline   pipeline[NUM_META_FS_KEYS];
 		} rc[MAX_SAMPLES_LOG2];
@@ -444,7 +442,7 @@ struct radv_meta_state {
 		VkPipeline                                decompress_pipeline;
 		VkPipeline                                resummarize_pipeline;
 		VkRenderPass                              pass;
-	} depth_decomp;
+	} depth_decomp[1 + MAX_SAMPLES_LOG2];

 	struct {
 		VkPipeline                                cmask_eliminate_pipeline;
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -1133,15 +1133,18 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 void
 si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
 {
-	uint32_t val = 0;
+	uint32_t op = PRED_OP(PREDICATION_OP_BOOL64) | PREDICATION_DRAW_VISIBLE;

-	if (va)
-		val = (((va >> 32) & 0xff) |
-		       PRED_OP(PREDICATION_OP_BOOL64)|
-		       PREDICATION_DRAW_VISIBLE);
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, val);
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+		radeon_emit(cmd_buffer->cs, op);
+		radeon_emit(cmd_buffer->cs, va);
+		radeon_emit(cmd_buffer->cs, va >> 32);
+	} else {
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+		radeon_emit(cmd_buffer->cs, va);
+		radeon_emit(cmd_buffer->cs, op | ((va >> 32) & 0xFF));
+	}
 }

 /* Set this if you want the 3D engine to wait until CP DMA is done.
--- a/src/amd/vulkan/vk_format.h
+++ b/src/amd/vulkan/vk_format.h
@@ -465,4 +465,27 @@ vk_format_get_component_bits(VkFormat format,
 	}
 }

+static inline VkFormat
+vk_to_non_srgb_format(VkFormat format)
+{
+	switch(format) {
+	case VK_FORMAT_R8_SRGB :
+		return VK_FORMAT_R8_UNORM;
+	case VK_FORMAT_R8G8_SRGB:
+		return VK_FORMAT_R8G8_UNORM;
+	case VK_FORMAT_R8G8B8_SRGB:
+		return VK_FORMAT_R8G8B8_UNORM;
+	case VK_FORMAT_B8G8R8_SRGB:
+		return VK_FORMAT_B8G8R8_UNORM;
+	case VK_FORMAT_R8G8B8A8_SRGB :
+		return VK_FORMAT_R8G8B8A8_UNORM;
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		return VK_FORMAT_B8G8R8A8_UNORM;
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
+	default:
+		return format;
+	}
+}
+
 #endif /* VK_FORMAT_H */
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
@@ -46,6 +46,11 @@ do_winsys_init(struct radv_amdgpu_winsys *ws, int fd)
 	if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
 		return false;

+	if (ws->info.chip_class >= GFX9) {
+		fprintf(stderr, "radv: VEGA support not completed.\n");
+		return false;
+	}
+
 	/* LLVM 5.0 is required for GFX9. */
 	if (ws->info.chip_class >= GFX9 && HAVE_LLVM < 0x0500) {
 		fprintf(stderr, "amdgpu: LLVM 5.0 is required, got LLVM %i.%i\n",
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -4495,7 +4495,7 @@ process_initializer(ir_variable *var, ast_declaration *decl,
      } else {
         if (var->type->is_numeric()) {
            /* Reduce cascading errors. */
-            var->constant_value = type->qualifier.flags.q.constant
+            rhs = var->constant_value = type->qualifier.flags.q.constant
               ? ir_constant::zero(state, var->type) : NULL;
         }
      }
--- a/src/compiler/glsl/ir_constant_expression.cpp
+++ b/src/compiler/glsl/ir_constant_expression.cpp
@@ -725,6 +725,8 @@ ir_swizzle::constant_expression_value(struct hash_table *variable_context)
         case GLSL_TYPE_FLOAT: data.f[i] = v->value.f[swiz_idx[i]]; break;
         case GLSL_TYPE_BOOL:  data.b[i] = v->value.b[swiz_idx[i]]; break;
         case GLSL_TYPE_DOUBLE:data.d[i] = v->value.d[swiz_idx[i]]; break;
+         case GLSL_TYPE_UINT64:data.u64[i] = v->value.u64[swiz_idx[i]]; break;
+         case GLSL_TYPE_INT64: data.i64[i] = v->value.i64[swiz_idx[i]]; break;
         default:              assert(!"Should not get here."); break;
         }
      }
--- a/src/compiler/glsl/opt_constant_propagation.cpp
+++ b/src/compiler/glsl/opt_constant_propagation.cpp
@@ -237,6 +237,12 @@ ir_constant_propagation_visitor::constant_propagation(ir_rvalue **rvalue) {
      case GLSL_TYPE_BOOL:
 	 data.b[i] = found->constant->value.b[rhs_channel];
 	 break;
+      case GLSL_TYPE_UINT64:
+	 data.u64[i] = found->constant->value.u64[rhs_channel];
+	 break;
+      case GLSL_TYPE_INT64:
+	 data.i64[i] = found->constant->value.i64[rhs_channel];
+	 break;
      default:
 	 assert(!"not reached");
 	 break;
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -646,6 +646,7 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
       error != NULL || xfixes_query->major_version < 2) {
      _eglLog(_EGL_WARNING, "DRI2: failed to query xfixes version");
      free(error);
+      free(xfixes_query);
      return EGL_FALSE;
   }
   free(xfixes_query);
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -923,7 +923,7 @@ static void *
 _fixupNativeWindow(_EGLDisplay *disp, void *native_window)
 {
 #ifdef HAVE_X11_PLATFORM
-   if (disp->Platform == _EGL_PLATFORM_X11 && native_window != NULL) {
+   if (disp && disp->Platform == _EGL_PLATFORM_X11 && native_window != NULL) {
      /* The `native_window` parameter for the X11 platform differs between
       * eglCreateWindowSurface() and eglCreatePlatformPixmapSurfaceEXT(). In
       * eglCreateWindowSurface(), the type of `native_window` is an Xlib
@@ -985,7 +985,7 @@ _fixupNativePixmap(_EGLDisplay *disp, void *native_pixmap)
       * `Pixmap*`.  Convert `Pixmap*` to `Pixmap` because that's what
       * dri2_x11_create_pixmap_surface() expects.
       */
-   if (disp->Platform == _EGL_PLATFORM_X11 && native_pixmap != NULL)
+   if (disp && disp->Platform == _EGL_PLATFORM_X11 && native_pixmap != NULL)
      return (void *)(* (Pixmap*) native_pixmap);
 #endif
   return native_pixmap;
--- a/src/gallium/auxiliary/os/os_time.c
+++ b/src/gallium/auxiliary/os/os_time.c
@@ -69,10 +69,17 @@ os_time_get_nano(void)

   static LARGE_INTEGER frequency;
   LARGE_INTEGER counter;
+   int64_t secs, nanosecs;
   if(!frequency.QuadPart)
      QueryPerformanceFrequency(&frequency);
   QueryPerformanceCounter(&counter);
-   return counter.QuadPart*INT64_C(1000000000)/frequency.QuadPart;
+   /* Compute seconds and nanoseconds parts separately to
+    * reduce severity of precision loss.
+    */
+   secs = counter.QuadPart / frequency.QuadPart;
+   nanosecs = (counter.QuadPart % frequency.QuadPart) * INT64_C(1000000000)
+      / frequency.QuadPart;
+   return secs*INT64_C(1000000000) + nanosecs;

 #else

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2006,6 +2006,7 @@ CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
 void
 CodeEmitterNVC0::emitMOV(const Instruction *i)
 {
+   assert(!i->saturate);
   if (i->def(0).getFile() == FILE_PREDICATE) {
      if (i->src(0).getFile() == FILE_GPR) {
         code[0] = 0xfc01c003;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -305,6 +305,8 @@ unsigned int Instruction::srcMask(unsigned int s) const
   case TGSI_OPCODE_TXD:
   case TGSI_OPCODE_TXL:
   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXF:
+   case TGSI_OPCODE_TG4:
   case TGSI_OPCODE_TEX_LZ:
   case TGSI_OPCODE_TXF_LZ:
   case TGSI_OPCODE_LODQ:
@@ -343,6 +345,8 @@ unsigned int Instruction::srcMask(unsigned int s) const
      }
   }
      return mask;
+   case TGSI_OPCODE_TXQ:
+      return 1;
   case TGSI_OPCODE_XPD:
   {
      unsigned int x = 0;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -727,7 +727,9 @@ ConstantFolding::expr(Instruction *i,
      // Leave PFETCH alone... we just folded its 2 args into 1.
      break;
   default:
-      i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
+      i->op = i->saturate ? OP_SAT : OP_MOV;
+      if (i->saturate)
+         unary(i, *i->getSrc(0)->asImm());
      break;
   }
   i->subOp = 0;
@@ -1509,6 +1511,17 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
   default:
      return;
   }
+
+   // This can get left behind some of the optimizations which simplify
+   // saturatable values.
+   if (newi->op == OP_MOV && newi->saturate) {
+      ImmediateValue tmp;
+      newi->saturate = 0;
+      newi->op = OP_SAT;
+      if (newi->src(0).getImmediate(tmp))
+         unary(newi, tmp);
+   }
+
   if (newi->op != op)
      foldCount++;
 }
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -771,6 +771,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+." },
 	{ "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." },
 	{ "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders compiled on demand" },
+	{ "ce", DBG_CE, "Force enable the constant engine" },
 	{ "noce", DBG_NO_CE, "Disable the constant engine"},
 	{ "unsafemath", DBG_UNSAFE_MATH, "Enable unsafe math shader optimizations" },
 	{ "nodccfb", DBG_NO_DCC_FB, "Disable separate DCC on the main framebuffer" },
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -65,12 +65,12 @@
 #define R600_PRIM_RECTANGLE_LIST	PIPE_PRIM_MAX

 /* Debug flags. */
-/* logging */
+/* logging and features */
 #define DBG_TEX			(1 << 0)
 /* gap - reuse */
 #define DBG_COMPUTE		(1 << 2)
 #define DBG_VM			(1 << 3)
-/* gap - reuse */
+#define DBG_CE			(1 << 4)
 /* shader logging */
 #define DBG_FS			(1 << 5)
 #define DBG_VS			(1 << 6)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -198,12 +198,24 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX,
 				       si_context_gfx_flush, sctx);

-	/* SI + AMDGPU + CE = GPU hang */
-	if (!(sscreen->b.debug_flags & DBG_NO_CE) && ws->cs_add_const_ib &&
-	    sscreen->b.chip_class != SI &&
-	    /* These can't use CE due to a power gating bug in the kernel. */
-	    sscreen->b.family != CHIP_CARRIZO &&
-	    sscreen->b.family != CHIP_STONEY) {
+	bool enable_ce = sscreen->b.chip_class != SI && /* SI hangs */
+			 /* These can't use CE due to a power gating bug in the kernel. */
+			 sscreen->b.family != CHIP_CARRIZO &&
+			 sscreen->b.family != CHIP_STONEY;
+
+	/* CE is currently disabled by default, because it makes s_load latency
+	 * worse, because CE IB doesn't run in lockstep with DE.
+	 * Remove this line after that performance issue has been resolved.
+	 */
+	enable_ce = false;
+
+	/* Apply CE overrides. */
+	if (sscreen->b.debug_flags & DBG_NO_CE)
+		enable_ce = false;
+	else if (sscreen->b.debug_flags & DBG_CE)
+		enable_ce = true;
+
+	if (ws->cs_add_const_ib && enable_ce) {
 		sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs);
 		if (!sctx->ce_ib)
 			goto fail;
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -1400,7 +1400,7 @@ static void tex_fetch_args(
 		 * It's unnecessary if the original texture format was
 		 * Z32_FLOAT, but we don't know that here.
 		 */
-		if (ctx->screen->b.chip_class == VI)
+		if (ctx->screen->b.chip_class >= VI)
 			z = ac_build_clamp(&ctx->ac, z);

 		address[count++] = z;
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
@@ -148,7 +148,10 @@ unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
 	buffer_size = LLVMGetBufferSize(out_buffer);
 	buffer_data = LLVMGetBufferStart(out_buffer);

-	ac_elf_read(buffer_data, buffer_size, binary);
+	if (!ac_elf_read(buffer_data, buffer_size, binary)) {
+		fprintf(stderr, "radeonsi: cannot read an ELF shader binary\n");
+		diag.retval = 1;
+	}

 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3162,14 +3162,13 @@ si_make_texture_descriptor(struct si_screen *screen,
 			   uint32_t *fmask_state)
 {
 	struct pipe_resource *res = &tex->resource.b.b;
-	const struct util_format_description *base_desc, *desc;
+	const struct util_format_description *desc;
 	unsigned char swizzle[4];
 	int first_non_void;
 	unsigned num_format, data_format, type;
 	uint64_t va;

 	desc = util_format_description(pipe_format);
-	base_desc = util_format_description(res->format);

 	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
 		const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
@@ -3270,15 +3269,6 @@ si_make_texture_descriptor(struct si_screen *screen,
 		data_format = 0;
 	}

-	/* Enable clamping for UNORM depth formats promoted to Z32F. */
-	if (screen->b.chip_class >= GFX9 &&
-	    util_format_has_depth(desc) &&
-	    num_format == V_008F14_IMG_NUM_FORMAT_FLOAT &&
-	    util_get_depth_format_type(base_desc) != UTIL_FORMAT_TYPE_FLOAT) {
-		/* NUM_FORMAT=FLOAT and DATA_FORMAT=24_8 means "clamp to [0,1]". */
-		data_format = V_008F14_IMG_DATA_FORMAT_24_8;
-	}
-
 	/* S8 with Z32 HTILE needs a special format. */
 	if (screen->b.chip_class >= GFX9 &&
 	    pipe_format == PIPE_FORMAT_S8_UINT &&
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext)

    if (IsDraw)
    {
-        InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE);
+        InterlockedIncrement(&pContext->drawsOutstandingFE);
    }

    _ReadWriteBarrier();
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -409,12 +409,12 @@ struct DRAW_CONTEXT
    bool            dependent;      // Backend work is dependent on all previous BE
    bool            isCompute;      // Is this DC a compute context?
    bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
-    volatile bool   doneFE;         // Is FE work done for this draw?

    FE_WORK         FeWork;

+    volatile OSALIGNLINE(bool)       doneFE;         // Is FE work done for this draw?
    volatile OSALIGNLINE(uint32_t)   FeLock;
-    volatile int32_t    threadsDone;
+    volatile OSALIGNLINE(uint32_t)   threadsDone;

    SYNC_DESC       retireCallback; // Call this func when this DC is retired.
 };
@@ -503,9 +503,9 @@ struct SWR_CONTEXT
    // Scratch space for workers.
    uint8_t** ppScratch;

-    volatile int32_t  drawsOutstandingFE;
+    volatile OSALIGNLINE(uint32_t)  drawsOutstandingFE;

-    CachingAllocator cachingArenaAllocator;
+    OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
    uint32_t frameCount;

    uint32_t lastFrameChecked;
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -393,7 +393,7 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONT
 // inlined-only version
 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 {
-    int32_t result = InterlockedDecrement((volatile LONG*)&pDC->threadsDone);
+    int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
    SWR_ASSERT(result >= 0);

    AR_FLUSH(pDC->drawId);
@@ -639,7 +639,7 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEX
    _mm_mfence();
    pDC->doneFE = true;

-    InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
+    InterlockedDecrement(&pContext->drawsOutstandingFE);
 }

 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
--- a/src/gallium/drivers/vc4/Android.mk
+++ b/src/gallium/drivers/vc4/Android.mk
@@ -28,6 +28,10 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	$(C_SOURCES)

+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_SRC_FILES += $(NEON_C_SOURCES)
+endif
+
 LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/include/drm-uapi
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -39,6 +39,14 @@ noinst_LTLIBRARIES = libvc4.la

 libvc4_la_SOURCES = $(C_SOURCES)
 libvc4_la_LIBADD = $(SIM_LIB)
+
+if HAVE_ARM_ASM
+noinst_LTLIBRARIES += libvc4_neon.la
+libvc4_la_LIBADD += libvc4_neon.la
+libvc4_neon_la_SOURCES = $(NEON_C_SOURCES)
+libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -mfpu=neon
+endif
+
 libvc4_la_LDFLAGS = $(SIM_LDFLAGS)

 EXTRA_DIST = kernel/README
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -57,7 +57,8 @@ C_SOURCES := \
 	vc4_state.c \
 	vc4_tiling.c \
 	vc4_tiling_lt.c \
-	vc4_tiling_lt_neon.c \
 	vc4_tiling.h \
 	vc4_uniforms.c \
 	$()
+
+NEON_C_SOURCES := vc4_tiling_lt_neon.c
--- a/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
@@ -89,13 +89,15 @@ vc4_load_lt_image(void *dst, uint32_t dst_stride,
                  void *src, uint32_t src_stride,
                  int cpp, const struct pipe_box *box)
 {
+#ifdef USE_ARM_ASM
        if (util_cpu_caps.has_neon) {
                vc4_load_lt_image_neon(dst, dst_stride, src, src_stride,
                                       cpp, box);
-        } else {
-                vc4_load_lt_image_base(dst, dst_stride, src, src_stride,
-                                       cpp, box);
+                return;
        }
+#endif
+        vc4_load_lt_image_base(dst, dst_stride, src, src_stride,
+                               cpp, box);
 }

 static inline void
@@ -103,13 +105,16 @@ vc4_store_lt_image(void *dst, uint32_t dst_stride,
                   void *src, uint32_t src_stride,
                   int cpp, const struct pipe_box *box)
 {
+#ifdef USE_ARM_ASM
        if (util_cpu_caps.has_neon) {
                vc4_store_lt_image_neon(dst, dst_stride, src, src_stride,
                                        cpp, box);
-        } else {
-                vc4_store_lt_image_base(dst, dst_stride, src, src_stride,
-                                        cpp, box);
+                return;
        }
+#endif
+
+        vc4_store_lt_image_base(dst, dst_stride, src, src_stride,
+                                cpp, box);
 }

 #endif /* VC4_TILING_H */
--- a/src/gallium/state_trackers/wgl/stw_framebuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c
@@ -601,8 +601,11 @@ wait_swap_interval(struct stw_framebuffer *fb)
      int64_t min_swap_period =
         1.0e6 / stw_dev->refresh_rate * stw_dev->swap_interval;

-      /* if time since last swap is less than wait period, wait */
-      if (delta < min_swap_period) {
+      /* If time since last swap is less than wait period, wait.
+       * Note that it's possible for the delta to be negative because of
+       * rollover.  See https://bugs.freedesktop.org/show_bug.cgi?id=102241
+       */
+      if ((delta >= 0) && (delta < min_swap_period)) {
         float fudge = 1.75f;  /* emperical fudge factor */
         int64_t wait = (min_swap_period - delta) * fudge;
         os_time_sleep(wait);
--- a/src/glx/glxcmds.c
+++ b/src/glx/glxcmds.c
@@ -820,7 +820,7 @@ glXSwapBuffers(Display * dpy, GLXDrawable drawable)
 {
 #ifdef GLX_USE_APPLEGL
   struct glx_context * gc = __glXGetCurrentContext();
-   if(gc != &DummyContext && apple_glx_is_current_drawable(dpy, gc->driContext, drawable)) {
+   if(gc != &dummyContext && apple_glx_is_current_drawable(dpy, gc->driContext, drawable)) {
      apple_glx_swap_buffers(gc->driContext);
   } else {
      __glXSendError(dpy, GLXBadCurrentWindow, 0, X_GLXSwapBuffers, false);
--- a/src/intel/genxml/gen10.xml
+++ b/src/intel/genxml/gen10.xml
@@ -554,7 +554,7 @@
    <field name="Write Disable Blue" start="0" end="0" type="bool"/>
  </struct>

-  <struct name="BLEND_STATE" length="17">
+  <struct name="BLEND_STATE" length="1">
    <field name="Alpha To Coverage Enable" start="31" end="31" type="bool"/>
    <field name="Independent Alpha Blend Enable" start="30" end="30" type="bool"/>
    <field name="Alpha To One Enable" start="29" end="29" type="bool"/>
@@ -564,7 +564,7 @@
    <field name="Color Dither Enable" start="23" end="23" type="bool"/>
    <field name="X Dither Offset" start="21" end="22" type="uint"/>
    <field name="Y Dither Offset" start="19" end="20" type="uint"/>
-    <group count="8" start="32" size="64">
+    <group count="0" start="32" size="64">
      <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
    </group>
  </struct>
--- a/src/intel/isl/isl.c
+++ b/src/intel/isl/isl.c
@@ -982,7 +982,8 @@ isl_calc_phys_total_extent_el_gen4_2d(
                                           &phys_slice0_sa);
   *total_extent_el = (struct isl_extent2d) {
      .w = isl_assert_div(phys_slice0_sa.w, fmtl->bw),
-      .h = *array_pitch_el_rows * phys_level0_sa->array_len,
+      .h = *array_pitch_el_rows * (phys_level0_sa->array_len - 1) +
+           isl_assert_div(phys_slice0_sa.h, fmtl->bh),
   };
 }

@@ -1366,124 +1367,19 @@ isl_calc_row_pitch(const struct isl_device *dev,
       !pitch_in_range(row_pitch, _3DSTATE_HIER_DEPTH_BUFFER_SurfacePitch_bits(dev->info)))
      return false;

-   if (surf_info->usage & ISL_SURF_USAGE_STENCIL_BIT)
-      isl_finishme("validate row pitch of stencil surfaces");
+   const uint32_t stencil_pitch_bits = dev->use_separate_stencil ?
+      _3DSTATE_STENCIL_BUFFER_SurfacePitch_bits(dev->info) :
+      _3DSTATE_DEPTH_BUFFER_SurfacePitch_bits(dev->info);
+
+   if ((surf_info->usage & ISL_SURF_USAGE_STENCIL_BIT) &&
+       !pitch_in_range(row_pitch, stencil_pitch_bits))
+      return false;

 done:
   *out_row_pitch = row_pitch;
   return true;
 }

-/**
- * Calculate and apply any padding required for the surface.
- *
- * @param[inout] total_h_el is updated with the new height
- * @param[out] pad_bytes is overwritten with additional padding requirements.
- */
-static void
-isl_apply_surface_padding(const struct isl_device *dev,
-                          const struct isl_surf_init_info *restrict info,
-                          const struct isl_tile_info *tile_info,
-                          uint32_t *total_h_el,
-                          uint32_t *pad_bytes)
-{
-   const struct isl_format_layout *fmtl = isl_format_get_layout(info->format);
-
-   *pad_bytes = 0;
-
-   /* From the Broadwell PRM >> Volume 5: Memory Views >> Common Surface
-    * Formats >> Surface Padding Requirements >> Render Target and Media
-    * Surfaces:
-    *
-    *   The data port accesses data (pixels) outside of the surface if they
-    *   are contained in the same cache request as pixels that are within the
-    *   surface. These pixels will not be returned by the requesting message,
-    *   however if these pixels lie outside of defined pages in the GTT,
-    *   a GTT error will result when the cache request is processed. In
-    *   order to avoid these GTT errors, “padding” at the bottom of the
-    *   surface is sometimes necessary.
-    *
-    * From the Broadwell PRM >> Volume 5: Memory Views >> Common Surface
-    * Formats >> Surface Padding Requirements >> Sampling Engine Surfaces:
-    *
-    *    ... Lots of padding requirements, all listed separately below.
-    */
-
-   /* We can safely ignore the first padding requirement, quoted below,
-    * because isl doesn't do buffers.
-    *
-    *    - [pre-BDW] For buffers, which have no inherent “height,” padding
-    *      requirements are different. A buffer must be padded to the next
-    *      multiple of 256 array elements, with an additional 16 bytes added
-    *      beyond that to account for the L1 cache line.
-    */
-
-   /*
-    *    - For compressed textures [...], padding at the bottom of the surface
-    *      is to an even compressed row.
-    */
-   if (isl_format_is_compressed(info->format))
-      *total_h_el = isl_align(*total_h_el, 2);
-
-   /*
-    *    - For cube surfaces, an additional two rows of padding are required
-    *      at the bottom of the surface.
-    */
-   if (info->usage & ISL_SURF_USAGE_CUBE_BIT)
-      *total_h_el += 2;
-
-   /*
-    *    - For packed YUV, 96 bpt, 48 bpt, and 24 bpt surface formats,
-    *      additional padding is required. These surfaces require an extra row
-    *      plus 16 bytes of padding at the bottom in addition to the general
-    *      padding requirements.
-    */
-   if (isl_format_is_yuv(info->format) &&
-       (fmtl->bpb == 96 || fmtl->bpb == 48|| fmtl->bpb == 24)) {
-      *total_h_el += 1;
-      *pad_bytes += 16;
-   }
-
-   /*
-    *    - For linear surfaces, additional padding of 64 bytes is required at
-    *      the bottom of the surface. This is in addition to the padding
-    *      required above.
-    */
-   if (tile_info->tiling == ISL_TILING_LINEAR)
-      *pad_bytes += 64;
-
-   /* The below text weakens, not strengthens, the padding requirements for
-    * linear surfaces. Therefore we can safely ignore it.
-    *
-    *    - [BDW+] For SURFTYPE_BUFFER, SURFTYPE_1D, and SURFTYPE_2D non-array,
-    *      non-MSAA, non-mip-mapped surfaces in linear memory, the only
-    *      padding requirement is to the next aligned 64-byte boundary beyond
-    *      the end of the surface. The rest of the padding requirements
-    *      documented above do not apply to these surfaces.
-    */
-
-   /*
-    *    - [SKL+] For SURFTYPE_2D and SURFTYPE_3D with linear mode and
-    *      height % 4 != 0, the surface must be padded with
-    *      4-(height % 4)*Surface Pitch # of bytes.
-    */
-   if (ISL_DEV_GEN(dev) >= 9 &&
-       tile_info->tiling == ISL_TILING_LINEAR &&
-       (info->dim == ISL_SURF_DIM_2D || info->dim == ISL_SURF_DIM_3D)) {
-      *total_h_el = isl_align(*total_h_el, 4);
-   }
-
-   /*
-    *    - [SKL+] For SURFTYPE_1D with linear mode, the surface must be padded
-    *      to 4 times the Surface Pitch # of bytes
-    */
-   if (ISL_DEV_GEN(dev) >= 9 &&
-       tile_info->tiling == ISL_TILING_LINEAR &&
-       info->dim == ISL_SURF_DIM_1D) {
-      *total_h_el += 4;
-   }
-}
-
 bool
 isl_surf_init_s(const struct isl_device *dev,
                struct isl_surf *surf,
@@ -1536,10 +1432,6 @@ isl_surf_init_s(const struct isl_device *dev,
                                 array_pitch_span, &array_pitch_el_rows,
                                 &phys_total_el);

-   uint32_t padded_h_el = phys_total_el.h;
-   uint32_t pad_bytes;
-   isl_apply_surface_padding(dev, info, &tile_info, &padded_h_el, &pad_bytes);
-
   uint32_t row_pitch;
   if (!isl_calc_row_pitch(dev, info, &tile_info, dim_layout,
                           &phys_total_el, &row_pitch))
@@ -1548,7 +1440,7 @@ isl_surf_init_s(const struct isl_device *dev,
   uint32_t base_alignment;
   uint64_t size;
   if (tiling == ISL_TILING_LINEAR) {
-      size = (uint64_t) row_pitch * padded_h_el + pad_bytes;
+      size = (uint64_t) row_pitch * phys_total_el.h;

      /* From the Broadwell PRM Vol 2d, RENDER_SURFACE_STATE::SurfaceBaseAddress:
       *
@@ -1569,9 +1461,8 @@ isl_surf_init_s(const struct isl_device *dev,
      }
      base_alignment = isl_round_up_to_power_of_two(base_alignment);
   } else {
-      padded_h_el += isl_align_div_npot(pad_bytes, row_pitch);
      const uint32_t total_h_tl =
-         isl_align_div(padded_h_el, tile_info.logical_extent_el.height);
+         isl_align_div(phys_total_el.h, tile_info.logical_extent_el.height);

      size = (uint64_t) total_h_tl * tile_info.phys_extent_B.height * row_pitch;

--- a/src/intel/vulkan/anv_formats.c
+++ b/src/intel/vulkan/anv_formats.c
@@ -395,7 +395,8 @@ anv_physical_device_get_format_properties(struct anv_physical_device *physical_d
      /* Nothing to do here */
   } else if (vk_format_is_depth_or_stencil(format)) {
      tiled |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT;
-      if (physical_device->info.gen >= 8)
+      if (vk_format_aspects(format) == VK_IMAGE_ASPECT_DEPTH_BIT ||
+          physical_device->info.gen >= 8)
         tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;

      tiled |= VK_FORMAT_FEATURE_BLIT_SRC_BIT |
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -304,8 +304,9 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
      src_format = dst_format = MESA_FORMAT_R_FLOAT32;
   }

+   enum isl_format src_isl_format = brw_isl_format_for_mesa_format(src_format);
   enum isl_aux_usage src_aux_usage =
-      intel_miptree_texture_aux_usage(brw, src_mt, src_format);
+      intel_miptree_texture_aux_usage(brw, src_mt, src_isl_format);
   /* We do format workarounds for some depth formats so we can't reliably
    * sample with HiZ.  One of these days, we should fix that.
    */
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -750,7 +750,7 @@ brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
      bo_wait_with_stall_warning(brw, bo, "CPU mapping");
   }

-   if (!bo->cache_coherent) {
+   if (!bo->cache_coherent && !bo->bufmgr->has_llc) {
      /* If we're reusing an existing CPU mapping, the CPU caches may
       * contain stale data from the last time we read from that mapping.
       * (With the BO cache, it might even be data from a previous buffer!)
@@ -760,6 +760,12 @@ brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
       * We need to invalidate those cachelines so that we see the latest
       * contents, and so long as we only read from the CPU mmap we do not
       * need to write those cachelines back afterwards.
+       *
+       * On LLC, the emprical evidence suggests that writes from the GPU
+       * that bypass the LLC (i.e. for scanout) do *invalidate* the CPU
+       * cachelines. (Other reads, such as the display engine, bypass the
+       * LLC entirely requiring us to keep dirty pixels for the scanout
+       * out of any cache.)
       */
      gen_invalidate_range(bo->map_cpu, bo->size);
   }
@@ -897,6 +903,14 @@ can_map_cpu(struct brw_bo *bo, unsigned flags)
   if (bo->cache_coherent)
      return true;

+   /* Even if the buffer itself is not cache-coherent (such as a scanout), on
+    * an LLC platform reads always are coherent (as they are performed via the
+    * central system agent). It is just the writes that we need to take special
+    * care to ensure that land in main memory and not stick in the CPU cache.
+    */
+   if (!(flags & MAP_WRITE) && bo->bufmgr->has_llc)
+      return true;
+
   /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid
    * across batch flushes where the kernel will change cache domains of the
    * bo, invalidating continued access to the CPU mmap on non-LLC device.
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -847,9 +847,15 @@ intel_miptree_create_for_bo(struct brw_context *brw,
   mt->bo = bo;
   mt->offset = offset;

-   if (!(layout_flags & MIPTREE_LAYOUT_DISABLE_AUX))
+   if (!(layout_flags & MIPTREE_LAYOUT_DISABLE_AUX)) {
      intel_miptree_choose_aux_usage(brw, mt);

+      if (!intel_miptree_alloc_aux(brw, mt)) {
+         intel_miptree_release(&mt);
+         return NULL;
+      }
+   }
+
   return mt;
 }

@@ -979,11 +985,6 @@ intel_miptree_create_for_dri_image(struct brw_context *brw,
      }
   }

-   if (!intel_miptree_alloc_aux(brw, mt)) {
-      intel_miptree_release(&mt);
-      return NULL;
-   }
-
   return mt;
 }

@@ -2719,6 +2720,7 @@ intel_miptree_make_shareable(struct brw_context *brw,
   }

   mt->aux_usage = ISL_AUX_USAGE_NONE;
+   mt->supports_fast_clear = false;
 }


--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2409,8 +2409,10 @@ namespace {
 class add_uniform_to_shader : public program_resource_visitor {
 public:
   add_uniform_to_shader(struct gl_shader_program *shader_program,
-			 struct gl_program_parameter_list *params)
-      : shader_program(shader_program), params(params), idx(-1)
+			 struct gl_program_parameter_list *params,
+                         gl_shader_stage shader_type)
+      : shader_program(shader_program), params(params), idx(-1),
+        shader_type(shader_type)
   {
      /* empty */
   }
@@ -2433,6 +2435,7 @@ private:
   struct gl_program_parameter_list *params;
   int idx;
   ir_variable *var;
+   gl_shader_stage shader_type;
 };

 } /* anonymous namespace */
@@ -2444,18 +2447,49 @@ add_uniform_to_shader::visit_field(const glsl_type *type, const char *name,
                                   const enum glsl_interface_packing,
                                   bool /* last_field */)
 {
-   /* opaque types don't use storage in the param list unless they are
-    * bindless samplers or images.
-    */
-   if (type->contains_opaque() && !var->data.bindless)
+   /* atomics don't get real storage */
+   if (type->contains_atomic())
      return;

-   assert(_mesa_lookup_parameter_index(params, name) < 0);
+   gl_register_file file;
+   if (type->without_array()->is_sampler() && !var->data.bindless) {
+      file = PROGRAM_SAMPLER;
+   } else {
+      file = PROGRAM_UNIFORM;
+   }

-   unsigned size = type_size(type) * 4;
+   int index = _mesa_lookup_parameter_index(params, name);
+   if (index < 0) {
+      unsigned size = type_size(type) * 4;

-   int index = _mesa_add_parameter(params, PROGRAM_UNIFORM, name, size,
-                                   type->gl_type, NULL, NULL);
+      index = _mesa_add_parameter(params, file, name, size, type->gl_type,
+				  NULL, NULL);
+
+      /* Sampler uniform values are stored in prog->SamplerUnits,
+       * and the entry in that array is selected by this index we
+       * store in ParameterValues[].
+       */
+      if (file == PROGRAM_SAMPLER) {
+	 unsigned location;
+	 const bool found =
+	    this->shader_program->UniformHash->get(location,
+						   params->Parameters[index].Name);
+	 assert(found);
+
+	 if (!found)
+	    return;
+
+	 struct gl_uniform_storage *storage =
+            &this->shader_program->data->UniformStorage[location];
+
+         assert(storage->type->is_sampler() &&
+                storage->opaque[shader_type].active);
+
+	 for (unsigned int j = 0; j < size / 4; j++)
+            params->ParameterValues[index + j][0].f =
+               storage->opaque[shader_type].index + j;
+      }
+   }

   /* The first part of the uniform that's processed determines the base
    * location of the whole uniform (for structures).
@@ -2479,7 +2513,7 @@ _mesa_generate_parameters_list_for_uniforms(struct gl_shader_program
 					    struct gl_program_parameter_list
 					    *params)
 {
-   add_uniform_to_shader add(shader_program, params);
+   add_uniform_to_shader add(shader_program, params, sh->Stage);

   foreach_in_list(ir_instruction, node, sh->ir) {
      ir_variable *var = node->as_variable();
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -634,7 +634,7 @@ st_context_flush(struct st_context_iface *stctxi, unsigned flags,

   st_flush(st, fence, pipe_flags);

-   if ((flags & ST_FLUSH_WAIT) && fence) {
+   if ((flags & ST_FLUSH_WAIT) && fence && *fence) {
      st->pipe->screen->fence_finish(st->pipe->screen, NULL, *fence,
                                     PIPE_TIMEOUT_INFINITE);
      st->pipe->screen->fence_reference(st->pipe->screen, fence, NULL);
--- a/src/util/Makefile.am
+++ b/src/util/Makefile.am
@@ -44,7 +44,9 @@ libmesautil_la_SOURCES = \
 	$(MESA_UTIL_FILES) \
 	$(MESA_UTIL_GENERATED_FILES)

-libmesautil_la_LIBADD = $(ZLIB_LIBS)
+libmesautil_la_LIBADD = \
+	$(CLOCK_LIB) \
+	$(ZLIB_LIBS)

 roundeven_test_LDADD = -lm