version: bump for 19.0-rc4

anv/cmd_buffer: check for NULL framebuffer
This can happen when we record a VkCmdDraw in a secondary buffer that was created inheriting from the primary buffer, but with the framebuffer set to NULL in the VkCommandBufferInheritanceInfo. Vulkan 1.1.81 spec says that "the application must ensure (using scissor if neccesary) that all rendering is contained in the render area [...] [which] must be contained within the framebuffer dimesions". While this should be done by the application, commit 465e5a86 added the clamp to the framebuffer size, in case of application does not do it. But this requires to know the framebuffer dimensions. If we do not have a framebuffer at that moment, the best compromise we can do is to just apply the scissor as it is, and let the application to ensure the rendering is contained in the render area. v2: do not clamp to framebuffer if there isn't a framebuffer v3 (Jason): - clamp earlier in the conditional - clamp to render area if command buffer is primary v4: clamp also x and y to render area (Jason) v5: rename used variables (Jason) Fixes: 465e5a86 ("anv: Clamp scissors to the framebuffer boundary") CC: Jason Ekstrand <jason@jlekstrand.net> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> (cherry picked from commit 1ad26f9417)
2019-02-13 09:11:02 -08:00 · 2019-02-12 14:19:52 -08:00 · 2019-02-12 14:19:52 -08:00 · 2019-02-12 14:19:52 -08:00 · 2019-02-12 14:19:52 -08:00 · 2019-02-12 14:19:52 -08:00
96 changed files with 1700 additions and 371 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,6 +22,7 @@
 SUBDIRS = src

 AM_DISTCHECK_CONFIGURE_FLAGS = \
+	--enable-autotools \
 	--enable-dri \
 	--enable-dri3 \
 	--enable-egl \
--- a/2
+++ b/2
@@ -1 +1 @@
-19.0.0-devel
+19.0.0-rc4
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,3 @@
+# Both of these were already merged with different shas
+da48cba61ef6fefb799bf96e6364b70dbf4ec712
+c812c740e60c14060eb89db66039111881a0f42f
--- a/configure.ac
+++ b/configure.ac
@@ -122,7 +122,7 @@ LLVM_REQUIRED_OPENCL=3.9.0
 LLVM_REQUIRED_R600=3.9.0
 LLVM_REQUIRED_RADEONSI=7.0.0
 LLVM_REQUIRED_RADV=7.0.0
-LLVM_REQUIRED_SWR=6.0.0
+LLVM_REQUIRED_SWR=7.0.0

 dnl Check for progs
 AC_PROG_CPP
@@ -2845,8 +2845,8 @@ if test -n "$with_gallium_drivers"; then
 fi

 # XXX: Keep in sync with LLVM_REQUIRED_SWR
-AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x6.0.0 -a \
-                                              "x$LLVM_VERSION" != x6.0.1)
+AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x7.0.0 -a \
+                                              "x$LLVM_VERSION" != x7.0.1)

 if test "x$enable_llvm" = "xyes" -a "$with_gallium_drivers"; then
    llvm_require_version $LLVM_REQUIRED_GALLIUM "gallium"
--- a/meson.build
+++ b/meson.build
@@ -1400,7 +1400,7 @@ if with_platform_x11
    dep_xcb_xfixes = dependency('xcb-xfixes')
  endif
  if with_xlib_lease
-    dep_xcb_xrandr = dependency('xcb-randr', version : '>= 1.12')
+    dep_xcb_xrandr = dependency('xcb-randr')
    dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3')
  endif
 endif
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -923,6 +923,14 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 				  ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 }

+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+	         LLVMValueRef base_ptr,
+	         LLVMValueRef index)
+{
+	return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
+}
+
 LLVMValueRef
 ac_build_gep0(struct ac_llvm_context *ctx,
 	      LLVMValueRef base_ptr,
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -223,6 +223,11 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 		       LLVMValueRef attr_number,
 		       LLVMValueRef params);

+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+	         LLVMValueRef base_ptr,
+	         LLVMValueRef index);
+
 LLVMValueRef
 ac_build_gep0(struct ac_llvm_context *ctx,
 	      LLVMValueRef base_ptr,
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2006,18 +2006,23 @@ static void
 visit_store_var(struct ac_nir_context *ctx,
 		nir_intrinsic_instr *instr)
 {
-        nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+	nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+	nir_variable *var = nir_deref_instr_get_variable(deref);

 	LLVMValueRef temp_ptr, value;
-	int idx = var->data.driver_location;
-	unsigned comp = var->data.location_frac;
+	int idx = 0;
+	unsigned comp = 0;
 	LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1]));
 	int writemask = instr->const_index[0];
 	LLVMValueRef indir_index;
 	unsigned const_index;

-	get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false,
-	                 NULL, NULL, &const_index, &indir_index);
+	if (var) {
+		get_deref_offset(ctx, deref, false,
+		                 NULL, NULL, &const_index, &indir_index);
+		idx = var->data.driver_location;
+		comp = var->data.location_frac;
+	}

 	if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) {

@@ -2030,7 +2035,7 @@ visit_store_var(struct ac_nir_context *ctx,

 	writemask = writemask << comp;

-	switch (var->data.mode) {
+	switch (deref->mode) {
 	case nir_var_shader_out:

 		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
@@ -2039,8 +2044,8 @@ visit_store_var(struct ac_nir_context *ctx,
 			unsigned const_index = 0;
 			const bool is_patch = var->data.patch;

-			get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
-			                 false, NULL, is_patch ? NULL : &vertex_index,
+			get_deref_offset(ctx, deref, false, NULL,
+			                 is_patch ? NULL : &vertex_index,
 			                 &const_index, &indir_index);

 			ctx->abi->store_tcs_outputs(ctx->abi, var,
@@ -2107,7 +2112,7 @@ visit_store_var(struct ac_nir_context *ctx,
 		int writemask = instr->const_index[0];
 		LLVMValueRef address = get_src(ctx, instr->src[0]);
 		LLVMValueRef val = get_src(ctx, instr->src[1]);
-		if (util_is_power_of_two_nonzero(writemask)) {
+		if (writemask == (1u << ac_get_llvm_num_components(val)) - 1) {
 			val = LLVMBuildBitCast(
 			   ctx->ac.builder, val,
 			   LLVMGetElementType(LLVMTypeOf(address)), "");
@@ -3818,6 +3823,73 @@ static void visit_jump(struct ac_llvm_context *ctx,
 	}
 }

+static LLVMTypeRef
+glsl_base_to_llvm_type(struct ac_llvm_context *ac,
+		       enum glsl_base_type type)
+{
+	switch (type) {
+	case GLSL_TYPE_INT:
+	case GLSL_TYPE_UINT:
+	case GLSL_TYPE_BOOL:
+	case GLSL_TYPE_SUBROUTINE:
+		return ac->i32;
+	case GLSL_TYPE_INT16:
+	case GLSL_TYPE_UINT16:
+		return ac->i16;
+	case GLSL_TYPE_FLOAT:
+		return ac->f32;
+	case GLSL_TYPE_FLOAT16:
+		return ac->f16;
+	case GLSL_TYPE_INT64:
+	case GLSL_TYPE_UINT64:
+		return ac->i64;
+	case GLSL_TYPE_DOUBLE:
+		return ac->f64;
+	default:
+		unreachable("unknown GLSL type");
+	}
+}
+
+static LLVMTypeRef
+glsl_to_llvm_type(struct ac_llvm_context *ac,
+		  const struct glsl_type *type)
+{
+	if (glsl_type_is_scalar(type)) {
+		return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
+	}
+
+	if (glsl_type_is_vector(type)) {
+		return LLVMVectorType(
+		   glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
+		   glsl_get_vector_elements(type));
+	}
+
+	if (glsl_type_is_matrix(type)) {
+		return LLVMArrayType(
+		   glsl_to_llvm_type(ac, glsl_get_column_type(type)),
+		   glsl_get_matrix_columns(type));
+	}
+
+	if (glsl_type_is_array(type)) {
+		return LLVMArrayType(
+		   glsl_to_llvm_type(ac, glsl_get_array_element(type)),
+		   glsl_get_length(type));
+	}
+
+	assert(glsl_type_is_struct(type));
+
+	LLVMTypeRef member_types[glsl_get_length(type)];
+
+	for (unsigned i = 0; i < glsl_get_length(type); i++) {
+		member_types[i] =
+			glsl_to_llvm_type(ac,
+					  glsl_get_struct_field(type, i));
+	}
+
+	return LLVMStructTypeInContext(ac->context, member_types,
+				       glsl_get_length(type), false);
+}
+
 static void visit_deref(struct ac_nir_context *ctx,
                        nir_deref_instr *instr)
 {
@@ -3839,9 +3911,27 @@ static void visit_deref(struct ac_nir_context *ctx,
 		result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
 		                       get_src(ctx, instr->arr.index));
 		break;
-	case nir_deref_type_cast:
-		result = get_src(ctx, instr->parent);
+	case nir_deref_type_ptr_as_array:
+		result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent),
+		                          get_src(ctx, instr->arr.index));
 		break;
+	case nir_deref_type_cast: {
+		result = get_src(ctx, instr->parent);
+
+		LLVMTypeRef pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type);
+		LLVMTypeRef type = LLVMPointerType(pointee_type, AC_ADDR_SPACE_LDS);
+
+		if (LLVMTypeOf(result) != type) {
+			if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) {
+				result = LLVMBuildBitCast(ctx->ac.builder, result,
+				                          type, "");
+			} else {
+				result = LLVMBuildIntToPtr(ctx->ac.builder, result,
+				                           type, "");
+			}
+		}
+		break;
+	}
 	default:
 		unreachable("Unhandled deref_instr deref type");
 	}
@@ -3990,73 +4080,6 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
 	}
 }

-static LLVMTypeRef
-glsl_base_to_llvm_type(struct ac_llvm_context *ac,
-		       enum glsl_base_type type)
-{
-	switch (type) {
-	case GLSL_TYPE_INT:
-	case GLSL_TYPE_UINT:
-	case GLSL_TYPE_BOOL:
-	case GLSL_TYPE_SUBROUTINE:
-		return ac->i32;
-	case GLSL_TYPE_INT16:
-	case GLSL_TYPE_UINT16:
-		return ac->i16;
-	case GLSL_TYPE_FLOAT:
-		return ac->f32;
-	case GLSL_TYPE_FLOAT16:
-		return ac->f16;
-	case GLSL_TYPE_INT64:
-	case GLSL_TYPE_UINT64:
-		return ac->i64;
-	case GLSL_TYPE_DOUBLE:
-		return ac->f64;
-	default:
-		unreachable("unknown GLSL type");
-	}
-}
-
-static LLVMTypeRef
-glsl_to_llvm_type(struct ac_llvm_context *ac,
-		  const struct glsl_type *type)
-{
-	if (glsl_type_is_scalar(type)) {
-		return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
-	}
-
-	if (glsl_type_is_vector(type)) {
-		return LLVMVectorType(
-		   glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
-		   glsl_get_vector_elements(type));
-	}
-
-	if (glsl_type_is_matrix(type)) {
-		return LLVMArrayType(
-		   glsl_to_llvm_type(ac, glsl_get_column_type(type)),
-		   glsl_get_matrix_columns(type));
-	}
-
-	if (glsl_type_is_array(type)) {
-		return LLVMArrayType(
-		   glsl_to_llvm_type(ac, glsl_get_array_element(type)),
-		   glsl_get_length(type));
-	}
-
-	assert(glsl_type_is_struct(type));
-
-	LLVMTypeRef member_types[glsl_get_length(type)];
-
-	for (unsigned i = 0; i < glsl_get_length(type); i++) {
-		member_types[i] =
-			glsl_to_llvm_type(ac,
-					  glsl_get_struct_field(type, i));
-	}
-
-	return LLVMStructTypeInContext(ac->context, member_types,
-				       glsl_get_length(type), false);
-}
-
 static void
 setup_locals(struct ac_nir_context *ctx,
 	     struct nir_function *func)
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1356,7 +1356,7 @@ radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,

 	uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;

-	if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
+	if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
 		radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
@@ -1518,14 +1518,13 @@ radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,

 	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;

-	if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
+	if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
 		radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
 		radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
 		radeon_emit(cs, 2);
 	} else {
-		/* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */
 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
 		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
 				COPY_DATA_DST_SEL(COPY_DATA_REG) |
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -84,7 +84,9 @@ VkResult radv_CreateDescriptorSetLayout(
 	uint32_t immutable_sampler_count = 0;
 	for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
 		max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding);
-		if (pCreateInfo->pBindings[j].pImmutableSamplers)
+		if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+		     pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
+		     pCreateInfo->pBindings[j].pImmutableSamplers)
 			immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
 	}

@@ -182,7 +184,9 @@ VkResult radv_CreateDescriptorSetLayout(
 			set_layout->has_variable_descriptors = true;
 		}

-		if (binding->pImmutableSamplers) {
+		if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+		     binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
+		    binding->pImmutableSamplers) {
 			set_layout->binding[b].immutable_samplers_offset = samplers_offset;
 			set_layout->binding[b].immutable_samplers_equal =
 				has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount);
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -369,6 +369,11 @@ radv_physical_device_init(struct radv_physical_device *device,
 	device->dcc_msaa_allowed =
 		(device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA);

+	/* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */
+	device->has_load_ctx_reg_pkt = device->rad_info.chip_class >= GFX9 ||
+				       (device->rad_info.chip_class >= VI &&
+				        device->rad_info.me_fw_feature >= 41);
+
 	radv_physical_device_init_mem_types(device);
 	radv_fill_device_extension_table(device, &device->supported_extensions);

--- a/src/amd/vulkan/radv_meta_blit.c
+++ b/src/amd/vulkan/radv_meta_blit.c
@@ -849,54 +849,60 @@ build_pipeline(struct radv_device *device,
 		.subpass = 0,
 	};

-	switch(aspect) {
-	case VK_IMAGE_ASPECT_COLOR_BIT:
-		vk_pipeline_info.pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
-			.attachmentCount = 1,
-			.pAttachments = (VkPipelineColorBlendAttachmentState []) {
-				{ .colorWriteMask =
-				VK_COLOR_COMPONENT_A_BIT |
-				VK_COLOR_COMPONENT_R_BIT |
-				VK_COLOR_COMPONENT_G_BIT |
-				VK_COLOR_COMPONENT_B_BIT },
+	VkPipelineColorBlendStateCreateInfo color_blend_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+		.attachmentCount = 1,
+		.pAttachments = (VkPipelineColorBlendAttachmentState []) {
+			{
+				.colorWriteMask = VK_COLOR_COMPONENT_A_BIT |
+						  VK_COLOR_COMPONENT_R_BIT |
+						  VK_COLOR_COMPONENT_G_BIT |
+						  VK_COLOR_COMPONENT_B_BIT },
 			}
 		};
+
+	VkPipelineDepthStencilStateCreateInfo depth_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+		.depthTestEnable = true,
+		.depthWriteEnable = true,
+		.depthCompareOp = VK_COMPARE_OP_ALWAYS,
+	};
+
+	VkPipelineDepthStencilStateCreateInfo stencil_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+		.depthTestEnable = false,
+		.depthWriteEnable = false,
+		.stencilTestEnable = true,
+		.front = {
+			.failOp = VK_STENCIL_OP_REPLACE,
+			.passOp = VK_STENCIL_OP_REPLACE,
+			.depthFailOp = VK_STENCIL_OP_REPLACE,
+			.compareOp = VK_COMPARE_OP_ALWAYS,
+			.compareMask = 0xff,
+			.writeMask = 0xff,
+			.reference = 0
+		},
+		.back = {
+			.failOp = VK_STENCIL_OP_REPLACE,
+			.passOp = VK_STENCIL_OP_REPLACE,
+			.depthFailOp = VK_STENCIL_OP_REPLACE,
+			.compareOp = VK_COMPARE_OP_ALWAYS,
+			.compareMask = 0xff,
+			.writeMask = 0xff,
+			.reference = 0
+		},
+		.depthCompareOp = VK_COMPARE_OP_ALWAYS,
+	};
+
+	switch(aspect) {
+	case VK_IMAGE_ASPECT_COLOR_BIT:
+		vk_pipeline_info.pColorBlendState = &color_blend_info;
 		break;
 	case VK_IMAGE_ASPECT_DEPTH_BIT:
-		vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
-			.depthTestEnable = true,
-			.depthWriteEnable = true,
-			.depthCompareOp = VK_COMPARE_OP_ALWAYS,
-		};
+		vk_pipeline_info.pDepthStencilState = &depth_info;
 		break;
 	case VK_IMAGE_ASPECT_STENCIL_BIT:
-		vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
-			.depthTestEnable = false,
-			.depthWriteEnable = false,
-			.stencilTestEnable = true,
-			.front = {
-				.failOp = VK_STENCIL_OP_REPLACE,
-				.passOp = VK_STENCIL_OP_REPLACE,
-				.depthFailOp = VK_STENCIL_OP_REPLACE,
-				.compareOp = VK_COMPARE_OP_ALWAYS,
-				.compareMask = 0xff,
-				.writeMask = 0xff,
-				.reference = 0
-			},
-			.back = {
-				.failOp = VK_STENCIL_OP_REPLACE,
-				.passOp = VK_STENCIL_OP_REPLACE,
-				.depthFailOp = VK_STENCIL_OP_REPLACE,
-				.compareOp = VK_COMPARE_OP_ALWAYS,
-				.compareMask = 0xff,
-				.writeMask = 0xff,
-				.reference = 0
-			},
-			.depthCompareOp = VK_COMPARE_OP_ALWAYS,
-		};
+		vk_pipeline_info.pDepthStencilState = &stencil_info;
 		break;
 	default:
 		unreachable("Unhandled aspect");
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -306,6 +306,9 @@ struct radv_physical_device {
 	/* Whether DCC should be enabled for MSAA textures. */
 	bool dcc_msaa_allowed;

+	/* Whether LOAD_CONTEXT_REG packets are supported. */
+	bool has_load_ctx_reg_pkt;
+
 	/* This is the drivers on-disk cache used as a fallback as opposed to
 	 * the pipeline cache defined by apps.
 	 */
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -159,7 +159,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively,
                NIR_PASS(progress, shader, nir_opt_if);
                NIR_PASS(progress, shader, nir_opt_dead_cf);
                NIR_PASS(progress, shader, nir_opt_cse);
-                NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true);
                NIR_PASS(progress, shader, nir_opt_algebraic);
                NIR_PASS(progress, shader, nir_opt_constant_folding);
                NIR_PASS(progress, shader, nir_opt_undef);
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -101,7 +101,7 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
 	case MESA_SHADER_VERTEX: {
 		nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));

-		if (var->data.mode == nir_var_shader_in) {
+		if (var && var->data.mode == nir_var_shader_in) {
 			unsigned idx = var->data.location;
 			uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa);

@@ -150,7 +150,7 @@ gather_intrinsic_store_deref_info(const nir_shader *nir,
 {
 	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));

-	if (var->data.mode == nir_var_shader_out) {
+	if (var && var->data.mode == nir_var_shader_out) {
 		unsigned idx = var->data.location;

 		switch (nir->info.stage) {
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -543,7 +543,7 @@ static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs,
 	cs->handles[cs->num_buffers].bo_handle = bo;
 	cs->handles[cs->num_buffers].bo_priority = priority;

-	hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
+	hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
 	cs->buffer_hash_table[hash] = cs->num_buffers;

 	++cs->num_buffers;
--- a/src/broadcom/common/v3d_cpu_tiling.h
+++ b/src/broadcom/common/v3d_cpu_tiling.h
@@ -159,9 +159,8 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
                         * d0-d7.
                         */
                        "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                        : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                          [cpu_stride]  "r"(cpu_stride)
                        : "q0", "q1", "q2", "q3");
                return;
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1455,7 +1455,7 @@ v3d_optimize_nir(struct nir_shader *s)
                NIR_PASS(progress, s, nir_opt_dce);
                NIR_PASS(progress, s, nir_opt_dead_cf);
                NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
                NIR_PASS(progress, s, nir_opt_algebraic);
                NIR_PASS(progress, s, nir_opt_constant_folding);
                NIR_PASS(progress, s, nir_opt_undef);
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -156,7 +156,7 @@ pack_sint(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
          int num_components)
 {
        color = nir_channels(b, color, (1 << num_components) - 1);
-        color = nir_format_clamp_uint(b, color, bits);
+        color = nir_format_clamp_sint(b, color, bits);
        return pack_bits(b, color, bits, num_components, true);
 }

--- a/src/compiler/Android.glsl.gen.mk
+++ b/src/compiler/Android.glsl.gen.mk
@@ -104,6 +104,6 @@ $(intermediates)/glsl/ir_expression_operation_strings.h: $(LOCAL_PATH)/glsl/ir_e
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $< strings > $@

-$(intermediates)/compiler/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
+$(intermediates)/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $< $(MESA_TOP)/src/compiler/glsl/float64.glsl $@ -n float64_source > $@
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2825,7 +2825,7 @@ should_print_nir(void)
 static inline void nir_validate_shader(nir_shader *shader, const char *when) { (void) shader; (void)when; }
 static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
 static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
-static inline bool should_skip_nir(const char *pass_name) { return false; }
+static inline bool should_skip_nir(UNUSED const char *pass_name) { return false; }
 static inline bool should_clone_nir(void) { return false; }
 static inline bool should_serialize_deserialize_nir(void) { return false; }
 static inline bool should_print_nir(void) { return false; }
@@ -3316,7 +3316,7 @@ bool nir_opt_move_comparisons(nir_shader *shader);
 bool nir_opt_move_load_ubo(nir_shader *shader);

 bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
-                             bool indirect_load_ok, bool expensive_alu_ok);
+                             bool indirect_load_ok);

 bool nir_opt_remove_phis(nir_shader *shader);

--- a/src/compiler/nir/nir_deref.c
+++ b/src/compiler/nir/nir_deref.c
@@ -574,10 +574,9 @@ nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl)
         _mesa_hash_table_clear(state.cache, NULL);

      nir_foreach_instr_safe(instr, block) {
-         if (instr->type == nir_instr_type_deref) {
-            nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr));
+         if (instr->type == nir_instr_type_deref &&
+             nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr)))
            continue;
-         }

         state.builder.cursor = nir_before_instr(instr);
         nir_foreach_src(instr, rematerialize_deref_src, &state);
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -59,8 +59,7 @@

 static bool
 block_check_for_allowed_instrs(nir_block *block, unsigned *count,
-                               bool alu_ok, bool indirect_load_ok,
-                               bool expensive_alu_ok)
+                               bool alu_ok, bool indirect_load_ok)
 {
   nir_foreach_instr(instr, block) {
      switch (instr->type) {
@@ -118,25 +117,6 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,
         case nir_op_vec3:
         case nir_op_vec4:
            break;
-
-         case nir_op_fcos:
-         case nir_op_fdiv:
-         case nir_op_fexp2:
-         case nir_op_flog2:
-         case nir_op_fmod:
-         case nir_op_fpow:
-         case nir_op_frcp:
-         case nir_op_frem:
-         case nir_op_frsq:
-         case nir_op_fsin:
-         case nir_op_idiv:
-         case nir_op_irem:
-         case nir_op_udiv:
-            if (!alu_ok || !expensive_alu_ok)
-               return false;
-
-            break;
-
         default:
            if (!alu_ok) {
               /* It must be a move-like operation. */
@@ -180,8 +160,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,

 static bool
 nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
-                              unsigned limit, bool indirect_load_ok,
-                              bool expensive_alu_ok)
+                              unsigned limit, bool indirect_load_ok)
 {
   if (nir_cf_node_is_first(&block->cf_node))
      return false;
@@ -202,9 +181,9 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
   /* ... and those blocks must only contain "allowed" instructions. */
   unsigned count = 0;
   if (!block_check_for_allowed_instrs(then_block, &count, limit != 0,
-                                       indirect_load_ok, expensive_alu_ok) ||
+                                       indirect_load_ok) ||
       !block_check_for_allowed_instrs(else_block, &count, limit != 0,
-                                       indirect_load_ok, expensive_alu_ok))
+                                       indirect_load_ok))
      return false;

   if (count > limit)
@@ -271,15 +250,14 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,

 static bool
 nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,
-                             bool indirect_load_ok, bool expensive_alu_ok)
+                             bool indirect_load_ok)
 {
   nir_shader *shader = impl->function->shader;
   bool progress = false;

   nir_foreach_block_safe(block, impl) {
      progress |= nir_opt_peephole_select_block(block, shader, limit,
-                                                indirect_load_ok,
-                                                expensive_alu_ok);
+                                                indirect_load_ok);
   }

   if (progress) {
@@ -295,15 +273,14 @@ nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,

 bool
 nir_opt_peephole_select(nir_shader *shader, unsigned limit,
-                        bool indirect_load_ok, bool expensive_alu_ok)
+                        bool indirect_load_ok)
 {
   bool progress = false;

   nir_foreach_function(function, shader) {
      if (function->impl)
         progress |= nir_opt_peephole_select_impl(function->impl, limit,
-                                                  indirect_load_ok,
-                                                  expensive_alu_ok);
+                                                  indirect_load_ok);
   }

   return progress;
--- a/src/freedreno/Makefile.am
+++ b/src/freedreno/Makefile.am
@@ -45,6 +45,7 @@ TESTS =
 BUILT_SOURCES =
 CLEANFILES =
 EXTRA_DIST = \
+	meson.build \
 	drm/meson.build \
 	ir3/ir3_nir_trig.py \
 	ir3/meson.build
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -97,7 +97,7 @@ ir3_optimize_loop(nir_shader *s)
 			progress |= OPT(s, nir_opt_gcm, true);
 		else if (gcm == 2)
 			progress |= OPT(s, nir_opt_gcm, false);
-		progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
+		progress |= OPT(s, nir_opt_peephole_select, 16, true);
 		progress |= OPT(s, nir_opt_intrinsics);
 		progress |= OPT(s, nir_opt_algebraic);
 		progress |= OPT(s, nir_opt_constant_folding);
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -1524,7 +1524,8 @@ tc_buffer_do_flush_region(struct threaded_context *tc,
   if (ttrans->staging) {
      struct pipe_box src_box;

-      u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
+      u_box_1d(ttrans->offset + ttrans->b.box.x % tc->map_buffer_alignment +
+               (box->x - ttrans->b.box.x),
               box->width, &src_box);

      /* Copy the staging buffer into the original one. */
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -487,6 +487,10 @@ The integer capabilities:
 * ``PIPE_CAP_DEST_SURFACE_SRGB_CONTROL``: Indicates whether the drivers
  supports switching the format between sRGB and linear for a surface that is
  used as destination in draw and blit calls.
+* ``PIPE_CAP_MAX_VARYINGS``: The maximum number of fragment shader
+  varyings. This will generally correspond to
+  ``PIPE_SHADER_CAP_MAX_INPUTS`` for the fragment shader, but in some
+  cases may be a smaller number.

 .. _pipe_capf:

--- a/src/gallium/drivers/etnaviv/etnaviv_screen.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c
@@ -360,6 +360,9 @@ etna_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
      return 0;

+   case PIPE_CAP_MAX_VARYINGS:
+      return screen->specs.max_varyings;
+
   case PIPE_CAP_PCI_GROUP:
   case PIPE_CAP_PCI_BUS:
   case PIPE_CAP_PCI_DEVICE:
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -23,4 +23,6 @@ libfreedreno_la_SOURCES = \
 	$(a6xx_SOURCES) \
 	$(ir3_SOURCES)

-EXTRA_DIST = meson.build
+EXTRA_DIST = \
+	ir3/ir3_cmdline.c \
+	meson.build
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -339,7 +339,6 @@ clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR));
 	OUT_RINGP(ring, patch_type, &batch->gmem_patches);
-	OUT_RING(ring, 0);

 	OUT_PKT3(ring, CP_SET_CONSTANT, 4);
 	OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO));
--- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -74,7 +74,7 @@ ir2_optimize_loop(nir_shader *s)
 		progress |= OPT(s, nir_opt_dce);
 		progress |= OPT(s, nir_opt_cse);
 		/* progress |= OPT(s, nir_opt_gcm, true); */
-		progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
+		progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true);
 		progress |= OPT(s, nir_opt_intrinsics);
 		progress |= OPT(s, nir_opt_algebraic);
 		progress |= OPT(s, nir_opt_constant_folding);
--- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
@@ -438,7 +438,7 @@ emit_blit_texture(struct fd_ringbuffer *ring, const struct pipe_blit_info *info)
 		OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) |
 				 A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) |
 				 A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap));
-		OUT_RELOC(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
+		OUT_RELOCW(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
 		OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch));
 		OUT_RING(ring, 0x00000000);
 		OUT_RING(ring, 0x00000000);
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -317,6 +317,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_VIEWPORTS:
 		return 1;

+	case PIPE_CAP_MAX_VARYINGS:
+		return 16;
+
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
 	/* manage the variants for these ourself, to avoid breaking precompile: */
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -402,6 +402,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
      return 0;
   case PIPE_CAP_ENDIANNESS:
      return PIPE_ENDIAN_LITTLE;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 10;

   case PIPE_CAP_VENDOR_ID:
      return 0x8086;
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -310,6 +310,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
      return 1;
   case PIPE_CAP_CLEAR_TEXTURE:
      return 1;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 32;
   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
--- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
@@ -543,6 +543,8 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
 $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
 long mov b32 $r3 0x3f800000
 long nop
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long nop
 long ret


@@ -554,7 +556,144 @@ long ret
 // SIZE:    9 * 8 bytes
 //
 gk104_rcp_f64:
-   long nop
+   // Step 1: classify input according to exponent and value, and calculate
+   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+   // bit 52 (bit 20 of the upper half) and is 11 bits in length
+   ext u32 $r2 $r1 0xb14
+   add b32 $r3 $r2 0xffffffff
+   joinat #rcp_rejoin
+   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+   // denorm, or 0). Do this by substracting 1 from the exponent, which will
+   // mean that it's > 0x7fd in those cases when doing unsigned comparison
+   set $p0 0x1 gt u32 $r3 0x7fd
+   // $r3: 0 for norms, 0x36 for denorms, -1 for others
+   long mov b32 $r3 0x0
+   sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
+   join (not $p0) nop
+   // Process all special values: NaN, inf, denorm, 0
+   mov b32 $r3 0xffffffff
+   // A number is NaN if its abs value is greater than or unordered with inf
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   (not $p0) bra #rcp_inf_or_denorm_or_zero
+   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+   // behavior is both seen on the CPU and the blob
+   join or b32 $r1 $r1 0x80000
+rcp_inf_or_denorm_or_zero:
+   and b32 $r4 $r1 0x7ff00000
+   // Other values with nonzero in exponent field should be inf
+   set $p0 0x1 eq s32 $r4 0x0
+   sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
+   $p0 bra #rcp_denorm_or_zero
+   // +/-Inf -> +/-0
+   xor b32 $r1 $r1 0x7ff00000
+   join mov b32 $r0 0x0
+rcp_denorm_or_zero:
+   set $p0 0x1 gtu f64 abs $r0d 0x0
+   $p0 bra #rcp_denorm
+   // +/-0 -> +/-Inf
+   join or b32 $r1 $r1 0x7ff00000
+rcp_denorm:
+   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+   mul rn f64 $r0d $r0d 0x4350000000000000
+   sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
+   join mov b32 $r3 0x36
+rcp_rejoin:
+   // All numbers with -1 in $r3 have their result ready in $r0d, return them
+   // others need further calculation
+   set $p0 0x1 lt s32 $r3 0x0
+   $p0 bra #rcp_end
+   // Step 2: Before the real calculation goes on, renormalize the values to
+   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+   // result in $r6d. The exponent will be recovered later.
+   ext u32 $r2 $r1 0xb14
+   and b32 $r7 $r1 0x800fffff
+   add b32 $r7 $r7 0x3ff00000
+   long mov b32 $r6 $r0
+   sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
+   // Step 3: Convert new value to float (no overflow will occur due to step
+   // 2), calculate rcp and do newton-raphson step once
+   cvt rz f32 $r5 f64 $r6d
+   long rcp f32 $r4 $r5
+   mov b32 $r0 0xbf800000
+   fma rn f32 $r5 $r4 $r5 $r0
+   fma rn f32 $r0 neg $r4 $r5 $r4
+   // Step 4: convert result $r0 back to double, do newton-raphson steps
+   cvt f64 $r0d f32 $r0
+   cvt f64 $r6d neg f64 $r6d
+   sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
+   cvt f64 $r8d f32 0x3f800000
+   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+   // The formula used here (and above) is:
+   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+   // The following code uses 2 FMAs for each step, and it will basically
+   // looks like:
+   //     tmp = -src * RCP_{n} + 1
+   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   // Step 5: Exponent recovery and final processing
+   // The exponent is recovered by adding what we added to the exponent.
+   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+   //     rcp(x) = c * rcp(cx)
+   // The delta in exponent comes from two sources:
+   //   1) The renormalization in step 2. The delta is:
+   //      0x3ff - $r2
+   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+   //      in $r3
+   // These 2 sources are calculated in the first two lines below, and then
+   // added to the exponent extracted from the result above.
+   // Note that after processing, the new exponent may >= 0x7ff (inf)
+   // or <= 0 (denorm). Those cases will be handled respectively below
+   subr b32 $r2 $r2 0x3ff
+   long add b32 $r4 $r2 $r3
+   ext u32 $r3 $r1 0xb14
+   // New exponent in $r3
+   long add b32 $r3 $r3 $r4
+   add b32 $r2 $r3 0xffffffff
+   sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
+   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+   // (same logic as in step 1)
+   set $p0 0x1 lt u32 $r2 0x7fe
+   (not $p0) bra #rcp_result_inf_or_denorm
+   // Norms: convert exponents back and return
+   shl b32 $r4 $r4 clamp 0x14
+   long add b32 $r1 $r4 $r1
+   bra #rcp_end
+rcp_result_inf_or_denorm:
+   // New exponent >= 0x7ff means that result is inf
+   set $p0 0x1 ge s32 $r3 0x7ff
+   (not $p0) bra #rcp_result_denorm
+   sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
+   // Infinity
+   and b32 $r1 $r1 0x80000000
+   long mov b32 $r0 0x0
+   add b32 $r1 $r1 0x7ff00000
+   bra #rcp_end
+rcp_result_denorm:
+   // Denorm result comes from huge input. The greatest possible fp64, i.e.
+   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+   // normal value. Other rcp result should be greater than that. If we
+   // set the exponent field to 1, we can recover the result by multiplying
+   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+   // the logic here.
+   set $p0 0x1 ne u32 $r3 0x0
+   and b32 $r1 $r1 0x800fffff
+   // 0x3e800000: 1/4
+   $p0 cvt f64 $r6d f32 0x3e800000
+   sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
+   // 0x3f000000: 1/2
+   (not $p0) cvt f64 $r6d f32 0x3f000000
+   add b32 $r1 $r1 0x00100000
+   mul rn f64 $r0d $r0d $r6d
+rcp_end:
   long ret

 // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
@@ -565,7 +704,67 @@ gk104_rcp_f64:
 // SIZE:    14 * 8 bytes
 //
 gk104_rsq_f64:
-   long nop
+   // Before getting initial result rsqrt64h, two special cases should be
+   // handled first.
+   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+   //    as NaN in rsqrt64h
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   $p0 or b32 $r1 $r1 0x00080000
+   and b32 $r2 $r1 0x7fffffff
+   sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
+   // 2. denorms and small normal values: using their original value will
+   //    lose precision either at rsqrt64h or the first step in newton-raphson
+   //    steps below. Take 2 as a threshold in exponent field, and multiply
+   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+   //    to recover in the end)
+   ext u32 $r3 $r1 0xb14
+   set $p1 0x1 le u32 $r3 0x2
+   long or b32 $r2 $r0 $r2
+   $p1 mul rn f64 $r0d $r0d 0x4350000000000000
+   rsqrt64h $r5 $r1
+   // rsqrt64h will give correct result for 0/inf/nan, the following logic
+   // checks whether the input is one of those (exponent is 0x7ff or all 0
+   // except for the sign bit)
+   set b32 $r6 ne u32 $r3 0x7ff
+   long and b32 $r2 $r2 $r6
+   sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
+   set $p0 0x1 ne u32 $r2 0x0
+   $p0 bra #rsq_norm
+   // For 0/inf/nan, make sure the sign bit agrees with input and return
+   and b32 $r1 $r1 0x80000000
+   long mov b32 $r0 0x0
+   long or b32 $r1 $r1 $r5
+   long ret
+rsq_norm:
+   // For others, do 4 Newton-Raphson steps with the formula:
+   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+   // In the code below, each step is written as:
+   //     tmp1 = 0.5 * x * RSQ_{n}
+   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
+   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+   long mov b32 $r4 0x0
+   sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
+   // 0x3f000000: 1/2
+   cvt f64 $r8d f32 0x3f000000
+   mul rn f64 $r2d $r0d $r8d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
+   // Multiply 2^27 to result for small inputs to recover
+   $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
+   long mov b32 $r1 $r5
+   long mov b32 $r0 $r4
   long ret

 //
--- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
@@ -481,12 +481,132 @@ uint64_t gk104_builtin_code[] = {
 	0xd40040000840c785,
 	0x18fe00000000dde2,
 	0x4000000000001de4,
-	0x9000000000001de7,
-/* 0x0f08: gk104_rcp_f64 */
+	0x2000000000000007,
 	0x4000000000001de4,
 	0x9000000000001de7,
-/* 0x0f18: gk104_rsq_f64 */
-	0x4000000000001de4,
+/* 0x0f18: gk104_rcp_f64 */
+	0x7000c02c50109c03,
+	0x0bfffffffc20dc02,
+	0x6000000280000007,
+	0x1a0ec01ff431dc03,
+	0x180000000000dde2,
+	0x228282f2b2d042f7,
+	0x40000000000021f4,
+	0x1bfffffffc00dde2,
+	0x1e0edffc0001dc81,
+	0x40000000200021e7,
+	0x3800200000105c52,
+/* 0x0f70: rcp_inf_or_denorm_or_zero */
+	0x39ffc00000111c02,
+	0x190e0000fc41dc23,
+	0x2202f2b2d2f042b7,
+	0x40000000400001e7,
+	0x39ffc00000105c82,
+	0x1800000000001df2,
+/* 0x0fa0: rcp_denorm_or_zero */
+	0x1e0ec0000001dc81,
+	0x40000000200001e7,
+	0x39ffc00000105c52,
+/* 0x0fb8: rcp_denorm */
+	0x5000d0d400001c01,
+	0x2280428282b282f7,
+	0x18000000d800ddf2,
+/* 0x0fd0: rcp_rejoin */
+	0x188e0000fc31dc23,
+	0x40000006000001e7,
+	0x7000c02c50109c03,
+	0x3a003ffffc11dc02,
+	0x08ffc0000071dc02,
+	0x2800000000019de4,
+	0x22e2b2a2828042b7,
+	0x1006000019a15c04,
+	0xc800000010511c00,
+	0x1afe000000001de2,
+	0x3000000014415c00,
+	0x3008000014401e00,
+	0x1000000001301c04,
+	0x1000000019b19d04,
+	0x22929292929292e7,
+	0x1000cfe001321c04,
+	0x2010000000611c01,
+	0x2000000010001c01,
+	0x2010000000611c01,
+	0x2000000010001c01,
+	0x2010000000611c01,
+	0x2000000010001c01,
+	0x2282828282820297,
+	0x2010000000611c01,
+	0x2000000010001c01,
+	0x0800000ffc209e02,
+	0x480000000c211c03,
+	0x7000c02c5010dc03,
+	0x480000001030dc03,
+	0x0bfffffffc309c02,
+	0x22b28282b282b287,
+	0x188ec01ff821dc03,
+	0x40000000600021e7,
+	0x6000c00050411c03,
+	0x4800000004405c03,
+	0x40000001c0001de7,
+/* 0x10f0: rcp_result_inf_or_denorm */
+	0x1b0ec01ffc31dc23,
+	0x40000000a00021e7,
+	0x22f25232b2825207,
+	0x3a00000000105c02,
+	0x1800000000001de2,
+	0x09ffc00000105c02,
+	0x40000000e0001de7,
+/* 0x1128: rcp_result_denorm */
+	0x1a8e0000fc31dc03,
+	0x3a003ffffc105c02,
+	0x1000cfa001318004,
+	0x227202a2e2c282f7,
+	0x1000cfc00131a004,
+	0x0800400000105c02,
+	0x5000000018001c01,
+/* 0x1160: rcp_end */
+	0x9000000000001de7,
+/* 0x1168: gk104_rsq_f64 */
+	0x1e0edffc0001dc81,
+	0x3800200000104042,
+	0x39fffffffc109c02,
+	0x22828252c2820277,
+	0x7000c02c5010dc03,
+	0x198ec0000833dc03,
+	0x6800000008009c43,
+	0x5000d0d400000401,
+	0xc80000001c115c00,
+	0x128ec01ffc319c03,
+	0x6800000018209c03,
+	0x2282e2827202b287,
+	0x1a8e0000fc21dc03,
+	0x40000000800001e7,
+	0x3a00000000105c02,
+	0x1800000000001de2,
+	0x6800000014105c43,
+	0x9000000000001de7,
+/* 0x11f8: rsq_norm */
+	0x1800000000011de2,
+	0x22929292929292f7,
+	0x1000cfc001321c04,
+	0x5000000020009c01,
+	0x5000000010201c01,
+	0x2010000000419e01,
+	0x2008000018411c01,
+	0x5000000010201c01,
+	0x2010000000419e01,
+	0x2292929292929297,
+	0x2008000018411c01,
+	0x5000000010201c01,
+	0x2010000000419e01,
+	0x2008000018411c01,
+	0x5000000010201c01,
+	0x2010000000419e01,
+	0x2008000018411c01,
+	0x20000002e2820297,
+	0x5000d06800410401,
+	0x2800000014005de4,
+	0x2800000010001de4,
 	0x9000000000001de7,
 	0xc800000003f01cc5,
 	0x2c00000100005c04,
@@ -495,7 +615,7 @@ uint64_t gk104_builtin_code[] = {
 	0x680100000c1fdc03,
 	0x4000000a60001c47,
 	0x180000004000dde2,
-/* 0x0f60: spill_cfstack */
+/* 0x12e0: spill_cfstack */
 	0x78000009c0000007,
 	0x0c0000000430dd02,
 	0x4003ffffa0001ca7,
@@ -543,14 +663,14 @@ uint64_t gk104_builtin_code[] = {
 	0x4000000100001ea7,
 	0x480100000c001c03,
 	0x0800000000105c42,
-/* 0x10d8: shared_loop */
+/* 0x1458: shared_loop */
 	0xc100000000309c85,
 	0x9400000500009c85,
 	0x0c00000010001d02,
 	0x0800000000105d42,
 	0x0c0000001030dd02,
 	0x4003ffff40001ca7,
-/* 0x1108: shared_done */
+/* 0x1488: shared_done */
 	0x2800406420001de4,
 	0x2800406430005de4,
 	0xe000000000001c45,
@@ -564,7 +684,7 @@ uint64_t gk104_builtin_code[] = {
 	0x480000000c209c03,
 	0x4801000008001c03,
 	0x0800000000105c42,
-/* 0x1170: search_cstack */
+/* 0x14f0: search_cstack */
 	0x280040646000dde4,
 	0x8400000020009f05,
 	0x190ec0002821dc03,
@@ -573,17 +693,17 @@ uint64_t gk104_builtin_code[] = {
 	0x0800000000105c42,
 	0x0c0000004030dd02,
 	0x00029dff0ffc5cbf,
-/* 0x11b0: entry_found */
+/* 0x1530: entry_found */
 	0x8400000000009f85,
 	0x2800406400001de4,
 	0x2800406410005de4,
 	0x9400000010009c85,
 	0x4000000000001df4,
-/* 0x11d8: end_exit */
+/* 0x1558: end_exit */
 	0x9800000003ffdcc5,
 	0xd000000000008007,
 	0xa000000000004007,
-/* 0x11f0: end_cont */
+/* 0x1570: end_cont */
 	0xd000000000008007,
 	0x3400c3fffc201c04,
 	0xc000000003f01ec5,
@@ -593,6 +713,6 @@ uint64_t gk104_builtin_code[] = {
 uint64_t gk104_builtin_offsets[] = {
 	0x0000000000000000,
 	0x00000000000000f0,
-	0x0000000000000f08,
 	0x0000000000000f18,
+	0x0000000000001168,
 };
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -83,12 +83,229 @@ gk110_div_s32:
   $p0 sub b32 $r1 $r1 $r2
   $p0 add b32 $r0 $r0 0x1
   $p3 cvt s32 $r0 neg s32 $r0
-   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
+   sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28
   $p2 cvt s32 $r1 neg s32 $r1
   ret

+// RCP F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0
+//
+// The core of RCP and RSQ implementation is Newton-Raphson step, which is
+// used to find successively better approximation from an imprecise initial
+// value (single precision rcp in RCP and rsqrt64h in RSQ).
+//
 gk110_rcp_f64:
+   // Step 1: classify input according to exponent and value, and calculate
+   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+   // bit 52 (bit 20 of the upper half) and is 11 bits in length
+   ext u32 $r2 $r1 0xb14
+   add b32 $r3 $r2 0xffffffff
+   joinat #rcp_rejoin
+   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+   // denorm, or 0). Do this by substracting 1 from the exponent, which will
+   // mean that it's > 0x7fd in those cases when doing unsigned comparison
+   set b32 $p0 0x1 gt u32 $r3 0x7fd
+   // $r3: 0 for norms, 0x36 for denorms, -1 for others
+   mov b32 $r3 0x0
+   sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
+   join (not $p0) nop
+   // Process all special values: NaN, inf, denorm, 0
+   mov b32 $r3 0xffffffff
+   // A number is NaN if its abs value is greater than or unordered with inf
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   (not $p0) bra #rcp_inf_or_denorm_or_zero
+   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+   // behavior is both seen on the CPU and the blob
+   join or b32 $r1 $r1 0x80000
+rcp_inf_or_denorm_or_zero:
+   and b32 $r4 $r1 0x7ff00000
+   // Other values with nonzero in exponent field should be inf
+   set b32 $p0 0x1 eq s32 $r4 0x0
+   sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
+   $p0 bra #rcp_denorm_or_zero
+   // +/-Inf -> +/-0
+   xor b32 $r1 $r1 0x7ff00000
+   join mov b32 $r0 0x0
+rcp_denorm_or_zero:
+   set $p0 0x1 gtu f64 abs $r0d 0x0
+   $p0 bra #rcp_denorm
+   // +/-0 -> +/-Inf
+   join or b32 $r1 $r1 0x7ff00000
+rcp_denorm:
+   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+   mul rn f64 $r0d $r0d 0x4350000000000000
+   sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
+   join mov b32 $r3 0x36
+rcp_rejoin:
+   // All numbers with -1 in $r3 have their result ready in $r0d, return them
+   // others need further calculation
+   set b32 $p0 0x1 lt s32 $r3 0x0
+   $p0 bra #rcp_end
+   // Step 2: Before the real calculation goes on, renormalize the values to
+   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+   // result in $r6d. The exponent will be recovered later.
+   ext u32 $r2 $r1 0xb14
+   and b32 $r7 $r1 0x800fffff
+   add b32 $r7 $r7 0x3ff00000
+   mov b32 $r6 $r0
+   sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
+   // Step 3: Convert new value to float (no overflow will occur due to step
+   // 2), calculate rcp and do newton-raphson step once
+   cvt rz f32 $r5 f64 $r6d
+   rcp f32 $r4 $r5
+   mov b32 $r0 0xbf800000
+   fma rn f32 $r5 $r4 $r5 $r0
+   fma rn f32 $r0 neg $r4 $r5 $r4
+   // Step 4: convert result $r0 back to double, do newton-raphson steps
+   cvt f64 $r0d f32 $r0
+   cvt f64 $r6d f64 neg $r6d
+   sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
+   cvt f64 $r8d f32 0x3f800000
+   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+   // The formula used here (and above) is:
+   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+   // The following code uses 2 FMAs for each step, and it will basically
+   // looks like:
+   //     tmp = -src * RCP_{n} + 1
+   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   // Step 5: Exponent recovery and final processing
+   // The exponent is recovered by adding what we added to the exponent.
+   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+   //     rcp(x) = c * rcp(cx)
+   // The delta in exponent comes from two sources:
+   //   1) The renormalization in step 2. The delta is:
+   //      0x3ff - $r2
+   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+   //      in $r3
+   // These 2 sources are calculated in the first two lines below, and then
+   // added to the exponent extracted from the result above.
+   // Note that after processing, the new exponent may >= 0x7ff (inf)
+   // or <= 0 (denorm). Those cases will be handled respectively below
+   subr b32 $r2 $r2 0x3ff
+   add b32 $r4 $r2 $r3
+   ext u32 $r3 $r1 0xb14
+   // New exponent in $r3
+   add b32 $r3 $r3 $r4
+   add b32 $r2 $r3 0xffffffff
+   sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
+   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+   // (same logic as in step 1)
+   set b32 $p0 0x1 lt u32 $r2 0x7fe
+   (not $p0) bra #rcp_result_inf_or_denorm
+   // Norms: convert exponents back and return
+   shl b32 $r4 $r4 clamp 0x14
+   add b32 $r1 $r4 $r1
+   bra #rcp_end
+rcp_result_inf_or_denorm:
+   // New exponent >= 0x7ff means that result is inf
+   set b32 $p0 0x1 ge s32 $r3 0x7ff
+   (not $p0) bra #rcp_result_denorm
+   sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
+   // Infinity
+   and b32 $r1 $r1 0x80000000
+   mov b32 $r0 0x0
+   add b32 $r1 $r1 0x7ff00000
+   bra #rcp_end
+rcp_result_denorm:
+   // Denorm result comes from huge input. The greatest possible fp64, i.e.
+   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+   // normal value. Other rcp result should be greater than that. If we
+   // set the exponent field to 1, we can recover the result by multiplying
+   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+   // the logic here.
+   set b32 $p0 0x1 ne u32 $r3 0x0
+   and b32 $r1 $r1 0x800fffff
+   // 0x3e800000: 1/4
+   $p0 cvt f64 $r6d f32 0x3e800000
+   sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
+   // 0x3f000000: 1/2
+   (not $p0) cvt f64 $r6d f32 0x3f000000
+   add b32 $r1 $r1 0x00100000
+   mul rn f64 $r0d $r0d $r6d
+rcp_end:
+   ret
+
+// RSQ F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
 gk110_rsq_f64:
+   // Before getting initial result rsqrt64h, two special cases should be
+   // handled first.
+   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+   //    as NaN in rsqrt64h
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   $p0 or b32 $r1 $r1 0x00080000
+   and b32 $r2 $r1 0x7fffffff
+   sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
+   // 2. denorms and small normal values: using their original value will
+   //    lose precision either at rsqrt64h or the first step in newton-raphson
+   //    steps below. Take 2 as a threshold in exponent field, and multiply
+   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+   //    to recover in the end)
+   ext u32 $r3 $r1 0xb14
+   set b32 $p1 0x1 le u32 $r3 0x2
+   or b32 $r2 $r0 $r2
+   $p1 mul rn f64 $r0d $r0d 0x4350000000000000
+   rsqrt64h f32 $r5 $r1
+   // rsqrt64h will give correct result for 0/inf/nan, the following logic
+   // checks whether the input is one of those (exponent is 0x7ff or all 0
+   // except for the sign bit)
+   set b32 $r6 ne u32 $r3 0x7ff
+   and b32 $r2 $r2 $r6
+   sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
+   set b32 $p0 0x1 ne u32 $r2 0x0
+   $p0 bra #rsq_norm
+   // For 0/inf/nan, make sure the sign bit agrees with input and return
+   and b32 $r1 $r1 0x80000000
+   mov b32 $r0 0x0
+   or b32 $r1 $r1 $r5
+   ret
+rsq_norm:
+   // For others, do 4 Newton-Raphson steps with the formula:
+   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+   // In the code below, each step is written as:
+   //     tmp1 = 0.5 * x * RSQ_{n}
+   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
+   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+   mov b32 $r4 0x0
+   sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
+   // 0x3f000000: 1/2
+   cvt f64 $r8d f32 0x3f000000
+   mul rn f64 $r2d $r0d $r8d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
+   // Multiply 2^27 to result for small inputs to recover
+   $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
+   mov b32 $r1 $r5
+   mov b32 $r0 $r4
   ret

 .section #gk110_builtin_offsets
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
@@ -65,11 +65,132 @@ uint64_t gk110_builtin_code[] = {
 	0xe088000001000406,
 	0x4000000000800001,
 	0xe6010000000ce802,
-	0x08b08010a010b810,
+	0x08a0a0a010a0b810,
 	0xe60100000088e806,
 	0x19000000001c003c,
 /* 0x0218: gk110_rcp_f64 */
-/* 0x0218: gk110_rsq_f64 */
+	0xc00000058a1c0409,
+	0x407fffffff9c080d,
+	0x1480000050000000,
+	0xb3401c03fe9c0c1d,
+	0xe4c03c007f9c000e,
+	0x08a0a0bcacb410bc,
+	0x8580000000603c02,
+	0x747fffffff9fc00e,
+	0xb4601fff801c021d,
+	0x120000000420003c,
+	0x21000400005c0404,
+/* 0x0270: rcp_inf_or_denorm_or_zero */
+	0x203ff800001c0410,
+	0xb3281c00001c101d,
+	0x0880bcacb4bc10ac,
+	0x120000000800003c,
+	0x223ff800001c0404,
+	0xe4c03c007fdc0002,
+/* 0x02a0: rcp_denorm_or_zero */
+	0xb4601c00001c021d,
+	0x120000000400003c,
+	0x213ff800005c0404,
+/* 0x02b8: rcp_denorm */
+	0xc400021a801c0001,
+	0x08a010a0a0aca0bc,
+	0x740000001b5fc00e,
+/* 0x02d0: rcp_rejoin */
+	0xb3181c00001c0c1d,
+	0x12000000c000003c,
+	0xc00000058a1c0409,
+	0x204007ffff9c041c,
+	0x401ff800001c1c1d,
+	0xe4c03c00001c001a,
+	0x08b8aca8a0a010ac,
+	0xe5400c00031c3816,
+	0x84000000021c1412,
+	0x745fc000001fc002,
+	0xcc000000029c1016,
+	0xcc081000029c1002,
+	0xe5400000001c2c02,
+	0xe5410000031c3c1a,
+	0x08a4a4a4a4a4a4b8,
+	0xc54001fc001c2c21,
+	0xdb802000001c1812,
+	0xdb800000021c0002,
+	0xdb802000001c1812,
+	0xdb800000021c0002,
+	0xdb802000001c1812,
+	0xdb800000021c0002,
+	0x08a0a0a0a0a080a4,
+	0xdb802000001c1812,
+	0xdb800000021c0002,
+	0x48000001ff9c0809,
+	0xe0800000019c0812,
+	0xc00000058a1c040d,
+	0xe0800000021c0c0e,
+	0x407fffffff9c0c09,
+	0x08aca0a0aca0aca0,
+	0xb3101c03ff1c081d,
+	0x120000000c20003c,
+	0xc24000000a1c1011,
+	0xe0800000009c1006,
+	0x12000000381c003c,
+/* 0x03f0: rcp_result_inf_or_denorm */
+	0xb3681c03ff9c0c1d,
+	0x120000001420003c,
+	0x08bc948caca09480,
+	0x20400000001c0404,
+	0xe4c03c007f9c0002,
+	0x403ff800001c0405,
+	0x120000001c1c003c,
+/* 0x0428: rcp_result_denorm */
+	0xb3501c00001c0c1d,
+	0x204007ffff9c0404,
+	0xc54001f400002c19,
+	0x089c80a8b8b0a0bc,
+	0xc54001f800202c19,
+	0x40000800001c0405,
+	0xe4000000031c0002,
+/* 0x0460: rcp_end */
+	0x19000000001c003c,
+/* 0x0468: gk110_rsq_f64 */
+	0xb4601fff801c021d,
+	0x2100040000000404,
+	0x203fffffff9c0408,
+	0x08a0a094b0a0809c,
+	0xc00000058a1c040d,
+	0xb3301c00011c0c3d,
+	0xe2001000011c000a,
+	0xc400021a80040001,
+	0x84000000039c0416,
+	0xb2d01c03ff9c0c19,
+	0xe2000000031c080a,
+	0x08a0b8a09c80aca0,
+	0xb3501c00001c081d,
+	0x120000001000003c,
+	0x20400000001c0404,
+	0xe4c03c007f9c0002,
+	0xe2001000029c0406,
+	0x19000000001c003c,
+/* 0x04f8: rsq_norm */
+	0xe4c03c007f9c0012,
+	0x08a4a4a4a4a4a4bc,
+	0xc54001f8001c2c21,
+	0xe4000000041c000a,
+	0xe4000000021c0802,
+	0xdb882000001c101a,
+	0xdb801000031c1012,
+	0xe4000000021c0802,
+	0xdb882000001c101a,
+	0x08a4a4a4a4a4a4a4,
+	0xdb801000031c1012,
+	0xe4000000021c0802,
+	0xdb882000001c101a,
+	0xdb801000031c1012,
+	0xe4000000021c0802,
+	0xdb882000001c101a,
+	0xdb801000031c1012,
+	0x08000000b8a080a4,
+	0xc400020d00041011,
+	0xe4c03c00029c0006,
+	0xe4c03c00021c0002,
 	0x19000000001c003c,
 };

@@ -77,5 +198,5 @@ uint64_t gk110_builtin_offsets[] = {
 	0x0000000000000000,
 	0x00000000000000f0,
 	0x0000000000000218,
-	0x0000000000000218,
+	0x0000000000000468,
 };
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
@@ -100,10 +100,253 @@ gm107_div_s32:
   ret
   nop 0

-// STUB
+// RCP F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0
+//
+// The core of RCP and RSQ implementation is Newton-Raphson step, which is
+// used to find successively better approximation from an imprecise initial
+// value (single precision rcp in RCP and rsqrt64h in RSQ).
+//
 gm107_rcp_f64:
-gm107_rsq_f64:
+   // Step 1: classify input according to exponent and value, and calculate
+   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+   // bit 52 (bit 20 of the upper half) and is 11 bits in length
   sched (st 0x0) (st 0x0) (st 0x0)
+   bfe u32 $r2 $r1 0xb14
+   iadd32i $r3 $r2 -1
+   ssy #rcp_rejoin
+   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+   // denorm, or 0). Do this by substracting 1 from the exponent, which will
+   // mean that it's > 0x7fd in those cases when doing unsigned comparison
+   sched (st 0x0) (st 0x0) (st 0x0)
+   isetp gt u32 and $p0 1 $r3 0x7fd 1
+   // $r3: 0 for norms, 0x36 for denorms, -1 for others
+   mov $r3 0x0 0xf
+   not $p0 sync
+   // Process all special values: NaN, inf, denorm, 0
+   sched (st 0x0) (st 0x0) (st 0x0)
+   mov32i $r3 0xffffffff 0xf
+   // A number is NaN if its abs value is greater than or unordered with inf
+   dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
+   not $p0 bra #rcp_inf_or_denorm_or_zero
+   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+   // behavior is both seen on the CPU and the blob
+   sched (st 0x0) (st 0x0) (st 0x0)
+   lop32i or $r1 $r1 0x80000
+   sync
+rcp_inf_or_denorm_or_zero:
+   lop32i and $r4 $r1 0x7ff00000
+   sched (st 0x0) (st 0x0) (st 0x0)
+   // Other values with nonzero in exponent field should be inf
+   isetp eq and $p0 1 $r4 0x0 1
+   $p0 bra #rcp_denorm_or_zero
+   // +/-Inf -> +/-0
+   lop32i xor $r1 $r1 0x7ff00000
+   sched (st 0x0) (st 0x0) (st 0x0)
+   mov $r0 0x0 0xf
+   sync
+rcp_denorm_or_zero:
+   dsetp gtu and $p0 1 abs $r0 0x0 1
+   sched (st 0x0) (st 0x0) (st 0x0)
+   $p0 bra #rcp_denorm
+   // +/-0 -> +/-Inf
+   lop32i or $r1 $r1 0x7ff00000
+   sync
+rcp_denorm:
+   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+   sched (st 0x0) (st 0x0) (st 0x0)
+   dmul $r0 $r0 0x4350000000000000
+   mov $r3 0x36 0xf
+   sync
+rcp_rejoin:
+   // All numbers with -1 in $r3 have their result ready in $r0d, return them
+   // others need further calculation
+   sched (st 0x0) (st 0x0) (st 0x0)
+   isetp lt and $p0 1 $r3 0x0 1
+   $p0 bra #rcp_end
+   // Step 2: Before the real calculation goes on, renormalize the values to
+   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+   // result in $r6d. The exponent will be recovered later.
+   bfe u32 $r2 $r1 0xb14
+   sched (st 0x0) (st 0x0) (st 0x0)
+   lop32i and $r7 $r1 0x800fffff
+   iadd32i $r7 $r7 0x3ff00000
+   mov $r6 $r0 0xf
+   // Step 3: Convert new value to float (no overflow will occur due to step
+   // 2), calculate rcp and do newton-raphson step once
+   sched (st 0x0) (st 0x0) (st 0x0)
+   f2f ftz f64 f32 $r5 $r6
+   mufu rcp $r4 $r5
+   mov32i $r0 0xbf800000 0xf
+   sched (st 0x0) (st 0x0) (st 0x0)
+   ffma $r5 $r4 $r5 $r0
+   ffma $r0 $r5 neg $r4 $r4
+   // Step 4: convert result $r0 back to double, do newton-raphson steps
+   f2f f32 f64 $r0 $r0
+   sched (st 0x0) (st 0x0) (st 0x0)
+   f2f f64 f64 $r6 neg $r6
+   f2f f32 f64 $r8 0x3f800000
+   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+   // The formula used here (and above) is:
+   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+   // The following code uses 2 FMAs for each step, and it will basically
+   // looks like:
+   //     tmp = -src * RCP_{n} + 1
+   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+   dfma $r4 $r6 $r0 $r8
+   sched (st 0x0) (st 0x0) (st 0x0)
+   dfma $r0 $r0 $r4 $r0
+   dfma $r4 $r6 $r0 $r8
+   dfma $r0 $r0 $r4 $r0
+   sched (st 0x0) (st 0x0) (st 0x0)
+   dfma $r4 $r6 $r0 $r8
+   dfma $r0 $r0 $r4 $r0
+   dfma $r4 $r6 $r0 $r8
+   sched (st 0x0) (st 0x0) (st 0x0)
+   dfma $r0 $r0 $r4 $r0
+   // Step 5: Exponent recovery and final processing
+   // The exponent is recovered by adding what we added to the exponent.
+   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+   //     rcp(x) = c * rcp(cx)
+   // The delta in exponent comes from two sources:
+   //   1) The renormalization in step 2. The delta is:
+   //      0x3ff - $r2
+   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+   //      in $r3
+   // These 2 sources are calculated in the first two lines below, and then
+   // added to the exponent extracted from the result above.
+   // Note that after processing, the new exponent may >= 0x7ff (inf)
+   // or <= 0 (denorm). Those cases will be handled respectively below
+   iadd $r2 neg $r2 0x3ff
+   iadd $r4 $r2 $r3
+   sched (st 0x0) (st 0x0) (st 0x0)
+   bfe u32 $r3 $r1 0xb14
+   // New exponent in $r3
+   iadd $r3 $r3 $r4
+   iadd32i $r2 $r3 -1
+   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+   // (same logic as in step 1)
+   sched (st 0x0) (st 0x0) (st 0x0)
+   isetp lt u32 and $p0 1 $r2 0x7fe 1
+   not $p0 bra #rcp_result_inf_or_denorm
+   // Norms: convert exponents back and return
+   shl $r4 $r4 0x14
+   sched (st 0x0) (st 0x0) (st 0x0)
+   iadd $r1 $r4 $r1
+   bra #rcp_end
+rcp_result_inf_or_denorm:
+   // New exponent >= 0x7ff means that result is inf
+   isetp ge and $p0 1 $r3 0x7ff 1
+   sched (st 0x0) (st 0x0) (st 0x0)
+   not $p0 bra #rcp_result_denorm
+   // Infinity
+   lop32i and $r1 $r1 0x80000000
+   mov $r0 0x0 0xf
+   sched (st 0x0) (st 0x0) (st 0x0)
+   iadd32i $r1 $r1 0x7ff00000
+   bra #rcp_end
+rcp_result_denorm:
+   // Denorm result comes from huge input. The greatest possible fp64, i.e.
+   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+   // normal value. Other rcp result should be greater than that. If we
+   // set the exponent field to 1, we can recover the result by multiplying
+   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+   // the logic here.
+   isetp ne u32 and $p0 1 $r3 0x0 1
+   sched (st 0x0) (st 0x0) (st 0x0)
+   lop32i and $r1 $r1 0x800fffff
+   // 0x3e800000: 1/4
+   $p0 f2f f32 f64 $r6 0x3e800000
+   // 0x3f000000: 1/2
+   not $p0 f2f f32 f64 $r6 0x3f000000
+   sched (st 0x0) (st 0x0) (st 0x0)
+   iadd32i $r1 $r1 0x00100000
+   dmul $r0 $r0 $r6
+rcp_end:
+   ret
+
+// RSQ F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
+gm107_rsq_f64:
+   // Before getting initial result rsqrt64h, two special cases should be
+   // handled first.
+   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+   //    as NaN in rsqrt64h
+   sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd)
+   dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
+   $p0 lop32i or $r1 $r1 0x00080000
+   lop32i and $r2 $r1 0x7fffffff
+   // 2. denorms and small normal values: using their original value will
+   //    lose precision either at rsqrt64h or the first step in newton-raphson
+   //    steps below. Take 2 as a threshold in exponent field, and multiply
+   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+   //    to recover in the end)
+   sched (st 0xd) (st 0xd) (st 0xd)
+   bfe u32 $r3 $r1 0xb14
+   isetp le u32 and $p1 1 $r3 0x2 1
+   lop or 1 $r2 $r0 $r2
+   sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd)
+   $p1 dmul $r0 $r0 0x4350000000000000
+   mufu rsq64h $r5 $r1
+   // rsqrt64h will give correct result for 0/inf/nan, the following logic
+   // checks whether the input is one of those (exponent is 0x7ff or all 0
+   // except for the sign bit)
+   iset ne u32 and $r6 $r3 0x7ff 1
+   sched (st 0xd) (st 0xd) (st 0xd)
+   lop and 1 $r2 $r2 $r6
+   isetp ne u32 and $p0 1 $r2 0x0 1
+   $p0 bra #rsq_norm
+   // For 0/inf/nan, make sure the sign bit agrees with input and return
+   sched (st 0xd) (st 0xd) (st 0xd wt 0x1)
+   lop32i and $r1 $r1 0x80000000
+   mov $r0 0x0 0xf
+   lop or 1 $r1 $r1 $r5
+   sched (st 0xd) (st 0xf) (st 0xf)
+   ret
+   nop 0
+   nop 0
+rsq_norm:
+   // For others, do 4 Newton-Raphson steps with the formula:
+   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+   // In the code below, each step is written as:
+   //     tmp1 = 0.5 * x * RSQ_{n}
+   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
+   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+   sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3)
+   mov $r4 0x0 0xf
+   // 0x3f000000: 1/2
+   f2f f32 f64 $r8 0x3f000000
+   dmul $r2 $r0 $r8
+   sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   // Multiply 2^27 to result for small inputs to recover
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd)
+   $p1 dmul $r4 $r4 0x41a0000000000000
+   mov $r1 $r5 0xf
+   mov $r0 $r4 0xf
+   sched (st 0xd) (st 0xf) (st 0xf)
   ret
   nop 0
   nop 0
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
@@ -82,8 +82,156 @@ uint64_t gm107_builtin_code[] = {
 	0xe32000000007000f,
 	0x50b0000000070f00,
 /* 0x0280: gm107_rcp_f64 */
-/* 0x0280: gm107_rsq_f64 */
 	0x001f8000fc0007e0,
+	0x38000000b1470102,
+	0x1c0ffffffff70203,
+	0xe29000000e000000,
+	0x001f8000fc0007e0,
+	0x366803807fd70307,
+	0x5c9807800ff70003,
+	0xf0f800000008000f,
+	0x001f8000fc0007e0,
+	0x010ffffffff7f003,
+	0x368c03fff0070087,
+	0xe24000000188000f,
+	0x001f8000fc0007e0,
+	0x0420008000070101,
+	0xf0f800000007000f,
+/* 0x02f8: rcp_inf_or_denorm_or_zero */
+	0x0407ff0000070104,
+	0x001f8000fc0007e0,
+	0x5b6503800ff70407,
+	0xe24000000200000f,
+	0x0447ff0000070101,
+	0x001f8000fc0007e0,
+	0x5c9807800ff70000,
+	0xf0f800000007000f,
+/* 0x0338: rcp_denorm_or_zero */
+	0x5b8c03800ff70087,
+	0x001f8000fc0007e0,
+	0xe24000000100000f,
+	0x0427ff0000070101,
+	0xf0f800000007000f,
+/* 0x0360: rcp_denorm */
+	0x001f8000fc0007e0,
+	0x3880004350070000,
+	0x3898078003670003,
+	0xf0f800000007000f,
+/* 0x0380: rcp_rejoin */
+	0x001f8000fc0007e0,
+	0x5b6303800ff70307,
+	0xe24000001c00000f,
+	0x38000000b1470102,
+	0x001f8000fc0007e0,
+	0x040800fffff70107,
+	0x1c03ff0000070707,
+	0x5c98078000070006,
+	0x001f8000fc0007e0,
+	0x5ca8100000670e05,
+	0x5080000000470504,
+	0x010bf8000007f000,
+	0x001f8000fc0007e0,
+	0x5980000000570405,
+	0x5981020000470500,
+	0x5ca8000000070b00,
+	0x001f8000fc0007e0,
+	0x5ca8200000670f06,
+	0x38a8003f80070b08,
+	0x5b70040000070604,
+	0x001f8000fc0007e0,
+	0x5b70000000470000,
+	0x5b70040000070604,
+	0x5b70000000470000,
+	0x001f8000fc0007e0,
+	0x5b70040000070604,
+	0x5b70000000470000,
+	0x5b70040000070604,
+	0x001f8000fc0007e0,
+	0x5b70000000470000,
+	0x381200003ff70202,
+	0x5c10000000370204,
+	0x001f8000fc0007e0,
+	0x38000000b1470103,
+	0x5c10000000470303,
+	0x1c0ffffffff70302,
+	0x001f8000fc0007e0,
+	0x366203807fe70207,
+	0xe24000000208000f,
+	0x3848000001470404,
+	0x001f8000fc0007e0,
+	0x5c10000000170401,
+	0xe24000000807000f,
+/* 0x04d8: rcp_result_inf_or_denorm */
+	0x366d03807ff70307,
+	0x001f8000fc0007e0,
+	0xe24000000288000f,
+	0x0408000000070101,
+	0x5c9807800ff70000,
+	0x001f8000fc0007e0,
+	0x1c07ff0000070101,
+	0xe24000000407000f,
+/* 0x0518: rcp_result_denorm */
+	0x5b6a03800ff70307,
+	0x001f8000fc0007e0,
+	0x040800fffff70101,
+	0x38a8003e80000b06,
+	0x38a8003f00080b06,
+	0x001f8000fc0007e0,
+	0x1c00010000070101,
+	0x5c80000000670000,
+/* 0x0558: rcp_end */
+	0xe32000000007000f,
+/* 0x0560: gm107_rsq_f64 */
+	0x001fb401fda1ff0d,
+	0x368c03fff0070087,
+	0x0420008000000101,
+	0x0407fffffff70102,
+	0x001fb400fda007ed,
+	0x38000000b1470103,
+	0x366603800027030f,
+	0x5c47020000270002,
+	0x001fb401e1a0070d,
+	0x3880004350010000,
+	0x5080000000770105,
+	0x365a03807ff70306,
+	0x001fb400fda007ed,
+	0x5c47000000670202,
+	0x5b6a03800ff70207,
+	0xe24000000400000f,
+	0x003fb400fda007ed,
+	0x0408000000070101,
+	0x5c9807800ff70000,
+	0x5c47020000570101,
+	0x001fbc00fde007ed,
+	0xe32000000007000f,
+	0x50b0000000070f00,
+	0x50b0000000070f00,
+/* 0x0620: rsq_norm */
+	0x0060b400e5a007ed,
+	0x5c9807800ff70004,
+	0x38a8003f00070b08,
+	0x5c80000000870002,
+	0x003c3401e1a01f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x001fb401fda00f0d,
+	0x38800041a0010404,
+	0x5c98078000570001,
+	0x5c98078000470000,
+	0x001fbc00fde007ed,
 	0xe32000000007000f,
 	0x50b0000000070f00,
 	0x50b0000000070f00,
@@ -93,5 +241,5 @@ uint64_t gm107_builtin_offsets[] = {
 	0x0000000000000000,
 	0x0000000000000120,
 	0x0000000000000280,
-	0x0000000000000280,
+	0x0000000000000560,
 };
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1119,6 +1119,7 @@ Program::Program(Type type, Target *arch)
   binSize = 0;

   maxGPR = -1;
+   fp64 = false;

   main = new Function(this, "MAIN", ~0);
   calls.insert(&main->call);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -1311,6 +1311,7 @@ public:
   uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL

   int maxGPR;
+   bool fp64;

   MemoryPool mem_Instruction;
   MemoryPool mem_CmpInstruction;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1087,6 +1087,8 @@ public:
   };
   std::vector<MemoryFile> memoryFiles;

+   std::vector<bool> bufferAtomics;
+
 private:
   int inferSysValDirection(unsigned sn) const;
   bool scanDeclaration(const struct tgsi_full_declaration *);
@@ -1137,6 +1139,7 @@ bool Source::scanSource()
   //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
   tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1);
   memoryFiles.resize(scan.file_max[TGSI_FILE_MEMORY] + 1);
+   bufferAtomics.resize(scan.file_max[TGSI_FILE_BUFFER] + 1);

   info->immd.bufSize = 0;

@@ -1483,11 +1486,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
         tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair(
                                                   first, last - first + 1)));
      break;
+   case TGSI_FILE_BUFFER:
+      for (i = first; i <= last; ++i)
+         bufferAtomics[i] = decl->Declaration.Atomic;
+      break;
   case TGSI_FILE_ADDRESS:
   case TGSI_FILE_CONSTANT:
   case TGSI_FILE_IMMEDIATE:
   case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_BUFFER:
   case TGSI_FILE_IMAGE:
      break;
   default:
@@ -2720,7 +2726,11 @@ Converter::handleLOAD(Value *dst0[4])
         }

         Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off);
-         ld->cache = tgsi.getCacheMode();
+         if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER &&
+             code->bufferAtomics[r])
+            ld->cache = nv50_ir::CACHE_CG;
+         else
+            ld->cache = tgsi.getCacheMode();
         if (ind)
            ld->setIndirect(0, 1, ind);
      }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -83,6 +83,38 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
   delete_Instruction(prog, i);
 }

+void
+NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
+{
+   FlowInstruction *call;
+   Value *def[2];
+   int builtin;
+
+   def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
+   def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
+
+   if (i->op == OP_RCP)
+      builtin = NVC0_BUILTIN_RCP_F64;
+   else
+      builtin = NVC0_BUILTIN_RSQ_F64;
+
+   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
+   def[0] = bld.getSSA();
+   def[1] = bld.getSSA();
+   bld.mkMovFromReg(def[0], 0);
+   bld.mkMovFromReg(def[1], 1);
+   bld.mkClobber(FILE_GPR, 0x3fc, 2);
+   bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
+   bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
+
+   call->fixed = 1;
+   call->absolute = call->builtin = 1;
+   call->target.builtin = builtin;
+   delete_Instruction(prog, i);
+
+   prog->fp64 = true;
+}
+
 void
 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
 {
@@ -96,6 +128,12 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
   Value *src[2], *dst[2], *def = i->getDef(0);
   bld.mkSplit(src, 4, i->getSrc(0));

+   int chip = prog->getTarget()->getChipset();
+   if (chip >= NVISA_GK104_CHIPSET) {
+      handleRCPRSQLib(i, src);
+      return;
+   }
+
   // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
   dst[0] = bld.loadImm(NULL, 0);
   dst[1] = bld.getSSA();
@@ -1063,22 +1101,6 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
      }
   }

-   if (chipset >= NVISA_GK104_CHIPSET) {
-      //
-      // If TEX requires more than 4 sources, the 2nd register tuple must be
-      // aligned to 4, even if it consists of just a single 4-byte register.
-      //
-      // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
-      //
-      int s = i->srcCount(0xff, true);
-      if (s > 4 && s < 7) {
-         if (i->srcExists(s)) // move potential predicate out of the way
-            i->moveSources(s, 7 - s);
-         while (s < 7)
-            i->setSrc(s++, bld.loadImm(NULL, 0));
-      }
-   }
-
   return true;
 }

@@ -1887,7 +1909,8 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
      su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
   const int slot = su->tex.r;
   const int dim = su->tex.target.getDim();
-   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
+   const int arg = dim + array;
   int c;
   Value *zero = bld.mkImm(0);
   Value *p1 = NULL;
@@ -1896,6 +1919,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
   Value *bf, *eau, *off;
   Value *addr, *pred;
   Value *ind = su->getIndirectR();
+   Value *y, *z;

   off = bld.getScratch(4);
   bf = bld.getScratch(4);
@@ -1926,34 +1950,42 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
   for (; c < 3; ++c)
      src[c] = zero;

+   if (dim == 2 && !array) {
+      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
+      src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
+                          v, bld.loadImm(NULL, 16));
+
+      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
+      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
+         ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   }
+
   // set predicate output
   if (su->tex.target == TEX_TARGET_BUFFER) {
      src[0]->getInsn()->setFlagsDef(1, pred);
   } else
-   if (su->tex.target.isArray() || su->tex.target.isCube()) {
+   if (array) {
      p1 = bld.getSSA(1, FILE_PREDICATE);
      src[dim]->getInsn()->setFlagsDef(1, p1);
   }

   // calculate pixel offset
   if (dim == 1) {
+      y = z = zero;
      if (su->tex.target != TEX_TARGET_BUFFER)
         bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
-   } else
-   if (dim == 3) {
+   } else {
+      y = src[1];
+      z = src[2];
+
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
-         ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+         ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l

      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
      bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
-         ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
-   } else {
-      assert(dim == 2);
-      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
-      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
-         ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
-         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+         ->subOp = array ?
+         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
   }

   // calculate effective address part 1
@@ -1966,19 +1998,15 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
            ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
      }
   } else {
-      Value *y = src[1];
-      Value *z = src[2];
      uint16_t subOp = 0;

      switch (dim) {
      case 1:
-         y = zero;
-         z = zero;
         break;
      case 2:
-         z = off;
-         if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
-            z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
+         if (array) {
+            z = off;
+         } else {
            subOp = NV50_IR_SUBOP_SUBFM_3D;
         }
         break;
@@ -2001,7 +2029,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
      eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
   }
   // add array layer offset
-   if (su->tex.target.isArray() || su->tex.target.isCube()) {
+   if (array) {
      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
      if (dim == 1)
         bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -62,6 +62,7 @@ private:

   // we want to insert calls to the builtin library only after optimization
   void handleDIV(Instruction *); // integer division, modulus
+   void handleRCPRSQLib(Instruction *, Value *[]);
   void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
   void handleFTZ(Instruction *);
   void handleSET(CmpInstruction *);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -2341,9 +2341,19 @@ RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex)
            if (!tex->tex.target.isArray() && tex->tex.useOffsets)
               s++;
         }
-         n = tex->srcCount(0xff) - s;
+         n = tex->srcCount(0xff, true) - s;
+         // TODO: Is this necessary? Perhaps just has to be aligned to the
+         // level that the first arg is, not necessarily to 4. This
+         // requirement has not been rigorously verified, as it has been on
+         // Kepler.
+         if (n > 0 && n < 3) {
+            if (tex->srcExists(n + s)) // move potential predicate out of the way
+               tex->moveSources(n + s, 3 - n);
+            while (n < 3)
+               tex->setSrc(s + n++, new_LValue(func, FILE_GPR));
+         }
      } else {
-         s = tex->srcCount(0xff);
+         s = tex->srcCount(0xff, true);
         n = 0;
      }

@@ -2366,14 +2376,18 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
   } else
   if (isTextureOp(tex->op)) {
      int n = tex->srcCount(0xff, true);
-      if (n > 4) {
-         condenseSrcs(tex, 0, 3);
-         if (n > 5) // NOTE: first call modified positions already
-            condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
-      } else
-      if (n > 1) {
-         condenseSrcs(tex, 0, n - 1);
+      int s = n > 4 ? 4 : n;
+      if (n > 4 && n < 7) {
+         if (tex->srcExists(n)) // move potential predicate out of the way
+            tex->moveSources(n, 7 - n);
+
+         while (n < 7)
+            tex->setSrc(n++, new_LValue(func, FILE_GPR));
      }
+      if (s > 1)
+         condenseSrcs(tex, 0, s - 1);
+      if (n > 4)
+         condenseSrcs(tex, 1, n - s);
   }
 }

@@ -2510,6 +2524,7 @@ RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s)
   assert(cst->getSrc(s)->defs.size() == 1); // still SSA

   Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
+
   bool imm = defi->op == OP_MOV &&
      defi->src(0).getFile() == FILE_IMMEDIATE;
   bool load = defi->op == OP_LOAD &&
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -399,6 +399,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
         }
      }
   }
+   info->io.fp64 |= fp64;
   info->bin.relocData = emit->getRelocInfo();
   info->bin.fixupData = emit->getFixupInfo();

--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -79,6 +79,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return 2048;
   case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
      return 8 * 1024 * 1024;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 8;
+
   /* supported capabilities */
   case PIPE_CAP_ANISOTROPIC_FILTER:
   case PIPE_CAP_POINT_SPRITE:
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -98,12 +98,10 @@ nv50_render_condition(struct pipe_context *pipe,
      case PIPE_QUERY_OCCLUSION_COUNTER:
      case PIPE_QUERY_OCCLUSION_PREDICATE:
      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+         if (hq->state == NV50_HW_QUERY_STATE_READY)
+            wait = true;
         if (likely(!condition)) {
-            if (unlikely(hq->nesting))
-               cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
-                             NV50_3D_COND_MODE_ALWAYS;
-            else
-               cond = NV50_3D_COND_MODE_RES_NON_ZERO;
+            cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : NV50_3D_COND_MODE_ALWAYS;
         } else {
            cond = wait ? NV50_3D_COND_MODE_EQUAL : NV50_3D_COND_MODE_ALWAYS;
         }
@@ -129,7 +127,7 @@ nv50_render_condition(struct pipe_context *pipe,

   PUSH_SPACE(push, 9);

-   if (wait) {
+   if (wait && hq->state != NV50_HW_QUERY_STATE_READY) {
      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
      PUSH_DATA (push, 0);
   }
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -29,11 +29,6 @@
 #include "nv50/nv50_query_hw_sm.h"
 #include "nv_object.xml.h"

-#define NV50_HW_QUERY_STATE_READY   0
-#define NV50_HW_QUERY_STATE_ACTIVE  1
-#define NV50_HW_QUERY_STATE_ENDED   2
-#define NV50_HW_QUERY_STATE_FLUSHED 3
-
 /* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
 * (since we use only a single GPU channel per screen) will not work properly.
 *
@@ -158,8 +153,7 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
   case PIPE_QUERY_OCCLUSION_COUNTER:
   case PIPE_QUERY_OCCLUSION_PREDICATE:
   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-      hq->nesting = nv50->screen->num_occlusion_queries_active++;
-      if (hq->nesting) {
+      if (nv50->screen->num_occlusion_queries_active++) {
         nv50_hw_query_get(push, q, 0x10, 0x0100f002);
      } else {
         PUSH_SPACE(push, 4);
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
@@ -6,6 +6,11 @@

 #include "nv50_query.h"

+#define NV50_HW_QUERY_STATE_READY   0
+#define NV50_HW_QUERY_STATE_ACTIVE  1
+#define NV50_HW_QUERY_STATE_ENDED   2
+#define NV50_HW_QUERY_STATE_FLUSHED 3
+
 #define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)

 struct nv50_hw_query;
@@ -29,7 +34,6 @@ struct nv50_hw_query {
   uint8_t state;
   bool is64bit;
   uint8_t rotate;
-   int nesting; /* only used for occlusion queries */
   struct nouveau_mm_allocation *mm;
   struct nouveau_fence *fence;
 };
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -156,6 +156,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return NV50_MAX_WINDOW_RECTANGLES;
   case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
      return 16 * 1024 * 1024;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 15;

   /* supported caps */
   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -215,6 +217,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TGSI_CLOCK:
   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
      return 1;
   case PIPE_CAP_SEAMLESS_CUBE_MAP:
      return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -312,6 +315,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TGSI_ATOMFADD:
   case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
   case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+   case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -434,6 +434,7 @@ nvc0_video_buffer_create(struct pipe_context *pipe,

 /* nvc0_push.c */
 void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *);
+void nvc0_push_vbo_indirect(struct nvc0_context *, const struct pipe_draw_info *);

 /* nve4_compute.c */
 void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -121,12 +121,10 @@ nvc0_render_condition(struct pipe_context *pipe,
      case PIPE_QUERY_OCCLUSION_COUNTER:
      case PIPE_QUERY_OCCLUSION_PREDICATE:
      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+         if (hq->state == NVC0_HW_QUERY_STATE_READY)
+            wait = true;
         if (likely(!condition)) {
-            if (unlikely(hq->nesting))
-               cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL :
-                             NVC0_3D_COND_MODE_ALWAYS;
-            else
-               cond = NVC0_3D_COND_MODE_RES_NON_ZERO;
+            cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
         } else {
            cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
         }
@@ -151,7 +149,7 @@ nvc0_render_condition(struct pipe_context *pipe,
      return;
   }

-   if (wait)
+   if (wait && hq->state != NVC0_HW_QUERY_STATE_READY)
      nvc0_hw_query_fifo_wait(nvc0, q);

   PUSH_SPACE(push, 10);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -28,11 +28,6 @@
 #include "nvc0/nvc0_query_hw_metric.h"
 #include "nvc0/nvc0_query_hw_sm.h"

-#define NVC0_HW_QUERY_STATE_READY   0
-#define NVC0_HW_QUERY_STATE_ACTIVE  1
-#define NVC0_HW_QUERY_STATE_ENDED   2
-#define NVC0_HW_QUERY_STATE_FLUSHED 3
-
 #define NVC0_HW_QUERY_ALLOC_SPACE 256

 bool
@@ -158,14 +153,18 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
   case PIPE_QUERY_OCCLUSION_COUNTER:
   case PIPE_QUERY_OCCLUSION_PREDICATE:
   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-      hq->nesting = nvc0->screen->num_occlusion_queries_active++;
-      if (hq->nesting) {
+      if (nvc0->screen->num_occlusion_queries_active++) {
         nvc0_hw_query_get(push, q, 0x10, 0x0100f002);
      } else {
         PUSH_SPACE(push, 3);
         BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
         PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
         IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+         /* Given that the counter is reset, the contents at 0x10 are
+          * equivalent to doing the query -- we would get hq->sequence as the
+          * payload and 0 as the reported value. This is already set up above
+          * as in the hq->rotate case.
+          */
      }
      break;
   case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -199,6 +198,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
      nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
      nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
      nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      ((uint64_t *)hq->data)[(12 + 10) * 2] = 0;
      break;
   default:
      break;
@@ -271,6 +271,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
      nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
      nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
      nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      ((uint64_t *)hq->data)[10 * 2] = 0;
      break;
   case PIPE_QUERY_TIMESTAMP_DISJOINT:
      /* This query is not issued on GPU because disjoint is forced to false */
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
@@ -6,6 +6,11 @@

 #include "nvc0_query.h"

+#define NVC0_HW_QUERY_STATE_READY   0
+#define NVC0_HW_QUERY_STATE_ACTIVE  1
+#define NVC0_HW_QUERY_STATE_ENDED   2
+#define NVC0_HW_QUERY_STATE_FLUSHED 3
+
 #define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)

 struct nvc0_hw_query;
@@ -29,7 +34,6 @@ struct nvc0_hw_query {
   uint8_t state;
   boolean is64bit;
   uint8_t rotate;
-   int nesting; /* only used for occlusion queries */
   struct nouveau_mm_allocation *mm;
   struct nouveau_fence *fence;
 };
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -182,6 +182,13 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return class_3d >= GM200_3D_CLASS ? 8 : 0;
   case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
      return 64 * 1024 * 1024;
+   case PIPE_CAP_MAX_VARYINGS:
+      /* NOTE: These only count our slots for GENERIC varyings.
+       * The address space may be larger, but the actual hard limit seems to be
+       * less than what the address space layout permits, so don't add TEXCOORD,
+       * COLOR, etc. here.
+       */
+      return 0x1f0 / 16;

   /* supported caps */
   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -266,6 +273,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
   case PIPE_CAP_QUERY_SO_OVERFLOW:
+   case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
      return 1;
   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
      return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
@@ -336,6 +344,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_SURFACE_SAMPLE_COUNT:
   case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
   case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+   case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
      return 0;

   case PIPE_CAP_VENDOR_ID:
@@ -392,18 +401,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen,
   case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
      return 16;
   case PIPE_SHADER_CAP_MAX_INPUTS:
-      if (shader == PIPE_SHADER_VERTEX)
-         return 32;
-      /* NOTE: These only count our slots for GENERIC varyings.
-       * The address space may be larger, but the actual hard limit seems to be
-       * less than what the address space layout permits, so don't add TEXCOORD,
-       * COLOR, etc. here.
-       */
-      if (shader == PIPE_SHADER_FRAGMENT)
-         return 0x1f0 / 16;
-      /* Actually this counts CLIPVERTEX, which occupies the last generic slot,
-       * and excludes 0x60 per-patch inputs.
-       */
      return 0x200 / 16;
   case PIPE_SHADER_CAP_MAX_OUTPUTS:
      return 32;
@@ -1286,8 +1283,8 @@ nvc0_screen_create(struct nouveau_device *dev)
   for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) {
      BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3);
      PUSH_DATA (push, 1);
-      PUSH_DATA (push, 8192 << 16);
-      PUSH_DATA (push, 8192 << 16);
+      PUSH_DATA (push, 16384 << 16);
+      PUSH_DATA (push, 16384 << 16);
   }

 #define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -1051,21 +1051,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
   } else {
      struct nv50_miptree *mt = nv50_miptree(&res->base);
      struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level];
-      const unsigned z = view->u.tex.first_layer;
+      unsigned z = view->u.tex.first_layer;

-      if (z) {
-         if (mt->layout_3d) {
-            address += nvc0_mt_zslice_offset(mt, view->u.tex.level, z);
-            /* doesn't work if z passes z-tile boundary */
-            if (depth > 1) {
-               pipe_debug_message(&nvc0->base.debug, CONFORMANCE,
-                                  "3D images are not really supported!");
-               debug_printf("3D images are not really supported!\n");
-            }
-         } else {
-            address += mt->layer_stride * z;
-         }
+      if (!mt->layout_3d) {
+         address += mt->layer_stride * z;
+         z = 0;
      }
+
      address += lvl->offset;

      info[0]  = address >> 8;
@@ -1080,7 +1072,8 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
      info[6]  = depth - 1;
      info[6] |= (lvl->tile_mode & 0xf00) << 21;
      info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22;
-      info[7]  = 0;
+      info[7]  = mt->layout_3d ? 1 : 0;
+      info[7] |= z << 16;
      info[14] = mt->ms_x;
      info[15] = mt->ms_y;
   }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -1040,7 +1040,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
   }

   if (nvc0->state.vbo_mode) {
-      nvc0_push_vbo(nvc0, info);
+      if (info->indirect)
+         nvc0_push_vbo_indirect(nvc0, info);
+      else
+         nvc0_push_vbo(nvc0, info);
      goto cleanup;
   }

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -466,6 +466,83 @@ nvc0_prim_gl(unsigned prim)
   }
 }

+typedef struct {
+   uint32_t count;
+   uint32_t primCount;
+   uint32_t first;
+   uint32_t baseInstance;
+} DrawArraysIndirectCommand;
+
+typedef struct {
+   uint32_t count;
+   uint32_t primCount;
+   uint32_t firstIndex;
+   int32_t  baseVertex;
+   uint32_t baseInstance;
+} DrawElementsIndirectCommand;
+
+void
+nvc0_push_vbo_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
+{
+   /* The strategy here is to just read the commands from the indirect buffer
+    * and do the draws. This is suboptimal, but will only happen in the case
+    * that conversion is required for FIXED or DOUBLE inputs.
+    */
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(info->indirect->buffer);
+   struct nv04_resource *buf_count = nv04_resource(info->indirect->indirect_draw_count);
+   unsigned i;
+
+   unsigned draw_count = info->indirect->draw_count;
+   if (buf_count) {
+      uint32_t *count = nouveau_resource_map_offset(
+            &nvc0->base, buf_count, info->indirect->indirect_draw_count_offset,
+            NOUVEAU_BO_RD);
+      draw_count = *count;
+   }
+
+   uint8_t *buf_data = nouveau_resource_map_offset(
+            &nvc0->base, buf, info->indirect->offset, NOUVEAU_BO_RD);
+   struct pipe_draw_info single = *info;
+   single.indirect = NULL;
+   for (i = 0; i < draw_count; i++, buf_data += info->indirect->stride) {
+      if (info->index_size) {
+         DrawElementsIndirectCommand *cmd = (void *)buf_data;
+         single.start = info->start + cmd->firstIndex;
+         single.count = cmd->count;
+         single.start_instance = cmd->baseInstance;
+         single.instance_count = cmd->primCount;
+         single.index_bias = cmd->baseVertex;
+      } else {
+         DrawArraysIndirectCommand *cmd = (void *)buf_data;
+         single.start = cmd->first;
+         single.count = cmd->count;
+         single.start_instance = cmd->baseInstance;
+         single.instance_count = cmd->primCount;
+      }
+
+      if (nvc0->vertprog->vp.need_draw_parameters) {
+         PUSH_SPACE(push, 9);
+         BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+         PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+         PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+         PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+         BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+         PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
+         PUSH_DATA (push, single.index_bias);
+         PUSH_DATA (push, single.start_instance);
+         PUSH_DATA (push, single.drawid + i);
+      }
+
+      nvc0_push_vbo(nvc0, &single);
+   }
+
+   nouveau_resource_unmap(buf);
+   if (buf_count)
+      nouveau_resource_unmap(buf_count);
+}
+
 void
 nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 {
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -304,6 +304,9 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
            return 2048;

+        case PIPE_CAP_MAX_VARYINGS:
+            return 10;
+
        case PIPE_CAP_VENDOR_ID:
                return 0x1002;
        case PIPE_CAP_DEVICE_ID:
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -536,6 +536,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_TEXEL_OFFSET:
 		return 7;

+	case PIPE_CAP_MAX_VARYINGS:
+		return 32;
+
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
 		return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600;
 	case PIPE_CAP_ENDIANNESS:
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -521,10 +521,13 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
 	struct si_resource *buf = si_resource(transfer->resource);

 	if (stransfer->staging) {
+		unsigned src_offset = stransfer->offset +
+				      transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
+				      (box->x - transfer->box.x);
+
 		/* Copy the staging buffer into the original one. */
 		si_copy_buffer((struct si_context*)ctx, transfer->resource,
-			       &stransfer->staging->b.b, box->x,
-			       stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT,
+			       &stransfer->staging->b.b, box->x, src_offset,
 			       box->width);
 	}

--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -254,6 +254,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 		return 30;

+	case PIPE_CAP_MAX_VARYINGS:
+		return 32;
+
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
 		return sscreen->info.chip_class <= VI ?
 			PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -1333,7 +1333,7 @@ void si_init_perfcounters(struct si_screen *screen)
 	for (i = 0; i < num_blocks; ++i) {
 		struct si_pc_block *block = &pc->blocks[i];
 		block->b = &blocks[i];
-		block->num_instances = block->b->instances;
+		block->num_instances = MAX2(1, block->b->instances);

 		if (!strcmp(block->b->b->name, "CB") ||
 		    !strcmp(block->b->b->name, "DB"))
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -834,7 +834,7 @@ si_lower_nir(struct si_shader_selector* sel)
 		NIR_PASS(progress, sel->nir, nir_opt_if);
 		NIR_PASS(progress, sel->nir, nir_opt_dead_cf);
 		NIR_PASS(progress, sel->nir, nir_opt_cse);
-		NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true, true);
+		NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true);

 		/* Needed for algebraic lowering */
 		NIR_PASS(progress, sel->nir, nir_opt_algebraic);
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -265,6 +265,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
      return 1;
   case PIPE_CAP_CLEAR_TEXTURE:
      return 1;
+   case PIPE_CAP_MAX_VARYINGS:
+      return TGSI_EXEC_MAX_INPUT_ATTRIBS;
   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -350,6 +350,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)

   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
      return sws->have_sm4_1 ? 1 : 0; /* only single-channel textures */
+   case PIPE_CAP_MAX_VARYINGS:
+      return sws->have_vgpu10 ? VGPU10_MAX_FS_INPUTS : 10;

   /* Unsupported features */
   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
--- a/src/gallium/drivers/v3d/v3d_resource.c
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -780,7 +780,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
                rsc->tiled = false;
        } else {
                fprintf(stderr, "Unsupported modifier requested\n");
-                return NULL;
+                goto fail;
        }

        rsc->internal_format = prsc->format;
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -177,6 +177,9 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
        case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
                return 4;

+        case PIPE_CAP_MAX_VARYINGS:
+                return V3D_MAX_FS_INPUTS / 4;
+
                /* Texturing. */
        case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
        case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1591,7 +1591,7 @@ vc4_optimize_nir(struct nir_shader *s)
                NIR_PASS(progress, s, nir_opt_dce);
                NIR_PASS(progress, s, nir_opt_dead_cf);
                NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
                NIR_PASS(progress, s, nir_opt_algebraic);
                NIR_PASS(progress, s, nir_opt_constant_folding);
                NIR_PASS(progress, s, nir_opt_undef);
--- a/src/gallium/drivers/vc4/vc4_query.c
+++ b/src/gallium/drivers/vc4/vc4_query.c
@@ -132,7 +132,7 @@ vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries,

        /* We can't mix HW and non-HW queries. */
        if (nhwqueries && nhwqueries != num_queries)
-                return NULL;
+                goto err_free_query;

        if (!nhwqueries)
                return (struct pipe_query *)query;
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -178,6 +178,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
                /* Note: Not supported in hardware, just faking it. */
                return 5;

+        case PIPE_CAP_MAX_VARYINGS:
+                return 8;
+
        case PIPE_CAP_VENDOR_ID:
                return 0x14E4;
        case PIPE_CAP_ACCELERATED:
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -258,6 +258,10 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
      return 1; /* TODO: need to introduce a hw-cap for this */
+   case PIPE_CAP_MAX_VARYINGS:
+      if (vscreen->caps.caps.v1.glsl_level < 150)
+         return vscreen->caps.caps.v2.max_vertex_attribs;
+      return 32;
   case PIPE_CAP_TEXTURE_GATHER_SM5:
   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
   case PIPE_CAP_FAKE_SW_MSAA:
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -856,6 +856,7 @@ enum pipe_cap
   PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE,
   PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND,
   PIPE_CAP_DEST_SURFACE_SRGB_CONTROL,
+   PIPE_CAP_MAX_VARYINGS,
 };

 /**
--- a/src/gallium/include/pipe/p_video_enums.h
+++ b/src/gallium/include/pipe/p_video_enums.h
@@ -70,7 +70,8 @@ enum pipe_video_profile
   PIPE_VIDEO_PROFILE_HEVC_MAIN_444,
   PIPE_VIDEO_PROFILE_JPEG_BASELINE,
   PIPE_VIDEO_PROFILE_VP9_PROFILE0,
-   PIPE_VIDEO_PROFILE_VP9_PROFILE2
+   PIPE_VIDEO_PROFILE_VP9_PROFILE2,
+   PIPE_VIDEO_PROFILE_MAX
 };

 /* Video caps, can be different for each codec/profile */
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -175,7 +175,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
   ctx->version_minor = 1;
   *ctx->vtable = vtable;
   *ctx->vtable_vpp = vtable_vpp;
-   ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN;
+   ctx->max_profiles = PIPE_VIDEO_PROFILE_MAX - PIPE_VIDEO_PROFILE_UNKNOWN - 1;
   ctx->max_entrypoints = 2;
   ctx->max_attributes = 1;
   ctx->max_image_formats = VL_VA_MAX_IMAGE_FORMATS;
--- a/src/gallium/state_trackers/va/picture_vp9.c
+++ b/src/gallium/state_trackers/va/picture_vp9.c
@@ -28,6 +28,8 @@
 #include "vl/vl_vlc.h"
 #include "va_private.h"

+#define NUM_VP9_REFS 8
+
 void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
   VADecPictureParameterBufferVP9 *vp9 = buf->data;
@@ -79,8 +81,11 @@ void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context,

   context->desc.vp9.picture_parameter.bit_depth = vp9->bit_depth;

-   for (i = 0 ; i < 8 ; i++)
+   for (i = 0 ; i < NUM_VP9_REFS ; i++)
      vlVaGetReferenceFrame(drv, vp9->reference_frames[i], &context->desc.vp9.ref[i]);
+
+   if (!context->decoder && !context->templat.max_references)
+      context->templat.max_references = NUM_VP9_REFS;
 }

 void vlVaHandleSliceParameterBufferVP9(vlVaContext *context, vlVaBuffer *buf)
--- a/src/gallium/state_trackers/xvmc/attributes.c
+++ b/src/gallium/state_trackers/xvmc/attributes.c
@@ -90,15 +90,15 @@ Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
   if (!attr)
      return XvMCBadContext;

-   if (strcmp(attr, XV_BRIGHTNESS))
+   if (strcmp(attr, XV_BRIGHTNESS) == 0)
      context_priv->procamp.brightness = value / 1000.0f;
-   else if (strcmp(attr, XV_CONTRAST))
+   else if (strcmp(attr, XV_CONTRAST) == 0)
      context_priv->procamp.contrast = value / 1000.0f + 1.0f;
-   else if (strcmp(attr, XV_SATURATION))
+   else if (strcmp(attr, XV_SATURATION) == 0)
      context_priv->procamp.saturation = value / 1000.0f + 1.0f;
-   else if (strcmp(attr, XV_HUE))
+   else if (strcmp(attr, XV_HUE) == 0)
      context_priv->procamp.hue = value / 1000.0f;
-   else if (strcmp(attr, XV_COLORSPACE))
+   else if (strcmp(attr, XV_COLORSPACE) == 0)
      context_priv->color_standard = value ?
         VL_CSC_COLOR_STANDARD_BT_601 :
         VL_CSC_COLOR_STANDARD_BT_709;
@@ -134,15 +134,15 @@ Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
   if (!attr)
      return XvMCBadContext;

-   if (strcmp(attr, XV_BRIGHTNESS))
+   if (strcmp(attr, XV_BRIGHTNESS) == 0)
      *value = context_priv->procamp.brightness * 1000;
-   else if (strcmp(attr, XV_CONTRAST))
+   else if (strcmp(attr, XV_CONTRAST) == 0)
      *value = context_priv->procamp.contrast * 1000 - 1000;
-   else if (strcmp(attr, XV_SATURATION))
+   else if (strcmp(attr, XV_SATURATION) == 0)
      *value = context_priv->procamp.saturation * 1000 + 1000;
-   else if (strcmp(attr, XV_HUE))
+   else if (strcmp(attr, XV_HUE) == 0)
      *value = context_priv->procamp.hue * 1000;
-   else if (strcmp(attr, XV_COLORSPACE))
+   else if (strcmp(attr, XV_COLORSPACE) == 0)
      *value = context_priv->color_standard == VL_CSC_COLOR_STANDARD_BT_709;
   else
      return BadName;
--- a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
+++ b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
@@ -123,11 +123,11 @@ void ParseArgs(int argc, char **argv, struct Config *config)

 			while (token && !fail)
 			{
-				if (strcmp(token, "i"))
+				if (strcmp(token, "i") == 0)
 					config->mb_types |= MB_TYPE_I;
-				else if (strcmp(token, "p"))
+				else if (strcmp(token, "p") == 0)
 					config->mb_types |= MB_TYPE_P;
-				else if (strcmp(token, "b"))
+				else if (strcmp(token, "b") == 0)
 					config->mb_types |= MB_TYPE_B;
 				else
 					fail = 1;
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1219,8 +1219,6 @@ static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs)
 {
   struct amdgpu_cs_context *cs = acs->csc;

-   cs->num_fence_dependencies = 0;
-
   amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
   amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
   amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
--- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
+++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
@@ -396,6 +396,7 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
 {
   struct xlib_displaytarget *xlib_dt;
   unsigned nblocksy, size;
+   int ignore;

   xlib_dt = CALLOC_STRUCT(xlib_displaytarget);
   if (!xlib_dt)
@@ -410,7 +411,8 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
   xlib_dt->stride = align(util_format_get_stride(format, width), alignment);
   size = xlib_dt->stride * nblocksy;

-   if (!debug_get_option_xlib_no_shm()) {
+   if (!debug_get_option_xlib_no_shm() &&
+       XQueryExtension(xlib_dt->display, "MIT-SHM", &ignore, &ignore, &ignore)) {
      xlib_dt->data = alloc_shm(xlib_dt, size);
      if (xlib_dt->data) {
         xlib_dt->shm = True;
--- a/src/intel/Makefile.vulkan.am
+++ b/src/intel/Makefile.vulkan.am
@@ -253,6 +253,7 @@ VULKAN_TESTS = \
 	vulkan/tests/block_pool_no_free \
 	vulkan/tests/state_pool_no_free \
 	vulkan/tests/state_pool_free_list_only \
+	vulkan/tests/state_pool_padding \
 	vulkan/tests/state_pool

 VULKAN_TEST_LDADD = \
@@ -274,6 +275,10 @@ vulkan_tests_state_pool_free_list_only_CFLAGS = $(VULKAN_CFLAGS)
 vulkan_tests_state_pool_free_list_only_CPPFLAGS = $(VULKAN_CPPFLAGS)
 vulkan_tests_state_pool_free_list_only_LDADD = $(VULKAN_TEST_LDADD)

+vulkan_tests_state_pool_padding_CFLAGS = $(VULKAN_CFLAGS)
+vulkan_tests_state_pool_padding_CPPFLAGS = $(VULKAN_CPPFLAGS)
+vulkan_tests_state_pool_padding_LDADD = $(VULKAN_TEST_LDADD)
+
 vulkan_tests_state_pool_CFLAGS = $(VULKAN_CFLAGS)
 vulkan_tests_state_pool_CPPFLAGS = $(VULKAN_CPPFLAGS)
 vulkan_tests_state_pool_LDADD = $(VULKAN_TEST_LDADD)
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -667,15 +667,14 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
       * messages adding a node interference to the grf127_send_hack_node.
       * This node has a fixed asignment to grf127.
       *
-       * We don't apply it to SIMD16 because previous code avoids any register
-       * overlap between sources and destination.
+       * We don't apply it to SIMD16 instructions because previous code avoids
+       * any register overlap between sources and destination.
       */
      ra_set_node_reg(g, grf127_send_hack_node, 127);
-      if (dispatch_width == 8) {
-         foreach_block_and_inst(block, fs_inst, inst, cfg) {
-            if (inst->is_send_from_grf() && inst->dst.file == VGRF)
-               ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
-         }
+      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         if (inst->exec_size < 16 && inst->is_send_from_grf() &&
+             inst->dst.file == VGRF)
+            ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
      }

      if (spilled_any_registers) {
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -570,18 +570,7 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
      OPT(nir_opt_dce);
      OPT(nir_opt_cse);

-      /* Passing 0 to the peephole select pass causes it to convert
-       * if-statements that contain only move instructions in the branches
-       * regardless of the count.
-       *
-       * Passing 1 to the peephole select pass causes it to convert
-       * if-statements that contain at most a single ALU instruction (total)
-       * in both branches.  Before Gen6, some math instructions were
-       * prohibitively expensive and the results of compare operations need an
-       * extra resolve step.  For these reasons, this pass is more harmful
-       * than good on those platforms.
-       *
-       * For indirect loads of uniforms (push constants), we assume that array
+      /* For indirect loads of uniforms (push constants), we assume that array
       * indices will nearly always be in bounds and the cost of the load is
       * low.  Therefore there shouldn't be a performance benefit to avoid it.
       * However, in vec4 tessellation shaders, these loads operate by
@@ -590,9 +579,7 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
      const bool is_vec4_tessellation = !is_scalar &&
         (nir->info.stage == MESA_SHADER_TESS_CTRL ||
          nir->info.stage == MESA_SHADER_TESS_EVAL);
-      OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
-      OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation,
-          compiler->devinfo->gen >= 6);
+      OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation);

      OPT(nir_opt_intrinsics);
      OPT(nir_opt_idiv_const, 32);
--- a/src/intel/vulkan/gen7_cmd_buffer.c
+++ b/src/intel/vulkan/gen7_cmd_buffer.c
@@ -70,12 +70,36 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
      };

      const int max = 0xffff;
+
+      uint32_t y_min = s->offset.y;
+      uint32_t x_min = s->offset.x;
+      uint32_t y_max = s->offset.y + s->extent.height - 1;
+      uint32_t x_max = s->offset.x + s->extent.width - 1;
+
+      /* Do this math using int64_t so overflow gets clamped correctly. */
+      if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+         y_min = clamp_int64((uint64_t) y_min,
+                             cmd_buffer->state.render_area.offset.y, max);
+         x_min = clamp_int64((uint64_t) x_min,
+                             cmd_buffer->state.render_area.offset.x, max);
+         y_max = clamp_int64((uint64_t) y_max, 0,
+                             cmd_buffer->state.render_area.offset.y +
+                             cmd_buffer->state.render_area.extent.height - 1);
+         x_max = clamp_int64((uint64_t) x_max, 0,
+                             cmd_buffer->state.render_area.offset.x +
+                             cmd_buffer->state.render_area.extent.width - 1);
+      } else if (fb) {
+         y_min = clamp_int64((uint64_t) y_min, 0, max);
+         x_min = clamp_int64((uint64_t) x_min, 0, max);
+         y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1);
+         x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1);
+      }
+
      struct GEN7_SCISSOR_RECT scissor = {
-         /* Do this math using int64_t so overflow gets clamped correctly. */
-         .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
-         .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
-         .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + s->extent.height - 1, 0, fb->height - 1),
-         .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + s->extent.width - 1, 0, fb->width - 1)
+         .ScissorRectangleYMin = y_min,
+         .ScissorRectangleXMin = x_min,
+         .ScissorRectangleYMax = y_max,
+         .ScissorRectangleXMax = x_max
      };

      if (s->extent.width <= 0 || s->extent.height <= 0) {
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -1211,13 +1211,30 @@ emit_3dstate_streamout(struct anv_pipeline *pipeline,
            hole_dwords -= 4;
         }

+         int varying = output->location;
+         uint8_t component_mask = output->component_mask;
+         /* VARYING_SLOT_PSIZ contains three scalar fields packed together:
+          * - VARYING_SLOT_LAYER    in VARYING_SLOT_PSIZ.y
+          * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z
+          * - VARYING_SLOT_PSIZ     in VARYING_SLOT_PSIZ.w
+          */
+         if (varying == VARYING_SLOT_LAYER) {
+            varying = VARYING_SLOT_PSIZ;
+            component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
+         } else if (varying == VARYING_SLOT_VIEWPORT) {
+            varying = VARYING_SLOT_PSIZ;
+            component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
+         } else if (varying == VARYING_SLOT_PSIZ) {
+            component_mask = 1 << 3; // SO_DECL_COMPMASK_W
+         }
+
         next_offset[buffer] = output->offset +
-                               __builtin_popcount(output->component_mask) * 4;
+                               __builtin_popcount(component_mask) * 4;

         so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
            .OutputBufferSlot = buffer,
-            .RegisterIndex = vue_map->varying_to_slot[output->location],
-            .ComponentMask = output->component_mask,
+            .RegisterIndex = vue_map->varying_to_slot[varying],
+            .ComponentMask = component_mask,
         };
      }

--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -111,7 +111,7 @@ set_adaptive_sync_property(xcb_connection_t *conn, xcb_drawable_t drawable,
   xcb_intern_atom_reply_t* reply;
   xcb_void_cookie_t check;

-   cookie = xcb_intern_atom(conn, 0, sizeof(name), name);
+   cookie = xcb_intern_atom(conn, 0, strlen(name), name);
   reply = xcb_intern_atom_reply(conn, cookie, NULL);
   if (reply == NULL)
      return;
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -34,6 +34,8 @@ AM_CFLAGS = \
 	-I$(top_builddir)/src/util \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/gtest/include \
+	-I$(top_builddir)/src/compiler \
+	-I$(top_srcdir)/src/compiler \
 	-I$(top_builddir)/src/compiler/glsl \
 	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_srcdir)/src/compiler/nir \
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -42,7 +42,7 @@
 #include "compiler/glsl/ir.h"
 #include "compiler/glsl/program.h"
 #include "compiler/glsl/glsl_to_nir.h"
-#include "compiler/glsl/float64_glsl.h"
+#include "glsl/float64_glsl.h"

 #include "brw_program.h"
 #include "brw_context.h"
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -4691,6 +4691,29 @@ discard_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
      if (!att)
         continue;

+      /* If we're asked to invalidate just depth or just stencil, but the
+       * attachment is packed depth/stencil, then we can only use
+       * Driver.DiscardFramebuffer if the attachments list includes both depth
+       * and stencil and they both point at the same renderbuffer.
+       */
+      if ((attachments[i] == GL_DEPTH_ATTACHMENT ||
+           attachments[i] == GL_STENCIL_ATTACHMENT) &&
+          (!att->Renderbuffer ||
+           att->Renderbuffer->_BaseFormat == GL_DEPTH_STENCIL)) {
+         GLenum other_format = (attachments[i] == GL_DEPTH_ATTACHMENT ?
+                                GL_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT);
+         bool has_both = false;
+         for (int j = 0; j < numAttachments; j++) {
+            if (attachments[j] == other_format)
+               has_both = true;
+            break;
+         }
+
+         if (fb->Attachment[BUFFER_DEPTH].Renderbuffer !=
+             fb->Attachment[BUFFER_STENCIL].Renderbuffer || !has_both)
+            continue;
+      }
+
      ctx->Driver.DiscardFramebuffer(ctx, fb, att);
   }
 }
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -208,6 +208,10 @@ new_draw_rastpos_stage(struct gl_context *ctx, struct draw_context *draw)
   rs->prim.end = 1;
   rs->prim.start = 0;
   rs->prim.count = 1;
+   rs->prim.pad = 0;
+   rs->prim.num_instances = 1;
+   rs->prim.base_instance = 0;
+   rs->prim.is_indirect = 0;

   return rs;
 }
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -223,8 +223,13 @@ void st_init_limits(struct pipe_screen *screen,
      pc->MaxUniformComponents = MIN2(pc->MaxUniformComponents,
                                      MAX_UNIFORMS * 4);

+      /* For ARB programs, prog_src_register::Index is a signed 13-bit number.
+       * This gives us a limit of 4096 values - but we may need to generate
+       * internal values in addition to what the source program uses.  So, we
+       * drop the limit one step lower, to 2048, to be safe.
+       */
      pc->MaxParameters =
-      pc->MaxNativeParameters = pc->MaxUniformComponents / 4;
+      pc->MaxNativeParameters = MIN2(pc->MaxUniformComponents / 4, 2048);
      pc->MaxInputComponents =
         screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS) * 4;
      pc->MaxOutputComponents =
@@ -362,10 +367,7 @@ void st_init_limits(struct pipe_screen *screen,
   c->Program[MESA_SHADER_VERTEX].MaxAttribs =
      MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16);

-   /* PIPE_SHADER_CAP_MAX_INPUTS for the FS specifies the maximum number
-    * of inputs. It's always 2 colors + N generic inputs. */
-   c->MaxVarying = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
-                                            PIPE_SHADER_CAP_MAX_INPUTS);
+   c->MaxVarying = screen->get_param(screen, PIPE_CAP_MAX_VARYINGS);
   c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING);
   c->MaxGeometryOutputVertices =
      screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -2356,6 +2356,8 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
      bindings |= PIPE_BIND_DEPTH_STENCIL;
   else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 ||
            internalFormat == GL_RGB || internalFormat == GL_RGBA ||
+            internalFormat == GL_RGBA2 ||
+            internalFormat == GL_RGB4 || internalFormat == GL_RGBA4 ||
            internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 ||
            internalFormat == GL_BGRA ||
            internalFormat == GL_RGB16F ||
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@@ -327,7 +327,7 @@ st_nir_opts(nir_shader *nir, bool scalar)
      NIR_PASS(progress, nir, nir_opt_if);
      NIR_PASS(progress, nir, nir_opt_dead_cf);
      NIR_PASS(progress, nir, nir_opt_cse);
-      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true);

      NIR_PASS(progress, nir, nir_opt_algebraic);
      NIR_PASS(progress, nir, nir_opt_constant_folding);