Update version to 18.2.0-rc4

Signed-off-by: Andres Gomez <agomez@igalia.com>
cherry-ignore: autotools: don't ship the git_sha1.h generated in git in the tarballs
2018-08-22 16:59:30 +03:00 · 2018-08-22 16:58:27 +03:00 · 2018-08-21 23:18:19 +03:00 · 2018-08-21 15:46:01 +03:00 · 2018-08-20 13:33:07 +03:00 · 2018-08-18 00:03:00 +03:00
67 changed files with 615 additions and 171 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-18.2.0-devel
+18.2.0-rc4
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,3 @@
+# fixes:  This commit has more than one Fixes tag but the commit it
+#         addresses didn't land in branch.
+6ff1c479968819b93c46d24bd898e89ce14ac401 autotools: don't ship the git_sha1.h generated in git in the tarballs
--- a/bin/install_megadrivers.py
+++ b/bin/install_megadrivers.py
@@ -43,13 +43,15 @@ def main():
    master = os.path.join(to, os.path.basename(args.megadriver))

    if not os.path.exists(to):
+        if os.path.lexists(to):
+            os.unlink(to)
        os.makedirs(to)
    shutil.copy(args.megadriver, master)

    for driver in args.drivers:
        abs_driver = os.path.join(to, driver)

-        if os.path.exists(abs_driver):
+        if os.path.lexists(abs_driver):
            os.unlink(abs_driver)
        print('installing {} to {}'.format(args.megadriver, abs_driver))
        os.link(master, abs_driver)
@@ -60,7 +62,7 @@ def main():

            name, ext = os.path.splitext(driver)
            while ext != '.so':
-                if os.path.exists(name):
+                if os.path.lexists(name):
                    os.unlink(name)
                os.symlink(driver, name)
                name, ext = os.path.splitext(name)
--- a/configure.ac
+++ b/configure.ac
@@ -1503,15 +1503,15 @@ fi
 AC_ARG_WITH([gl-lib-name],
  [AS_HELP_STRING([--with-gl-lib-name@<:@=NAME@:>@],
    [specify GL library name @<:@default=GL@:>@])],
-  [GL_LIB=$withval],
-  [GL_LIB="$DEFAULT_GL_LIB_NAME"])
+  [AC_MSG_ERROR([--with-gl-lib-name is no longer supported. Rename the library manually if needed.])],
+  [])
 AC_ARG_WITH([osmesa-lib-name],
  [AS_HELP_STRING([--with-osmesa-lib-name@<:@=NAME@:>@],
    [specify OSMesa library name @<:@default=OSMesa@:>@])],
-  [OSMESA_LIB=$withval],
-  [OSMESA_LIB=OSMesa])
-AS_IF([test "x$GL_LIB" = xyes], [GL_LIB="$DEFAULT_GL_LIB_NAME"])
-AS_IF([test "x$OSMESA_LIB" = xyes], [OSMESA_LIB=OSMesa])
+  [AC_MSG_ERROR([--with-osmesa-lib-name is no longer supported. Rename the library manually if needed.])],
+  [])
+GL_LIB="$DEFAULT_GL_LIB_NAME"
+OSMESA_LIB=OSMesa

 dnl
 dnl Mangled Mesa support
@@ -1523,6 +1523,9 @@ AC_ARG_ENABLE([mangling],
  [enable_mangling=no]
 )
 if test "x${enable_mangling}" = "xyes" ; then
+  if test "x$enable_libglvnd" = xyes; then
+    AC_MSG_ERROR([Conflicting options --enable-mangling and --enable-libglvnd.])
+  fi
  DEFINES="${DEFINES} -DUSE_MGL_NAMESPACE"
  GL_LIB="Mangled${GL_LIB}"
  OSMESA_LIB="Mangled${OSMESA_LIB}"
@@ -1530,6 +1533,15 @@ fi
 AC_SUBST([GL_LIB])
 AC_SUBST([OSMESA_LIB])

+dnl HACK when building glx + glvnd we ship gl.pc, despite that glvnd should do it
+dnl Thus we need to use GL as a DSO name.
+if test "x$enable_libglvnd" = xyes -a "x$enable_glx" != xno; then
+  GL_PKGCONF_LIB="GL"
+else
+  GL_PKGCONF_LIB="$GL_LIB"
+fi
+AC_SUBST([GL_PKGCONF_LIB])
+
 # Check for libdrm
 PKG_CHECK_MODULES([LIBDRM], [libdrm >= $LIBDRM_REQUIRED],
                  [have_libdrm=yes], [have_libdrm=no])
@@ -1658,6 +1670,8 @@ xxlib | xgallium-xlib)
 xdri)
    # DRI-based GLX

+    require_dri_shared_libs_and_glapi "GLX"
+
    # find the DRI deps for libGL
    dri_modules="x11 xext xdamage >= $XDAMAGE_REQUIRED xfixes x11-xcb xcb xcb-glx >= $XCBGLX_REQUIRED"

--- a/meson.build
+++ b/meson.build
@@ -989,7 +989,7 @@ if cc.links('''
      freelocale(loc);
      return 0;
    }''',
-    extra_args : pre_args,
+    args : pre_args,
    name : 'strtod has locale support')
  pre_args += '-DHAVE_STRTOD_L'
 endif
--- a/src/amd/Android.mk
+++ b/src/amd/Android.mk
@@ -27,4 +27,6 @@ include $(LOCAL_PATH)/Makefile.sources

 include $(LOCAL_PATH)/Android.addrlib.mk
 include $(LOCAL_PATH)/Android.common.mk
+ifneq ($(filter radeonsi,$(BOARD_GPU_DRIVERS)),)
 include $(LOCAL_PATH)/vulkan/Android.mk
+endif
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -149,7 +149,8 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
 	char features[256];
 	const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--";
 	LLVMTargetRef target = ac_get_llvm_target(triple);
-	bool barrier_does_waitcnt = family != CHIP_VEGA20;
+	bool barrier_does_waitcnt = (tm_options & AC_TM_AUTO_WAITCNT_BEFORE_BARRIER) &&
+				    family != CHIP_VEGA20;

 	snprintf(features, sizeof(features),
 		 "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s%s%s",
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -65,6 +65,7 @@ enum ac_target_machine_options {
 	AC_TM_CHECK_IR = (1 << 5),
 	AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6),
 	AC_TM_CREATE_LOW_OPT = (1 << 7),
+	AC_TM_AUTO_WAITCNT_BEFORE_BARRIER = (1 << 8),
 };

 enum ac_float_mode {
--- a/src/amd/vulkan/Android.mk
+++ b/src/amd/vulkan/Android.mk
@@ -62,6 +62,7 @@ LOCAL_SRC_FILES := \
 	$(VULKAN_FILES)

 LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
+LOCAL_CFLAGS += -DVK_USE_PLATFORM_ANDROID_KHR

 $(call mesa-build-with-llvm)

@@ -140,6 +141,7 @@ LOCAL_SRC_FILES := \
 	$(VULKAN_ANDROID_FILES)

 LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
+LOCAL_CFLAGS += -DVK_USE_PLATFORM_ANDROID_KHR

 $(call mesa-build-with-llvm)

--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -124,7 +124,7 @@ VULKAN_LIB_DEPS += \
 endif

 if HAVE_PLATFORM_ANDROID
-AM_CPPFLAGS += $(ANDROID_CPPFLAGS)
+AM_CPPFLAGS += $(ANDROID_CPPFLAGS) -DVK_USE_PLATFORM_ANDROID_KHR
 AM_CFLAGS += $(ANDROID_CFLAGS)
 VULKAN_LIB_DEPS += $(ANDROID_LIBS)
 VULKAN_SOURCES += $(VULKAN_ANDROID_FILES)
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2307,6 +2307,7 @@ VkResult radv_BeginCommandBuffer(
 	cmd_buffer->state.last_num_instances = -1;
 	cmd_buffer->state.last_vertex_offset = -1;
 	cmd_buffer->state.last_first_instance = -1;
+	cmd_buffer->state.predication_type = -1;
 	cmd_buffer->usage_flags = pBeginInfo->flags;

 	/* setup initial configuration into command buffer */
@@ -4126,15 +4127,18 @@ static void radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer,

 	if (radv_image_has_dcc(image)) {
 		uint32_t value = 0xffffffffu; /* Fully expanded mode. */
+		bool need_decompress_pass = false;

 		if (radv_layout_dcc_compressed(image, dst_layout,
 					       dst_queue_mask)) {
 			value = 0x20202020u;
+			need_decompress_pass = true;
 		}

 		radv_initialize_dcc(cmd_buffer, image, value);

-		radv_set_dcc_need_cmask_elim_pred(cmd_buffer, image, false);
+		radv_set_dcc_need_cmask_elim_pred(cmd_buffer, image,
+						  need_decompress_pass);
 	}

 	if (radv_image_has_cmask(image) || radv_image_has_dcc(image)) {
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -480,6 +480,9 @@ radv_handle_per_app_options(struct radv_instance *instance,
 			 */
 			instance->perftest_flags |= RADV_PERFTEST_SISCHED;
 		}
+	} else if (!strcmp(name, "DOOM_VFR")) {
+		/* Work around a Doom VFR game bug */
+		instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS;
 	}
 }

--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -105,7 +105,7 @@ EXTENSIONS = [
    Extension('VK_EXT_sampler_filter_minmax',             1, 'device->rad_info.chip_class >= CIK'),
    Extension('VK_EXT_shader_viewport_index_layer',       1, True),
    Extension('VK_EXT_shader_stencil_export',             1, True),
-    Extension('VK_EXT_vertex_attribute_divisor',          1, True),
+    Extension('VK_EXT_vertex_attribute_divisor',          2, True),
    Extension('VK_AMD_draw_indirect_count',               1, True),
    Extension('VK_AMD_gcn_shader',                        1, True),
    Extension('VK_AMD_rasterization_order',               1, 'device->has_out_of_order_rast'),
--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@@ -612,7 +612,8 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
 	}

 	if (desc->layout == VK_FORMAT_LAYOUT_ETC &&
-	    physical_device->rad_info.chip_class < GFX9 &&
+	    physical_device->rad_info.family != CHIP_VEGA10 &&
+	    physical_device->rad_info.family != CHIP_RAVEN &&
 	    physical_device->rad_info.family != CHIP_STONEY) {
 		out_properties->linearTilingFeatures = linear;
 		out_properties->optimalTilingFeatures = tiled;
--- a/src/amd/vulkan/radv_meta_fast_clear.c
+++ b/src/amd/vulkan/radv_meta_fast_clear.c
@@ -603,7 +603,7 @@ radv_emit_color_decompress(struct radv_cmd_buffer *cmd_buffer,
               pipeline = cmd_buffer->device->meta_state.fast_clear_flush.cmask_eliminate_pipeline;
 	}

-	if (radv_image_has_dcc(image)) {
+	if (!decompress_dcc && radv_image_has_dcc(image)) {
 		old_predicating = cmd_buffer->state.predicating;

 		radv_emit_set_predication_state_from_image(cmd_buffer, image, true);
@@ -671,7 +671,7 @@ radv_emit_color_decompress(struct radv_cmd_buffer *cmd_buffer,
 					&cmd_buffer->pool->alloc);

 	}
-	if (radv_image_has_dcc(image)) {
+	if (!decompress_dcc && radv_image_has_dcc(image)) {
 		cmd_buffer->state.predicating = old_predicating;

 		radv_emit_set_predication_state_from_image(cmd_buffer, image, false);
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1991,8 +1991,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 			uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];

 			if (divisor) {
-				buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.instance_id,
-				                            ctx->abi.start_instance, "");
+				buffer_index = ctx->abi.instance_id;

 				if (divisor != 1) {
 					buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
@@ -2009,6 +2008,8 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 			} else {
 				buffer_index = ctx->ac.i32_0;
 			}
+
+			buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.start_instance, buffer_index, "");
 		} else
 			buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
 			                            ctx->abi.base_vertex, "");
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -673,7 +673,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
 			if (!cs->num_buffers)
 				continue;

-			if (unique_bo_count == 0) {
+			if (unique_bo_count == 0 && !cs->num_virtual_buffers) {
 				memcpy(handles, cs->handles, cs->num_buffers * sizeof(amdgpu_bo_handle));
 				unique_bo_count = cs->num_buffers;
 				continue;
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -528,6 +528,16 @@
    <field name="number of attribute arrays" size="5" start="0" type="uint"/>
  </packet>

+  <packet code="71" name="VCM Cache Size" min_ver="41">
+    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
+    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
+  </packet>
+
+  <packet code="73" name="VCM Cache Size" max_ver="33">
+    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
+    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
+  </packet>
+
  <packet code="73" name="Transform Feedback Buffer" min_ver="41">
    <field name="Buffer Address" size="32" start="32" type="address"/>
    <field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/>
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -27,13 +27,14 @@
 #include <stdint.h>

 /**
- * Struct for tracking features of the V3D chip. This is where we'll store
- * boolean flags for features in a specific version, but for now it's just the
- * version
+ * Struct for tracking features of the V3D chip across driver and compiler.
 */
 struct v3d_device_info {
        /** Simple V3D version: major * 10 + minor */
        uint8_t ver;
+
+        /** Size of the VPM, in bytes. */
+        int vpm_size;
 };

 #endif
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -462,6 +462,7 @@ struct choose_scoreboard {
        int last_magic_sfu_write_tick;
        int last_ldvary_tick;
        int last_uniforms_reset_tick;
+        int last_thrsw_tick;
        bool tlb_locked;
 };

@@ -1095,10 +1096,16 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
 }

 static bool
-valid_thrsw_sequence(struct v3d_compile *c,
+valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
                     struct qinst *qinst, int instructions_in_sequence,
                     bool is_thrend)
 {
+        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
+        if (scoreboard->last_thrsw_tick + 3 >
+            scoreboard->tick - instructions_in_sequence) {
+                return false;
+        }
+
        for (int slot = 0; slot < instructions_in_sequence; slot++) {
                /* No scheduling SFU when the result would land in the other
                 * thread.  The simulator complains for safety, though it
@@ -1159,7 +1166,8 @@ emit_thrsw(struct v3d_compile *c,
                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
                        break;

-                if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
+                if (!valid_thrsw_sequence(c, scoreboard,
+                                          prev_inst, slots_filled + 1,
                                          is_thrend)) {
                        break;
                }
@@ -1173,7 +1181,9 @@ emit_thrsw(struct v3d_compile *c,
        if (merge_inst) {
                merge_inst->qpu.sig.thrsw = true;
                needs_free = true;
+                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
        } else {
+                scoreboard->last_thrsw_tick = scoreboard->tick;
                insert_scheduled_instruction(c, block, scoreboard, inst);
                time++;
                slots_filled++;
@@ -1475,6 +1485,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
        scoreboard.last_ldvary_tick = -10;
        scoreboard.last_magic_sfu_write_tick = -10;
        scoreboard.last_uniforms_reset_tick = -10;
+        scoreboard.last_thrsw_tick = -10;

        if (debug) {
                fprintf(stderr, "Pre-schedule instructions\n");
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -648,6 +648,9 @@ struct v3d_vs_prog_data {

        /* Total number of components written, for the shader state record. */
        uint32_t vpm_output_size;
+
+        /* Value to be programmed in VCM_CACHE_SIZE. */
+        uint8_t vcm_cache_size;
 };

 struct v3d_fs_prog_data {
@@ -928,7 +931,7 @@ VIR_A_ALU2(OR)
 VIR_A_ALU2(XOR)
 VIR_A_ALU2(VADD)
 VIR_A_ALU2(VSUB)
-VIR_A_ALU2(STVPMV)
+VIR_A_NODST_2(STVPMV)
 VIR_A_ALU1(NOT)
 VIR_A_ALU1(NEG)
 VIR_A_ALU1(FLAPUSH)
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -452,6 +452,16 @@ vir_emit_def(struct v3d_compile *c, struct qinst *inst)
 {
        assert(inst->dst.file == QFILE_NULL);

+        /* If we're emitting an instruction that's a def, it had better be
+         * writing a register.
+         */
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
+                       v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
+                assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
+                       v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
+        }
+
        inst->dst = vir_get_temp(c);

        if (inst->dst.file == QFILE_TEMP)
@@ -746,10 +756,28 @@ uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
        if (prog_data->uses_iid)
                prog_data->vpm_input_size++;

-        /* Input/output segment size are in 8x32-bit multiples. */
+        /* Input/output segment size are in sectors (8 rows of 32 bits per
+         * channel).
+         */
        prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
        prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;

+        /* Compute VCM cache size.  We set up our program to take up less than
+         * half of the VPM, so that any set of bin and render programs won't
+         * run out of space.  We need space for at least one input segment,
+         * and then allocate the rest to output segments (one for the current
+         * program, the rest to VCM).  The valid range of the VCM cache size
+         * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
+         * batches.
+         */
+        assert(c->devinfo->vpm_size);
+        int sector_size = 16 * sizeof(uint32_t) * 8;
+        int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
+        int half_vpm = vpm_size_in_sectors / 2;
+        int vpm_output_batches = half_vpm - prog_data->vpm_input_size;
+        assert(vpm_output_batches >= 2);
+        prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
+
        return v3d_return_qpu_insts(c, final_assembly_size);
 }

--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -94,6 +94,15 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                }
                        }

+                        /* Refuse to spill a ldvary's dst, because that means
+                         * that ldvary's r5 would end up being used across a
+                         * thrsw.
+                         */
+                        if (inst->qpu.sig.ldvary) {
+                                assert(inst->dst.file == QFILE_TEMP);
+                                BITSET_CLEAR(c->spillable, inst->dst.index);
+                        }
+
                        if (inst->is_last_thrsw)
                                started_last_seg = true;

@@ -102,7 +111,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                started_last_seg = true;

                        /* Track when we're in between a TMU setup and the
-                         * final LDTMU from that TMU setup.  We can't
+                         * final LDTMU or TMUWT from that TMU setup.  We can't
                         * spill/fill any temps during that time, because that
                         * involves inserting a new TMU setup/LDTMU sequence.
                         */
@@ -110,6 +119,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                            is_last_ldtmu(inst, block))
                                in_tmu_operation = false;

+                        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT)
+                                in_tmu_operation = false;
+
                        if (v3d_qpu_writes_tmu(&inst->qpu))
                                in_tmu_operation = true;
                }
@@ -206,6 +219,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                                     inst->dst);
                        v3d_emit_spill_tmua(c, spill_offset);
                        vir_emit_thrsw(c);
+                        vir_TMUWT(c);
                        c->spills++;
                }

--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -1928,6 +1928,11 @@ ast_expression::do_hir(exec_list *instructions,

      error_emitted = op[0]->type->is_error() || op[1]->type->is_error();

+      if (error_emitted) {
+         result = ir_rvalue::error_value(ctx);
+         break;
+      }
+
      type = arithmetic_result_type(op[0], op[1], false, state, & loc);

      ir_rvalue *temp_rhs;
--- a/src/compiler/glsl/ir_constant_expression.cpp
+++ b/src/compiler/glsl/ir_constant_expression.cpp
@@ -826,7 +826,7 @@ ir_dereference_array::constant_expression_value(void *mem_ctx,
         const unsigned component = idx->value.u[0];

         return new(mem_ctx) ir_constant(array, component);
-      } else {
+      } else if (array->type->is_array()) {
         const unsigned index = idx->value.u[0];
         return array->get_array_element(index)->clone(mem_ctx, NULL);
      }
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -1134,6 +1134,25 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
   return (config_count != 0);
 }

+#ifdef HAVE_DRM_GRALLOC
+static int
+droid_open_device_drm_gralloc(struct dri2_egl_display *dri2_dpy)
+{
+   int fd = -1, err = -EINVAL;
+
+   if (dri2_dpy->gralloc->perform)
+         err = dri2_dpy->gralloc->perform(dri2_dpy->gralloc,
+                                          GRALLOC_MODULE_PERFORM_GET_DRM_FD,
+                                          &fd);
+   if (err || fd < 0) {
+      _eglLog(_EGL_WARNING, "fail to get drm fd");
+      fd = -1;
+   }
+
+   return (fd >= 0) ? fcntl(fd, F_DUPFD_CLOEXEC, 3) : -1;
+}
+#endif /* HAVE_DRM_GRALLOC */
+
 static const struct dri2_egl_display_vtbl droid_display_vtbl = {
   .authenticate = NULL,
   .create_window_surface = droid_create_window_surface,
@@ -1384,7 +1403,11 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp)

   disp->DriverData = (void *) dri2_dpy;

+#ifdef HAVE_DRM_GRALLOC
+   dri2_dpy->fd = droid_open_device_drm_gralloc(dri2_dpy);
+#else
   dri2_dpy->fd = droid_open_device(disp);
+#endif
   if (dri2_dpy->fd < 0) {
      err = "DRI2: failed to open device";
      goto cleanup;
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -201,6 +201,17 @@ resize_callback(struct wl_egl_window *wl_win, void *data)
   struct dri2_egl_display *dri2_dpy =
      dri2_egl_display(dri2_surf->base.Resource.Display);

+   /* Update the surface size as soon as native window is resized; from user
+    * pov, this makes the effect that resize is done inmediately after native
+    * window resize, without requiring to wait until the first draw.
+    *
+    * A more detailed and lengthy explanation can be found at
+    * https://lists.freedesktop.org/archives/mesa-dev/2018-June/196474.html
+    */
+   if (!dri2_surf->back) {
+      dri2_surf->base.Width = wl_win->width;
+      dri2_surf->base.Height = wl_win->height;
+   }
   dri2_dpy->flush->invalidate(dri2_surf->dri_drawable);
 }

@@ -258,6 +269,9 @@ dri2_wl_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
      goto cleanup_surf;
   }

+   dri2_surf->base.Width = window->width;
+   dri2_surf->base.Height = window->height;
+
   visual_idx = dri2_wl_visual_idx_from_config(dri2_dpy, config);
   assert(visual_idx != -1);

@@ -577,8 +591,8 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
   struct dri2_egl_display *dri2_dpy =
      dri2_egl_display(dri2_surf->base.Resource.Display);

-   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
-       dri2_surf->base.Height != dri2_surf->wl_win->height) {
+   if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {

      dri2_wl_release_buffers(dri2_surf);

@@ -1632,8 +1646,8 @@ swrast_update_buffers(struct dri2_egl_surface *dri2_surf)
   if (dri2_surf->back)
      return 0;

-   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
-       dri2_surf->base.Height != dri2_surf->wl_win->height) {
+   if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {

      dri2_wl_release_buffers(dri2_surf);

--- a/src/egl/drivers/dri2/platform_x11_dri3.c
+++ b/src/egl/drivers/dri2/platform_x11_dri3.c
@@ -107,12 +107,17 @@ static const struct loader_dri3_vtable egl_dri3_vtable = {
 static EGLBoolean
 dri3_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
 {
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(surf);
+   xcb_drawable_t drawable = dri3_surf->loader_drawable.drawable;

   (void) drv;

   loader_dri3_drawable_fini(&dri3_surf->loader_drawable);

+   if (surf->Type == EGL_PBUFFER_BIT)
+      xcb_free_pixmap (dri2_dpy->conn, drawable);
+
   dri2_fini_surface(surf);
   free(surf);

--- a/src/egl/meson.build
+++ b/src/egl/meson.build
@@ -99,10 +99,10 @@ endif

 if with_platform_x11
  files_egl += files('drivers/dri2/platform_x11.c')
+  incs_for_egl += inc_loader
  if with_dri3
    files_egl += files('drivers/dri2/platform_x11_dri3.c')
    link_for_egl += libloader_dri3_helper
-    incs_for_egl += inc_loader
  endif
  deps_for_egl += [dep_x11_xcb, dep_xcb_dri2, dep_xcb_xfixes]
 endif
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -1131,6 +1131,31 @@ static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr)
   mgr->dirty_real_vb_mask = 0;
 }

+static void
+u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
+                               unsigned *indirect_data, unsigned stride,
+                               unsigned draw_count)
+{
+   assert(info->index_size);
+   info->indirect = NULL;
+
+   for (unsigned i = 0; i < draw_count; i++) {
+      unsigned offset = i * stride / 4;
+
+      info->count = indirect_data[offset + 0];
+      info->instance_count = indirect_data[offset + 1];
+
+      if (!info->count || !info->instance_count)
+         continue;
+
+      info->start = indirect_data[offset + 2];
+      info->index_bias = indirect_data[offset + 3];
+      info->start_instance = indirect_data[offset + 4];
+
+      u_vbuf_draw_vbo(mgr, info);
+   }
+}
+
 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
 {
   struct pipe_context *pipe = mgr->pipe;
@@ -1160,33 +1185,163 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)

   new_info = *info;

-   /* Fallback. We need to know all the parameters. */
+   /* Handle indirect (multi)draws. */
   if (new_info.indirect) {
-      struct pipe_transfer *transfer = NULL;
-      int *data;
+      const struct pipe_draw_indirect_info *indirect = new_info.indirect;
+      unsigned draw_count = 0;

-      if (new_info.index_size) {
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
-                                      new_info.indirect->offset, 20,
-                                      PIPE_TRANSFER_READ, &transfer);
-         new_info.index_bias = data[3];
-         new_info.start_instance = data[4];
-      }
-      else {
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
-                                      new_info.indirect->offset, 16,
-                                      PIPE_TRANSFER_READ, &transfer);
-         new_info.start_instance = data[3];
+      /* Get the number of draws. */
+      if (indirect->indirect_draw_count) {
+         pipe_buffer_read(pipe, indirect->indirect_draw_count,
+                          indirect->indirect_draw_count_offset,
+                          4, &draw_count);
+      } else {
+         draw_count = indirect->draw_count;
      }

-      new_info.count = data[0];
-      new_info.instance_count = data[1];
-      new_info.start = data[2];
-      pipe_buffer_unmap(pipe, transfer);
-      new_info.indirect = NULL;
-
-      if (!new_info.count)
+      if (!draw_count)
         return;
+
+      unsigned data_size = (draw_count - 1) * indirect->stride +
+                           (new_info.index_size ? 20 : 16);
+      unsigned *data = malloc(data_size);
+      if (!data)
+         return; /* report an error? */
+
+      /* Read the used buffer range only once, because the read can be
+       * uncached.
+       */
+      pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size,
+                       data);
+
+      if (info->index_size) {
+         /* Indexed multidraw. */
+         unsigned index_bias0 = data[3];
+         bool index_bias_same = true;
+
+         /* If we invoke the translate path, we have to split the multidraw. */
+         if (incompatible_vb_mask ||
+             mgr->ve->incompatible_elem_mask) {
+            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
+                                           indirect->stride, draw_count);
+            free(data);
+            return;
+         }
+
+         /* See if index_bias is the same for all draws. */
+         for (unsigned i = 1; i < draw_count; i++) {
+            if (data[i * indirect->stride / 4 + 3] != index_bias0) {
+               index_bias_same = false;
+               break;
+            }
+         }
+
+         /* Split the multidraw if index_bias is different. */
+         if (!index_bias_same) {
+            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
+                                           indirect->stride, draw_count);
+            free(data);
+            return;
+         }
+
+         /* If we don't need to use the translate path and index_bias is
+          * the same, we can process the multidraw with the time complexity
+          * equal to 1 draw call (except for the index range computation).
+          * We only need to compute the index range covering all draw calls
+          * of the multidraw.
+          *
+          * The driver will not look at these values because indirect != NULL.
+          * These values determine the user buffer bounds to upload.
+          */
+         new_info.index_bias = index_bias0;
+         new_info.min_index = ~0u;
+         new_info.max_index = 0;
+         new_info.start_instance = ~0u;
+         unsigned end_instance = 0;
+
+         struct pipe_transfer *transfer = NULL;
+         const uint8_t *indices;
+
+         if (info->has_user_indices) {
+            indices = (uint8_t*)info->index.user;
+         } else {
+            indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource,
+                                                PIPE_TRANSFER_READ, &transfer);
+         }
+
+         for (unsigned i = 0; i < draw_count; i++) {
+            unsigned offset = i * indirect->stride / 4;
+            unsigned start = data[offset + 2];
+            unsigned count = data[offset + 0];
+            unsigned start_instance = data[offset + 4];
+            unsigned instance_count = data[offset + 1];
+
+            if (!count || !instance_count)
+               continue;
+
+            /* Update the ranges of instances. */
+            new_info.start_instance = MIN2(new_info.start_instance,
+                                           start_instance);
+            end_instance = MAX2(end_instance, start_instance + instance_count);
+
+            /* Update the index range. */
+            unsigned min, max;
+            new_info.count = count; /* only used by get_minmax_index */
+            u_vbuf_get_minmax_index_mapped(&new_info,
+                                           indices +
+                                           new_info.index_size * start,
+                                           &min, &max);
+
+            new_info.min_index = MIN2(new_info.min_index, min);
+            new_info.max_index = MAX2(new_info.max_index, max);
+         }
+         free(data);
+
+         if (transfer)
+            pipe_buffer_unmap(pipe, transfer);
+
+         /* Set the final instance count. */
+         new_info.instance_count = end_instance - new_info.start_instance;
+
+         if (new_info.start_instance == ~0u || !new_info.instance_count)
+            return;
+      } else {
+         /* Non-indexed multidraw.
+          *
+          * Keep the draw call indirect and compute minimums & maximums,
+          * which will determine the user buffer bounds to upload, but
+          * the driver will not look at these values because indirect != NULL.
+          *
+          * This efficiently processes the multidraw with the time complexity
+          * equal to 1 draw call.
+          */
+         new_info.start = ~0u;
+         new_info.start_instance = ~0u;
+         unsigned end_vertex = 0;
+         unsigned end_instance = 0;
+
+         for (unsigned i = 0; i < draw_count; i++) {
+            unsigned offset = i * indirect->stride / 4;
+            unsigned start = data[offset + 2];
+            unsigned count = data[offset + 0];
+            unsigned start_instance = data[offset + 3];
+            unsigned instance_count = data[offset + 1];
+
+            new_info.start = MIN2(new_info.start, start);
+            new_info.start_instance = MIN2(new_info.start_instance,
+                                           start_instance);
+
+            end_vertex = MAX2(end_vertex, start + count);
+            end_instance = MAX2(end_instance, start_instance + instance_count);
+         }
+
+         /* Set the final counts. */
+         new_info.count = end_vertex - new_info.start;
+         new_info.instance_count = end_instance - new_info.start_instance;
+
+         if (new_info.start == ~0u || !new_info.count || !new_info.instance_count)
+            return;
+      }
   }

   if (new_info.index_size) {
@@ -1211,7 +1366,8 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
          * We would have to break this drawing operation into several ones. */
         /* Use some heuristic to see if unrolling indices improves
          * performance. */
-         if (!new_info.primitive_restart &&
+         if (!info->indirect &&
+             !new_info.primitive_restart &&
             num_vertices > new_info.count*2 &&
             num_vertices - new_info.count > 32 &&
             !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -2151,13 +2151,36 @@ NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
   }
 }

+void
+NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
+{
+   if (!su->getPredicate())
+      return;
+
+   bld.setPosition(su, true);
+
+   for (unsigned i = 0; su->defExists(i); ++i) {
+      ValueDef &def = su->def(i);
+
+      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
+      assert(su->cc == CC_NOT_P);
+      mov->setPredicate(CC_P, su->getPredicate());
+      Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), NULL, mov->getDef(0));
+
+      def.replace(uni->getDef(0), false);
+      uni->setSrc(0, def.get());
+   }
+}
+
 void
 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
 {
   processSurfaceCoordsNVE4(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
      assert(su->getPredicate());
@@ -2267,8 +2290,10 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)

   processSurfaceCoordsNVC0(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
      const int dim = su->tex.target.getDim();
@@ -2370,8 +2395,10 @@ NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
 {
   processSurfaceCoordsGM107(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDP) {
      Value *def = su->getDef(0);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -172,6 +172,7 @@ private:
   void processSurfaceCoordsNVE4(TexInstruction *);
   void processSurfaceCoordsNVC0(TexInstruction *);
   void convertSurfaceFormat(TexInstruction *);
+   void insertOOBSurfaceOpResult(TexInstruction *);
   Value *calculateSampleOffset(Value *sampleID);

 protected:
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -715,7 +715,6 @@ static void compute_emit_cs(struct r600_context *rctx,
 		rctx->cmd_buf_is_compute = true;
 	}

-	r600_need_cs_space(rctx, 0, true);
 	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
 		r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
 		current = rctx->cs_shader_state.shader->sel->current;
@@ -742,16 +741,22 @@ static void compute_emit_cs(struct r600_context *rctx,
 		}
 		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
 		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
+
+		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
+		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
+
 		if (need_buf_const) {
 			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
 		}
 		r600_update_driver_const_buffers(rctx, true);

-		if (evergreen_emit_atomic_buffer_setup(rctx, current, combined_atomics, &atomic_used_mask)) {
+		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
+		if (atomic_used_mask) {
 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 		}
-	}
+	} else
+		r600_need_cs_space(rctx, 0, true, 0);

 	/* Initialize all the compute-related registers.
 	 *
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -109,7 +109,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,

 		r600_need_cs_space(rctx,
 				   10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
-				   R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
+				   R600_MAX_PFP_SYNC_ME_DWORDS, FALSE, 0);

 		/* Flush the caches for the first copy only. */
 		if (rctx->b.flags) {
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -4030,7 +4030,6 @@ static void evergreen_set_hw_atomic_buffers(struct pipe_context *ctx,

 		if (!buffers || !buffers[idx].buffer) {
 			pipe_resource_reference(&abuf->buffer, NULL);
-			astate->enabled_mask &= ~(1 << i);
 			continue;
 		}
 		buf = &buffers[idx];
@@ -4038,7 +4037,6 @@ static void evergreen_set_hw_atomic_buffers(struct pipe_context *ctx,
 		pipe_resource_reference(&abuf->buffer, buf->buffer);
 		abuf->buffer_offset = buf->buffer_offset;
 		abuf->buffer_size = buf->buffer_size;
-		astate->enabled_mask |= (1 << i);
 	}
 }

@@ -4868,20 +4866,15 @@ static void cayman_write_count_to_gds(struct r600_context *rctx,
 	radeon_emit(cs, reloc);
 }

-bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
-					struct r600_pipe_shader *cs_shader,
-					struct r600_shader_atomic *combined_atomics,
-					uint8_t *atomic_used_mask_p)
+void evergreen_emit_atomic_buffer_setup_count(struct r600_context *rctx,
+					      struct r600_pipe_shader *cs_shader,
+					      struct r600_shader_atomic *combined_atomics,
+					      uint8_t *atomic_used_mask_p)
 {
-	struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
-	unsigned pkt_flags = 0;
 	uint8_t atomic_used_mask = 0;
 	int i, j, k;
 	bool is_compute = cs_shader ? true : false;

-	if (is_compute)
-		pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
-
 	for (i = 0; i < (is_compute ? 1 : EG_NUM_HW_STAGES); i++) {
 		uint8_t num_atomic_stage;
 		struct r600_pipe_shader *pshader;
@@ -4914,8 +4907,25 @@ bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
 			}
 		}
 	}
+	*atomic_used_mask_p = atomic_used_mask;
+}
+
+void evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
+					bool is_compute,
+					struct r600_shader_atomic *combined_atomics,
+					uint8_t atomic_used_mask)
+{
+	struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
+	unsigned pkt_flags = 0;
+	uint32_t mask;
+
+	if (is_compute)
+		pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
+
+	mask = atomic_used_mask;
+	if (!mask)
+		return;

-	uint32_t mask = atomic_used_mask;
 	while (mask) {
 		unsigned atomic_index = u_bit_scan(&mask);
 		struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
@@ -4927,8 +4937,6 @@ bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
 		else
 			evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
 	}
-	*atomic_used_mask_p = atomic_used_mask;
-	return true;
 }

 void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
@@ -4940,7 +4948,7 @@ void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
 	struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
 	uint32_t pkt_flags = 0;
 	uint32_t event = EVENT_TYPE_PS_DONE;
-	uint32_t mask = astate->enabled_mask;
+	uint32_t mask;
 	uint64_t dst_offset;
 	unsigned reloc;

--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -31,7 +31,7 @@


 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
-			boolean count_draw_in)
+			boolean count_draw_in, unsigned num_atomics)
 {
 	/* Flush the DMA IB if it's not empty. */
 	if (radeon_emitted(ctx->b.dma.cs, 0))
@@ -61,6 +61,9 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 		num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS;
 	}

+	/* add atomic counters, 8 pre + 8 post per counter + 16 post if any counters */
+	num_dw += (num_atomics * 16) + (num_atomics ? 16 : 0);
+
 	/* Count in r600_suspend_queries. */
 	num_dw += ctx->b.num_cs_dw_queries_suspend;

@@ -526,7 +529,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,

 		r600_need_cs_space(rctx,
 				   10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
-				   3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
+				   3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE, 0);

 		/* Flush the caches for the first copy only. */
 		if (rctx->b.flags) {
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -446,8 +446,6 @@ struct r600_shader_state {
 };

 struct r600_atomic_buffer_state {
-	uint32_t enabled_mask;
-	uint32_t dirty_mask;
 	struct pipe_shader_buffer buffer[EG_MAX_ATOMIC_BUFFERS];
 };

@@ -773,7 +771,7 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 			    struct pipe_fence_handle **fence);
 void r600_begin_new_cs(struct r600_context *ctx);
 void r600_flush_emit(struct r600_context *ctx);
-void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in, unsigned num_atomics);
 void r600_emit_pfp_sync_me(struct r600_context *rctx);
 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 			     struct pipe_resource *dst, uint64_t dst_offset,
@@ -1067,10 +1065,14 @@ void r600_delete_shader_selector(struct pipe_context *ctx,
 				 struct r600_pipe_shader_selector *sel);

 struct r600_shader_atomic;
-bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
-					struct r600_pipe_shader *cs_shader,
+void evergreen_emit_atomic_buffer_setup_count(struct r600_context *rctx,
+					      struct r600_pipe_shader *cs_shader,
+					      struct r600_shader_atomic *combined_atomics,
+					      uint8_t *atomic_used_mask_p);
+void evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
+					bool is_compute,
 					struct r600_shader_atomic *combined_atomics,
-					uint8_t *atomic_used_mask_p);
+					uint8_t atomic_used_mask);
 void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
 				       bool is_compute,
 				       struct r600_shader_atomic *combined_atomics,
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -2085,8 +2085,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		: (rctx->tes_shader)? rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]
 		: info->mode;

-	if (rctx->b.chip_class >= EVERGREEN)
-		evergreen_emit_atomic_buffer_setup(rctx, NULL, combined_atomics, &atomic_used_mask);
+	if (rctx->b.chip_class >= EVERGREEN) {
+		evergreen_emit_atomic_buffer_setup_count(rctx, NULL, combined_atomics, &atomic_used_mask);
+	}

 	if (index_size) {
 		index_offset += info->start * index_size;
@@ -2172,7 +2173,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		evergreen_setup_tess_constants(rctx, info, &num_patches);

 	/* Emit states. */
-	r600_need_cs_space(rctx, has_user_indices ? 5 : 0, TRUE);
+	r600_need_cs_space(rctx, has_user_indices ? 5 : 0, TRUE, util_bitcount(atomic_used_mask));
 	r600_flush_emit(rctx);

 	mask = rctx->dirty_atoms;
@@ -2180,6 +2181,10 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		r600_emit_atom(rctx, rctx->atoms[u_bit_scan64(&mask)]);
 	}

+	if (rctx->b.chip_class >= EVERGREEN) {
+		evergreen_emit_atomic_buffer_setup(rctx, false, combined_atomics, atomic_used_mask);
+	}
+		
 	if (rctx->b.chip_class == CAYMAN) {
 		/* Copied from radeonsi. */
 		unsigned primgroup_size = 128; /* recommended without a GS */
@@ -3284,7 +3289,7 @@ static void r600_set_active_query_state(struct pipe_context *ctx, boolean enable
 static void r600_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
                                   bool include_draw_vbo)
 {
-	r600_need_cs_space((struct r600_context*)ctx, num_dw, include_draw_vbo);
+	r600_need_cs_space((struct r600_context*)ctx, num_dw, include_draw_vbo, 0);
 }

 /* keep this at the end of this file, please */
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -114,6 +114,7 @@ static void si_init_compiler(struct si_screen *sscreen,
 				       sscreen->info.chip_class <= VI;

 	enum ac_target_machine_options tm_options =
+		AC_TM_AUTO_WAITCNT_BEFORE_BARRIER |
 		(sscreen->debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) |
 		(sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
 		(sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
--- a/src/gallium/drivers/swr/swr_public.h
+++ b/src/gallium/drivers/swr/swr_public.h
@@ -37,7 +37,7 @@ extern "C" {
 struct pipe_screen *swr_create_screen(struct sw_winsys *winsys);

 // arch-specific dll entry point
-PUBLIC struct pipe_screen *swr_create_screen_internal(struct sw_winsys *winsys);
+struct pipe_screen *swr_create_screen_internal(struct sw_winsys *winsys);

 // cleanup for failed screen creation
 void swr_destroy_screen_internal(struct swr_screen **screen);
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -1143,12 +1143,10 @@ swr_validate_env_options(struct swr_screen *screen)
 }


-PUBLIC
 struct pipe_screen *
 swr_create_screen_internal(struct sw_winsys *winsys)
 {
   struct swr_screen *screen = CALLOC_STRUCT(swr_screen);
-   memset(screen, 0, sizeof(struct swr_screen));

   if (!screen)
      return NULL;
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -585,6 +585,8 @@ v3d_get_device_info(struct v3d_screen *screen)
        uint32_t minor = (ident1.value >> 0) & 0xf;
        screen->devinfo.ver = major * 10 + minor;

+        screen->devinfo.vpm_size = (ident1.value >> 28 & 0xf) * 1024;
+
        switch (screen->devinfo.ver) {
        case 33:
        case 41:
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -306,6 +306,13 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                }
        }

+        cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) {
+                vcm.number_of_16_vertex_batches_for_binning =
+                        v3d->prog.cs->prog_data.vs->vcm_cache_size;
+                vcm.number_of_16_vertex_batches_for_rendering =
+                        v3d->prog.vs->prog_data.vs->vcm_cache_size;
+        }
+
        cl_emit(&job->bcl, GL_SHADER_STATE, state) {
                state.address = cl_address(job->indirect.bo, shader_rec_offset);
                state.number_of_attribute_arrays = num_elements_to_emit;
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -222,6 +222,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
                        attr.coordinate_shader_vpm_offset = 0;
                        attr.vertex_shader_vpm_offset = 0;
                }
+
+                vc4_bo_unreference(&bo);
        }

        cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) {
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -121,7 +121,8 @@ vc4_fence_server_sync(struct pipe_context *pctx,
        struct vc4_context *vc4 = vc4_context(pctx);
        struct vc4_fence *fence = vc4_fence(pfence);

-        sync_accumulate("vc4", &vc4->in_fence_fd, fence->fd);
+        if (fence->fd >= 0)
+                sync_accumulate("vc4", &vc4->in_fence_fd, fence->fd);
 }

 static int
@@ -142,8 +143,12 @@ vc4_fence_context_init(struct vc4_context *vc4)
        /* Since we initialize the in_fence_fd to -1 (no wait necessary),
         * we also need to initialize our in_syncobj as signaled.
         */
-        return drmSyncobjCreate(vc4->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
-                                &vc4->in_syncobj);
+        if (vc4->screen->has_syncobj) {
+                return drmSyncobjCreate(vc4->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                        &vc4->in_syncobj);
+        } else {
+                return 0;
+        }
 }

 void
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -38,6 +38,7 @@
 #include "vc4_context.h"
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
+#include "mesa/state_tracker/st_glsl_types.h"

 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
@@ -50,6 +51,12 @@ type_size(const struct glsl_type *type)
   return glsl_count_attribute_slots(type, false);
 }

+static int
+uniforms_type_size(const struct glsl_type *type)
+{
+        return st_glsl_storage_type_size(type, false);
+}
+
 static void
 resize_qreg_array(struct vc4_compile *c,
                  struct qreg **regs,
@@ -1685,7 +1692,7 @@ static void
 ntq_setup_uniforms(struct vc4_compile *c)
 {
        nir_foreach_variable(var, &c->s->uniforms) {
-                uint32_t vec4_count = type_size(var->type);
+                uint32_t vec4_count = uniforms_type_size(var->type);
                unsigned vec4_size = 4 * sizeof(float);

                declare_uniform_range(c, var->data.driver_location * vec4_size,
@@ -2469,9 +2476,13 @@ vc4_shader_state_create(struct pipe_context *pctx,
                 */
                s = cso->ir.nir;

-                NIR_PASS_V(s, nir_lower_io, nir_var_all, type_size,
+                NIR_PASS_V(s, nir_lower_io, nir_var_all & ~nir_var_uniform,
+                           type_size,
                           (nir_lower_io_options)0);
-        } else {
+                NIR_PASS_V(s, nir_lower_io, nir_var_uniform,
+                           uniforms_type_size,
+                           (nir_lower_io_options)0);
+       } else {
                assert(cso->type == PIPE_SHADER_IR_TGSI);

                if (vc4_debug & VC4_DEBUG_TGSI) {
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -614,7 +614,9 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
        }

        so->texture_p0 =
-                (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
+                (VC4_SET_FIELD((rsc->slices[0].offset +
+                                cso->u.tex.first_layer *
+                                rsc->cube_map_stride) >> 12, VC4_TEX_P0_OFFSET) |
                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) |
                 VC4_SET_FIELD(so->force_first_level ?
                               cso->u.tex.last_level :
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -26,8 +26,12 @@
 *
 **************************************************************************/

+#if !defined(ANDROID) || ANDROID_API_LEVEL >= 26
+/* Android's libc began supporting shm in Oreo */
+#define HAVE_SHM
 #include <sys/ipc.h>
 #include <sys/shm.h>
+#endif

 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
@@ -83,6 +87,7 @@ dri_sw_is_displaytarget_format_supported( struct sw_winsys *ws,
   return TRUE;
 }

+#ifdef HAVE_SHM
 static char *
 alloc_shm(struct dri_sw_displaytarget *dri_sw_dt, unsigned size)
 {
@@ -101,6 +106,7 @@ alloc_shm(struct dri_sw_displaytarget *dri_sw_dt, unsigned size)

   return addr;
 }
+#endif

 static struct sw_displaytarget *
 dri_sw_displaytarget_create(struct sw_winsys *winsys,
@@ -131,8 +137,11 @@ dri_sw_displaytarget_create(struct sw_winsys *winsys,
   size = dri_sw_dt->stride * nblocksy;

   dri_sw_dt->shmid = -1;
+
+#ifdef HAVE_SHM
   if (ws->lf->put_image_shm)
      dri_sw_dt->data = alloc_shm(dri_sw_dt, size);
+#endif

   if(!dri_sw_dt->data)
      dri_sw_dt->data = align_malloc(size, alignment);
@@ -156,8 +165,10 @@ dri_sw_displaytarget_destroy(struct sw_winsys *ws,
   struct dri_sw_displaytarget *dri_sw_dt = dri_sw_displaytarget(dt);

   if (dri_sw_dt->shmid >= 0) {
+#ifdef HAVE_SHM
      shmdt(dri_sw_dt->data);
      shmctl(dri_sw_dt->shmid, IPC_RMID, 0);
+#endif
   } else {
      align_free(dri_sw_dt->data);
   }
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -176,6 +176,8 @@ kms_sw_displaytarget_create(struct sw_winsys *ws,

   list_inithead(&kms_sw_dt->planes);
   kms_sw_dt->ref_count = 1;
+   kms_sw_dt->mapped = MAP_FAILED;
+   kms_sw_dt->ro_mapped = MAP_FAILED;

   kms_sw_dt->format = format;

@@ -262,7 +264,7 @@ kms_sw_displaytarget_map(struct sw_winsys *ws,

   prot = (flags == PIPE_TRANSFER_READ) ? PROT_READ : (PROT_READ | PROT_WRITE);
   void **ptr = (flags == PIPE_TRANSFER_READ) ? &kms_sw_dt->ro_mapped : &kms_sw_dt->mapped;
-   if (!*ptr) {
+   if (*ptr == MAP_FAILED) {
      void *tmp = mmap(0, kms_sw_dt->size, prot, MAP_SHARED,
                       kms_sw->fd, map_req.offset);
      if (tmp == MAP_FAILED)
@@ -332,6 +334,8 @@ kms_sw_displaytarget_add_from_prime(struct kms_sw_winsys *kms_sw, int fd,
      FREE(kms_sw_dt);
      return NULL;
   }
+   kms_sw_dt->mapped = MAP_FAILED;
+   kms_sw_dt->ro_mapped = MAP_FAILED;
   kms_sw_dt->size = lseek_ret;
   kms_sw_dt->ref_count = 1;
   kms_sw_dt->handle = handle;
@@ -368,10 +372,14 @@ kms_sw_displaytarget_unmap(struct sw_winsys *ws,
   DEBUG_PRINT("KMS-DEBUG: unmapped buffer %u (was %p)\n", kms_sw_dt->handle, kms_sw_dt->mapped);
   DEBUG_PRINT("KMS-DEBUG: unmapped buffer %u (was %p)\n", kms_sw_dt->handle, kms_sw_dt->ro_mapped);

-   munmap(kms_sw_dt->mapped, kms_sw_dt->size);
-   kms_sw_dt->mapped = NULL;
-   munmap(kms_sw_dt->ro_mapped, kms_sw_dt->size);
-   kms_sw_dt->ro_mapped = NULL;
+   if (kms_sw_dt->mapped != MAP_FAILED) {
+      munmap(kms_sw_dt->mapped, kms_sw_dt->size);
+      kms_sw_dt->mapped = MAP_FAILED;
+   }
+   if (kms_sw_dt->ro_mapped != MAP_FAILED) {
+      munmap(kms_sw_dt->ro_mapped, kms_sw_dt->size);
+      kms_sw_dt->ro_mapped = MAP_FAILED;
+   }
 }

 static struct sw_displaytarget *
--- a/src/glx/Makefile.am
+++ b/src/glx/Makefile.am
@@ -19,9 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.

-if HAVE_SHARED_GLAPI
-SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
-endif

 SUBDIRS =

@@ -181,7 +178,7 @@ GL_LIBS = \
 	$(LIBDRM_LIBS) \
 	libglx.la \
 	$(top_builddir)/src/mapi/glapi/libglapi.la \
-	$(SHARED_GLAPI_LIB) \
+	$(top_builddir)/src/mapi/shared-glapi/libglapi.la \
 	$(GL_LIB_DEPS)

 GL_LDFLAGS = \
--- a/src/glx/glxextensions.c
+++ b/src/glx/glxextensions.c
@@ -152,7 +152,7 @@ static const struct extension_info known_glx_extensions[] = {
   { GLX(ATI_pixel_format_float),      VER(0,0), N, N, N, N },
   { GLX(INTEL_swap_event),            VER(0,0), Y, N, N, N },
   { GLX(MESA_copy_sub_buffer),        VER(0,0), Y, N, N, N },
-   { GLX(MESA_multithread_makecurrent),VER(0,0), Y, N, Y, N },
+   { GLX(MESA_multithread_makecurrent),VER(0,0), Y, N, N, Y },
   { GLX(MESA_query_renderer),         VER(0,0), Y, N, N, Y },
   { GLX(MESA_swap_control),           VER(0,0), Y, N, N, Y },
   { GLX(NV_float_buffer),             VER(0,0), N, N, N, N },
--- a/src/intel/Makefile.tools.am
+++ b/src/intel/Makefile.tools.am
@@ -21,7 +21,9 @@

 noinst_PROGRAMS += \
 	tools/aubinator \
-	tools/aubinator_error_decode
+	tools/aubinator_error_decode \
+	tools/error2aub
+

 tools_aubinator_SOURCES = \
 	tools/aubinator.c \
@@ -59,3 +61,23 @@ tools_aubinator_error_decode_LDADD = \
 tools_aubinator_error_decode_CFLAGS = \
 	$(AM_CFLAGS) \
 	$(ZLIB_CFLAGS)
+
+
+tools_error2aub_SOURCES = \
+	tools/gen_context.h \
+	tools/gen8_context.h \
+	tools/gen10_context.h \
+	tools/aub_write.h \
+	tools/aub_write.c \
+	tools/error2aub.c
+
+tools_error2aub_CFLAGS = \
+	$(AM_CFLAGS) \
+	$(ZLIB_CFLAGS)
+
+tools_error2aub_LDADD = \
+	dev/libintel_dev.la \
+	$(PTHREAD_LIBS) \
+	$(DLOPEN_LIBS) \
+	$(ZLIB_LIBS) \
+	-lm
--- a/src/intel/blorp/blorp.c
+++ b/src/intel/blorp/blorp.c
@@ -75,18 +75,6 @@ brw_blorp_surface_info_init(struct blorp_context *blorp,
   if (format == ISL_FORMAT_UNSUPPORTED)
      format = surf->surf->format;

-   if (format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
-      /* Unfortunately, ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as
-       * a render target, which would prevent us from blitting to 24-bit
-       * depth.  The miptree consists of 32 bits per pixel, arranged as 24-bit
-       * depth values interleaved with 8 "don't care" bits.  Since depth
-       * values don't require any blending, it doesn't matter how we interpret
-       * the bit pattern as long as we copy the right amount of data, so just
-       * map it as 8-bit BGRA.
-       */
-      format = ISL_FORMAT_B8G8R8A8_UNORM;
-   }
-
   info->surf = *surf->surf;
   info->addr = surf->addr;

--- a/src/intel/blorp/blorp_blit.c
+++ b/src/intel/blorp/blorp_blit.c
@@ -776,6 +776,14 @@ blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
       * grid of samples with in a pixel. Sample number layout shows the
       * rectangular grid of samples roughly corresponding to the real sample
       * locations with in a pixel.
+       *
+       * In the case of 2x MSAA, the layout of sample indices is reversed from
+       * the layout of sample numbers:
+       *
+       * sample index layout :  ---------    sample number layout :  ---------
+       *                        | 0 | 1 |                            | 1 | 0 |
+       *                        ---------                            ---------
+       *
       * In case of 4x MSAA, layout of sample indices matches the layout of
       * sample numbers:
       *           ---------
@@ -819,7 +827,9 @@ blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
                                            key->x_scale * key->y_scale));
      sample = nir_f2i32(b, sample);

-      if (tex_samples == 8) {
+      if (tex_samples == 2) {
+         sample = nir_isub(b, nir_imm_int(b, 1), sample);
+      } else if (tex_samples == 8) {
         sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573),
                                       nir_ishl(b, sample, nir_imm_int(b, 2))),
                           nir_imm_int(b, 0xf));
@@ -984,14 +994,14 @@ convert_color(struct nir_builder *b, nir_ssa_def *color,
   nir_ssa_def *value;

   if (key->dst_format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
-      /* The destination image is bound as R32_UNORM but the data needs to be
+      /* The destination image is bound as R32_UINT but the data needs to be
       * in R24_UNORM_X8_TYPELESS.  The bottom 24 are the actual data and the
       * top 8 need to be zero.  We can accomplish this by simply multiplying
       * by a factor to scale things down.
       */
-      float factor = (float)((1 << 24) - 1) / (float)UINT32_MAX;
-      value = nir_fmul(b, nir_fsat(b, nir_channel(b, color, 0)),
-                          nir_imm_float(b, factor));
+      unsigned factor = (1 << 24) - 1;
+      value = nir_fsat(b, nir_channel(b, color, 0));
+      value = nir_f2i32(b, nir_fmul(b, value, nir_imm_float(b, factor)));
   } else if (key->dst_format == ISL_FORMAT_L8_UNORM_SRGB) {
      value = nir_format_linear_to_srgb(b, nir_channel(b, color, 0));
   } else if (key->dst_format == ISL_FORMAT_R8G8B8_UNORM_SRGB) {
@@ -1976,7 +1986,7 @@ try_blorp_blit(struct blorp_batch *batch,
         isl_format_rgbx_to_rgba(params->dst.view.format);
   } else if (params->dst.view.format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
      wm_prog_key->dst_format = params->dst.view.format;
-      params->dst.view.format = ISL_FORMAT_R32_UNORM;
+      params->dst.view.format = ISL_FORMAT_R32_UINT;
   } else if (params->dst.view.format == ISL_FORMAT_A4B4G4R4_UNORM) {
      params->dst.view.swizzle =
         isl_swizzle_compose(params->dst.view.swizzle,
@@ -2240,6 +2250,17 @@ blorp_blit(struct blorp_batch *batch,
      }
   }

+   /* ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as a render target,
+    * which requires shader math to render to it.  Blitting Z24X8 to Z24X8
+    * is fairly common though, so we'd like to avoid it.  Since we don't need
+    * to blend depth values, we can simply pick a renderable format with the
+    * right number of bits-per-pixel, like 8-bit BGRA.
+    */
+   if (dst_surf->surf->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS &&
+       src_surf->surf->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
+      src_format = dst_format = ISL_FORMAT_B8G8R8A8_UNORM;
+   }
+
   brw_blorp_surface_info_init(batch->blorp, &params.src, src_surf, src_level,
                               src_layer, src_format, false);
   brw_blorp_surface_info_init(batch->blorp, &params.dst, dst_surf, dst_level,
--- a/src/intel/common/gen_sample_positions.h
+++ b/src/intel/common/gen_sample_positions.h
@@ -42,10 +42,10 @@ prefix##0YOffset   = 0.5;
 * c   1
 */
 #define GEN_SAMPLE_POS_2X(prefix) \
-prefix##0XOffset   = 0.25; \
-prefix##0YOffset   = 0.25; \
-prefix##1XOffset   = 0.75; \
-prefix##1YOffset   = 0.75;
+prefix##0XOffset   = 0.75; \
+prefix##0YOffset   = 0.75; \
+prefix##1XOffset   = 0.25; \
+prefix##1YOffset   = 0.25;

 /**
 * Sample positions:
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5115,6 +5115,25 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
      }
   }

+   if (devinfo->gen < 6) {
+      /* From the G45 PRM, Volume 4 Page 361:
+       *
+       *    "Operand Alignment Rule: With the exceptions listed below, a
+       *     source/destination operand in general should be aligned to even
+       *     256-bit physical register with a region size equal to two 256-bit
+       *     physical registers."
+       *
+       * Normally we enforce this by allocating virtual registers to the
+       * even-aligned class.  But we need to handle payload registers.
+       */
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
+             inst->size_read(i) > REG_SIZE) {
+            max_width = MIN2(max_width, 8);
+         }
+      }
+   }
+
   /* From the IVB PRMs:
    *  "When an instruction is SIMD32, the low 16 bits of the execution mask
    *   are applied for both halves of the SIMD32 instruction. If different
@@ -6321,6 +6340,7 @@ fs_visitor::optimize()
   if (OPT(lower_load_payload)) {
      split_virtual_grfs();
      OPT(register_coalesce);
+      OPT(lower_simd_width);
      OPT(compute_to_mrf);
      OPT(dead_code_eliminate);
   }
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -713,18 +713,6 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
   nir_validate_shader(*producer);
   nir_validate_shader(*consumer);

-   const bool p_is_scalar =
-      compiler->scalar_stage[(*producer)->info.stage];
-   const bool c_is_scalar =
-      compiler->scalar_stage[(*consumer)->info.stage];
-
-   if (p_is_scalar && c_is_scalar) {
-      NIR_PASS_V(*producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
-      NIR_PASS_V(*consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
-      *producer = brw_nir_optimize(*producer, compiler, p_is_scalar);
-      *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar);
-   }
-
   NIR_PASS_V(*producer, nir_remove_dead_variables, nir_var_shader_out);
   NIR_PASS_V(*consumer, nir_remove_dead_variables, nir_var_shader_in);

@@ -741,7 +729,12 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
      NIR_PASS_V(*consumer, nir_lower_indirect_derefs,
                 brw_nir_no_indirect_mask(compiler, (*consumer)->info.stage));

+      const bool p_is_scalar =
+         compiler->scalar_stage[(*producer)->info.stage];
      *producer = brw_nir_optimize(*producer, compiler, p_is_scalar);
+
+      const bool c_is_scalar =
+         compiler->scalar_stage[(*consumer)->info.stage];
      *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar);
   }
 }
--- a/src/intel/tools/aubinator.c
+++ b/src/intel/tools/aubinator.c
@@ -590,7 +590,7 @@ handle_memtrace_reg_write(uint32_t *p)
   uint32_t pphwsp_addr = context_descriptor & 0xfffff000;
   struct gen_batch_decode_bo pphwsp_bo = get_ggtt_batch_bo(NULL, pphwsp_addr);
   uint32_t *context = (uint32_t *)((uint8_t *)pphwsp_bo.map +
-                                    (pphwsp_bo.addr - pphwsp_addr) +
+                                    (pphwsp_addr - pphwsp_bo.addr) +
                                    pphwsp_size);

   uint32_t ring_buffer_head = context[5];
@@ -601,7 +601,7 @@ handle_memtrace_reg_write(uint32_t *p)
   struct gen_batch_decode_bo ring_bo = get_ggtt_batch_bo(NULL,
                                                          ring_buffer_start);
   assert(ring_bo.size > 0);
-   void *commands = (uint8_t *)ring_bo.map + (ring_bo.addr - ring_buffer_start);
+   void *commands = (uint8_t *)ring_bo.map + (ring_buffer_start - ring_bo.addr);

   if (context_descriptor & 0x100 /* ppgtt */) {
      batch_ctx.get_bo = get_ppgtt_batch_bo;
--- a/src/intel/tools/error2aub.c
+++ b/src/intel/tools/error2aub.c
@@ -205,7 +205,7 @@ main(int argc, char *argv[])
      BO_TYPE_UNKNOWN = 0,
      BO_TYPE_BATCH,
      BO_TYPE_USER,
-   } bo_type;
+   } bo_type = BO_TYPE_UNKNOWN;
   uint64_t bo_addr;

   char *line = NULL;
--- a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
+++ b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
@@ -340,18 +340,16 @@ try_lower_tex_ycbcr(struct anv_pipeline_layout *layout,
   if (binding->immutable_samplers == NULL)
      return false;

-   unsigned texture_index = tex->texture_index;
+   assert(tex->texture_index == 0);
+   unsigned array_index = 0;
   if (deref->deref_type != nir_deref_type_var) {
      assert(deref->deref_type == nir_deref_type_array);
      nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
      if (!const_index)
         return false;
-      size_t hw_binding_size =
-         anv_descriptor_set_binding_layout_get_hw_size(binding);
-      texture_index += MIN2(const_index->u32[0], hw_binding_size - 1);
+      array_index = MIN2(const_index->u32[0], binding->array_size - 1);
   }
-   const struct anv_sampler *sampler =
-      binding->immutable_samplers[texture_index];
+   const struct anv_sampler *sampler = binding->immutable_samplers[array_index];

   if (sampler->conversion == NULL)
      return false;
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -496,7 +496,6 @@ bo_alloc_internal(struct brw_bufmgr *bufmgr,
                  uint32_t stride)
 {
   struct brw_bo *bo;
-   unsigned int page_size = getpagesize();
   int ret;
   struct bo_cache_bucket *bucket;
   bool alloc_from_cache;
@@ -522,12 +521,12 @@ bo_alloc_internal(struct brw_bufmgr *bufmgr,
    * allocation up.
    */
   if (bucket == NULL) {
-      bo_size = size;
-      if (bo_size < page_size)
-         bo_size = page_size;
+      unsigned int page_size = getpagesize();
+      bo_size = size == 0 ? page_size : ALIGN(size, page_size);
   } else {
      bo_size = bucket->size;
   }
+   assert(bo_size);

   mtx_lock(&bufmgr->lock);
   /* Get a buffer out of the cache if available */
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -695,7 +695,7 @@ brw_initialize_context_constants(struct brw_context *brw)
   /* ARB_viewport_array, OES_viewport_array */
   if (devinfo->gen >= 6) {
      ctx->Const.MaxViewports = GEN6_NUM_VIEWPORTS;
-      ctx->Const.ViewportSubpixelBits = 0;
+      ctx->Const.ViewportSubpixelBits = 8;

      /* Cast to float before negating because MaxViewportWidth is unsigned.
       */
--- a/src/mesa/drivers/dri/i965/brw_multisample_state.h
+++ b/src/mesa/drivers/dri/i965/brw_multisample_state.h
@@ -38,13 +38,13 @@
 /**
 * 1x MSAA has a single sample at the center: (0.5, 0.5) -> (0x8, 0x8).
 *
- * 2x MSAA sample positions are (0.25, 0.25) and (0.75, 0.75):
+ * 2x MSAA sample positions are (0.75, 0.75) and (0.25, 0.25):
 *   4 c
- * 4 0
- * c   1
+ * 4 1
+ * c   0
 */
 static const uint32_t
-brw_multisample_positions_1x_2x = 0x0088cc44;
+brw_multisample_positions_1x_2x = 0x008844cc;

 /**
 * Sample positions:
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -68,10 +68,10 @@ gen6_get_sample_position(struct gl_context *ctx,
 * index layout in case of 2X and 4x MSAA, but they are different in
 * case of 8X MSAA.
 *
- * 2X MSAA sample index / number layout
- *           ---------
- *           | 0 | 1 |
- *           ---------
+ * 8X MSAA sample index layout    8x MSAA sample number layout
+ *           ---------                      ---------
+ *           | 0 | 1 |                      | 1 | 0 |
+ *           ---------                      ---------
 *
 * 4X MSAA sample index / number layout
 *           ---------
@@ -107,7 +107,7 @@ gen6_get_sample_position(struct gl_context *ctx,
 void
 gen6_set_sample_maps(struct gl_context *ctx)
 {
-   uint8_t map_2x[2] = {0, 1};
+   uint8_t map_2x[2] = {1, 0};
   uint8_t map_4x[4] = {0, 1, 2, 3};
   uint8_t map_8x[8] = {3, 7, 5, 0, 1, 2, 4, 6};
   uint8_t map_16x[16] = { 15, 10, 9, 7, 4, 1, 3, 13,
--- a/src/mesa/gl.pc.in
+++ b/src/mesa/gl.pc.in
@@ -7,7 +7,7 @@ Name: gl
 Description: Mesa OpenGL library
 Requires.private: @GL_PC_REQ_PRIV@
 Version: @PACKAGE_VERSION@
-Libs: -L${libdir} -l@GL_LIB@
+Libs: -L${libdir} -l@GL_PKGCONF_LIB@
 Libs.private: @GL_PC_LIB_PRIV@
 Cflags: -I${includedir} @GL_PC_CFLAGS@
 glx_tls: @GLX_TLS@
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -1229,7 +1229,7 @@ void st_init_extensions(struct pipe_screen *screen,
       screen->is_format_supported(screen, PIPE_FORMAT_R8G8B8A8_UNORM,
                                   PIPE_TEXTURE_2D, 0, 0,
                                   PIPE_BIND_SAMPLER_VIEW) &&
-       screen->is_format_supported(screen, PIPE_FORMAT_B8G8R8A8_SRGB,
+       screen->is_format_supported(screen, PIPE_FORMAT_R8G8B8A8_SRGB,
                                   PIPE_TEXTURE_2D, 0, 0,
                                   PIPE_BIND_SAMPLER_VIEW) &&
       screen->is_format_supported(screen, PIPE_FORMAT_R16_UNORM,
--- a/src/util/drirc
+++ b/src/util/drirc
@@ -120,6 +120,10 @@ TODO: document the other workarounds.
            <option name="allow_glsl_extension_directive_midshader" value="true" />
        </application>

+        <application name="Metro 2033 Redux / Metro Last Night Redux" executable="metro">
+            <option name="allow_glsl_extension_directive_midshader" value="true" />
+        </application>
+
        <application name="Worms W.M.D" executable="Worms W.M.Dx64">
            <option name="allow_higher_compat_version" value="true" />
        </application>