Update version to 18.2.0-rc3

Signed-off-by: Andres Gomez <agomez@igalia.com>
radv: Allow ETC2 on RAVEN and VEGA10 instead of all GFX9.
2018-08-15 14:53:50 +03:00 · 2018-08-14 23:52:11 +03:00 · 2018-08-14 23:51:14 +03:00 · 2018-08-14 23:50:05 +03:00 · 2018-08-13 12:45:28 +03:00 · 2018-08-13 12:44:46 +03:00
45 changed files with 493 additions and 106 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-18.2.0-devel
+18.2.0-rc3
--- a/bin/install_megadrivers.py
+++ b/bin/install_megadrivers.py
@@ -43,13 +43,15 @@ def main():
    master = os.path.join(to, os.path.basename(args.megadriver))

    if not os.path.exists(to):
+        if os.path.lexists(to):
+            os.unlink(to)
        os.makedirs(to)
    shutil.copy(args.megadriver, master)

    for driver in args.drivers:
        abs_driver = os.path.join(to, driver)

-        if os.path.exists(abs_driver):
+        if os.path.lexists(abs_driver):
            os.unlink(abs_driver)
        print('installing {} to {}'.format(args.megadriver, abs_driver))
        os.link(master, abs_driver)
@@ -60,7 +62,7 @@ def main():

            name, ext = os.path.splitext(driver)
            while ext != '.so':
-                if os.path.exists(name):
+                if os.path.lexists(name):
                    os.unlink(name)
                os.symlink(driver, name)
                name, ext = os.path.splitext(name)
--- a/configure.ac
+++ b/configure.ac
@@ -1503,15 +1503,15 @@ fi
 AC_ARG_WITH([gl-lib-name],
  [AS_HELP_STRING([--with-gl-lib-name@<:@=NAME@:>@],
    [specify GL library name @<:@default=GL@:>@])],
-  [GL_LIB=$withval],
-  [GL_LIB="$DEFAULT_GL_LIB_NAME"])
+  [AC_MSG_ERROR([--with-gl-lib-name is no longer supported. Rename the library manually if needed.])],
+  [])
 AC_ARG_WITH([osmesa-lib-name],
  [AS_HELP_STRING([--with-osmesa-lib-name@<:@=NAME@:>@],
    [specify OSMesa library name @<:@default=OSMesa@:>@])],
-  [OSMESA_LIB=$withval],
-  [OSMESA_LIB=OSMesa])
-AS_IF([test "x$GL_LIB" = xyes], [GL_LIB="$DEFAULT_GL_LIB_NAME"])
-AS_IF([test "x$OSMESA_LIB" = xyes], [OSMESA_LIB=OSMesa])
+  [AC_MSG_ERROR([--with-osmesa-lib-name is no longer supported. Rename the library manually if needed.])],
+  [])
+GL_LIB="$DEFAULT_GL_LIB_NAME"
+OSMESA_LIB=OSMesa

 dnl
 dnl Mangled Mesa support
@@ -1523,6 +1523,9 @@ AC_ARG_ENABLE([mangling],
  [enable_mangling=no]
 )
 if test "x${enable_mangling}" = "xyes" ; then
+  if test "x$enable_libglvnd" = xyes; then
+    AC_MSG_ERROR([Conflicting options --enable-mangling and --enable-libglvnd.])
+  fi
  DEFINES="${DEFINES} -DUSE_MGL_NAMESPACE"
  GL_LIB="Mangled${GL_LIB}"
  OSMESA_LIB="Mangled${OSMESA_LIB}"
@@ -1530,6 +1533,15 @@ fi
 AC_SUBST([GL_LIB])
 AC_SUBST([OSMESA_LIB])

+dnl HACK when building glx + glvnd we ship gl.pc, despite that glvnd should do it
+dnl Thus we need to use GL as a DSO name.
+if test "x$enable_libglvnd" = xyes -a "x$enable_glx" != xno; then
+  GL_PKGCONF_LIB="GL"
+else
+  GL_PKGCONF_LIB="$GL_LIB"
+fi
+AC_SUBST([GL_PKGCONF_LIB])
+
 # Check for libdrm
 PKG_CHECK_MODULES([LIBDRM], [libdrm >= $LIBDRM_REQUIRED],
                  [have_libdrm=yes], [have_libdrm=no])
@@ -1658,6 +1670,8 @@ xxlib | xgallium-xlib)
 xdri)
    # DRI-based GLX

+    require_dri_shared_libs_and_glapi "GLX"
+
    # find the DRI deps for libGL
    dri_modules="x11 xext xdamage >= $XDAMAGE_REQUIRED xfixes x11-xcb xcb xcb-glx >= $XCBGLX_REQUIRED"

--- a/meson.build
+++ b/meson.build
@@ -989,7 +989,7 @@ if cc.links('''
      freelocale(loc);
      return 0;
    }''',
-    extra_args : pre_args,
+    args : pre_args,
    name : 'strtod has locale support')
  pre_args += '-DHAVE_STRTOD_L'
 endif
--- a/src/amd/Android.mk
+++ b/src/amd/Android.mk
@@ -27,4 +27,6 @@ include $(LOCAL_PATH)/Makefile.sources

 include $(LOCAL_PATH)/Android.addrlib.mk
 include $(LOCAL_PATH)/Android.common.mk
+ifneq ($(filter radeonsi,$(BOARD_GPU_DRIVERS)),)
 include $(LOCAL_PATH)/vulkan/Android.mk
+endif
--- a/src/amd/vulkan/Android.mk
+++ b/src/amd/vulkan/Android.mk
@@ -62,6 +62,7 @@ LOCAL_SRC_FILES := \
 	$(VULKAN_FILES)

 LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
+LOCAL_CFLAGS += -DVK_USE_PLATFORM_ANDROID_KHR

 $(call mesa-build-with-llvm)

@@ -140,6 +141,7 @@ LOCAL_SRC_FILES := \
 	$(VULKAN_ANDROID_FILES)

 LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
+LOCAL_CFLAGS += -DVK_USE_PLATFORM_ANDROID_KHR

 $(call mesa-build-with-llvm)

--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -124,7 +124,7 @@ VULKAN_LIB_DEPS += \
 endif

 if HAVE_PLATFORM_ANDROID
-AM_CPPFLAGS += $(ANDROID_CPPFLAGS)
+AM_CPPFLAGS += $(ANDROID_CPPFLAGS) -DVK_USE_PLATFORM_ANDROID_KHR
 AM_CFLAGS += $(ANDROID_CFLAGS)
 VULKAN_LIB_DEPS += $(ANDROID_LIBS)
 VULKAN_SOURCES += $(VULKAN_ANDROID_FILES)
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -105,7 +105,7 @@ EXTENSIONS = [
    Extension('VK_EXT_sampler_filter_minmax',             1, 'device->rad_info.chip_class >= CIK'),
    Extension('VK_EXT_shader_viewport_index_layer',       1, True),
    Extension('VK_EXT_shader_stencil_export',             1, True),
-    Extension('VK_EXT_vertex_attribute_divisor',          1, True),
+    Extension('VK_EXT_vertex_attribute_divisor',          2, True),
    Extension('VK_AMD_draw_indirect_count',               1, True),
    Extension('VK_AMD_gcn_shader',                        1, True),
    Extension('VK_AMD_rasterization_order',               1, 'device->has_out_of_order_rast'),
--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@@ -612,7 +612,8 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
 	}

 	if (desc->layout == VK_FORMAT_LAYOUT_ETC &&
-	    physical_device->rad_info.chip_class < GFX9 &&
+	    physical_device->rad_info.family != CHIP_VEGA10 &&
+	    physical_device->rad_info.family != CHIP_RAVEN &&
 	    physical_device->rad_info.family != CHIP_STONEY) {
 		out_properties->linearTilingFeatures = linear;
 		out_properties->optimalTilingFeatures = tiled;
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1991,8 +1991,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 			uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];

 			if (divisor) {
-				buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.instance_id,
-				                            ctx->abi.start_instance, "");
+				buffer_index = ctx->abi.instance_id;

 				if (divisor != 1) {
 					buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
@@ -2007,8 +2006,10 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 						MAX2(1, ctx->shader_info->vs.vgpr_comp_cnt);
 				}
 			} else {
-				buffer_index = ctx->ac.i32_0;
+				unreachable("Invalid vertex attribute divisor of 0.");
 			}
+
+			buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.start_instance, buffer_index, "");
 		} else
 			buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
 			                            ctx->abi.base_vertex, "");
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -528,6 +528,16 @@
    <field name="number of attribute arrays" size="5" start="0" type="uint"/>
  </packet>

+  <packet code="71" name="VCM Cache Size" min_ver="41">
+    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
+    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
+  </packet>
+
+  <packet code="73" name="VCM Cache Size" max_ver="33">
+    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
+    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
+  </packet>
+
  <packet code="73" name="Transform Feedback Buffer" min_ver="41">
    <field name="Buffer Address" size="32" start="32" type="address"/>
    <field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/>
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -27,13 +27,14 @@
 #include <stdint.h>

 /**
- * Struct for tracking features of the V3D chip. This is where we'll store
- * boolean flags for features in a specific version, but for now it's just the
- * version
+ * Struct for tracking features of the V3D chip across driver and compiler.
 */
 struct v3d_device_info {
        /** Simple V3D version: major * 10 + minor */
        uint8_t ver;
+
+        /** Size of the VPM, in bytes. */
+        int vpm_size;
 };

 #endif
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -462,6 +462,7 @@ struct choose_scoreboard {
        int last_magic_sfu_write_tick;
        int last_ldvary_tick;
        int last_uniforms_reset_tick;
+        int last_thrsw_tick;
        bool tlb_locked;
 };

@@ -1095,10 +1096,16 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
 }

 static bool
-valid_thrsw_sequence(struct v3d_compile *c,
+valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
                     struct qinst *qinst, int instructions_in_sequence,
                     bool is_thrend)
 {
+        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
+        if (scoreboard->last_thrsw_tick + 3 >
+            scoreboard->tick - instructions_in_sequence) {
+                return false;
+        }
+
        for (int slot = 0; slot < instructions_in_sequence; slot++) {
                /* No scheduling SFU when the result would land in the other
                 * thread.  The simulator complains for safety, though it
@@ -1159,7 +1166,8 @@ emit_thrsw(struct v3d_compile *c,
                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
                        break;

-                if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
+                if (!valid_thrsw_sequence(c, scoreboard,
+                                          prev_inst, slots_filled + 1,
                                          is_thrend)) {
                        break;
                }
@@ -1173,7 +1181,9 @@ emit_thrsw(struct v3d_compile *c,
        if (merge_inst) {
                merge_inst->qpu.sig.thrsw = true;
                needs_free = true;
+                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
        } else {
+                scoreboard->last_thrsw_tick = scoreboard->tick;
                insert_scheduled_instruction(c, block, scoreboard, inst);
                time++;
                slots_filled++;
@@ -1475,6 +1485,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
        scoreboard.last_ldvary_tick = -10;
        scoreboard.last_magic_sfu_write_tick = -10;
        scoreboard.last_uniforms_reset_tick = -10;
+        scoreboard.last_thrsw_tick = -10;

        if (debug) {
                fprintf(stderr, "Pre-schedule instructions\n");
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -648,6 +648,9 @@ struct v3d_vs_prog_data {

        /* Total number of components written, for the shader state record. */
        uint32_t vpm_output_size;
+
+        /* Value to be programmed in VCM_CACHE_SIZE. */
+        uint8_t vcm_cache_size;
 };

 struct v3d_fs_prog_data {
@@ -928,7 +931,7 @@ VIR_A_ALU2(OR)
 VIR_A_ALU2(XOR)
 VIR_A_ALU2(VADD)
 VIR_A_ALU2(VSUB)
-VIR_A_ALU2(STVPMV)
+VIR_A_NODST_2(STVPMV)
 VIR_A_ALU1(NOT)
 VIR_A_ALU1(NEG)
 VIR_A_ALU1(FLAPUSH)
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -452,6 +452,16 @@ vir_emit_def(struct v3d_compile *c, struct qinst *inst)
 {
        assert(inst->dst.file == QFILE_NULL);

+        /* If we're emitting an instruction that's a def, it had better be
+         * writing a register.
+         */
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
+                       v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
+                assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
+                       v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
+        }
+
        inst->dst = vir_get_temp(c);

        if (inst->dst.file == QFILE_TEMP)
@@ -746,10 +756,28 @@ uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
        if (prog_data->uses_iid)
                prog_data->vpm_input_size++;

-        /* Input/output segment size are in 8x32-bit multiples. */
+        /* Input/output segment size are in sectors (8 rows of 32 bits per
+         * channel).
+         */
        prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
        prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;

+        /* Compute VCM cache size.  We set up our program to take up less than
+         * half of the VPM, so that any set of bin and render programs won't
+         * run out of space.  We need space for at least one input segment,
+         * and then allocate the rest to output segments (one for the current
+         * program, the rest to VCM).  The valid range of the VCM cache size
+         * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
+         * batches.
+         */
+        assert(c->devinfo->vpm_size);
+        int sector_size = 16 * sizeof(uint32_t) * 8;
+        int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
+        int half_vpm = vpm_size_in_sectors / 2;
+        int vpm_output_batches = half_vpm - prog_data->vpm_input_size;
+        assert(vpm_output_batches >= 2);
+        prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
+
        return v3d_return_qpu_insts(c, final_assembly_size);
 }

--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -94,6 +94,15 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                }
                        }

+                        /* Refuse to spill a ldvary's dst, because that means
+                         * that ldvary's r5 would end up being used across a
+                         * thrsw.
+                         */
+                        if (inst->qpu.sig.ldvary) {
+                                assert(inst->dst.file == QFILE_TEMP);
+                                BITSET_CLEAR(c->spillable, inst->dst.index);
+                        }
+
                        if (inst->is_last_thrsw)
                                started_last_seg = true;

@@ -102,7 +111,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                started_last_seg = true;

                        /* Track when we're in between a TMU setup and the
-                         * final LDTMU from that TMU setup.  We can't
+                         * final LDTMU or TMUWT from that TMU setup.  We can't
                         * spill/fill any temps during that time, because that
                         * involves inserting a new TMU setup/LDTMU sequence.
                         */
@@ -110,6 +119,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                            is_last_ldtmu(inst, block))
                                in_tmu_operation = false;

+                        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT)
+                                in_tmu_operation = false;
+
                        if (v3d_qpu_writes_tmu(&inst->qpu))
                                in_tmu_operation = true;
                }
@@ -206,6 +219,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                                     inst->dst);
                        v3d_emit_spill_tmua(c, spill_offset);
                        vir_emit_thrsw(c);
+                        vir_TMUWT(c);
                        c->spills++;
                }

--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -1928,6 +1928,11 @@ ast_expression::do_hir(exec_list *instructions,

      error_emitted = op[0]->type->is_error() || op[1]->type->is_error();

+      if (error_emitted) {
+         result = ir_rvalue::error_value(ctx);
+         break;
+      }
+
      type = arithmetic_result_type(op[0], op[1], false, state, & loc);

      ir_rvalue *temp_rhs;
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -201,6 +201,17 @@ resize_callback(struct wl_egl_window *wl_win, void *data)
   struct dri2_egl_display *dri2_dpy =
      dri2_egl_display(dri2_surf->base.Resource.Display);

+   /* Update the surface size as soon as native window is resized; from user
+    * pov, this makes the effect that resize is done inmediately after native
+    * window resize, without requiring to wait until the first draw.
+    *
+    * A more detailed and lengthy explanation can be found at
+    * https://lists.freedesktop.org/archives/mesa-dev/2018-June/196474.html
+    */
+   if (!dri2_surf->back) {
+      dri2_surf->base.Width = wl_win->width;
+      dri2_surf->base.Height = wl_win->height;
+   }
   dri2_dpy->flush->invalidate(dri2_surf->dri_drawable);
 }

@@ -258,6 +269,9 @@ dri2_wl_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
      goto cleanup_surf;
   }

+   dri2_surf->base.Width = window->width;
+   dri2_surf->base.Height = window->height;
+
   visual_idx = dri2_wl_visual_idx_from_config(dri2_dpy, config);
   assert(visual_idx != -1);

@@ -577,8 +591,8 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
   struct dri2_egl_display *dri2_dpy =
      dri2_egl_display(dri2_surf->base.Resource.Display);

-   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
-       dri2_surf->base.Height != dri2_surf->wl_win->height) {
+   if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {

      dri2_wl_release_buffers(dri2_surf);

@@ -1632,8 +1646,8 @@ swrast_update_buffers(struct dri2_egl_surface *dri2_surf)
   if (dri2_surf->back)
      return 0;

-   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
-       dri2_surf->base.Height != dri2_surf->wl_win->height) {
+   if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {

      dri2_wl_release_buffers(dri2_surf);

--- a/src/egl/drivers/dri2/platform_x11_dri3.c
+++ b/src/egl/drivers/dri2/platform_x11_dri3.c
@@ -107,12 +107,17 @@ static const struct loader_dri3_vtable egl_dri3_vtable = {
 static EGLBoolean
 dri3_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
 {
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   struct dri3_egl_surface *dri3_surf = dri3_egl_surface(surf);
+   xcb_drawable_t drawable = dri3_surf->loader_drawable.drawable;

   (void) drv;

   loader_dri3_drawable_fini(&dri3_surf->loader_drawable);

+   if (surf->Type == EGL_PBUFFER_BIT)
+      xcb_free_pixmap (dri2_dpy->conn, drawable);
+
   dri2_fini_surface(surf);
   free(surf);

--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -1131,6 +1131,31 @@ static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr)
   mgr->dirty_real_vb_mask = 0;
 }

+static void
+u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
+                               unsigned *indirect_data, unsigned stride,
+                               unsigned draw_count)
+{
+   assert(info->index_size);
+   info->indirect = NULL;
+
+   for (unsigned i = 0; i < draw_count; i++) {
+      unsigned offset = i * stride / 4;
+
+      info->count = indirect_data[offset + 0];
+      info->instance_count = indirect_data[offset + 1];
+
+      if (!info->count || !info->instance_count)
+         continue;
+
+      info->start = indirect_data[offset + 2];
+      info->index_bias = indirect_data[offset + 3];
+      info->start_instance = indirect_data[offset + 4];
+
+      u_vbuf_draw_vbo(mgr, info);
+   }
+}
+
 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
 {
   struct pipe_context *pipe = mgr->pipe;
@@ -1160,33 +1185,163 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)

   new_info = *info;

-   /* Fallback. We need to know all the parameters. */
+   /* Handle indirect (multi)draws. */
   if (new_info.indirect) {
-      struct pipe_transfer *transfer = NULL;
-      int *data;
+      const struct pipe_draw_indirect_info *indirect = new_info.indirect;
+      unsigned draw_count = 0;

-      if (new_info.index_size) {
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
-                                      new_info.indirect->offset, 20,
-                                      PIPE_TRANSFER_READ, &transfer);
-         new_info.index_bias = data[3];
-         new_info.start_instance = data[4];
-      }
-      else {
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
-                                      new_info.indirect->offset, 16,
-                                      PIPE_TRANSFER_READ, &transfer);
-         new_info.start_instance = data[3];
+      /* Get the number of draws. */
+      if (indirect->indirect_draw_count) {
+         pipe_buffer_read(pipe, indirect->indirect_draw_count,
+                          indirect->indirect_draw_count_offset,
+                          4, &draw_count);
+      } else {
+         draw_count = indirect->draw_count;
      }

-      new_info.count = data[0];
-      new_info.instance_count = data[1];
-      new_info.start = data[2];
-      pipe_buffer_unmap(pipe, transfer);
-      new_info.indirect = NULL;
-
-      if (!new_info.count)
+      if (!draw_count)
         return;
+
+      unsigned data_size = (draw_count - 1) * indirect->stride +
+                           (new_info.index_size ? 20 : 16);
+      unsigned *data = malloc(data_size);
+      if (!data)
+         return; /* report an error? */
+
+      /* Read the used buffer range only once, because the read can be
+       * uncached.
+       */
+      pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size,
+                       data);
+
+      if (info->index_size) {
+         /* Indexed multidraw. */
+         unsigned index_bias0 = data[3];
+         bool index_bias_same = true;
+
+         /* If we invoke the translate path, we have to split the multidraw. */
+         if (incompatible_vb_mask ||
+             mgr->ve->incompatible_elem_mask) {
+            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
+                                           indirect->stride, draw_count);
+            free(data);
+            return;
+         }
+
+         /* See if index_bias is the same for all draws. */
+         for (unsigned i = 1; i < draw_count; i++) {
+            if (data[i * indirect->stride / 4 + 3] != index_bias0) {
+               index_bias_same = false;
+               break;
+            }
+         }
+
+         /* Split the multidraw if index_bias is different. */
+         if (!index_bias_same) {
+            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
+                                           indirect->stride, draw_count);
+            free(data);
+            return;
+         }
+
+         /* If we don't need to use the translate path and index_bias is
+          * the same, we can process the multidraw with the time complexity
+          * equal to 1 draw call (except for the index range computation).
+          * We only need to compute the index range covering all draw calls
+          * of the multidraw.
+          *
+          * The driver will not look at these values because indirect != NULL.
+          * These values determine the user buffer bounds to upload.
+          */
+         new_info.index_bias = index_bias0;
+         new_info.min_index = ~0u;
+         new_info.max_index = 0;
+         new_info.start_instance = ~0u;
+         unsigned end_instance = 0;
+
+         struct pipe_transfer *transfer = NULL;
+         const uint8_t *indices;
+
+         if (info->has_user_indices) {
+            indices = (uint8_t*)info->index.user;
+         } else {
+            indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource,
+                                                PIPE_TRANSFER_READ, &transfer);
+         }
+
+         for (unsigned i = 0; i < draw_count; i++) {
+            unsigned offset = i * indirect->stride / 4;
+            unsigned start = data[offset + 2];
+            unsigned count = data[offset + 0];
+            unsigned start_instance = data[offset + 4];
+            unsigned instance_count = data[offset + 1];
+
+            if (!count || !instance_count)
+               continue;
+
+            /* Update the ranges of instances. */
+            new_info.start_instance = MIN2(new_info.start_instance,
+                                           start_instance);
+            end_instance = MAX2(end_instance, start_instance + instance_count);
+
+            /* Update the index range. */
+            unsigned min, max;
+            new_info.count = count; /* only used by get_minmax_index */
+            u_vbuf_get_minmax_index_mapped(&new_info,
+                                           indices +
+                                           new_info.index_size * start,
+                                           &min, &max);
+
+            new_info.min_index = MIN2(new_info.min_index, min);
+            new_info.max_index = MAX2(new_info.max_index, max);
+         }
+         free(data);
+
+         if (transfer)
+            pipe_buffer_unmap(pipe, transfer);
+
+         /* Set the final instance count. */
+         new_info.instance_count = end_instance - new_info.start_instance;
+
+         if (new_info.start_instance == ~0u || !new_info.instance_count)
+            return;
+      } else {
+         /* Non-indexed multidraw.
+          *
+          * Keep the draw call indirect and compute minimums & maximums,
+          * which will determine the user buffer bounds to upload, but
+          * the driver will not look at these values because indirect != NULL.
+          *
+          * This efficiently processes the multidraw with the time complexity
+          * equal to 1 draw call.
+          */
+         new_info.start = ~0u;
+         new_info.start_instance = ~0u;
+         unsigned end_vertex = 0;
+         unsigned end_instance = 0;
+
+         for (unsigned i = 0; i < draw_count; i++) {
+            unsigned offset = i * indirect->stride / 4;
+            unsigned start = data[offset + 2];
+            unsigned count = data[offset + 0];
+            unsigned start_instance = data[offset + 3];
+            unsigned instance_count = data[offset + 1];
+
+            new_info.start = MIN2(new_info.start, start);
+            new_info.start_instance = MIN2(new_info.start_instance,
+                                           start_instance);
+
+            end_vertex = MAX2(end_vertex, start + count);
+            end_instance = MAX2(end_instance, start_instance + instance_count);
+         }
+
+         /* Set the final counts. */
+         new_info.count = end_vertex - new_info.start;
+         new_info.instance_count = end_instance - new_info.start_instance;
+
+         if (new_info.start == ~0u || !new_info.count || !new_info.instance_count)
+            return;
+      }
   }

   if (new_info.index_size) {
@@ -1211,7 +1366,8 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
          * We would have to break this drawing operation into several ones. */
         /* Use some heuristic to see if unrolling indices improves
          * performance. */
-         if (!new_info.primitive_restart &&
+         if (!info->indirect &&
+             !new_info.primitive_restart &&
             num_vertices > new_info.count*2 &&
             num_vertices - new_info.count > 32 &&
             !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -2151,13 +2151,36 @@ NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
   }
 }

+void
+NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
+{
+   if (!su->getPredicate())
+      return;
+
+   bld.setPosition(su, true);
+
+   for (unsigned i = 0; su->defExists(i); ++i) {
+      ValueDef &def = su->def(i);
+
+      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
+      assert(su->cc == CC_NOT_P);
+      mov->setPredicate(CC_P, su->getPredicate());
+      Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), NULL, mov->getDef(0));
+
+      def.replace(uni->getDef(0), false);
+      uni->setSrc(0, def.get());
+   }
+}
+
 void
 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
 {
   processSurfaceCoordsNVE4(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
      assert(su->getPredicate());
@@ -2267,8 +2290,10 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)

   processSurfaceCoordsNVC0(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
      const int dim = su->tex.target.getDim();
@@ -2370,8 +2395,10 @@ NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
 {
   processSurfaceCoordsGM107(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDP) {
      Value *def = su->getDef(0);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -172,6 +172,7 @@ private:
   void processSurfaceCoordsNVE4(TexInstruction *);
   void processSurfaceCoordsNVC0(TexInstruction *);
   void convertSurfaceFormat(TexInstruction *);
+   void insertOOBSurfaceOpResult(TexInstruction *);
   Value *calculateSampleOffset(Value *sampleID);

 protected:
--- a/src/gallium/drivers/swr/swr_public.h
+++ b/src/gallium/drivers/swr/swr_public.h
@@ -37,7 +37,7 @@ extern "C" {
 struct pipe_screen *swr_create_screen(struct sw_winsys *winsys);

 // arch-specific dll entry point
-PUBLIC struct pipe_screen *swr_create_screen_internal(struct sw_winsys *winsys);
+struct pipe_screen *swr_create_screen_internal(struct sw_winsys *winsys);

 // cleanup for failed screen creation
 void swr_destroy_screen_internal(struct swr_screen **screen);
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -1143,12 +1143,10 @@ swr_validate_env_options(struct swr_screen *screen)
 }


-PUBLIC
 struct pipe_screen *
 swr_create_screen_internal(struct sw_winsys *winsys)
 {
   struct swr_screen *screen = CALLOC_STRUCT(swr_screen);
-   memset(screen, 0, sizeof(struct swr_screen));

   if (!screen)
      return NULL;
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -585,6 +585,8 @@ v3d_get_device_info(struct v3d_screen *screen)
        uint32_t minor = (ident1.value >> 0) & 0xf;
        screen->devinfo.ver = major * 10 + minor;

+        screen->devinfo.vpm_size = (ident1.value >> 28 & 0xf) * 1024;
+
        switch (screen->devinfo.ver) {
        case 33:
        case 41:
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -306,6 +306,13 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                }
        }

+        cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) {
+                vcm.number_of_16_vertex_batches_for_binning =
+                        v3d->prog.cs->prog_data.vs->vcm_cache_size;
+                vcm.number_of_16_vertex_batches_for_rendering =
+                        v3d->prog.vs->prog_data.vs->vcm_cache_size;
+        }
+
        cl_emit(&job->bcl, GL_SHADER_STATE, state) {
                state.address = cl_address(job->indirect.bo, shader_rec_offset);
                state.number_of_attribute_arrays = num_elements_to_emit;
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -222,6 +222,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
                        attr.coordinate_shader_vpm_offset = 0;
                        attr.vertex_shader_vpm_offset = 0;
                }
+
+                vc4_bo_unreference(&bo);
        }

        cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) {
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -121,7 +121,8 @@ vc4_fence_server_sync(struct pipe_context *pctx,
        struct vc4_context *vc4 = vc4_context(pctx);
        struct vc4_fence *fence = vc4_fence(pfence);

-        sync_accumulate("vc4", &vc4->in_fence_fd, fence->fd);
+        if (fence->fd >= 0)
+                sync_accumulate("vc4", &vc4->in_fence_fd, fence->fd);
 }

 static int
@@ -142,8 +143,12 @@ vc4_fence_context_init(struct vc4_context *vc4)
        /* Since we initialize the in_fence_fd to -1 (no wait necessary),
         * we also need to initialize our in_syncobj as signaled.
         */
-        return drmSyncobjCreate(vc4->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
-                                &vc4->in_syncobj);
+        if (vc4->screen->has_syncobj) {
+                return drmSyncobjCreate(vc4->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                        &vc4->in_syncobj);
+        } else {
+                return 0;
+        }
 }

 void
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -38,6 +38,7 @@
 #include "vc4_context.h"
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
+#include "mesa/state_tracker/st_glsl_types.h"

 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
@@ -50,6 +51,12 @@ type_size(const struct glsl_type *type)
   return glsl_count_attribute_slots(type, false);
 }

+static int
+uniforms_type_size(const struct glsl_type *type)
+{
+        return st_glsl_storage_type_size(type, false);
+}
+
 static void
 resize_qreg_array(struct vc4_compile *c,
                  struct qreg **regs,
@@ -1685,7 +1692,7 @@ static void
 ntq_setup_uniforms(struct vc4_compile *c)
 {
        nir_foreach_variable(var, &c->s->uniforms) {
-                uint32_t vec4_count = type_size(var->type);
+                uint32_t vec4_count = uniforms_type_size(var->type);
                unsigned vec4_size = 4 * sizeof(float);

                declare_uniform_range(c, var->data.driver_location * vec4_size,
@@ -2469,9 +2476,13 @@ vc4_shader_state_create(struct pipe_context *pctx,
                 */
                s = cso->ir.nir;

-                NIR_PASS_V(s, nir_lower_io, nir_var_all, type_size,
+                NIR_PASS_V(s, nir_lower_io, nir_var_all & ~nir_var_uniform,
+                           type_size,
                           (nir_lower_io_options)0);
-        } else {
+                NIR_PASS_V(s, nir_lower_io, nir_var_uniform,
+                           uniforms_type_size,
+                           (nir_lower_io_options)0);
+       } else {
                assert(cso->type == PIPE_SHADER_IR_TGSI);

                if (vc4_debug & VC4_DEBUG_TGSI) {
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -614,7 +614,9 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
        }

        so->texture_p0 =
-                (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
+                (VC4_SET_FIELD((rsc->slices[0].offset +
+                                cso->u.tex.first_layer *
+                                rsc->cube_map_stride) >> 12, VC4_TEX_P0_OFFSET) |
                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) |
                 VC4_SET_FIELD(so->force_first_level ?
                               cso->u.tex.last_level :
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -26,8 +26,12 @@
 *
 **************************************************************************/

+#if !defined(ANDROID) || ANDROID_API_LEVEL >= 26
+/* Android's libc began supporting shm in Oreo */
+#define HAVE_SHM
 #include <sys/ipc.h>
 #include <sys/shm.h>
+#endif

 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
@@ -83,6 +87,7 @@ dri_sw_is_displaytarget_format_supported( struct sw_winsys *ws,
   return TRUE;
 }

+#ifdef HAVE_SHM
 static char *
 alloc_shm(struct dri_sw_displaytarget *dri_sw_dt, unsigned size)
 {
@@ -101,6 +106,7 @@ alloc_shm(struct dri_sw_displaytarget *dri_sw_dt, unsigned size)

   return addr;
 }
+#endif

 static struct sw_displaytarget *
 dri_sw_displaytarget_create(struct sw_winsys *winsys,
@@ -131,8 +137,11 @@ dri_sw_displaytarget_create(struct sw_winsys *winsys,
   size = dri_sw_dt->stride * nblocksy;

   dri_sw_dt->shmid = -1;
+
+#ifdef HAVE_SHM
   if (ws->lf->put_image_shm)
      dri_sw_dt->data = alloc_shm(dri_sw_dt, size);
+#endif

   if(!dri_sw_dt->data)
      dri_sw_dt->data = align_malloc(size, alignment);
@@ -156,8 +165,10 @@ dri_sw_displaytarget_destroy(struct sw_winsys *ws,
   struct dri_sw_displaytarget *dri_sw_dt = dri_sw_displaytarget(dt);

   if (dri_sw_dt->shmid >= 0) {
+#ifdef HAVE_SHM
      shmdt(dri_sw_dt->data);
      shmctl(dri_sw_dt->shmid, IPC_RMID, 0);
+#endif
   } else {
      align_free(dri_sw_dt->data);
   }
--- a/src/glx/Makefile.am
+++ b/src/glx/Makefile.am
@@ -19,9 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.

-if HAVE_SHARED_GLAPI
-SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
-endif

 SUBDIRS =

@@ -181,7 +178,7 @@ GL_LIBS = \
 	$(LIBDRM_LIBS) \
 	libglx.la \
 	$(top_builddir)/src/mapi/glapi/libglapi.la \
-	$(SHARED_GLAPI_LIB) \
+	$(top_builddir)/src/mapi/shared-glapi/libglapi.la \
 	$(GL_LIB_DEPS)

 GL_LDFLAGS = \
--- a/src/glx/glxextensions.c
+++ b/src/glx/glxextensions.c
@@ -152,7 +152,7 @@ static const struct extension_info known_glx_extensions[] = {
   { GLX(ATI_pixel_format_float),      VER(0,0), N, N, N, N },
   { GLX(INTEL_swap_event),            VER(0,0), Y, N, N, N },
   { GLX(MESA_copy_sub_buffer),        VER(0,0), Y, N, N, N },
-   { GLX(MESA_multithread_makecurrent),VER(0,0), Y, N, Y, N },
+   { GLX(MESA_multithread_makecurrent),VER(0,0), Y, N, N, Y },
   { GLX(MESA_query_renderer),         VER(0,0), Y, N, N, Y },
   { GLX(MESA_swap_control),           VER(0,0), Y, N, N, Y },
   { GLX(NV_float_buffer),             VER(0,0), N, N, N, N },
--- a/src/intel/Makefile.tools.am
+++ b/src/intel/Makefile.tools.am
@@ -21,7 +21,9 @@

 noinst_PROGRAMS += \
 	tools/aubinator \
-	tools/aubinator_error_decode
+	tools/aubinator_error_decode \
+	tools/error2aub
+

 tools_aubinator_SOURCES = \
 	tools/aubinator.c \
@@ -59,3 +61,23 @@ tools_aubinator_error_decode_LDADD = \
 tools_aubinator_error_decode_CFLAGS = \
 	$(AM_CFLAGS) \
 	$(ZLIB_CFLAGS)
+
+
+tools_error2aub_SOURCES = \
+	tools/gen_context.h \
+	tools/gen8_context.h \
+	tools/gen10_context.h \
+	tools/aub_write.h \
+	tools/aub_write.c \
+	tools/error2aub.c
+
+tools_error2aub_CFLAGS = \
+	$(AM_CFLAGS) \
+	$(ZLIB_CFLAGS)
+
+tools_error2aub_LDADD = \
+	dev/libintel_dev.la \
+	$(PTHREAD_LIBS) \
+	$(DLOPEN_LIBS) \
+	$(ZLIB_LIBS) \
+	-lm
--- a/src/intel/blorp/blorp.c
+++ b/src/intel/blorp/blorp.c
@@ -75,18 +75,6 @@ brw_blorp_surface_info_init(struct blorp_context *blorp,
   if (format == ISL_FORMAT_UNSUPPORTED)
      format = surf->surf->format;

-   if (format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
-      /* Unfortunately, ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as
-       * a render target, which would prevent us from blitting to 24-bit
-       * depth.  The miptree consists of 32 bits per pixel, arranged as 24-bit
-       * depth values interleaved with 8 "don't care" bits.  Since depth
-       * values don't require any blending, it doesn't matter how we interpret
-       * the bit pattern as long as we copy the right amount of data, so just
-       * map it as 8-bit BGRA.
-       */
-      format = ISL_FORMAT_B8G8R8A8_UNORM;
-   }
-
   info->surf = *surf->surf;
   info->addr = surf->addr;

--- a/src/intel/blorp/blorp_blit.c
+++ b/src/intel/blorp/blorp_blit.c
@@ -776,6 +776,14 @@ blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
       * grid of samples with in a pixel. Sample number layout shows the
       * rectangular grid of samples roughly corresponding to the real sample
       * locations with in a pixel.
+       *
+       * In the case of 2x MSAA, the layout of sample indices is reversed from
+       * the layout of sample numbers:
+       *
+       * sample index layout :  ---------    sample number layout :  ---------
+       *                        | 0 | 1 |                            | 1 | 0 |
+       *                        ---------                            ---------
+       *
       * In case of 4x MSAA, layout of sample indices matches the layout of
       * sample numbers:
       *           ---------
@@ -819,7 +827,9 @@ blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
                                            key->x_scale * key->y_scale));
      sample = nir_f2i32(b, sample);

-      if (tex_samples == 8) {
+      if (tex_samples == 2) {
+         sample = nir_isub(b, nir_imm_int(b, 1), sample);
+      } else if (tex_samples == 8) {
         sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573),
                                       nir_ishl(b, sample, nir_imm_int(b, 2))),
                           nir_imm_int(b, 0xf));
@@ -984,14 +994,14 @@ convert_color(struct nir_builder *b, nir_ssa_def *color,
   nir_ssa_def *value;

   if (key->dst_format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
-      /* The destination image is bound as R32_UNORM but the data needs to be
+      /* The destination image is bound as R32_UINT but the data needs to be
       * in R24_UNORM_X8_TYPELESS.  The bottom 24 are the actual data and the
       * top 8 need to be zero.  We can accomplish this by simply multiplying
       * by a factor to scale things down.
       */
-      float factor = (float)((1 << 24) - 1) / (float)UINT32_MAX;
-      value = nir_fmul(b, nir_fsat(b, nir_channel(b, color, 0)),
-                          nir_imm_float(b, factor));
+      unsigned factor = (1 << 24) - 1;
+      value = nir_fsat(b, nir_channel(b, color, 0));
+      value = nir_f2i32(b, nir_fmul(b, value, nir_imm_float(b, factor)));
   } else if (key->dst_format == ISL_FORMAT_L8_UNORM_SRGB) {
      value = nir_format_linear_to_srgb(b, nir_channel(b, color, 0));
   } else if (key->dst_format == ISL_FORMAT_R8G8B8_UNORM_SRGB) {
@@ -1976,7 +1986,7 @@ try_blorp_blit(struct blorp_batch *batch,
         isl_format_rgbx_to_rgba(params->dst.view.format);
   } else if (params->dst.view.format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
      wm_prog_key->dst_format = params->dst.view.format;
-      params->dst.view.format = ISL_FORMAT_R32_UNORM;
+      params->dst.view.format = ISL_FORMAT_R32_UINT;
   } else if (params->dst.view.format == ISL_FORMAT_A4B4G4R4_UNORM) {
      params->dst.view.swizzle =
         isl_swizzle_compose(params->dst.view.swizzle,
@@ -2240,6 +2250,17 @@ blorp_blit(struct blorp_batch *batch,
      }
   }

+   /* ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as a render target,
+    * which requires shader math to render to it.  Blitting Z24X8 to Z24X8
+    * is fairly common though, so we'd like to avoid it.  Since we don't need
+    * to blend depth values, we can simply pick a renderable format with the
+    * right number of bits-per-pixel, like 8-bit BGRA.
+    */
+   if (dst_surf->surf->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS &&
+       src_surf->surf->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
+      src_format = dst_format = ISL_FORMAT_B8G8R8A8_UNORM;
+   }
+
   brw_blorp_surface_info_init(batch->blorp, &params.src, src_surf, src_level,
                               src_layer, src_format, false);
   brw_blorp_surface_info_init(batch->blorp, &params.dst, dst_surf, dst_level,
--- a/src/intel/common/gen_sample_positions.h
+++ b/src/intel/common/gen_sample_positions.h
@@ -42,10 +42,10 @@ prefix##0YOffset   = 0.5;
 * c   1
 */
 #define GEN_SAMPLE_POS_2X(prefix) \
-prefix##0XOffset   = 0.25; \
-prefix##0YOffset   = 0.25; \
-prefix##1XOffset   = 0.75; \
-prefix##1YOffset   = 0.75;
+prefix##0XOffset   = 0.75; \
+prefix##0YOffset   = 0.75; \
+prefix##1XOffset   = 0.25; \
+prefix##1YOffset   = 0.25;

 /**
 * Sample positions:
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5115,6 +5115,25 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
      }
   }

+   if (devinfo->gen < 6) {
+      /* From the G45 PRM, Volume 4 Page 361:
+       *
+       *    "Operand Alignment Rule: With the exceptions listed below, a
+       *     source/destination operand in general should be aligned to even
+       *     256-bit physical register with a region size equal to two 256-bit
+       *     physical registers."
+       *
+       * Normally we enforce this by allocating virtual registers to the
+       * even-aligned class.  But we need to handle payload registers.
+       */
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
+             inst->size_read(i) > REG_SIZE) {
+            max_width = MIN2(max_width, 8);
+         }
+      }
+   }
+
   /* From the IVB PRMs:
    *  "When an instruction is SIMD32, the low 16 bits of the execution mask
    *   are applied for both halves of the SIMD32 instruction. If different
@@ -6321,6 +6340,7 @@ fs_visitor::optimize()
   if (OPT(lower_load_payload)) {
      split_virtual_grfs();
      OPT(register_coalesce);
+      OPT(lower_simd_width);
      OPT(compute_to_mrf);
      OPT(dead_code_eliminate);
   }
--- a/src/intel/tools/aubinator.c
+++ b/src/intel/tools/aubinator.c
@@ -590,7 +590,7 @@ handle_memtrace_reg_write(uint32_t *p)
   uint32_t pphwsp_addr = context_descriptor & 0xfffff000;
   struct gen_batch_decode_bo pphwsp_bo = get_ggtt_batch_bo(NULL, pphwsp_addr);
   uint32_t *context = (uint32_t *)((uint8_t *)pphwsp_bo.map +
-                                    (pphwsp_bo.addr - pphwsp_addr) +
+                                    (pphwsp_addr - pphwsp_bo.addr) +
                                    pphwsp_size);

   uint32_t ring_buffer_head = context[5];
@@ -601,7 +601,7 @@ handle_memtrace_reg_write(uint32_t *p)
   struct gen_batch_decode_bo ring_bo = get_ggtt_batch_bo(NULL,
                                                          ring_buffer_start);
   assert(ring_bo.size > 0);
-   void *commands = (uint8_t *)ring_bo.map + (ring_bo.addr - ring_buffer_start);
+   void *commands = (uint8_t *)ring_bo.map + (ring_buffer_start - ring_bo.addr);

   if (context_descriptor & 0x100 /* ppgtt */) {
      batch_ctx.get_bo = get_ppgtt_batch_bo;
--- a/src/intel/tools/error2aub.c
+++ b/src/intel/tools/error2aub.c
@@ -205,7 +205,7 @@ main(int argc, char *argv[])
      BO_TYPE_UNKNOWN = 0,
      BO_TYPE_BATCH,
      BO_TYPE_USER,
-   } bo_type;
+   } bo_type = BO_TYPE_UNKNOWN;
   uint64_t bo_addr;

   char *line = NULL;
--- a/src/mesa/drivers/dri/i965/brw_multisample_state.h
+++ b/src/mesa/drivers/dri/i965/brw_multisample_state.h
@@ -38,13 +38,13 @@
 /**
 * 1x MSAA has a single sample at the center: (0.5, 0.5) -> (0x8, 0x8).
 *
- * 2x MSAA sample positions are (0.25, 0.25) and (0.75, 0.75):
+ * 2x MSAA sample positions are (0.75, 0.75) and (0.25, 0.25):
 *   4 c
- * 4 0
- * c   1
+ * 4 1
+ * c   0
 */
 static const uint32_t
-brw_multisample_positions_1x_2x = 0x0088cc44;
+brw_multisample_positions_1x_2x = 0x008844cc;

 /**
 * Sample positions:
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -68,10 +68,10 @@ gen6_get_sample_position(struct gl_context *ctx,
 * index layout in case of 2X and 4x MSAA, but they are different in
 * case of 8X MSAA.
 *
- * 2X MSAA sample index / number layout
- *           ---------
- *           | 0 | 1 |
- *           ---------
+ * 8X MSAA sample index layout    8x MSAA sample number layout
+ *           ---------                      ---------
+ *           | 0 | 1 |                      | 1 | 0 |
+ *           ---------                      ---------
 *
 * 4X MSAA sample index / number layout
 *           ---------
@@ -107,7 +107,7 @@ gen6_get_sample_position(struct gl_context *ctx,
 void
 gen6_set_sample_maps(struct gl_context *ctx)
 {
-   uint8_t map_2x[2] = {0, 1};
+   uint8_t map_2x[2] = {1, 0};
   uint8_t map_4x[4] = {0, 1, 2, 3};
   uint8_t map_8x[8] = {3, 7, 5, 0, 1, 2, 4, 6};
   uint8_t map_16x[16] = { 15, 10, 9, 7, 4, 1, 3, 13,
--- a/src/mesa/gl.pc.in
+++ b/src/mesa/gl.pc.in
@@ -7,7 +7,7 @@ Name: gl
 Description: Mesa OpenGL library
 Requires.private: @GL_PC_REQ_PRIV@
 Version: @PACKAGE_VERSION@
-Libs: -L${libdir} -l@GL_LIB@
+Libs: -L${libdir} -l@GL_PKGCONF_LIB@
 Libs.private: @GL_PC_LIB_PRIV@
 Cflags: -I${includedir} @GL_PC_CFLAGS@
 glx_tls: @GLX_TLS@
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -1229,7 +1229,7 @@ void st_init_extensions(struct pipe_screen *screen,
       screen->is_format_supported(screen, PIPE_FORMAT_R8G8B8A8_UNORM,
                                   PIPE_TEXTURE_2D, 0, 0,
                                   PIPE_BIND_SAMPLER_VIEW) &&
-       screen->is_format_supported(screen, PIPE_FORMAT_B8G8R8A8_SRGB,
+       screen->is_format_supported(screen, PIPE_FORMAT_R8G8B8A8_SRGB,
                                   PIPE_TEXTURE_2D, 0, 0,
                                   PIPE_BIND_SAMPLER_VIEW) &&
       screen->is_format_supported(screen, PIPE_FORMAT_R16_UNORM,
--- a/src/util/drirc
+++ b/src/util/drirc
@@ -120,6 +120,10 @@ TODO: document the other workarounds.
            <option name="allow_glsl_extension_directive_midshader" value="true" />
        </application>

+        <application name="Metro 2033 Redux / Metro Last Night Redux" executable="metro">
+            <option name="allow_glsl_extension_directive_midshader" value="true" />
+        </application>
+
        <application name="Worms W.M.D" executable="Worms W.M.Dx64">
            <option name="allow_higher_compat_version" value="true" />
        </application>