VERSION: bump for 19.3.0 final

docs: add release notes for 19.3.0
Revert "egl: move #include of local headers out of Khronos headers"
2019-12-12 11:21:58 -08:00 · 2019-12-12 11:21:43 -08:00 · 2019-12-12 09:24:42 -08:00 · 2019-12-12 09:23:54 -08:00 · 2019-12-12 09:22:54 -08:00 · 2019-12-12 09:22:54 -08:00
58 changed files with 3690 additions and 246 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-19.3.0-rc5
+19.3.0
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -1,6 +1,7 @@
 # This is reverted shortly after landing
 4432a2d14d80081d062f7939a950d65ea3a16eed

-# This was manually backported
+# These were manually backported
 21be5c8edd3ad156f6cbfbceb96e7939716d9f2c
 4b392ced2d744fccffe95490ff57e6b41033c266
+b6905438514ae4de0b7f85c861e3d811ddaadda9
--- a/docs/relnotes/19.3.0.html
+++ b/docs/relnotes/19.3.0.html
--- a/include/EGL/eglext.h
+++ b/include/EGL/eglext.h
@@ -1362,6 +1362,20 @@ EGLAPI EGLuint64NV EGLAPIENTRY eglGetSystemTimeNV (void);
 #define EGL_NATIVE_SURFACE_TIZEN          0x32A1
 #endif /* EGL_TIZEN_image_native_surface */

+#ifndef EGL_EXT_image_flush_external
+#define EGL_EXT_image_flush_external 1
+#define EGL_IMAGE_EXTERNAL_FLUSH_EXT 0x32A2
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLIMAGEFLUSHEXTERNALEXTPROC) (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLIMAGEINVALIDATEEXTERNALEXTPROC) (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglImageFlushExternalEXT (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglImageInvalidateExternalEXT (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list);
+#endif
+#endif /* EGL_EXT_image_flush_external */
+
+#include <EGL/eglmesaext.h>
+#include <EGL/eglextchromium.h>
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/EGL/eglextchromium.h
+++ b/include/EGL/eglextchromium.h
@@ -53,17 +53,6 @@ typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCVALUESCHROMIUMPROC)
 #endif
 #endif

-#ifndef EGL_EXT_image_flush_external
-#define EGL_EXT_image_flush_external 1
-#define EGL_IMAGE_EXTERNAL_FLUSH_EXT 0x32A2
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLIMAGEFLUSHEXTERNALEXTPROC) (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLIMAGEINVALIDATEEXTERNALEXTPROC) (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list);
-#ifdef EGL_EGLEXT_PROTOTYPES
-EGLAPI EGLBoolean EGLAPIENTRY eglImageFlushExternalEXT (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list);
-EGLAPI EGLBoolean EGLAPIENTRY eglImageInvalidateExternalEXT (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list);
-#endif
-#endif /* EGL_EXT_image_flush_external */
-
 #ifdef __cplusplus
 }
 #endif
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -392,7 +392,7 @@ void insert_NOPs_gfx8_9(Program* program)
   }
 }

-void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
+void handle_instruction_gfx10(Program *program, NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
                              std::vector<aco_ptr<Instruction>>& old_instructions,
                              std::vector<aco_ptr<Instruction>>& new_instructions)
 {
@@ -403,6 +403,9 @@ void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
       instr->format == Format::SCRATCH || instr->format == Format::DS) {
      /* Remember all SGPRs that are read by the VMEM instruction */
      mark_read_regs(instr, ctx.sgprs_read_by_VMEM);
+      ctx.sgprs_read_by_VMEM.set(exec);
+      if (program->wave_size == 64)
+         ctx.sgprs_read_by_VMEM.set(exec_hi);
   } else if (instr->isSALU() || instr->format == Format::SMEM) {
      /* Check if SALU writes an SGPR that was previously read by the VALU */
      if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) {
@@ -535,7 +538,7 @@ void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
   }
 }

-void handle_block_gfx10(NOP_ctx_gfx10& ctx, Block& block)
+void handle_block_gfx10(Program *program, NOP_ctx_gfx10& ctx, Block& block)
 {
   if (block.instructions.empty())
      return;
@@ -544,7 +547,7 @@ void handle_block_gfx10(NOP_ctx_gfx10& ctx, Block& block)
   instructions.reserve(block.instructions.size());

   for (aco_ptr<Instruction>& instr : block.instructions) {
-      handle_instruction_gfx10(ctx, instr, block.instructions, instructions);
+      handle_instruction_gfx10(program, ctx, instr, block.instructions, instructions);
      instructions.emplace_back(std::move(instr));
   }

@@ -569,7 +572,7 @@ void mitigate_hazards_gfx10(Program *program)
            for (unsigned b : program->blocks[idx].linear_preds)
               loop_block_ctx.join(all_ctx[b]);

-            handle_block_gfx10(loop_block_ctx, program->blocks[idx]);
+            handle_block_gfx10(program, loop_block_ctx, program->blocks[idx]);

            /* We only need to continue if the loop header context changed */
            if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
@@ -584,7 +587,7 @@ void mitigate_hazards_gfx10(Program *program)
      for (unsigned b : block.linear_preds)
         ctx.join(all_ctx[b]);

-      handle_block_gfx10(ctx, block);
+      handle_block_gfx10(program, ctx, block);
   }
 }

--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -1976,8 +1976,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
   }
   case nir_op_i2i64: {
      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (instr->src[0].src.ssa->bit_size == 32) {
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
+      if (src.regClass() == s1) {
+         Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
+      } else if (src.regClass() == v1) {
+         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
      } else {
         fprintf(stderr, "Unimplemented NIR instr bit size: ");
         nir_print_instr(&instr->instr, stderr);
@@ -6572,11 +6576,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
      }
   }

-   if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
-       instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
-       instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
-      coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
-
   std::vector<Operand> args;
   if (has_offset)
      args.emplace_back(Operand(offset));
@@ -6592,7 +6591,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
   if (has_lod)
      args.emplace_back(lod);

-   Operand arg;
+   Temp arg;
   if (args.size() > 1) {
      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
      unsigned size = 0;
@@ -6604,12 +6603,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
      Temp tmp = bld.tmp(rc);
      vec->definitions[0] = Definition(tmp);
      ctx->block->instructions.emplace_back(std::move(vec));
-      arg = Operand(tmp);
+      arg = tmp;
   } else {
      assert(args[0].isTemp());
-      arg = Operand(as_vgpr(ctx, args[0].getTemp()));
+      arg = as_vgpr(ctx, args[0].getTemp());
   }

+   /* we don't need the bias, sample index, compare value or offset to be
+    * computed in WQM but if the p_create_vector copies the coordinates, then it
+    * needs to be in WQM */
+   if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
+       instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
+       instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
+      arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
+
   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
      //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()

@@ -6741,7 +6748,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
   }

   tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
-   tex->operands[0] = arg;
+   tex->operands[0] = Operand(arg);
   tex->operands[1] = Operand(resource);
   tex->operands[2] = Operand(sampler);
   tex->dim = dim;
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -130,8 +130,7 @@ struct InstrPred {
               return false;
         }
      }
-      if (a->format == Format::PSEUDO_BRANCH)
-         return false;
+
      if (a->isVOP3()) {
         VOP3A_instruction* a3 = static_cast<VOP3A_instruction*>(a);
         VOP3A_instruction* b3 = static_cast<VOP3A_instruction*>(b);
@@ -147,7 +146,8 @@ struct InstrPred {
      if (a->isDPP()) {
         DPP_instruction* aDPP = static_cast<DPP_instruction*>(a);
         DPP_instruction* bDPP = static_cast<DPP_instruction*>(b);
-         return aDPP->dpp_ctrl == bDPP->dpp_ctrl &&
+         return aDPP->pass_flags == bDPP->pass_flags &&
+                aDPP->dpp_ctrl == bDPP->dpp_ctrl &&
                aDPP->bank_mask == bDPP->bank_mask &&
                aDPP->row_mask == bDPP->row_mask &&
                aDPP->bound_ctrl == bDPP->bound_ctrl &&
@@ -156,6 +156,7 @@ struct InstrPred {
                aDPP->neg[0] == bDPP->neg[0] &&
                aDPP->neg[1] == bDPP->neg[1];
      }
+
      switch (a->format) {
         case Format::VOPC: {
            /* Since the results depend on the exec mask, these shouldn't
@@ -191,7 +192,7 @@ struct InstrPred {
            /* this is fine since they are only used for vertex input fetches */
            MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a);
            MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b);
-            return aM->can_reorder == bM->can_reorder &&
+            return aM->can_reorder && bM->can_reorder &&
                   aM->barrier == bM->barrier &&
                   aM->dfmt == bM->dfmt &&
                   aM->nfmt == bM->nfmt &&
@@ -208,6 +209,10 @@ struct InstrPred {
         case Format::FLAT:
         case Format::GLOBAL:
         case Format::SCRATCH:
+         case Format::EXP:
+         case Format::SOPP:
+         case Format::PSEUDO_BRANCH:
+         case Format::PSEUDO_BARRIER:
            return false;
         case Format::DS: {
            /* we already handle potential issue with permute/swizzle above */
@@ -276,6 +281,10 @@ void process_block(vn_ctx& ctx, Block& block)
            op.setTemp(it->second);
      }

+      if (instr->opcode == aco_opcode::p_discard_if ||
+          instr->opcode == aco_opcode::p_demote_to_helper)
+         ctx.exec_id++;
+
      if (instr->definitions.empty()) {
         new_instructions.emplace_back(std::move(instr));
         continue;
@@ -288,10 +297,6 @@ void process_block(vn_ctx& ctx, Block& block)
         ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp();
      }

-      if (instr->opcode == aco_opcode::p_discard_if ||
-          instr->opcode == aco_opcode::p_demote_to_helper)
-         ctx.exec_id++;
-
      instr->pass_flags = ctx.exec_id;
      std::pair<expr_set::iterator, bool> res = ctx.expr_values.emplace(instr.get(), block.index);

@@ -303,6 +308,7 @@ void process_block(vn_ctx& ctx, Block& block)
         if (dominates(ctx, res.first->second, block.index)) {
            for (unsigned i = 0; i < instr->definitions.size(); i++) {
               assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass());
+               assert(instr->definitions[i].isTemp());
               ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp();
            }
         } else {
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -759,11 +759,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,

      /* count variables to be moved and check war_hint */
      bool war_hint = false;
-      for (unsigned j = reg_lo; j <= reg_hi; j++) {
-         if (reg_file[j] != 0)
+      bool linear_vgpr = false;
+      for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
+         if (reg_file[j] != 0) {
            k++;
+            /* we cannot split live ranges of linear vgprs */
+            if (ctx.assignments[reg_file[j]].second & (1 << 6))
+               linear_vgpr = true;
+         }
         war_hint |= ctx.war_hint[j];
      }
+      if (linear_vgpr || (war_hint && !best_war_hint))
+         continue;

      /* count operands in wrong positions */
      for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) {
@@ -775,7 +782,7 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
            k += instr->operands[j].size();
      }
      bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
-      if (k > num_moves || (!aligned && k == num_moves) || (war_hint && !best_war_hint))
+      if (k > num_moves || (!aligned && k == num_moves))
         continue;

      best_pos = reg_lo;
@@ -961,7 +968,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_

   handle_live_in = [&](Temp val, Block *block) -> Temp {
      std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
-      if (preds.size() == 0 && block->index != 0) {
+      if (preds.size() == 0 || val.regClass() == val.regClass().as_linear()) {
         renames[block->index][val.id()] = val;
         return val;
      }
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -3953,8 +3953,43 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
 {
 	LLVMValueRef result, tmp;

-	if (ctx->chip_class >= GFX10) {
-		result = inclusive ? src : identity;
+	if (inclusive) {
+		result = src;
+	} else if (ctx->chip_class >= GFX10) {
+		/* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
+		LLVMValueRef active, tmp1, tmp2;
+		LLVMValueRef tid = ac_get_thread_id(ctx);
+
+		tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+
+		tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
+
+		if (maxprefix > 32) {
+			active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
+					       LLVMConstInt(ctx->i32, 32, false), "");
+
+			tmp2 = LLVMBuildSelect(ctx->builder, active,
+					       ac_build_readlane(ctx, src,
+								 LLVMConstInt(ctx->i32, 31, false)),
+					       tmp2, "");
+
+			active = LLVMBuildOr(ctx->builder, active,
+					     LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+							   LLVMBuildAnd(ctx->builder, tid,
+									LLVMConstInt(ctx->i32, 0x1f, false), ""),
+							   LLVMConstInt(ctx->i32, 0x10, false), ""), "");
+			src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+		} else if (maxprefix > 16) {
+			active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
+					       LLVMConstInt(ctx->i32, 16, false), "");
+
+			src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+		}
+
+		result = src;
+	} else if (ctx->chip_class >= GFX8) {
+		src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
+		result = src;
 	} else {
 		if (!inclusive)
 			src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
@@ -3984,33 +4019,31 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
 		return result;

 	if (ctx->chip_class >= GFX10) {
-		/* dpp_row_bcast{15,31} are not supported on gfx10. */
-		LLVMBuilderRef builder = ctx->builder;
 		LLVMValueRef tid = ac_get_thread_id(ctx);
-		LLVMValueRef cc;
-		/* TODO-GFX10: Can we get better code-gen by putting this into
-		 * a branch so that LLVM generates EXEC mask manipulations? */
-		if (inclusive)
-			tmp = result;
-		else
-			tmp = ac_build_alu_op(ctx, result, src, op);
-		tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
-		tmp = ac_build_alu_op(ctx, result, tmp, op);
-		cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
-		cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
-		result = LLVMBuildSelect(builder, cc, tmp, result, "");
+		LLVMValueRef active;
+
+		tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
+
+		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+				       LLVMBuildAnd(ctx->builder, tid,
+						    LLVMConstInt(ctx->i32, 16, false), ""),
+				       ctx->i32_0, "");
+
+		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+		result = ac_build_alu_op(ctx, result, tmp, op);
+
 		if (maxprefix <= 32)
 			return result;

-		if (inclusive)
-			tmp = result;
-		else
-			tmp = ac_build_alu_op(ctx, result, src, op);
-		tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
-		tmp = ac_build_alu_op(ctx, result, tmp, op);
-		cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
-				   LLVMConstInt(ctx->i32, 32, false), "");
-		result = LLVMBuildSelect(builder, cc, tmp, result, "");
+		tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+
+		active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid,
+				       LLVMConstInt(ctx->i32, 32, false), "");
+
+		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+		result = ac_build_alu_op(ctx, result, tmp, op);
 		return result;
 	}

--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -3713,11 +3713,21 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 		break;
 	}
 	case nir_intrinsic_load_constant: {
+		unsigned base = nir_intrinsic_base(instr);
+		unsigned range = nir_intrinsic_range(instr);
+
 		LLVMValueRef offset = get_src(ctx, instr->src[0]);
-		LLVMValueRef base = LLVMConstInt(ctx->ac.i32,
-						 nir_intrinsic_base(instr),
-						 false);
-		offset = LLVMBuildAdd(ctx->ac.builder, offset, base, "");
+		offset = LLVMBuildAdd(ctx->ac.builder, offset,
+				      LLVMConstInt(ctx->ac.i32, base, false), "");
+
+		/* Clamp the offset to avoid out-of-bound access because global
+		 * instructions can't handle them.
+		 */
+		LLVMValueRef size = LLVMConstInt(ctx->ac.i32, base + range, false);
+		LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+						  offset, size, "");
+		offset = LLVMBuildSelect(ctx->ac.builder, cond, offset, size, "");
+
 		LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->constant_data,
 						 offset);
 		LLVMTypeRef comp_type =
--- a/src/amd/vulkan/radv_android.c
+++ b/src/amd/vulkan/radv_android.c
@@ -392,8 +392,8 @@ vk_format_from_android(unsigned android_format, unsigned android_usage)
 {
 	switch (android_format) {
 	case AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM:
-		return VK_FORMAT_R8G8B8A8_UNORM;
 	case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM:
+		return VK_FORMAT_R8G8B8A8_UNORM;
 	case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM:
 		return VK_FORMAT_R8G8B8_UNORM;
 	case AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM:
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1097,6 +1097,24 @@ void radv_GetPhysicalDeviceFeatures2(
 	return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
 }

+static size_t
+radv_max_descriptor_set_size()
+{
+	/* make sure that the entire descriptor set is addressable with a signed
+	 * 32-bit int. So the sum of all limits scaled by descriptor size has to
+	 * be at most 2 GiB. the combined image & samples object count as one of
+	 * both. This limit is for the pipeline layout, not for the set layout, but
+	 * there is no set limit, so we just set a pipeline limit. I don't think
+	 * any app is going to hit this soon. */
+	return ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS
+	                     - MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
+	          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
+	           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
+	           32 /* sampler, largest when combined with image */ +
+	           64 /* sampled image */ +
+	           64 /* storage image */);
+}
+
 void radv_GetPhysicalDeviceProperties(
 	VkPhysicalDevice                            physicalDevice,
 	VkPhysicalDeviceProperties*                 pProperties)
@@ -1104,18 +1122,7 @@ void radv_GetPhysicalDeviceProperties(
 	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
 	VkSampleCountFlags sample_counts = 0xf;

-	/* make sure that the entire descriptor set is addressable with a signed
-	 * 32-bit int. So the sum of all limits scaled by descriptor size has to
-	 * be at most 2 GiB. the combined image & samples object count as one of
-	 * both. This limit is for the pipeline layout, not for the set layout, but
-	 * there is no set limit, so we just set a pipeline limit. I don't think
-	 * any app is going to hit this soon. */
-	size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
-	          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
-	           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
-	           32 /* sampler, largest when combined with image */ +
-	           64 /* sampled image */ +
-	           64 /* storage image */);
+	size_t max_descriptor_set_size = radv_max_descriptor_set_size();

 	VkPhysicalDeviceLimits limits = {
 		.maxImageDimension1D                      = (1 << 14),
@@ -1394,13 +1401,7 @@ void radv_GetPhysicalDeviceProperties2(
 			properties->robustBufferAccessUpdateAfterBind = false;
 			properties->quadDivergentImplicitLod = false;

-			size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS -
-				MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
-			          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
-			           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
-			           32 /* sampler, largest when combined with image */ +
-			           64 /* sampled image */ +
-			           64 /* storage image */);
+			size_t max_descriptor_set_size = radv_max_descriptor_set_size();
 			properties->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size;
 			properties->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size;
 			properties->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size;
@@ -3855,8 +3856,7 @@ radv_finalize_timelines(struct radv_device *device,
 			pthread_mutex_lock(&wait_sems[i]->timeline.mutex);
 			struct radv_timeline_point *point =
 				radv_timeline_find_point_at_least_locked(device, &wait_sems[i]->timeline, wait_values[i]);
-			if (point)
-				--point->wait_count;
+			point->wait_count -= 2;
 			pthread_mutex_unlock(&wait_sems[i]->timeline.mutex);
 		}
 	}
@@ -3865,11 +3865,9 @@ radv_finalize_timelines(struct radv_device *device,
 			pthread_mutex_lock(&signal_sems[i]->timeline.mutex);
 			struct radv_timeline_point *point =
 				radv_timeline_find_point_at_least_locked(device, &signal_sems[i]->timeline, signal_values[i]);
-			if (point) {
-				signal_sems[i]->timeline.highest_submitted =
-					MAX2(signal_sems[i]->timeline.highest_submitted, point->value);
-				point->wait_count--;
-			}
+			signal_sems[i]->timeline.highest_submitted =
+				MAX2(signal_sems[i]->timeline.highest_submitted, point->value);
+			point->wait_count -= 2;
 			radv_timeline_trigger_waiters_locked(&signal_sems[i]->timeline, processing_list);
 			pthread_mutex_unlock(&signal_sems[i]->timeline.mutex);
 		}
@@ -5458,8 +5456,6 @@ radv_timeline_wait_locked(struct radv_device *device,
 	if (!point)
 		return VK_SUCCESS;

-	point->wait_count++;
-
 	pthread_mutex_unlock(&timeline->mutex);

 	bool success = device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, abs_timeout);
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1101,15 +1101,32 @@ radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
 	int ps_iter_samples = 1;
 	uint32_t mask = 0xffff;

-	if (vkms)
+	if (vkms) {
 		ms->num_samples = vkms->rasterizationSamples;
-	else
-		ms->num_samples = 1;

-	if (vkms)
-		ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms);
-	if (vkms && !vkms->sampleShadingEnable && pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.force_persample) {
-		ps_iter_samples = ms->num_samples;
+		/* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
+		 *
+		 * "Sample shading is enabled for a graphics pipeline:
+		 *
+		 * - If the interface of the fragment shader entry point of the
+		 *   graphics pipeline includes an input variable decorated
+		 *   with SampleId or SamplePosition. In this case
+		 *   minSampleShadingFactor takes the value 1.0.
+		 * - Else if the sampleShadingEnable member of the
+		 *   VkPipelineMultisampleStateCreateInfo structure specified
+		 *   when creating the graphics pipeline is set to VK_TRUE. In
+		 *   this case minSampleShadingFactor takes the value of
+		 *   VkPipelineMultisampleStateCreateInfo::minSampleShading.
+		 *
+		 * Otherwise, sample shading is considered disabled."
+		 */
+		if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.force_persample) {
+			ps_iter_samples = ms->num_samples;
+		} else {
+			ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms);
+		}
+	} else {
+		ms->num_samples = 1;
 	}

 	const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -151,6 +151,15 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
 			((wrmask >> (i * 4)) & 0xf) << comp;
 }

+static void
+set_writes_memory(const nir_shader *nir, struct radv_shader_info *info)
+{
+	if (nir->info.stage == MESA_SHADER_FRAGMENT)
+		info->ps.writes_memory = true;
+	else if (nir->info.stage == MESA_SHADER_GEOMETRY)
+		info->gs.writes_memory = true;
+}
+
 static void
 gather_intrinsic_store_deref_info(const nir_shader *nir,
 				const nir_intrinsic_instr *instr,
@@ -308,10 +317,7 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_xor ||
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_exchange ||
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) {
-			if (nir->info.stage == MESA_SHADER_FRAGMENT)
-				info->ps.writes_memory = true;
-			else if (nir->info.stage == MESA_SHADER_GEOMETRY)
-				info->gs.writes_memory = true;
+			set_writes_memory(nir, info);
 		}
 		break;
 	}
@@ -326,17 +332,28 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 	case nir_intrinsic_ssbo_atomic_xor:
 	case nir_intrinsic_ssbo_atomic_exchange:
 	case nir_intrinsic_ssbo_atomic_comp_swap:
-		if (nir->info.stage == MESA_SHADER_FRAGMENT)
-			info->ps.writes_memory = true;
-		else if (nir->info.stage == MESA_SHADER_GEOMETRY)
-			info->gs.writes_memory = true;
+		set_writes_memory(nir, info);
 		break;
 	case nir_intrinsic_load_deref:
 		gather_intrinsic_load_deref_info(nir, instr, info);
 		break;
 	case nir_intrinsic_store_deref:
 		gather_intrinsic_store_deref_info(nir, instr, info);
+		/* fallthrough */
+	case nir_intrinsic_deref_atomic_add:
+	case nir_intrinsic_deref_atomic_imin:
+	case nir_intrinsic_deref_atomic_umin:
+	case nir_intrinsic_deref_atomic_imax:
+	case nir_intrinsic_deref_atomic_umax:
+	case nir_intrinsic_deref_atomic_and:
+	case nir_intrinsic_deref_atomic_or:
+	case nir_intrinsic_deref_atomic_xor:
+	case nir_intrinsic_deref_atomic_exchange:
+	case nir_intrinsic_deref_atomic_comp_swap: {
+		if (nir_src_as_deref(instr->src[0])->mode & (nir_var_mem_global | nir_var_mem_ssbo))
+			set_writes_memory(nir, info);
 		break;
+	}
 	default:
 		break;
 	}
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -58,6 +58,6 @@ libbroadcom_cle = static_library(
  'v3d_decoder.c',
  include_directories : [inc_common, inc_broadcom],
  c_args : [c_vis_args, no_override_init_args],
-  dependencies : [dep_libdrm, dep_valgrind],
+  dependencies : [dep_libdrm, dep_valgrind, dep_expat, dep_zlib],
  build_by_default : false,
 )
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -1435,6 +1435,9 @@ builtin_variable_generator::add_varying(int slot, const glsl_type *type,
 void
 builtin_variable_generator::generate_varyings()
 {
+   struct gl_shader_compiler_options *options =
+      &state->ctx->Const.ShaderCompilerOptions[state->stage];
+
   /* gl_Position and gl_PointSize are not visible from fragment shaders. */
   if (state->stage != MESA_SHADER_FRAGMENT) {
      add_varying(VARYING_SLOT_POS, vec4_t, GLSL_PRECISION_HIGH, "gl_Position");
@@ -1526,6 +1529,9 @@ builtin_variable_generator::generate_varyings()
         var->data.sample = fields[i].sample;
         var->data.patch = fields[i].patch;
         var->init_interface_type(per_vertex_out_type);
+
+         var->data.invariant = fields[i].location == VARYING_SLOT_POS &&
+                               options->PositionAlwaysInvariant;
      }
   }
 }
--- a/src/compiler/glsl/gl_nir_linker.c
+++ b/src/compiler/glsl/gl_nir_linker.c
@@ -34,32 +34,11 @@
 */

 static bool
-add_interface_variables(const struct gl_context *cts,
-                        struct gl_shader_program *prog,
-                        struct set *resource_set,
-                        unsigned stage, GLenum programInterface)
+add_vars_from_list(const struct gl_context *ctx,
+                   struct gl_shader_program *prog, struct set *resource_set,
+                   const struct exec_list *var_list, unsigned stage,
+                   GLenum programInterface)
 {
-   const struct exec_list *var_list = NULL;
-
-   struct gl_linked_shader *sh = prog->_LinkedShaders[stage];
-   if (!sh)
-      return true;
-
-   nir_shader *nir = sh->Program->nir;
-   assert(nir);
-
-   switch (programInterface) {
-   case GL_PROGRAM_INPUT:
-      var_list = &nir->inputs;
-      break;
-   case GL_PROGRAM_OUTPUT:
-      var_list = &nir->outputs;
-      break;
-   default:
-      assert("!Should not get here");
-      break;
-   }
-
   nir_foreach_variable(var, var_list) {
      if (var->data.how_declared == nir_var_hidden)
         continue;
@@ -108,6 +87,38 @@ add_interface_variables(const struct gl_context *cts,
   return true;
 }

+static bool
+add_interface_variables(const struct gl_context *ctx,
+                        struct gl_shader_program *prog,
+                        struct set *resource_set,
+                        unsigned stage, GLenum programInterface)
+{
+   struct gl_linked_shader *sh = prog->_LinkedShaders[stage];
+   if (!sh)
+      return true;
+
+   nir_shader *nir = sh->Program->nir;
+   assert(nir);
+
+   switch (programInterface) {
+   case GL_PROGRAM_INPUT: {
+      bool result = add_vars_from_list(ctx, prog, resource_set,
+                                       &nir->inputs, stage, programInterface);
+      result &= add_vars_from_list(ctx, prog, resource_set, &nir->system_values,
+                                   stage, programInterface);
+      return result;
+   }
+   case GL_PROGRAM_OUTPUT:
+      return add_vars_from_list(ctx, prog, resource_set, &nir->outputs, stage,
+                                programInterface);
+   default:
+      assert("!Should not get here");
+      break;
+   }
+
+   return false;
+}
+
 /* TODO: as we keep adding features, this method is becoming more and more
 * similar to its GLSL counterpart at linker.cpp. Eventually it would be good
 * to check if they could be refactored, and reduce code duplication somehow
--- a/src/compiler/nir/nir_lower_clip.c
+++ b/src/compiler/nir/nir_lower_clip.c
@@ -316,6 +316,17 @@ nir_lower_clip_vs(nir_shader *shader, unsigned ucp_enables, bool use_vars,
   if (!ucp_enables)
      return false;

+   /* find clipvertex/position outputs: */
+   nir_foreach_variable(var, &shader->outputs) {
+      int loc = var->data.driver_location;
+
+      /* keep track of last used driver-location.. we'll be
+       * appending CLIP_DIST0/CLIP_DIST1 after last existing
+       * output:
+       */
+      maxloc = MAX2(maxloc, loc);
+   }
+
   nir_builder_init(&b, impl);

   /* NIR should ensure that, even in case of loops/if-else, there
--- a/src/compiler/nir/nir_lower_io_to_vector.c
+++ b/src/compiler/nir/nir_lower_io_to_vector.c
@@ -184,7 +184,10 @@ get_flat_type(const nir_shader *shader, nir_variable *old_vars[MAX_SLOTS][4],
   if (num_vars <= 1)
      return NULL;

-   return glsl_array_type(glsl_vector_type(base, 4), slots, 0);
+   if (slots == 1)
+      return glsl_vector_type(base, 4);
+   else
+      return glsl_array_type(glsl_vector_type(base, 4), slots, 0);
 }

 static bool
@@ -340,6 +343,9 @@ build_array_deref_of_new_var_flat(nir_shader *shader,
      deref = nir_build_deref_array(b, deref, index);
   }

+   if (!glsl_type_is_array(deref->type))
+      return deref;
+
   bool vs_in = shader->info.stage == MESA_SHADER_VERTEX &&
                new_var->data.mode == nir_var_shader_in;
   return nir_build_deref_array(
--- a/src/egl/generate/gen_egl_dispatch.py
+++ b/src/egl/generate/gen_egl_dispatch.py
@@ -100,8 +100,6 @@ def generateHeader(functions):

    #include <EGL/egl.h>
    #include <EGL/eglext.h>
-    #include <EGL/eglmesaext.h>
-    #include <EGL/eglextchromium.h>
    #include "glvnd/libeglabi.h"

    """.lstrip("\n"))
--- a/src/egl/main/egltypedefs.h
+++ b/src/egl/main/egltypedefs.h
@@ -33,8 +33,6 @@

 #include <EGL/egl.h>
 #include <EGL/eglext.h>
-#include <EGL/eglmesaext.h>
-#include <EGL/eglextchromium.h>

 #ifdef __cplusplus
 extern "C" {
--- a/src/gallium/auxiliary/Android.mk
+++ b/src/gallium/auxiliary/Android.mk
@@ -77,3 +77,14 @@ LOCAL_GENERATED_SOURCES += $(MESA_GEN_NIR_H)

 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
+
+# Build libmesa_galliumvl used by radeonsi
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+	$(VL_SOURCES)
+
+LOCAL_MODULE := libmesa_galliumvl
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
--- a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
+++ b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
@@ -38,6 +38,7 @@ DRI_CONF_SECTION_END
 DRI_CONF_SECTION_MISCELLANEOUS
   DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER("false")
   DRI_CONF_GLSL_ZERO_INIT("false")
+   DRI_CONF_VS_POSITION_ALWAYS_INVARIANT("false")
   DRI_CONF_ALLOW_RGB10_CONFIGS("true")
   DRI_CONF_ALLOW_FP16_CONFIGS("false")
 DRI_CONF_SECTION_END
--- a/src/gallium/auxiliary/util/u_prim.h
+++ b/src/gallium/auxiliary/util/u_prim.h
@@ -338,7 +338,14 @@ u_stream_outputs_for_vertices(enum pipe_prim_type primitive, unsigned nr)
   /* Extraneous vertices don't contribute to stream outputs */
   u_trim_pipe_prim(primitive, &nr);

-   /* Consider how many primitives are actually generated */
+   /* Polygons are special, since they are a single primitive with many
+    * vertices. In this case, we just have an output for each vertex (after
+    * trimming) */
+
+   if (primitive == PIPE_PRIM_POLYGON)
+      return nr;
+
+   /* Normally, consider how many primitives are actually generated */
   unsigned prims = u_decomposed_prims_for_vertices(primitive, nr);

   /* One output per vertex after decomposition */
--- a/src/gallium/drivers/etnaviv/etnaviv_resource.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_resource.c
@@ -498,7 +498,6 @@ etna_resource_from_handle(struct pipe_screen *pscreen,
   struct etna_resource *rsc;
   struct etna_resource_level *level;
   struct pipe_resource *prsc;
-   struct pipe_resource *ptiled = NULL;

   DBG("target=%d, format=%s, %ux%ux%u, array_size=%u, last_level=%u, "
       "nr_samples=%u, usage=%u, bind=%x, flags=%x",
@@ -572,8 +571,6 @@ etna_resource_from_handle(struct pipe_screen *pscreen,

 fail:
   etna_resource_destroy(pscreen, prsc);
-   if (ptiled)
-      etna_resource_destroy(pscreen, ptiled);

   return NULL;
 }
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -60,9 +60,9 @@ fd4_screen_is_format_supported(struct pipe_screen *pscreen,
 	}

 	if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
+			(fd4_pipe2tex(format) != (enum a4xx_tex_fmt)~0) &&
 			(target == PIPE_BUFFER ||
-			 util_format_get_blocksize(format) != 12) &&
-			(fd4_pipe2tex(format) != (enum a4xx_tex_fmt)~0)) {
+			 util_format_get_blocksize(format) != 12)) {
 		retval |= PIPE_BIND_SAMPLER_VIEW;
 	}

--- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
@@ -76,9 +76,9 @@ fd5_screen_is_format_supported(struct pipe_screen *pscreen,
 	}

 	if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) &&
+			(fd5_pipe2tex(format) != (enum a5xx_tex_fmt)~0) &&
 			(target == PIPE_BUFFER ||
-			 util_format_get_blocksize(format) != 12) &&
-			(fd5_pipe2tex(format) != (enum a5xx_tex_fmt)~0)) {
+			 util_format_get_blocksize(format) != 12)) {
 		retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
 	}

--- a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c
@@ -82,9 +82,9 @@ fd6_screen_is_format_supported(struct pipe_screen *pscreen,
 	}

 	if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) &&
+			(fd6_pipe2tex(format) != (enum a6xx_tex_fmt)~0) &&
 			(target == PIPE_BUFFER ||
-			 util_format_get_blocksize(format) != 12) &&
-			(fd6_pipe2tex(format) != (enum a6xx_tex_fmt)~0)) {
+			 util_format_get_blocksize(format) != 12)) {
 		retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
 	}

--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@@ -1294,7 +1294,8 @@ iris_bo_get_tiling(struct iris_bo *bo, uint32_t *tiling_mode,
 }

 struct iris_bo *
-iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd)
+iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd,
+                      uint32_t tiling, uint32_t stride)
 {
   uint32_t handle;
   struct iris_bo *bo;
@@ -1345,9 +1346,15 @@ iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd)
   if (gen_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling))
      goto err;

-   bo->tiling_mode = get_tiling.tiling_mode;
-   bo->swizzle_mode = get_tiling.swizzle_mode;
-   /* XXX stride is unknown */
+   if (get_tiling.tiling_mode == tiling || tiling > I915_TILING_LAST) {
+      bo->tiling_mode = get_tiling.tiling_mode;
+      bo->swizzle_mode = get_tiling.swizzle_mode;
+       /* XXX stride is unknown */
+   } else {
+      if (bo_set_tiling_internal(bo, tiling, stride)) {
+         goto err;
+      }
+   }

 out:
   mtx_unlock(&bufmgr->lock);
@@ -1660,6 +1667,7 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd, bool bo_reuse)

   STATIC_ASSERT(IRIS_MEMZONE_SHADER_START == 0ull);
   const uint64_t _4GB = 1ull << 32;
+   const uint64_t _2GB = 1ul << 31;

   /* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */
   const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE;
@@ -1669,9 +1677,16 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd, bool bo_reuse)
   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SURFACE],
                      IRIS_MEMZONE_SURFACE_START,
                      _4GB_minus_1 - IRIS_MAX_BINDERS * IRIS_BINDER_SIZE);
+   /* TODO: Why does limiting to 2GB help some state items on gen12?
+    *  - CC Viewport Pointer
+    *  - Blend State Pointer
+    *  - Color Calc State Pointer
+    */
+   const uint64_t dynamic_pool_size =
+      (devinfo->gen >= 12 ? _2GB : _4GB_minus_1) - IRIS_BORDER_COLOR_POOL_SIZE;
   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_DYNAMIC],
                      IRIS_MEMZONE_DYNAMIC_START + IRIS_BORDER_COLOR_POOL_SIZE,
-                      _4GB_minus_1 - IRIS_BORDER_COLOR_POOL_SIZE);
+                      dynamic_pool_size);

   /* Leave the last 4GB out of the high vma range, so that no state
    * base address + size can overflow 48 bits.
--- a/src/gallium/drivers/iris/iris_bufmgr.h
+++ b/src/gallium/drivers/iris/iris_bufmgr.h
@@ -352,7 +352,8 @@ int iris_hw_context_set_priority(struct iris_bufmgr *bufmgr,
 void iris_destroy_hw_context(struct iris_bufmgr *bufmgr, uint32_t ctx_id);

 int iris_bo_export_dmabuf(struct iris_bo *bo, int *prime_fd);
-struct iris_bo *iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd);
+struct iris_bo *iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd,
+                                      uint32_t tiling, uint32_t stride);

 uint32_t iris_bo_export_gem_handle(struct iris_bo *bo);

--- a/src/gallium/drivers/iris/iris_resource.c
+++ b/src/gallium/drivers/iris/iris_resource.c
@@ -960,12 +960,21 @@ iris_resource_from_handle(struct pipe_screen *pscreen,
   struct gen_device_info *devinfo = &screen->devinfo;
   struct iris_bufmgr *bufmgr = screen->bufmgr;
   struct iris_resource *res = iris_alloc_resource(pscreen, templ);
+   const struct isl_drm_modifier_info *mod_inf =
+	   isl_drm_modifier_get_info(whandle->modifier);
+   uint32_t tiling;
+
   if (!res)
      return NULL;

   switch (whandle->type) {
   case WINSYS_HANDLE_TYPE_FD:
-      res->bo = iris_bo_import_dmabuf(bufmgr, whandle->handle);
+      if (mod_inf)
+         tiling = isl_tiling_to_i915_tiling(mod_inf->tiling);
+      else
+         tiling = I915_TILING_LAST + 1;
+      res->bo = iris_bo_import_dmabuf(bufmgr, whandle->handle,
+                                      tiling, whandle->stride);
      break;
   case WINSYS_HANDLE_TYPE_SHARED:
      res->bo = iris_bo_gem_create_from_name(bufmgr, "winsys image",
@@ -979,12 +988,14 @@ iris_resource_from_handle(struct pipe_screen *pscreen,

   res->offset = whandle->offset;

-   uint64_t modifier = whandle->modifier;
-   if (modifier == DRM_FORMAT_MOD_INVALID) {
-      modifier = tiling_to_modifier(res->bo->tiling_mode);
+   if (mod_inf == NULL) {
+      mod_inf =
+         isl_drm_modifier_get_info(tiling_to_modifier(res->bo->tiling_mode));
   }
-   res->mod_info = isl_drm_modifier_get_info(modifier);
-   assert(res->mod_info);
+   assert(mod_inf);
+
+   res->external_format = whandle->format;
+   res->mod_info = mod_inf;

   isl_surf_usage_flags_t isl_usage = pipe_bind_to_isl_usage(templ->bind);

@@ -995,7 +1006,8 @@ iris_resource_from_handle(struct pipe_screen *pscreen,
   if (templ->target == PIPE_BUFFER) {
      res->surf.tiling = ISL_TILING_LINEAR;
   } else {
-      if (whandle->modifier == DRM_FORMAT_MOD_INVALID || whandle->plane == 0) {
+      /* Create a surface for each plane specified by the external format. */
+      if (whandle->plane < util_format_get_num_planes(whandle->format)) {
         UNUSED const bool isl_surf_created_successfully =
            isl_surf_init(&screen->isl_dev, &res->surf,
                          .dim = target_to_isl_surf_dim(templ->target),
@@ -1173,6 +1185,8 @@ iris_resource_get_handle(struct pipe_screen *pscreen,
      whandle->stride = res->surf.row_pitch_B;
      bo = res->bo;
   }
+
+   whandle->format = res->external_format;
   whandle->modifier =
      res->mod_info ? res->mod_info->modifier
                    : tiling_to_modifier(res->bo->tiling_mode);
--- a/src/gallium/drivers/iris/iris_resource.h
+++ b/src/gallium/drivers/iris/iris_resource.h
@@ -162,6 +162,13 @@ struct iris_resource {
      uint16_t has_hiz;
   } aux;

+   /**
+    * For external surfaces, this is format that was used to create or import
+    * the surface. For internal surfaces, this will always be
+    * PIPE_FORMAT_NONE.
+    */
+   enum pipe_format external_format;
+
   /**
    * For external surfaces, this is DRM format modifier that was used to
    * create or import the surface.  For internal surfaces, this will always
--- a/src/gallium/drivers/panfrost/pan_job.c
+++ b/src/gallium/drivers/panfrost/pan_job.c
@@ -933,6 +933,25 @@ panfrost_batch_submit(struct panfrost_batch *batch)
        if (ret)
                fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret);

+        /* We must reset the damage info of our render targets here even
+         * though a damage reset normally happens when the DRI layer swaps
+         * buffers. That's because there can be implicit flushes the GL
+         * app is not aware of, and those might impact the damage region: if
+         * part of the damaged portion is drawn during those implicit flushes,
+         * you have to reload those areas before next draws are pushed, and
+         * since the driver can't easily know what's been modified by the draws
+         * it flushed, the easiest solution is to reload everything.
+         */
+        for (unsigned i = 0; i < batch->key.nr_cbufs; i++) {
+                struct panfrost_resource *res;
+
+                if (!batch->key.cbufs[i])
+                        continue;
+
+                res = pan_resource(batch->key.cbufs[i]->texture);
+                panfrost_resource_reset_damage(res);
+        }
+
 out:
        panfrost_freeze_batch(batch);
        panfrost_free_batch(batch);
--- a/src/gallium/drivers/panfrost/pan_resource.c
+++ b/src/gallium/drivers/panfrost/pan_resource.c
@@ -48,7 +48,7 @@
 #include "pan_util.h"
 #include "pan_tiling.h"

-static void
+void
 panfrost_resource_reset_damage(struct panfrost_resource *pres)
 {
        /* We set the damage extent to the full resource size but keep the
--- a/src/gallium/drivers/panfrost/pan_resource.h
+++ b/src/gallium/drivers/panfrost/pan_resource.h
@@ -135,6 +135,9 @@ void
 panfrost_blit_wallpaper(struct panfrost_context *ctx,
                        struct pipe_box *box);

+void
+panfrost_resource_reset_damage(struct panfrost_resource *pres);
+
 void
 panfrost_resource_set_damage_region(struct pipe_screen *screen,
                                    struct pipe_resource *res,
--- a/src/gallium/drivers/radeonsi/Android.mk
+++ b/src/gallium/drivers/radeonsi/Android.mk
@@ -40,7 +40,9 @@ LOCAL_C_INCLUDES := \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir

-LOCAL_STATIC_LIBRARIES := libmesa_amd_common
+LOCAL_STATIC_LIBRARIES := \
+	libmesa_amd_common \
+	libmesa_galliumvl

 LOCAL_SHARED_LIBRARIES := libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_radeonsi
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -199,7 +199,8 @@ static unsigned si_texture_get_offset(struct si_screen *sscreen,

 		/* Each texture is an array of slices. Each slice is an array
 		 * of mipmap levels. */
-		return box->z * tex->surface.u.gfx9.surf_slice_size +
+		return tex->surface.u.gfx9.surf_offset +
+		       box->z * tex->surface.u.gfx9.surf_slice_size +
 		       tex->surface.u.gfx9.offset[level] +
 		       (box->y / tex->surface.blk_h *
 			tex->surface.u.gfx9.surf_pitch +
@@ -1721,10 +1722,12 @@ struct pipe_resource *si_texture_create(struct pipe_screen *screen,
 		tex->plane_index = i;
 		tex->num_planes = num_planes;

-		if (!last_plane)
+		if (!plane0) {
 			plane0 = last_plane = tex;
-		else
+		} else {
 			last_plane->buffer.b.b.next = &tex->buffer.b.b;
+			last_plane = tex;
+		}
 	}

 	return (struct pipe_resource *)plane0;
--- a/src/gallium/include/state_tracker/st_api.h
+++ b/src/gallium/include/state_tracker/st_api.h
@@ -228,6 +228,7 @@ struct st_config_options
   bool allow_glsl_builtin_variable_redeclaration;
   bool allow_higher_compat_version;
   bool glsl_zero_init;
+   bool vs_position_always_invariant;
   bool force_glsl_abs_sqrt;
   bool allow_glsl_cross_stage_interpolation_mismatch;
   bool allow_glsl_layout_qualifier_on_function_parameters;
--- a/src/gallium/include/state_tracker/winsys_handle.h
+++ b/src/gallium/include/state_tracker/winsys_handle.h
@@ -49,6 +49,12 @@ struct winsys_handle
    */
   unsigned offset;

+   /**
+    * Input to resource_from_handle.
+    * Output from resource_get_handle.
+    */
+   uint64_t format;
+
   /**
    * Input to resource_from_handle.
    * Output from resource_get_handle.
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -547,6 +547,7 @@ dri2_allocate_textures(struct dri_context *ctx,
         whandle.handle = buf->name;
         whandle.stride = buf->pitch;
         whandle.offset = 0;
+         whandle.format = format;
         whandle.modifier = DRM_FORMAT_MOD_INVALID;
         if (screen->can_share_buffer)
            whandle.type = WINSYS_HANDLE_TYPE_SHARED;
@@ -777,18 +778,12 @@ dri2_create_image_from_winsys(__DRIscreen *_screen,
   for (i = num_handles - 1; i >= 0; i--) {
      struct pipe_resource *tex;

-      if (whandle[i].modifier == DRM_FORMAT_MOD_INVALID) {
-         templ.width0 = width >> map->planes[i].width_shift;
-         templ.height0 = height >> map->planes[i].height_shift;
-         if (is_yuv)
-            templ.format = dri2_get_pipe_format_for_dri_format(map->planes[i].dri_format);
-         else
-            templ.format = map->pipe_format;
-      } else {
-         templ.width0 = width;
-         templ.height0 = height;
+      templ.width0 = width >> map->planes[i].width_shift;
+      templ.height0 = height >> map->planes[i].height_shift;
+      if (is_yuv)
+         templ.format = dri2_get_pipe_format_for_dri_format(map->planes[i].dri_format);
+      else
         templ.format = map->pipe_format;
-      }
      assert(templ.format != PIPE_FORMAT_NONE);

      tex = pscreen->resource_from_handle(pscreen,
@@ -826,6 +821,7 @@ dri2_create_image_from_name(__DRIscreen *_screen,
   memset(&whandle, 0, sizeof(whandle));
   whandle.type = WINSYS_HANDLE_TYPE_SHARED;
   whandle.handle = name;
+   whandle.format = map->pipe_format;
   whandle.modifier = DRM_FORMAT_MOD_INVALID;

   whandle.stride = pitch * util_format_get_blocksize(map->pipe_format);
@@ -844,8 +840,13 @@ dri2_create_image_from_name(__DRIscreen *_screen,
 }

 static unsigned
-dri2_get_modifier_num_planes(uint64_t modifier)
+dri2_get_modifier_num_planes(uint64_t modifier, int fourcc)
 {
+   const struct dri2_format_mapping *map = dri2_get_mapping_by_fourcc(fourcc);
+
+   if (!map)
+      return 0;
+
   switch (modifier) {
   case I915_FORMAT_MOD_Y_TILED_CCS:
      return 2;
@@ -867,8 +868,8 @@ dri2_get_modifier_num_planes(uint64_t modifier)
   /* FD_FORMAT_MOD_QCOM_TILED is not in drm_fourcc.h */
   case I915_FORMAT_MOD_X_TILED:
   case I915_FORMAT_MOD_Y_TILED:
-      return 1;
   case DRM_FORMAT_MOD_INVALID:
+      return map->nplanes;
   default:
      return 0;
   }
@@ -886,15 +887,13 @@ dri2_create_image_from_fd(__DRIscreen *_screen,
   __DRIimage *img = NULL;
   unsigned err = __DRI_IMAGE_ERROR_SUCCESS;
   int i, expected_num_fds;
-   uint64_t mod_planes = dri2_get_modifier_num_planes(modifier);
+   int num_handles = dri2_get_modifier_num_planes(modifier, fourcc);

-   if (!map || (modifier != DRM_FORMAT_MOD_INVALID && mod_planes == 0)) {
+   if (!map || num_handles == 0) {
      err = __DRI_IMAGE_ERROR_BAD_MATCH;
      goto exit;
   }

-   int num_handles = mod_planes > 0 ? mod_planes : map->nplanes;
-
   switch (fourcc) {
   case DRM_FORMAT_YUYV:
   case DRM_FORMAT_UYVY:
@@ -914,7 +913,7 @@ dri2_create_image_from_fd(__DRIscreen *_screen,

   for (i = 0; i < num_handles; i++) {
      int fdnum = i >= num_fds ? 0 : i;
-      int index = mod_planes > 0 ? i : map->planes[i].buffer_index;
+      int index = i >= map->nplanes ? i : map->planes[i].buffer_index;
      if (fds[fdnum] < 0) {
         err = __DRI_IMAGE_ERROR_BAD_ALLOC;
         goto exit;
@@ -924,6 +923,7 @@ dri2_create_image_from_fd(__DRIscreen *_screen,
      whandles[i].handle = (unsigned)fds[fdnum];
      whandles[i].stride = (unsigned)strides[index];
      whandles[i].offset = (unsigned)offsets[index];
+      whandles[i].format = map->pipe_format;
      whandles[i].modifier = modifier;
      whandles[i].plane = index;
   }
@@ -1314,6 +1314,7 @@ dri2_from_names(__DRIscreen *screen, int width, int height, int format,
   whandle.handle = names[0];
   whandle.stride = strides[0];
   whandle.offset = offsets[0];
+   whandle.format = map->pipe_format;
   whandle.modifier = DRM_FORMAT_MOD_INVALID;

   img = dri2_create_image_from_winsys(screen, width, height, map,
@@ -1411,7 +1412,7 @@ dri2_query_dma_buf_format_modifier_attribs(__DRIscreen *_screen,
 {
   switch (attrib) {
   case __DRI_IMAGE_FORMAT_MODIFIER_ATTRIB_PLANE_COUNT: {
-      uint64_t mod_planes = dri2_get_modifier_num_planes(modifier);
+      uint64_t mod_planes = dri2_get_modifier_num_planes(modifier, fourcc);
      if (mod_planes > 0)
         *value = mod_planes;
      return mod_planes > 0;
@@ -1879,8 +1880,6 @@ static void
 dri2_set_damage_region(__DRIdrawable *dPriv, unsigned int nrects, int *rects)
 {
   struct dri_drawable *drawable = dri_drawable(dPriv);
-   struct pipe_resource *resource = drawable->textures[ST_ATTACHMENT_BACK_LEFT];
-   struct pipe_screen *screen = resource->screen;
   struct pipe_box *boxes = NULL;

   if (nrects) {
@@ -1894,8 +1893,25 @@ dri2_set_damage_region(__DRIdrawable *dPriv, unsigned int nrects, int *rects)
      }
   }

-   screen->set_damage_region(screen, resource, nrects, boxes);
-   FREE(boxes);
+   FREE(drawable->damage_rects);
+   drawable->damage_rects = boxes;
+   drawable->num_damage_rects = nrects;
+
+   /* Only apply the damage region if the BACK_LEFT texture is up-to-date. */
+   if (drawable->texture_stamp == drawable->dPriv->lastStamp &&
+       (drawable->texture_mask & (1 << ST_ATTACHMENT_BACK_LEFT))) {
+      struct pipe_screen *screen = drawable->screen->base.screen;
+      struct pipe_resource *resource;
+
+      if (drawable->stvis.samples > 1)
+         resource = drawable->msaa_textures[ST_ATTACHMENT_BACK_LEFT];
+      else
+         resource = drawable->textures[ST_ATTACHMENT_BACK_LEFT];
+
+      screen->set_damage_region(screen, resource,
+                                drawable->num_damage_rects,
+                                drawable->damage_rects);
+   }
 }

 static __DRI2bufferDamageExtension dri2BufferDamageExtension = {
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -92,6 +92,18 @@ dri_st_framebuffer_validate(struct st_context_iface *stctx,
      }
   } while (lastStamp != drawable->dPriv->lastStamp);

+   /* Flush the pending set_damage_region request. */
+   struct pipe_screen *pscreen = screen->base.screen;
+
+   if (new_mask & (1 << ST_ATTACHMENT_BACK_LEFT) &&
+       pscreen->set_damage_region) {
+      struct pipe_resource *resource = textures[ST_ATTACHMENT_BACK_LEFT];
+
+      pscreen->set_damage_region(pscreen, resource,
+                                 drawable->num_damage_rects,
+                                 drawable->damage_rects);
+   }
+
   if (!out)
      return true;

@@ -197,6 +209,7 @@ dri_destroy_buffer(__DRIdrawable * dPriv)
   /* Notify the st manager that this drawable is no longer valid */
   stapi->destroy_drawable(stapi, &drawable->base);

+   FREE(drawable->damage_rects);
   FREE(drawable);
 }

--- a/src/gallium/state_trackers/dri/dri_drawable.h
+++ b/src/gallium/state_trackers/dri/dri_drawable.h
@@ -52,6 +52,9 @@ struct dri_drawable
   unsigned old_w;
   unsigned old_h;

+   struct pipe_box *damage_rects;
+   unsigned int num_damage_rects;
+
   struct pipe_resource *textures[ST_ATTACHMENT_COUNT];
   struct pipe_resource *msaa_textures[ST_ATTACHMENT_COUNT];
   unsigned int texture_mask, texture_stamp;
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -84,6 +84,8 @@ dri_fill_st_options(struct dri_screen *screen)
   options->allow_higher_compat_version =
      driQueryOptionb(optionCache, "allow_higher_compat_version");
   options->glsl_zero_init = driQueryOptionb(optionCache, "glsl_zero_init");
+   options->vs_position_always_invariant =
+      driQueryOptionb(optionCache, "vs_position_always_invariant");
   options->force_glsl_abs_sqrt =
      driQueryOptionb(optionCache, "force_glsl_abs_sqrt");
   options->allow_glsl_cross_stage_interpolation_mismatch =
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -57,7 +57,8 @@ endif
 LOCAL_STATIC_LIBRARIES += \
 	libfreedreno_drm \
 	libfreedreno_ir3 \
-	libpanfrost_shared \
+	libmesa_gallium \
+	libpanfrost_shared

 ifeq ($(USE_LIBBACKTRACE),true)
 	LOCAL_SHARED_LIBRARIES += libbacktrace
@@ -75,7 +76,6 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 	libmesa_nir \
 	libmesa_dri_common \
 	libmesa_megadriver_stub \
-	libmesa_gallium \
 	libmesa_pipe_loader \
 	libmesa_util \
 	libmesa_loader
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -326,7 +326,6 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
   aws = util_hash_table_get(dev_tab, dev);
   if (aws) {
      pipe_reference(NULL, &aws->reference);
-      simple_mtx_unlock(&dev_tab_mutex);

      /* Release the device handle, because we don't need it anymore.
       * This function is returning an existing winsys instance, which
--- a/src/intel/compiler/brw_eu_compact.c
+++ b/src/intel/compiler/brw_eu_compact.c
@@ -1512,10 +1512,10 @@ has_immediate(const struct gen_device_info *devinfo, const brw_inst *inst,
 {
   if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
      *type = brw_inst_src0_type(devinfo, inst);
-      return *type != -1;
+      return *type != (enum brw_reg_type)-1;
   } else if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
      *type = brw_inst_src1_type(devinfo, inst);
-      return *type != -1;
+      return *type != (enum brw_reg_type)-1;
   }

   return false;
--- a/src/intel/perf/gen_perf.c
+++ b/src/intel/perf/gen_perf.c
@@ -71,6 +71,8 @@
 #define MAP_READ  (1 << 0)
 #define MAP_WRITE (1 << 1)

+#define OA_REPORT_INVALID_CTX_ID (0xffffffff)
+
 /**
 * Periodic OA samples are read() into these buffer structures via the
 * i915 perf kernel interface and appended to the
@@ -1137,7 +1139,9 @@ gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
 {
   int i, idx = 0;

-   result->hw_id = start[2];
+   if (result->hw_id == OA_REPORT_INVALID_CTX_ID &&
+       start[2] != OA_REPORT_INVALID_CTX_ID)
+      result->hw_id = start[2];
   result->reports_accumulated++;

   switch (query->oa_format) {
@@ -1175,7 +1179,7 @@ void
 gen_perf_query_result_clear(struct gen_perf_query_result *result)
 {
   memset(result, 0, sizeof(*result));
-   result->hw_id = 0xffffffff; /* invalid */
+   result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */
 }

 static void
@@ -1456,8 +1460,8 @@ get_free_sample_buf(struct gen_perf_context *perf_ctx)

      exec_node_init(&buf->link);
      buf->refcount = 0;
-      buf->len = 0;
   }
+   buf->len = 0;

   return buf;
 }
@@ -1974,7 +1978,8 @@ read_oa_samples_until(struct gen_perf_context *perf_ctx,
      exec_list_get_tail(&perf_ctx->sample_buffers);
   struct oa_sample_buf *tail_buf =
      exec_node_data(struct oa_sample_buf, tail_node, link);
-   uint32_t last_timestamp = tail_buf->last_timestamp;
+   uint32_t last_timestamp =
+      tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp;

   while (1) {
      struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
@@ -1989,12 +1994,13 @@ read_oa_samples_until(struct gen_perf_context *perf_ctx,
         exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);

         if (len < 0) {
-            if (errno == EAGAIN)
-               return ((last_timestamp - start_timestamp) >=
+            if (errno == EAGAIN) {
+               return ((last_timestamp - start_timestamp) < INT32_MAX &&
+                       (last_timestamp - start_timestamp) >=
                       (end_timestamp - start_timestamp)) ?
                      OA_READ_STATUS_FINISHED :
                      OA_READ_STATUS_UNFINISHED;
-            else {
+            } else {
               DBG("Error reading i915 perf samples: %m\n");
            }
         } else
@@ -2210,6 +2216,17 @@ discard_all_queries(struct gen_perf_context *perf_ctx)
   }
 }

+/* Looks for the validity bit of context ID (dword 2) of an OA report. */
+static bool
+oa_report_ctx_id_valid(const struct gen_device_info *devinfo,
+                       const uint32_t *report)
+{
+   assert(devinfo->gen >= 8);
+   if (devinfo->gen == 8)
+      return (report[0] & (1 << 25)) != 0;
+   return (report[0] & (1 << 16)) != 0;
+}
+
 /**
 * Accumulate raw OA counter values based on deltas between pairs of
 * OA reports.
@@ -2237,7 +2254,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
   uint32_t *last;
   uint32_t *end;
   struct exec_node *first_samples_node;
-   bool in_ctx = true;
+   bool last_report_ctx_match = true;
   int out_duration = 0;

   assert(query->oa.map != NULL);
@@ -2266,7 +2283,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
   first_samples_node = query->oa.samples_head->next;

   foreach_list_typed_from(struct oa_sample_buf, buf, link,
-                           &perf_ctx.sample_buffers,
+                           &perf_ctx->sample_buffers,
                           first_samples_node)
   {
      int offset = 0;
@@ -2283,6 +2300,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
         switch (header->type) {
         case DRM_I915_PERF_RECORD_SAMPLE: {
            uint32_t *report = (uint32_t *)(header + 1);
+            bool report_ctx_match = true;
            bool add = true;

            /* Ignore reports that come before the start marker.
@@ -2311,35 +2329,30 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
             * of OA counters while any other context is acctive.
             */
            if (devinfo->gen >= 8) {
-               if (in_ctx && report[2] != query->oa.result.hw_id) {
-                  DBG("i915 perf: Switch AWAY (observed by ID change)\n");
-                  in_ctx = false;
+               /* Consider that the current report matches our context only if
+                * the report says the report ID is valid.
+                */
+               report_ctx_match = oa_report_ctx_id_valid(devinfo, report) &&
+                  report[2] == start[2];
+               if (report_ctx_match)
                  out_duration = 0;
-               } else if (in_ctx == false && report[2] == query->oa.result.hw_id) {
-                  DBG("i915 perf: Switch TO\n");
-                  in_ctx = true;
-
-                  /* From experimentation in IGT, we found that the OA unit
-                   * might label some report as "idle" (using an invalid
-                   * context ID), right after a report for a given context.
-                   * Deltas generated by those reports actually belong to the
-                   * previous context, even though they're not labelled as
-                   * such.
-                   *
-                   * We didn't *really* Switch AWAY in the case that we e.g.
-                   * saw a single periodic report while idle...
-                   */
-                  if (out_duration >= 1)
-                     add = false;
-               } else if (in_ctx) {
-                  assert(report[2] == query->oa.result.hw_id);
-                  DBG("i915 perf: Continuation IN\n");
-               } else {
-                  assert(report[2] != query->oa.result.hw_id);
-                  DBG("i915 perf: Continuation OUT\n");
-                  add = false;
+               else
                  out_duration++;
-               }
+
+               /* Only add the delta between <last, report> if the last report
+                * was clearly identified as our context, or if we have at most
+                * 1 report without a matching ID.
+                *
+                * The OA unit will sometimes label reports with an invalid
+                * context ID when i915 rewrites the execlist submit register
+                * with the same context as the one currently running. This
+                * happens when i915 wants to notify the HW of ringbuffer tail
+                * register update. We have to consider this report as part of
+                * our context as the 3d pipeline behind the OACS unit is still
+                * processing the operations started at the previous execlist
+                * submission.
+                */
+               add = last_report_ctx_match && out_duration < 2;
            }

            if (add) {
@@ -2349,6 +2362,7 @@ accumulate_oa_reports(struct gen_perf_context *perf_ctx,
            }

            last = report;
+            last_report_ctx_match = report_ctx_match;

            break;
         }
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -345,6 +345,9 @@ VkResult anv_ResetCommandBuffer(
   case 11:                                        \
      gen11_##func(__VA_ARGS__);                   \
      break;                                       \
+   case 12:                                        \
+      gen12_##func(__VA_ARGS__);                   \
+      break;                                       \
   default:                                        \
      assert(!"Unknown hardware generation");      \
   }
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -3000,6 +3000,14 @@ VkResult anv_DeviceWaitIdle(
 bool
 anv_vma_alloc(struct anv_device *device, struct anv_bo *bo)
 {
+   const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
+   const struct gen_device_info *devinfo = &pdevice->info;
+   /* Gen12 CCS surface addresses need to be 64K aligned. We have no way of
+    * telling what this allocation is for so pick the largest alignment.
+    */
+   const uint32_t vma_alignment =
+      devinfo->gen >= 12 ? (64 * 1024) : (4 * 1024);
+
   if (!(bo->flags & EXEC_OBJECT_PINNED))
      return true;

@@ -3009,7 +3017,8 @@ anv_vma_alloc(struct anv_device *device, struct anv_bo *bo)

   if (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS &&
       device->vma_hi_available >= bo->size) {
-      uint64_t addr = util_vma_heap_alloc(&device->vma_hi, bo->size, 4096);
+      uint64_t addr =
+         util_vma_heap_alloc(&device->vma_hi, bo->size, vma_alignment);
      if (addr) {
         bo->offset = gen_canonical_address(addr);
         assert(addr == gen_48b_address(bo->offset));
@@ -3018,7 +3027,8 @@ anv_vma_alloc(struct anv_device *device, struct anv_bo *bo)
   }

   if (bo->offset == 0 && device->vma_lo_available >= bo->size) {
-      uint64_t addr = util_vma_heap_alloc(&device->vma_lo, bo->size, 4096);
+      uint64_t addr =
+         util_vma_heap_alloc(&device->vma_lo, bo->size, vma_alignment);
      if (addr) {
         bo->offset = gen_canonical_address(addr);
         assert(addr == gen_48b_address(bo->offset));
@@ -3267,9 +3277,10 @@ VkResult anv_AllocateMemory(
                                      i915_tiling);
         if (ret) {
            anv_bo_cache_release(device, &device->bo_cache, mem->bo);
-            return vk_errorf(device->instance, NULL,
-                             VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                             "failed to set BO tiling: %m");
+            result = vk_errorf(device->instance, NULL,
+                               VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                               "failed to set BO tiling: %m");
+            goto fail;
         }
      }
   }
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -3893,6 +3893,13 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
         vfe.NumberofURBEntries     = 2;
         vfe.URBEntryAllocationSize = 2;
      }
+
+      /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
+       * invalid. Set the compute pipeline to dirty to force a re-emit of the
+       * pipeline in case we get back-to-back dispatch calls with the same
+       * pipeline and a PIPELINE_SELECT in between.
+       */
+      cmd_buffer->state.compute.pipeline_dirty = true;
   }
 #endif

--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -369,8 +369,8 @@ emit_3dstate_sbe(struct anv_pipeline *pipeline)
      if (input_index < 0)
         continue;

-      /* gl_Layer is stored in the VUE header */
-      if (attr == VARYING_SLOT_LAYER) {
+      /* gl_Viewport and gl_Layer are stored in the VUE header */
+      if (attr == VARYING_SLOT_VIEWPORT || attr == VARYING_SLOT_LAYER) {
         urb_entry_read_offset = 0;
         continue;
      }
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -35,9 +35,7 @@ i965_FILES = \
 	brw_object_purgeable.c \
 	brw_pipe_control.c \
 	brw_pipe_control.h \
-	brw_performance_query.h \
 	brw_performance_query.c \
-	brw_performance_query_metrics.h \
 	brw_program.c \
 	brw_program.h \
 	brw_program_binary.c \
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -98,6 +98,7 @@ DRI_CONF_BEGIN

   DRI_CONF_SECTION_MISCELLANEOUS
      DRI_CONF_GLSL_ZERO_INIT("false")
+      DRI_CONF_VS_POSITION_ALWAYS_INVARIANT("false")
      DRI_CONF_ALLOW_RGB10_CONFIGS("false")
      DRI_CONF_ALLOW_RGB565_CONFIGS("true")
      DRI_CONF_ALLOW_FP16_CONFIGS("false")
@@ -2798,6 +2799,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen)
   screen->compiler->constant_buffer_0_is_relative = devinfo->gen < 8 ||
      !(screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION);

+   screen->compiler->glsl_compiler_options[MESA_SHADER_VERTEX].PositionAlwaysInvariant = driQueryOptionb(&screen->optionCache, "vs_position_always_invariant");
+
   screen->compiler->supports_pull_constants = true;

   screen->has_exec_fence =
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3193,6 +3193,9 @@ struct gl_shader_compiler_options
   /** Clamp UBO and SSBO block indices so they don't go out-of-bounds. */
   GLboolean ClampBlockIndicesToArrayBounds;

+   /** (driconf) Force gl_Position to be considered invariant */
+   GLboolean PositionAlwaysInvariant;
+
   const struct nir_shader_compiler_options *NirOptions;
 };

--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -754,6 +754,8 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
   ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoSat =
      !screen->get_param(screen, PIPE_CAP_VERTEX_SHADER_SATURATE);

+   ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].PositionAlwaysInvariant = options->vs_position_always_invariant;
+
   if (ctx->Const.GLSLVersion < 400) {
      for (i = 0; i < MESA_SHADER_STAGES; i++)
         ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler = true;
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@@ -550,4 +550,14 @@ TODO: document the other workarounds.
            <option name="gles_emulate_bgra" value="true" />
        </application>
    </device>
+    <device driver="i965">
+        <application name="Middle Earth: Shadow of Mordor" executable="ShadowOfMordor">
+            <option name="vs_position_always_invariant" value="true" />
+        </application>
+    </device>
+    <device driver="iris">
+        <application name="Middle Earth: Shadow of Mordor" executable="ShadowOfMordor">
+            <option name="vs_position_always_invariant" value="true" />
+        </application>
+    </device>
 </driconf>
--- a/src/util/xmlpool/t_options.h
+++ b/src/util/xmlpool/t_options.h
@@ -279,6 +279,11 @@ DRI_CONF_OPT_BEGIN_B(glsl_zero_init, def) \
        DRI_CONF_DESC(en,gettext("Force uninitialized variables to default to zero")) \
 DRI_CONF_OPT_END

+#define DRI_CONF_VS_POSITION_ALWAYS_INVARIANT(def) \
+DRI_CONF_OPT_BEGIN_B(vs_position_always_invariant, def) \
+        DRI_CONF_DESC(en,gettext("Force the vertex shader's gl_Position output to be considered 'invariant'")) \
+DRI_CONF_OPT_END
+
 #define DRI_CONF_ALLOW_RGB10_CONFIGS(def) \
 DRI_CONF_OPT_BEGIN_B(allow_rgb10_configs, def) \
 DRI_CONF_DESC(en,gettext("Allow exposure of visuals and fbconfigs with rgb10a2 formats")) \