Bump version for -rc3

cherry-ignore: Update for 19.3-rc3 cycle
egl: fix _EGL_NATIVE_PLATFORM fallback
2019-11-13 11:17:01 -08:00 · 2019-11-11 11:44:26 -08:00 · 2019-11-11 11:44:26 -08:00 · 2019-11-11 11:44:26 -08:00 · 2019-11-11 11:44:26 -08:00 · 2019-11-11 11:44:26 -08:00
52 changed files with 573 additions and 271 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-19.3.0-devel
+19.3.0-rc3
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,2 @@
+# This is reverted shortly after landing
+4432a2d14d80081d062f7939a950d65ea3a16eed
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -92,7 +92,7 @@ is_revert_nomination()
 }

 # Use the last branchpoint as our limit for the search
-latest_branchpoint=`git merge-base origin/master HEAD`
+latest_branchpoint=`git merge-base upstream/master HEAD`

 # List all the commits between day 1 and the branch point...
 git log --reverse --pretty=%H $latest_branchpoint > already_landed
@@ -103,7 +103,7 @@ git log --reverse --pretty=medium --grep="cherry picked from commit" $latest_bra
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked

 # Grep for potential candidates
-git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable\|^CC:.*mesa-dev\|\<fixes\>\|\<broken by\>\|This reverts commit' $latest_branchpoint..origin/master |\
+git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable\|^CC:.*mesa-dev\|\<fixes\>\|\<broken by\>\|This reverts commit' $latest_branchpoint..upstream/master |\
 while read sha
 do
 	# Check to see whether the patch is on the ignore list.
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@@ -16,3 +16,5 @@ VK_INTEL_performance_query on Intel.
 Meson support for windows using MSVC and MinGW
 scons has been deprecated for non windows
 Initial Intel gen12 (Tigerlake) support on anvil and iris
+New compiler backend "ACO" for RADV (RADV_PERFTEST=aco)
+VK_EXT_shader_demote_to_helper_invocation on RADV/ACO.
--- a/src/amd/Makefile.sources
+++ b/src/amd/Makefile.sources
@@ -85,6 +85,7 @@ ACO_FILES = \
 	compiler/aco_register_allocation.cpp \
 	compiler/aco_live_var_analysis.cpp \
 	compiler/aco_lower_bool_phis.cpp \
+	compiler/aco_lower_to_cssa.cpp \
 	compiler/aco_lower_to_hw_instr.cpp \
 	compiler/aco_optimizer.cpp \
 	compiler/aco_opt_value_numbering.cpp \
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -114,6 +114,11 @@ unsigned
 ac_get_tbuffer_format(enum chip_class chip_class,
 		      unsigned dfmt, unsigned nfmt)
 {
+	// Some games try to access vertex buffers without a valid format.
+	// This is a game bug, but we should still handle it gracefully.
+	if (dfmt == V_008F0C_IMG_FORMAT_INVALID)
+		return V_008F0C_IMG_FORMAT_INVALID;
+
 	if (chip_class >= GFX10) {
 		unsigned format;
 		switch (dfmt) {
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -317,6 +317,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*

      uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf->dfmt, mtbuf->nfmt);
      uint32_t encoding = (0b111010 << 26);
+      assert(img_format <= 0x7F);
      assert(!mtbuf->dlc || ctx.chip_class >= GFX10);
      encoding |= (mtbuf->dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */
      encoding |= (mtbuf->glc ? 1 : 0) << 14;
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -1263,13 +1263,13 @@ setup_isel_context(Program* program,
   } else if (program->chip_class >= GFX8) {
      program->physical_sgprs = 800;
      program->sgpr_alloc_granule = 15;
+      if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
+         program->sgpr_limit = 94; /* workaround hardware bug */
+      else
         program->sgpr_limit = 102;
   } else {
      program->physical_sgprs = 512;
      program->sgpr_alloc_granule = 7;
-      if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
-         program->sgpr_limit = 94; /* workaround hardware bug */
-      else
      program->sgpr_limit = 104;
   }
   /* TODO: we don't have to allocate VCC if we don't need it */
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -172,11 +172,11 @@ bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int movin
   }
 }

-bool can_reorder(Instruction* candidate, bool allow_smem)
+bool can_reorder(Instruction* candidate)
 {
   switch (candidate->format) {
   case Format::SMEM:
-      return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder;
+      return static_cast<SMEM_instruction*>(candidate)->can_reorder;
   case Format::MUBUF:
      return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
   case Format::MIMG:
@@ -200,7 +200,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
   int window_size = SMEM_WINDOW_SIZE;
   int max_moves = SMEM_MAX_MOVES;
   int16_t k = 0;
-   bool can_reorder_cur = can_reorder(current, false);
+   bool can_reorder_cur = can_reorder(current);

   /* don't move s_memtime/s_memrealtime */
   if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
@@ -224,6 +224,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
      assert(candidate_idx >= 0);
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
+      bool can_reorder_candidate = can_reorder(candidate.get());

      /* break if we'd make the previous SMEM instruction stall */
      bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
@@ -231,7 +232,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
         break;

      /* break when encountering another MEM instruction, logical_start or barriers */
-      if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
+      if (!can_reorder_candidate && !can_reorder_cur)
         break;
      if (candidate->opcode == aco_opcode::p_logical_start)
         break;
@@ -239,6 +240,8 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
         break;
      if (!can_move_instr(candidate, current, moving_interaction))
         break;
+      if (candidate->isVMEM())
+         break;
      register_pressure.update(register_demand[candidate_idx]);

      /* if current depends on candidate, add additional dependencies and continue */
@@ -264,6 +267,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.depends_on[op.tempId()] = true;
         }
+         can_reorder_cur &= can_reorder_candidate;
         continue;
      }

@@ -280,6 +284,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.depends_on[op.tempId()] = true;
         }
+         can_reorder_cur &= can_reorder_candidate;
         continue;
      }

@@ -323,12 +328,14 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
   insert_idx = idx + 1;
   moving_interaction = barrier_none;
   moving_spill = false;
+   can_reorder_cur = true;

   bool found_dependency = false;
   /* second, check if we have instructions after current to move up */
   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
      assert(candidate_idx < (int) block->instructions.size());
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
+      bool can_reorder_candidate = can_reorder(candidate.get());

      if (candidate->opcode == aco_opcode::p_logical_end)
         break;
@@ -369,7 +376,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
         }
      }

-      if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
+      if (!can_reorder_candidate && !can_reorder_cur)
         break;

      if (!found_dependency) {
@@ -380,8 +387,10 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
      /* update register pressure */
      register_pressure.update(register_demand[candidate_idx - 1]);

-      if (is_dependency)
+      if (is_dependency) {
+         can_reorder_cur &= can_reorder_candidate;
         continue;
+      }
      assert(insert_idx != idx);

      // TODO: correctly calculate register pressure for this case
@@ -392,6 +401,8 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
            register_pressure_unknown = true;
      }
      if (register_pressure_unknown) {
+         if (candidate->isVMEM())
+            break;
         for (const Definition& def : candidate->definitions) {
            if (def.isTemp())
               ctx.RAR_dependencies[def.tempId()] = true;
@@ -400,6 +411,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.RAR_dependencies[op.tempId()] = true;
         }
+         can_reorder_cur &= can_reorder_candidate;
         continue;
      }

@@ -440,7 +452,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
   int max_moves = VMEM_MAX_MOVES;
   int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
   int16_t k = 0;
-   bool can_reorder_cur = can_reorder(current, false);
+   /* initially true as we don't pull other VMEM instructions
+    * through the current instruction */
+   bool can_reorder_vmem = true;
+   bool can_reorder_smem = true;

   /* create the initial set of values which current depends on */
   std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
@@ -467,9 +482,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
      assert(candidate_idx >= 0);
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
+      bool can_reorder_candidate = can_reorder(candidate.get());

      /* break when encountering another VMEM instruction, logical_start or barriers */
-      if (!can_reorder(candidate.get(), true) && !can_reorder_cur)
+      if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
         break;
      if (candidate->opcode == aco_opcode::p_logical_start)
         break;
@@ -487,10 +503,11 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
      bool part_of_clause = false;
      if (candidate->isVMEM()) {
         bool same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
+         bool can_reorder = can_reorder_vmem || can_reorder_candidate;
         int grab_dist = clause_insert_idx - candidate_idx;
         /* We can't easily tell how much this will decrease the def-to-use
          * distances, so just use how far it will be moved as a heuristic. */
-         part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
+         part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist;
      }

      /* if current depends on candidate, add additional dependencies and continue */
@@ -522,6 +539,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            }
         }
         register_pressure_clause.update(register_demand[candidate_idx]);
+         can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
+         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
         continue;
      }

@@ -555,6 +574,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            }
         }
         register_pressure_clause.update(register_demand[candidate_idx]);
+         can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
+         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
         continue;
      }

@@ -605,12 +626,16 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
   int insert_idx = idx;
   moving_interaction = barrier_none;
   moving_spill = false;
+   // TODO: differentiate between loads and stores (load-load can always reorder)
+   can_reorder_vmem = true;
+   can_reorder_smem = true;

   bool found_dependency = false;
   /* second, check if we have instructions after current to move up */
   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
      assert(candidate_idx < (int) block->instructions.size());
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
+      bool can_reorder_candidate = can_reorder(candidate.get());

      if (candidate->opcode == aco_opcode::p_logical_end)
         break;
@@ -623,7 +648,11 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
         break;

      /* check if candidate depends on current */
-      bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur;
+      bool is_dependency = false;
+      if (candidate->format == Format::SMEM)
+         is_dependency = !can_reorder_smem && !can_reorder_candidate;
+      if (candidate->isVMEM())
+         is_dependency = !can_reorder_vmem && !can_reorder_candidate;
      for (const Operand& op : candidate->operands) {
         if (op.isTemp() && ctx.depends_on[op.tempId()]) {
            is_dependency = true;
@@ -645,6 +674,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.RAR_dependencies[op.tempId()] = true;
         }
+         /* update flag whether we can reorder other memory instructions */
+         can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
+         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
+
         if (!found_dependency) {
            insert_idx = candidate_idx;
            found_dependency = true;
@@ -652,7 +685,9 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            register_pressure = register_demand[insert_idx - 1];
            continue;
         }
+
      } else if (candidate->isVMEM()) {
+         /* don't move up dependencies of other VMEM instructions */
         for (const Definition& def : candidate->definitions) {
            if (def.isTemp())
               ctx.depends_on[def.tempId()] = true;
@@ -681,6 +716,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.RAR_dependencies[op.tempId()] = true;
         }
+         can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
+         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
         continue;
      }

--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1291,9 +1291,9 @@ Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
      rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
   }
-   /* older generations need element size = 16 bytes. element size removed in GFX9 */
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
   if (ctx.program->chip_class <= GFX8)
-      rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);

   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
                     private_segment_buffer, Operand(-1u),
@@ -1530,12 +1530,12 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
               /* spill vgpr */
               ctx.program->config->spilled_vgprs += (*it)->operands[0].size();
               uint32_t spill_slot = vgpr_slot[spill_id];
-               bool add_offset = ctx.program->config->scratch_bytes_per_wave + vgpr_spill_slots * 4 > 4096;
-               unsigned base_offset = add_offset ? 0 : ctx.program->config->scratch_bytes_per_wave;
+               bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
+               unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;

               /* check if the scratch resource descriptor already exists */
               if (scratch_rsrc == Temp()) {
-                  unsigned offset = ctx.program->config->scratch_bytes_per_wave - base_offset;
+                  unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
                  scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
                                                       last_top_level_block_idx == block.index ?
                                                       instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
@@ -1544,37 +1544,21 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
               }

               unsigned offset = base_offset + spill_slot * 4;
-               aco_opcode opcode;
+               aco_opcode opcode = aco_opcode::buffer_store_dword;
               assert((*it)->operands[0].isTemp());
               Temp temp = (*it)->operands[0].getTemp();
               assert(temp.type() == RegType::vgpr && !temp.is_linear());
-               switch (temp.size()) {
-               case 1: opcode = aco_opcode::buffer_store_dword; break;
-               case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
-               case 6: temp = bld.tmp(v3); /* fallthrough */
-               case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
-               case 8: temp = bld.tmp(v4); /* fallthrough */
-               case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
-               default: {
+               if (temp.size() > 1) {
                  Instruction* split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
                  split->operands[0] = Operand(temp);
                  for (unsigned i = 0; i < temp.size(); i++)
                     split->definitions[i] = bld.def(v1);
                  bld.insert(split);
-                  opcode = aco_opcode::buffer_store_dword;
                  for (unsigned i = 0; i < temp.size(); i++)
                     bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
-                  continue;
-               }
-               }
-
-               if ((*it)->operands[0].size() > 4) {
-                  Temp temp2 = bld.pseudo(aco_opcode::p_split_vector, bld.def(temp.regClass()), Definition(temp), (*it)->operands[0]);
-                  bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp2, offset, false);
-                  offset += temp.size() * 4;
-               }
+               } else {
                  bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp, offset, false);
-
+               }
            } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
               ctx.program->config->spilled_sgprs += (*it)->operands[0].size();

@@ -1615,12 +1599,12 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
            if (vgpr_slot.find(spill_id) != vgpr_slot.end()) {
               /* reload vgpr */
               uint32_t spill_slot = vgpr_slot[spill_id];
-               bool add_offset = ctx.program->config->scratch_bytes_per_wave + vgpr_spill_slots * 4 > 4096;
-               unsigned base_offset = add_offset ? 0 : ctx.program->config->scratch_bytes_per_wave;
+               bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
+               unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;

               /* check if the scratch resource descriptor already exists */
               if (scratch_rsrc == Temp()) {
-                  unsigned offset = ctx.program->config->scratch_bytes_per_wave - base_offset;
+                  unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
                  scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
                                                       last_top_level_block_idx == block.index ?
                                                       instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
@@ -1629,35 +1613,20 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
               }

               unsigned offset = base_offset + spill_slot * 4;
-               aco_opcode opcode;
+               aco_opcode opcode = aco_opcode::buffer_load_dword;
               Definition def = (*it)->definitions[0];
-               switch (def.size()) {
-               case 1: opcode = aco_opcode::buffer_load_dword; break;
-               case 2: opcode = aco_opcode::buffer_load_dwordx2; break;
-               case 6: def = bld.def(v3); /* fallthrough */
-               case 3: opcode = aco_opcode::buffer_load_dwordx3; break;
-               case 8: def = bld.def(v4); /* fallthrough */
-               case 4: opcode = aco_opcode::buffer_load_dwordx4; break;
-               default: {
+               if (def.size() > 1) {
                  Instruction* vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
                  vec->definitions[0] = def;
-                  opcode = aco_opcode::buffer_load_dword;
                  for (unsigned i = 0; i < def.size(); i++) {
                     Temp tmp = bld.tmp(v1);
                     vec->operands[i] = Operand(tmp);
                     bld.mubuf(opcode, Definition(tmp), Operand(), scratch_rsrc, scratch_offset, offset + i * 4, false);
                  }
                  bld.insert(vec);
-                  continue;
-               }
-               }
-
+               } else {
                  bld.mubuf(opcode, def, Operand(), scratch_rsrc, scratch_offset, offset, false);
-               if ((*it)->definitions[0].size() > 4) {
-                  Temp temp2 = bld.mubuf(opcode, bld.def(def.regClass()), Operand(), scratch_rsrc, scratch_offset, offset + def.size() * 4, false);
-                  bld.pseudo(aco_opcode::p_create_vector, (*it)->definitions[0], def.getTemp(), temp2);
               }
-
            } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
               uint32_t spill_slot = sgpr_slot[spill_id];
               reload_in_loop[spill_slot / 64] = block.loop_nest_depth > 0;
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -25,6 +25,7 @@
 * IN THE SOFTWARE.
 */

+#include "dirent.h"
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/audit.h>
@@ -47,7 +48,6 @@
 #include "radv_shader.h"
 #include "radv_cs.h"
 #include "util/disk_cache.h"
-#include "util/strtod.h"
 #include "vk_util.h"
 #include <xf86drm.h>
 #include <amdgpu.h>
@@ -682,7 +682,6 @@ VkResult radv_CreateInstance(
 					 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
 	instance->engineVersion = engine_version;

-	_mesa_locale_init();
 	glsl_type_singleton_init_or_ref();

 	VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
@@ -713,7 +712,6 @@ void radv_DestroyInstance(
 	VG(VALGRIND_DESTROY_MEMPOOL(instance));

 	glsl_type_singleton_decref();
-	_mesa_locale_fini();

 	driDestroyOptionCache(&instance->dri_options);
 	driDestroyOptionInfo(&instance->available_dri_options);
@@ -2069,25 +2067,61 @@ bool radv_sc_read(int fd, void *buf, size_t size, bool timeout)
 	}
 }

+static bool radv_close_all_fds(const int *keep_fds, int keep_fd_count)
+{
+	DIR *d;
+	struct dirent *dir;
+	d = opendir("/proc/self/fd");
+	if (!d)
+		return false;
+	int dir_fd = dirfd(d);
+
+	while ((dir = readdir(d)) != NULL) {
+		if (dir->d_name[0] == '.')
+			continue;
+
+		int fd = atoi(dir->d_name);
+		if (fd == dir_fd)
+			continue;
+
+		bool keep = false;
+		for (int i = 0; !keep && i < keep_fd_count; ++i)
+			if (keep_fds[i] == fd)
+				keep = true;
+
+		if (keep)
+			continue;
+
+		close(fd);
+	}
+	closedir(d);
+	return true;
+}
+
 static void run_secure_compile_device(struct radv_device *device, unsigned process,
-				      int *fd_secure_input, int *fd_secure_output)
+				      int fd_secure_input, int fd_secure_output)
 {
 	enum radv_secure_compile_type sc_type;
-	if (install_seccomp_filter() == -1) {
+
+	const int needed_fds[] = {
+		fd_secure_input,
+		fd_secure_output,
+	};
+	if (!radv_close_all_fds(needed_fds, ARRAY_SIZE(needed_fds)) || install_seccomp_filter() == -1) {
 		sc_type = RADV_SC_TYPE_INIT_FAILURE;
 	} else {
 		sc_type = RADV_SC_TYPE_INIT_SUCCESS;
-		device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input[0];
-		device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output[1];
+		device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input;
+		device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output;
 	}

-	write(fd_secure_output[1], &sc_type, sizeof(sc_type));
+	write(fd_secure_output, &sc_type, sizeof(sc_type));

 	if (sc_type == RADV_SC_TYPE_INIT_FAILURE)
 		goto secure_compile_exit;

 	while (true) {
-		radv_sc_read(fd_secure_input[0], &sc_type, sizeof(sc_type), false);
+		radv_sc_read(fd_secure_input, &sc_type, sizeof(sc_type), false);

 		if (sc_type == RADV_SC_TYPE_COMPILE_PIPELINE) {
 			struct radv_pipeline *pipeline;
@@ -2100,20 +2134,20 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce

 			/* Read pipeline layout */
 			struct radv_pipeline_layout layout;
-			sc_read = radv_sc_read(fd_secure_input[0], &layout, sizeof(struct radv_pipeline_layout), true);
-			sc_read &= radv_sc_read(fd_secure_input[0], &layout.num_sets, sizeof(uint32_t), true);
+			sc_read = radv_sc_read(fd_secure_input, &layout, sizeof(struct radv_pipeline_layout), true);
+			sc_read &= radv_sc_read(fd_secure_input, &layout.num_sets, sizeof(uint32_t), true);
 			if (!sc_read)
 				goto secure_compile_exit;

 			for (uint32_t set = 0; set < layout.num_sets; set++) {
 				uint32_t layout_size;
-				sc_read &= radv_sc_read(fd_secure_input[0], &layout_size, sizeof(uint32_t), true);
+				sc_read &= radv_sc_read(fd_secure_input, &layout_size, sizeof(uint32_t), true);
 				if (!sc_read)
 					goto secure_compile_exit;

 				layout.set[set].layout = malloc(layout_size);
 				layout.set[set].layout->layout_size = layout_size;
-				sc_read &= radv_sc_read(fd_secure_input[0], layout.set[set].layout,
+				sc_read &= radv_sc_read(fd_secure_input, layout.set[set].layout,
 							layout.set[set].layout->layout_size, true);
 			}

@@ -2121,16 +2155,16 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce

 			/* Read pipeline key */
 			struct radv_pipeline_key key;
-			sc_read &= radv_sc_read(fd_secure_input[0], &key, sizeof(struct radv_pipeline_key), true);
+			sc_read &= radv_sc_read(fd_secure_input, &key, sizeof(struct radv_pipeline_key), true);

 			/* Read pipeline create flags */
 			VkPipelineCreateFlags flags;
-			sc_read &= radv_sc_read(fd_secure_input[0], &flags, sizeof(VkPipelineCreateFlags), true);
+			sc_read &= radv_sc_read(fd_secure_input, &flags, sizeof(VkPipelineCreateFlags), true);

 			/* Read stage and shader information */
 			uint32_t num_stages;
 			const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 0, };
-			sc_read &= radv_sc_read(fd_secure_input[0], &num_stages, sizeof(uint32_t), true);
+			sc_read &= radv_sc_read(fd_secure_input, &num_stages, sizeof(uint32_t), true);
 			if (!sc_read)
 				goto secure_compile_exit;

@@ -2138,33 +2172,33 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce

 				/* Read stage */
 				gl_shader_stage stage;
-				sc_read &= radv_sc_read(fd_secure_input[0], &stage, sizeof(gl_shader_stage), true);
+				sc_read &= radv_sc_read(fd_secure_input, &stage, sizeof(gl_shader_stage), true);

 				VkPipelineShaderStageCreateInfo *pStage = calloc(1, sizeof(VkPipelineShaderStageCreateInfo));

 				/* Read entry point name */
 				size_t name_size;
-				sc_read &= radv_sc_read(fd_secure_input[0], &name_size, sizeof(size_t), true);
+				sc_read &= radv_sc_read(fd_secure_input, &name_size, sizeof(size_t), true);
 				if (!sc_read)
 					goto secure_compile_exit;

 				char *ep_name = malloc(name_size);
-				sc_read &= radv_sc_read(fd_secure_input[0], ep_name, name_size, true);
+				sc_read &= radv_sc_read(fd_secure_input, ep_name, name_size, true);
 				pStage->pName = ep_name;

 				/* Read shader module */
 				size_t module_size;
-				sc_read &= radv_sc_read(fd_secure_input[0], &module_size, sizeof(size_t), true);
+				sc_read &= radv_sc_read(fd_secure_input, &module_size, sizeof(size_t), true);
 				if (!sc_read)
 					goto secure_compile_exit;

 				struct radv_shader_module *module = malloc(module_size);
-				sc_read &= radv_sc_read(fd_secure_input[0], module, module_size, true);
+				sc_read &= radv_sc_read(fd_secure_input, module, module_size, true);
 				pStage->module = radv_shader_module_to_handle(module);

 				/* Read specialization info */
 				bool has_spec_info;
-				sc_read &= radv_sc_read(fd_secure_input[0], &has_spec_info, sizeof(bool), true);
+				sc_read &= radv_sc_read(fd_secure_input, &has_spec_info, sizeof(bool), true);
 				if (!sc_read)
 					goto secure_compile_exit;

@@ -2172,21 +2206,21 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 					VkSpecializationInfo *specInfo = malloc(sizeof(VkSpecializationInfo));
 					pStage->pSpecializationInfo = specInfo;

-					sc_read &= radv_sc_read(fd_secure_input[0], &specInfo->dataSize, sizeof(size_t), true);
+					sc_read &= radv_sc_read(fd_secure_input, &specInfo->dataSize, sizeof(size_t), true);
 					if (!sc_read)
 						goto secure_compile_exit;

 					void *si_data = malloc(specInfo->dataSize);
-					sc_read &= radv_sc_read(fd_secure_input[0], si_data, specInfo->dataSize, true);
+					sc_read &= radv_sc_read(fd_secure_input, si_data, specInfo->dataSize, true);
 					specInfo->pData = si_data;

-					sc_read &= radv_sc_read(fd_secure_input[0], &specInfo->mapEntryCount, sizeof(uint32_t), true);
+					sc_read &= radv_sc_read(fd_secure_input, &specInfo->mapEntryCount, sizeof(uint32_t), true);
 					if (!sc_read)
 						goto secure_compile_exit;

 					VkSpecializationMapEntry *mapEntries = malloc(sizeof(VkSpecializationMapEntry) * specInfo->mapEntryCount);
 					for (uint32_t j = 0; j < specInfo->mapEntryCount; j++) {
-						sc_read &= radv_sc_read(fd_secure_input[0], &mapEntries[j], sizeof(VkSpecializationMapEntry), true);
+						sc_read &= radv_sc_read(fd_secure_input, &mapEntries[j], sizeof(VkSpecializationMapEntry), true);
 						if (!sc_read)
 							goto secure_compile_exit;
 					}
@@ -2222,7 +2256,7 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 			vk_free(&device->alloc, pipeline);

 			sc_type = RADV_SC_TYPE_COMPILE_PIPELINE_FINISHED;
-			write(fd_secure_output[1], &sc_type, sizeof(sc_type));
+			write(fd_secure_output, &sc_type, sizeof(sc_type));

 		} else if (sc_type == RADV_SC_TYPE_DESTROY_DEVICE) {
 			goto secure_compile_exit;
@@ -2230,10 +2264,8 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 	}

 secure_compile_exit:
-	close(fd_secure_input[1]);
-	close(fd_secure_input[0]);
-	close(fd_secure_output[1]);
-	close(fd_secure_output[0]);
+	close(fd_secure_input);
+	close(fd_secure_output);
 	_exit(0);
 }

@@ -2278,7 +2310,7 @@ static VkResult fork_secure_compile_device(struct radv_device *device)
 	for (unsigned process = 0; process < sc_threads; process++) {
 		if ((device->sc_state->secure_compile_processes[process].sc_pid = fork()) == 0) {
 			device->sc_state->secure_compile_thread_counter = process;
-			run_secure_compile_device(device, process, fd_secure_input[process], fd_secure_output[process]);
+			run_secure_compile_device(device, process, fd_secure_input[process][0], fd_secure_output[process][1]);
 		} else {
 			if (device->sc_state->secure_compile_processes[process].sc_pid == -1)
 				return VK_ERROR_INITIALIZATION_FAILED;
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -4646,10 +4646,10 @@ radv_secure_compile(struct radv_pipeline *pipeline,

 	/* Do an early exit if all cache entries are already there. */
 	bool may_need_copy_shader = pStages[MESA_SHADER_GEOMETRY];
-	void *main_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[0], 20);
+	void *main_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[0], NULL);
 	void *copy_entry = NULL;
 	if (may_need_copy_shader)
-		copy_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[1], 20);
+		copy_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[1], NULL);

 	bool has_all_cache_entries = main_entry && (!may_need_copy_shader || copy_entry);
 	free(main_entry);
@@ -5065,6 +5065,19 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline)
 	assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
 }

+static struct radv_pipeline_key
+radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
+				   const VkComputePipelineCreateInfo *pCreateInfo)
+{
+	struct radv_pipeline_key key;
+	memset(&key, 0, sizeof(key));
+
+	if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
+		key.optimisations_disabled = 1;
+
+	return key;
+}
+
 static VkResult radv_compute_pipeline_create(
 	VkDevice                                    _device,
 	VkPipelineCache                             _cache,
@@ -5098,13 +5111,16 @@ static VkResult radv_compute_pipeline_create(

 	pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;

+	struct radv_pipeline_key key =
+		radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
+
 	if (radv_device_use_secure_compile(device->instance)) {
-		result = radv_secure_compile(pipeline, device, &(struct radv_pipeline_key) {0}, pStages, pCreateInfo->flags, 1);
+		result = radv_secure_compile(pipeline, device, &key, pStages, pCreateInfo->flags, 1);
 		*pPipeline = radv_pipeline_to_handle(pipeline);

 		return result;
 	} else {
-		radv_create_shaders(pipeline, device, cache, &(struct radv_pipeline_key) {0}, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks);
+		radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks);
 	}

 	pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -1582,7 +1582,7 @@ static bool radv_amdgpu_wait_syncobj(struct radeon_winsys *_ws, const uint32_t *
 					 &tmp);
 	if (ret == 0) {
 		return true;
-	} else if (ret == -1 && errno == ETIME) {
+	} else if (ret == -ETIME) {
 		return false;
 	} else {
 		fprintf(stderr, "amdgpu: radv_amdgpu_wait_syncobj failed!\nerrno: %d\n", errno);
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -301,8 +301,8 @@ class Variable(Value):
      # constant.  If we want to support names that have numeric or
      # punctuation characters, we can me the first assertion more flexible.
      assert self.var_name.isalpha()
-      assert self.var_name is not 'True'
-      assert self.var_name is not 'False'
+      assert self.var_name != 'True'
+      assert self.var_name != 'False'

      self.is_constant = m.group('const') is not None
      self.cond = m.group('cond')
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -5152,6 +5152,7 @@ spirv_to_nir(const uint32_t *words, size_t word_count,
   }

   /* Set shader info defaults */
+   if (stage == MESA_SHADER_GEOMETRY)
      b->shader->info.gs.invocations = 1;

   /* Parse rounding mode execution modes. This has to happen earlier than
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -138,15 +138,6 @@ _eglNativePlatformDetectNativeDisplay(void *nativeDisplay)
      if (first_pointer == gbm_create_device)
         return _EGL_PLATFORM_DRM;
 #endif
-
-#ifdef HAVE_X11_PLATFORM
-      /* If not matched to any other platform, fallback to x11. */
-      return _EGL_PLATFORM_X11;
-#endif
-
-#ifdef HAVE_HAIKU_PLATFORM
-      return _EGL_PLATFORM_HAIKU;
-#endif
   }

   return _EGL_INVALID_PLATFORM;
--- a/src/egl/meson.build
+++ b/src/egl/meson.build
@@ -1,4 +1,4 @@
-# Copyright Â© 2017 Intel Corporation
+# Copyright © 2017-2019 Intel Corporation

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -149,6 +149,7 @@ if not with_glvnd
 else
  egl_lib_name = 'EGL_mesa'
  egl_lib_version = '0.0.0'
+  deps_for_egl += dep_glvnd
  files_egl += [g_egldispatchstubs_h, g_egldispatchstubs_c]
  files_egl += files('main/eglglvnd.c', 'main/egldispatchstubs.c')
  install_data(
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -39,7 +39,6 @@
 #include "compiler/glsl_types.h"
 #include "util/debug.h"
 #include "util/disk_cache.h"
-#include "util/strtod.h"
 #include "vk_format.h"
 #include "vk_util.h"

@@ -431,7 +430,6 @@ tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
      return vk_error(instance, result);
   }

-   _mesa_locale_init();
   glsl_type_singleton_init_or_ref();

   VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
@@ -457,7 +455,6 @@ tu_DestroyInstance(VkInstance _instance,
   VG(VALGRIND_DESTROY_MEMPOOL(instance));

   glsl_type_singleton_decref();
-   _mesa_locale_fini();

   vk_debug_report_instance_destroy(&instance->debug_report_callbacks);

--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -470,10 +470,6 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen,
 	case PIPE_SHADER_FRAGMENT:
 	case PIPE_SHADER_VERTEX:
 		break;
-	case PIPE_SHADER_GEOMETRY:
-		if (is_a6xx(screen))
-			break;
-		return 0;
 	case PIPE_SHADER_COMPUTE:
 		if (has_compute(screen))
 			break;
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -136,6 +136,7 @@ enum {
 #define IRIS_DIRTY_VF_STATISTICS            (1ull << 57)
 #define IRIS_DIRTY_PMA_FIX                  (1ull << 58)
 #define IRIS_DIRTY_DEPTH_BOUNDS             (1ull << 59)
+#define IRIS_DIRTY_RENDER_BUFFER            (1ull << 60)

 #define IRIS_ALL_DIRTY_FOR_COMPUTE (IRIS_DIRTY_CS | \
                                    IRIS_DIRTY_SAMPLER_STATES_CS | \
@@ -151,7 +152,8 @@ enum {
                                 IRIS_DIRTY_BINDINGS_TES | \
                                 IRIS_DIRTY_BINDINGS_GS  | \
                                 IRIS_DIRTY_BINDINGS_FS  | \
-                                 IRIS_DIRTY_BINDINGS_CS)
+                                 IRIS_DIRTY_BINDINGS_CS  | \
+                                 IRIS_DIRTY_RENDER_BUFFER)

 /**
 * Non-orthogonal state (NOS) dependency flags.
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -3023,31 +3023,14 @@ iris_set_framebuffer_state(struct pipe_context *ctx,
   /* Render target change */
   ice->state.dirty |= IRIS_DIRTY_BINDINGS_FS;

+   ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
+
   ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;

   ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_FRAMEBUFFER];

   if (GEN_GEN == 8)
      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
-
-#if GEN_GEN == 11
-   // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
-   // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
-
-   /* The PIPE_CONTROL command description says:
-    *
-    *   "Whenever a Binding Table Index (BTI) used by a Render Target Message
-    *    points to a different RENDER_SURFACE_STATE, SW must issue a Render
-    *    Target Cache Flush by enabling this bit. When render target flush
-    *    is set due to new association of BTI, PS Scoreboard Stall bit must
-    *    be set in this packet."
-    */
-   // XXX: does this need to happen at 3DSTATE_BTP_PS time?
-   iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],
-                                "workaround: RT BTI change [draw]",
-                                PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
-#endif
 }

 /**
@@ -5297,6 +5280,24 @@ iris_upload_dirty_render_state(struct iris_context *ice,
      }
   }

+   if (GEN_GEN >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {
+      // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
+      // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
+
+      /* The PIPE_CONTROL command description says:
+       *
+       *   "Whenever a Binding Table Index (BTI) used by a Render Target
+       *    Message points to a different RENDER_SURFACE_STATE, SW must issue a
+       *    Render Target Cache Flush by enabling this bit. When render target
+       *    flush is set due to new association of BTI, PS Scoreboard Stall bit
+       *    must be set in this packet."
+       */
+      // XXX: does this need to happen at 3DSTATE_BTP_PS time?
+      iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",
+                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                   PIPE_CONTROL_STALL_AT_SCOREBOARD);
+   }
+
   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
      if (dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) {
         iris_populate_binding_table(ice, batch, stage, false);
@@ -5508,7 +5509,7 @@ iris_upload_dirty_render_state(struct iris_context *ice,
             BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
            cl.NonPerspectiveBarycentricEnable = true;

-         cl.ForceZeroRTAIndexEnable = cso_fb->layers == 0;
+         cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
         cl.MaximumVPIndex = ice->state.num_viewports - 1;
      }
      iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -122,6 +122,8 @@ private:
   void emitSAM();
   void emitRAM();

+   void emitPSETP();
+
   void emitMOV();
   void emitS2R();
   void emitCS2R();
@@ -690,6 +692,31 @@ CodeEmitterGM107::emitRAM()
 * predicate/cc
 ******************************************************************************/

+void
+CodeEmitterGM107::emitPSETP()
+{
+
+   emitInsn(0x50900000);
+
+   switch (insn->op) {
+   case OP_AND: emitField(0x18, 3, 0); break;
+   case OP_OR:  emitField(0x18, 3, 1); break;
+   case OP_XOR: emitField(0x18, 3, 2); break;
+   default:
+      assert(!"unexpected operation");
+      break;
+   }
+
+   // emitINV (0x2a);
+   emitPRED(0x27); // TODO: support 3-arg
+   emitINV (0x20, insn->src(1));
+   emitPRED(0x1d, insn->src(1));
+   emitINV (0x0f, insn->src(0));
+   emitPRED(0x0c, insn->src(0));
+   emitPRED(0x03, insn->def(0));
+   emitPRED(0x00);
+}
+
 /*******************************************************************************
 * movement / conversion
 ******************************************************************************/
@@ -3557,7 +3584,12 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
   case OP_AND:
   case OP_OR:
   case OP_XOR:
-      emitLOP();
+      switch (insn->def(0).getFile()) {
+      case FILE_GPR: emitLOP(); break;
+      case FILE_PREDICATE: emitPSETP(); break;
+      default:
+         assert(!"invalid bool op");
+      }
      break;
   case OP_NOT:
      emitNOT();
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1591,6 +1591,12 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
      if (insn.getOpcode() == TGSI_OPCODE_STORE &&
          dst.getFile() != TGSI_FILE_MEMORY) {
         info->io.globalAccess |= 0x2;
+
+         if (dst.getFile() == TGSI_FILE_INPUT) {
+            // TODO: Handle indirect somehow?
+            const int i = dst.getIndex(0);
+            info->in[i].mask |= 1;
+         }
      }

      if (dst.getFile() == TGSI_FILE_OUTPUT) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1802,6 +1802,9 @@ NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless
 {
   uint32_t base = slot * NVC0_SU_INFO__STRIDE;

+   // We don't upload surface info for bindless for GM107+
+   assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET);
+
   if (ptr) {
      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
      if (bindless)
@@ -2204,7 +2207,7 @@ getDestType(const ImgType type) {
 }

 void
-NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
+NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded)
 {
   const TexInstruction::ImgFormatDesc *format = su->tex.format;
   int width = format->bits[0] + format->bits[1] +
@@ -2223,21 +2226,38 @@ NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
   if (width < 32)
      untypedDst[0] = bld.getSSA();

+   if (loaded && loaded[0]) {
+      for (int i = 0; i < 4; i++) {
+         if (loaded[i])
+            typedDst[i] = loaded[i]->getDef(0);
+      }
+   } else {
      for (int i = 0; i < 4; i++) {
         typedDst[i] = su->getDef(i);
      }
+   }

   // Set the untyped dsts as the su's destinations
+   if (loaded && loaded[0]) {
+      for (int i = 0; i < 4; i++)
+         if (loaded[i])
+            loaded[i]->setDef(0, untypedDst[i]);
+   } else {
      for (int i = 0; i < 4; i++)
         su->setDef(i, untypedDst[i]);

      bld.setPosition(su, true);
+   }

   // Unpack each component into the typed dsts
   int bits = 0;
   for (int i = 0; i < 4; bits += format->bits[i], i++) {
      if (!typedDst[i])
         continue;
+
+      if (loaded && loaded[0])
+         bld.setPosition(loaded[i], true);
+
      if (i >= format->components) {
         if (format->type == FLOAT ||
             format->type == UNORM ||
@@ -2308,7 +2328,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
   processSurfaceCoordsNVE4(su);

   if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
+      convertSurfaceFormat(su, NULL);
      insertOOBSurfaceOpResult(su);
   }

@@ -2421,7 +2441,7 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
   processSurfaceCoordsNVC0(su);

   if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
+      convertSurfaceFormat(su, NULL);
      insertOOBSurfaceOpResult(su);
   }

@@ -2463,14 +2483,16 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
   }
 }

-void
-NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
+TexInstruction *
+NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4])
 {
   const int slot = su->tex.r;
   const int dim = su->tex.target.getDim();
-   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
+   const int arg = dim + array;
   Value *ind = su->getIndirectR();
   Value *handle;
+   Instruction *pred = NULL, *pred2d = NULL;
   int pos = 0;

   bld.setPosition(su, false);
@@ -2489,19 +2511,38 @@ NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
      assert(pos == 0);
      break;
   }
+
+   if (dim == 2 && !array) {
+      // This might be a 2d slice of a 3d texture, try to load the z
+      // coordinate in.
+      Value *v;
+      if (!su->tex.bindless)
+         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
+      else
+         v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11));
+      Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1));
+      pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+                         TYPE_U32, bld.mkImm(0), is_3d);
+
+      bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16));
+      su->moveSources(dim, 1);
+      su->setSrc(dim, v);
+      su->tex.target = nv50_ir::TEX_TARGET_3D;
+      pos++;
+   }
+
   if (su->tex.bindless)
-      handle = ind;
+      handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047));
   else
      handle = loadTexHandle(ind, slot + 32);
+
   su->setSrc(arg + pos, handle);

   // The address check doesn't make sense here. The format check could make
   // sense but it's a bit of a pain.
-   if (su->tex.bindless)
-      return;
-
+   if (!su->tex.bindless) {
      // prevent read fault when the image is not actually bound
-   CmpInstruction *pred =
+      pred =
         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
                   TYPE_U32, bld.mkImm(0),
                   loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
@@ -2517,39 +2558,106 @@ NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
                   loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
                   pred->getDef(0));
      }
+   }
+
+   // Now we have "pred" which (optionally) contains whether to do the surface
+   // op at all, and a "pred2d" which indicates that, in case of doing the
+   // surface op, we have to create a 2d and 3d version, conditioned on pred2d.
+   TexInstruction *su2d = NULL;
+   if (pred2d) {
+      su2d = cloneForward(func, su)->asTex();
+      for (unsigned i = 0; su->defExists(i); ++i)
+         su2d->setDef(i, bld.getSSA());
+      su2d->moveSources(dim + 1, -1);
+      su2d->tex.target = nv50_ir::TEX_TARGET_2D;
+   }
+   if (pred2d && pred) {
+      Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8,
+                                      bld.getSSA(1, FILE_PREDICATE),
+                                      pred->getDef(0), pred2d->getDef(0));
+      pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
+      pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT);
+      su->setPredicate(CC_P, pred3d->getDef(0));
+      pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE),
+                         pred->getDef(0), pred2d->getDef(0));
+      pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
+   } else if (pred) {
      su->setPredicate(CC_NOT_P, pred->getDef(0));
+   } else if (pred2d) {
+      su->setPredicate(CC_NOT_P, pred2d->getDef(0));
+   }
+   if (su2d) {
+      su2d->setPredicate(CC_P, pred2d->getDef(0));
+      bld.insert(su2d);
+
+      // Create a UNION so that RA assigns the same registers
+      bld.setPosition(su, true);
+      for (unsigned i = 0; su->defExists(i); ++i) {
+         assert(i < 4);
+
+         ValueDef &def = su->def(i);
+         ValueDef &def2 = su2d->def(i);
+         Instruction *mov = NULL;
+
+         if (pred) {
+            mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
+            mov->setPredicate(CC_P, pred->getDef(0));
+         }
+
+         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
+                                      bld.getSSA(),
+                                      NULL, def2.get());
+         def.replace(uni->getDef(0), false);
+         uni->setSrc(0, def.get());
+         if (mov)
+            uni->setSrc(2, mov->getDef(0));
+      }
+   } else if (pred) {
+      // Create a UNION so that RA assigns the same registers
+      bld.setPosition(su, true);
+      for (unsigned i = 0; su->defExists(i); ++i) {
+         assert(i < 4);
+
+         ValueDef &def = su->def(i);
+
+         Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
+         mov->setPredicate(CC_P, pred->getDef(0));
+
+         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
+                                      bld.getSSA(),
+                                      NULL, mov->getDef(0));
+         def.replace(uni->getDef(0), false);
+         uni->setSrc(0, def.get());
+      }
+   }
+
+   return su2d;
 }

 void
 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
 {
-   processSurfaceCoordsGM107(su);
+   // processSurfaceCoords also takes care of fixing up the outputs and
+   // union'ing them with 0 as necessary. Additionally it may create a second
+   // surface which needs some of the similar fixups.
+
+   Instruction *loaded[4] = {};
+   TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded);

   if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
-      insertOOBSurfaceOpResult(su);
+      convertSurfaceFormat(su, loaded);
   }

   if (su->op == OP_SUREDP) {
-      Value *def = su->getDef(0);
-
      su->op = OP_SUREDB;
-
-      // There may not be a predicate in the bindless case.
-      if (su->getPredicate()) {
-         su->setDef(0, bld.getSSA());
-
-         bld.setPosition(su, true);
-
-         // make sure to initialize dst value when the atomic operation is not
-         // performed
-         Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
-
-         assert(su->cc == CC_NOT_P);
-         mov->setPredicate(CC_P, su->getPredicate());
-
-         bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
   }
+
+   // If we fixed up the type of the regular surface load instruction, we also
+   // have to fix up the copy.
+   if (su2) {
+      su2->op = su->op;
+      su2->dType = su->dType;
+      su2->sType = su->sType;
   }
 }

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -171,10 +171,10 @@ private:
   Value *loadMsInfo32(Value *ptr, uint32_t off);

   void adjustCoordinatesMS(TexInstruction *);
-   void processSurfaceCoordsGM107(TexInstruction *);
+   TexInstruction *processSurfaceCoordsGM107(TexInstruction *, Instruction *[4]);
   void processSurfaceCoordsNVE4(TexInstruction *);
   void processSurfaceCoordsNVC0(TexInstruction *);
-   void convertSurfaceFormat(TexInstruction *);
+   void convertSurfaceFormat(TexInstruction *, Instruction **);
   void insertOOBSurfaceOpResult(TexInstruction *);
   Value *calculateSampleOffset(Value *sampleID);

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -1433,7 +1433,15 @@ gm107_create_image_handle(struct pipe_context *pipe,

   nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);

-   return 0x100000000ULL | tic->id;
+   // Compute handle. This will include the TIC as well as some additional
+   // info regarding the bound 3d surface layer, if applicable.
+   uint64_t handle = 0x100000000ULL | tic->id;
+   struct nv04_resource *res = nv04_resource(view->resource);
+   if (res->base.target == PIPE_TEXTURE_3D) {
+      handle |= 1 << 11;
+      handle |= view->u.tex.first_layer << (11 + 16);
+   }
+   return handle;

 fail:
   FREE(tic);
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -903,6 +903,10 @@ static void si_disk_cache_create(struct si_screen *sscreen)
 	/* These flags affect shader compilation. */
 	#define ALL_FLAGS (DBG(SI_SCHED) | DBG(GISEL))
 	uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
+	/* Reserve left-most bit for tgsi/nir selector */
+	assert(!(shader_debug_flags & (1u << 31)));
+	shader_debug_flags |= (uint32_t)
+		((sscreen->options.enable_nir & 0x1) << 31);

 	/* Add the high bits of 32-bit addresses, which affects
 	 * how 32-bit addresses are expanded to 64 bits.
@@ -1026,6 +1030,13 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
 		return NULL;
 	}

+	{
+#define OPT_BOOL(name, dflt, description) \
+		sscreen->options.name = \
+			driQueryOptionb(config->options, "radeonsi_"#name);
+#include "si_debug_options.h"
+	}
+
 	si_disk_cache_create(sscreen);

 	/* Determine the number of shader compiler threads. */
@@ -1146,13 +1157,6 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
 	sscreen->commutative_blend_add =
 		driQueryOptionb(config->options, "radeonsi_commutative_blend_add");

-	{
-#define OPT_BOOL(name, dflt, description) \
-		sscreen->options.name = \
-			driQueryOptionb(config->options, "radeonsi_"#name);
-#include "si_debug_options.h"
-	}
-
 	sscreen->use_ngg = sscreen->info.chip_class >= GFX10 &&
 			   sscreen->info.family != CHIP_NAVI14 &&
 			   !(sscreen->debug_flags & DBG(NO_NGG));
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -1231,6 +1231,14 @@ swr_update_derived(struct pipe_context *pipe,
         util_viewport_zmin_zmax(state, rasterizer->clip_halfz,
                                 &vp->minZ, &vp->maxZ);

+         if (rasterizer->depth_clip_near) {
+            vp->minZ = 0.0f;
+         }
+
+         if (rasterizer->depth_clip_far) {
+            vp->maxZ = 1.0f;
+         }
+
         vpm->m00[i] = state->scale[0];
         vpm->m11[i] = state->scale[1];
         vpm->m22[i] = state->scale[2];
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@@ -488,9 +488,10 @@ get_render_pass(struct zink_context *ctx)
   struct zink_render_pass_state state;

   for (int i = 0; i < fb->nr_cbufs; i++) {
-      struct zink_resource *cbuf = zink_resource(fb->cbufs[i]->texture);
-      state.rts[i].format = cbuf->format;
-      state.rts[i].samples = cbuf->base.nr_samples > 0 ? cbuf->base.nr_samples : VK_SAMPLE_COUNT_1_BIT;
+      struct pipe_resource *res = fb->cbufs[i]->texture;
+      state.rts[i].format = zink_get_format(screen, fb->cbufs[i]->format);
+      state.rts[i].samples = res->nr_samples > 0 ? res->nr_samples :
+                                                   VK_SAMPLE_COUNT_1_BIT;
   }
   state.num_cbufs = fb->nr_cbufs;

@@ -993,6 +994,25 @@ get_gfx_program(struct zink_context *ctx)
   return ctx->curr_program;
 }

+static bool
+line_width_needed(enum pipe_prim_type reduced_prim,
+                  VkPolygonMode polygon_mode)
+{
+   switch (reduced_prim) {
+   case PIPE_PRIM_POINTS:
+      return false;
+
+   case PIPE_PRIM_LINES:
+      return true;
+
+   case PIPE_PRIM_TRIANGLES:
+      return polygon_mode == VK_POLYGON_MODE_LINE;
+
+   default:
+      unreachable("unexpected reduced prim");
+   }
+}
+
 static void
 zink_draw_vbo(struct pipe_context *pctx,
              const struct pipe_draw_info *dinfo)
@@ -1156,7 +1176,7 @@ zink_draw_vbo(struct pipe_context *pctx,
      vkCmdSetScissor(batch->cmdbuf, 0, 1, &fb_scissor);
   }

-   if (reduced_prim == PIPE_PRIM_LINES) {
+   if (line_width_needed(reduced_prim, rast_state->hw_state.polygon_mode)) {
      if (screen->feats.wideLines || ctx->line_width == 1.0f)
         vkCmdSetLineWidth(batch->cmdbuf, ctx->line_width);
      else
@@ -1294,6 +1314,10 @@ blit_native(struct zink_context *ctx, const struct pipe_blit_info *info)
   zink_batch_reference_resoure(batch, src);
   zink_batch_reference_resoure(batch, dst);

+   if (src->layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL)
+      zink_resource_barrier(batch->cmdbuf, src, src->aspect,
+                            VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
+
   if (dst->layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL)
      zink_resource_barrier(batch->cmdbuf, dst, dst->aspect,
                            VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
--- a/src/gallium/drivers/zink/zink_resource.c
+++ b/src/gallium/drivers/zink/zink_resource.c
@@ -137,6 +137,7 @@ resource_create(struct pipe_screen *pscreen,

      VkImageCreateInfo ici = {};
      ici.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+      ici.flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;

      switch (templ->target) {
      case PIPE_TEXTURE_1D:
@@ -146,7 +147,7 @@ resource_create(struct pipe_screen *pscreen,

      case PIPE_TEXTURE_CUBE:
      case PIPE_TEXTURE_CUBE_ARRAY:
-         ici.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
+         ici.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
         /* fall-through */
      case PIPE_TEXTURE_2D:
      case PIPE_TEXTURE_2D_ARRAY:
@@ -157,7 +158,7 @@ resource_create(struct pipe_screen *pscreen,
      case PIPE_TEXTURE_3D:
         ici.imageType = VK_IMAGE_TYPE_3D;
         if (templ->bind & PIPE_BIND_RENDER_TARGET)
-            ici.flags = VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
+            ici.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
         break;

      case PIPE_BUFFER:
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -125,6 +125,8 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return 1;

   case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
+      return 0; /* TODO: re-enable after implementing nir_texop_txd */
+
   case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
   case PIPE_CAP_VERTEX_SHADER_SATURATE:
      return 1;
@@ -284,7 +286,7 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return 0;

   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
-      return 1;
+      return 0;

   case PIPE_CAP_NIR_COMPACT_ARRAYS:
      return 1;
@@ -549,7 +551,7 @@ static const VkFormat formats[PIPE_FORMAT_COUNT] = {
   [PIPE_FORMAT_Z32_FLOAT] = VK_FORMAT_D32_SFLOAT,
   [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = VK_FORMAT_D32_SFLOAT_S8_UINT,
   [PIPE_FORMAT_Z16_UNORM] = VK_FORMAT_D16_UNORM,
-   [PIPE_FORMAT_X8Z24_UNORM] = VK_FORMAT_X8_D24_UNORM_PACK32,
+   [PIPE_FORMAT_Z24X8_UNORM] = VK_FORMAT_X8_D24_UNORM_PACK32,
   [PIPE_FORMAT_Z24_UNORM_S8_UINT] = VK_FORMAT_D24_UNORM_S8_UINT,

   // compressed formats
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -940,7 +940,7 @@ dri2_create_image_from_fd(__DRIscreen *_screen,
      whandles[i].stride = (unsigned)strides[index];
      whandles[i].offset = (unsigned)offsets[index];
      whandles[i].modifier = modifier;
-      whandles[i].plane = i;
+      whandles[i].plane = index;
   }

   img = dri2_create_image_from_winsys(_screen, width, height, use, map,
--- a/src/gallium/targets/graw-gdi/meson.build
+++ b/src/gallium/targets/graw-gdi/meson.build
@@ -32,6 +32,7 @@ libgraw_gdi = shared_library(
  dependencies : [
    dep_ws2_32, idep_mesautil, driver_swrast,
  ],
+  name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libgraw.dll
 )

 libgraw = libgraw_gdi
--- a/src/gallium/targets/graw-null/meson.build
+++ b/src/gallium/targets/graw-null/meson.build
@@ -32,6 +32,7 @@ libgraw_null = shared_library(
  include_directories : inc_common,
  link_with : libgallium,
  dependencies : idep_mesautil,
+  name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libgraw_null.dll
 )

 libgraw = libgraw_null
--- a/src/gallium/targets/osmesa/meson.build
+++ b/src/gallium/targets/osmesa/meson.build
@@ -58,6 +58,7 @@ libosmesa = shared_library(
    dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind,
    driver_swrast, driver_swr,
  ],
+  name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libosmesa.dll
  soversion : host_machine.system() == 'windows' ? '' : '8',
  version : '8.0.0',
  install : true,
--- a/src/gallium/targets/pipe-loader/meson.build
+++ b/src/gallium/targets/pipe-loader/meson.build
@@ -47,6 +47,15 @@ endif

 pipe_loader_install_dir = join_paths(get_option('libdir'), 'gallium-pipe')

+_kmsro_targets = [
+   driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
+   driver_panfrost, driver_lima,
+]
+
+if with_gallium_v3d
+   _kmsro_targets += [idep_xmlconfig, dep_expat]
+endif
+
 pipe_loaders = [
  [with_gallium_i915, 'i915', driver_i915, []],
  [with_gallium_nouveau, 'nouveau', driver_nouveau, []],
@@ -54,7 +63,7 @@ pipe_loaders = [
  [with_gallium_r600, 'r600', driver_r600, []],
  [with_gallium_radeonsi, 'radeonsi', [driver_radeonsi, idep_xmlconfig], []],
  [with_gallium_freedreno, 'msm', driver_freedreno, []],
-  [with_gallium_panfrost, 'kmsro', [driver_kmsro, driver_panfrost], []],
+  [with_gallium_kmsro, 'kmsro', _kmsro_targets, []],
  [with_gallium_svga, 'vmwgfx', driver_svga, []],
  [with_gallium_softpipe, 'swrast', [driver_swrast, driver_swr], [libwsw, libws_null]],
 ]
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -736,8 +736,7 @@ namespace brw {
      src_reg
      fix_byte_src(const src_reg &src) const
      {
-         if ((shader->devinfo->gen < 11 && !shader->devinfo->is_geminilake) ||
-             type_sz(src.type) != 1)
+         if (shader->devinfo->gen < 11 || type_sz(src.type) != 1)
            return src;

         dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1505,8 +1505,15 @@ generate_code(struct brw_codegen *p,
   bool debug_flag = INTEL_DEBUG &
      intel_debug_flag_for_shader_stage(nir->info.stage);
   struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
+
+   /* `send_count` explicitly does not include spills or fills, as we'd
+    * like to use it as a metric for intentional memory access or other
+    * shared function use.  Otherwise, subtle changes to scheduling or
+    * register allocation could cause it to fluctuate wildly - and that
+    * effect is already counted in spill/fill counts.
+    */
   int spill_count = 0, fill_count = 0;
-   int loop_count = 0;
+   int loop_count = 0, send_count = 0;

   foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
      struct brw_reg src[3], dst;
@@ -1746,6 +1753,7 @@ generate_code(struct brw_codegen *p,
            generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
         } else {
            generate_math1_gen4(p, inst, dst, src[0]);
+            send_count++;
         }
         break;

@@ -1759,6 +1767,7 @@ generate_code(struct brw_codegen *p,
            generate_math_gen6(p, inst, dst, src[0], src[1]);
         } else {
            generate_math2_gen4(p, inst, dst, src[0], src[1]);
+            send_count++;
         }
         break;

@@ -1775,14 +1784,17 @@ generate_code(struct brw_codegen *p,
      case SHADER_OPCODE_SAMPLEINFO:
         generate_tex(p, prog_data, nir->info.stage,
                      inst, dst, src[0], src[1], src[2]);
+         send_count++;
         break;

      case SHADER_OPCODE_GET_BUFFER_SIZE:
         generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
+         send_count++;
         break;

      case VS_OPCODE_URB_WRITE:
         generate_vs_urb_write(p, inst);
+         send_count++;
         break;

      case SHADER_OPCODE_GEN4_SCRATCH_READ:
@@ -1797,10 +1809,12 @@ generate_code(struct brw_codegen *p,

      case VS_OPCODE_PULL_CONSTANT_LOAD:
         generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
+         send_count++;
         break;

      case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
         generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
+         send_count++;
         break;

      case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
@@ -1809,14 +1823,17 @@ generate_code(struct brw_codegen *p,

      case GS_OPCODE_URB_WRITE:
         generate_gs_urb_write(p, inst);
+         send_count++;
         break;

      case GS_OPCODE_URB_WRITE_ALLOCATE:
         generate_gs_urb_write_allocate(p, inst);
+         send_count++;
         break;

      case GS_OPCODE_SVB_WRITE:
         generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
+         send_count++;
         break;

      case GS_OPCODE_SVB_SET_DST_INDEX:
@@ -1825,6 +1842,7 @@ generate_code(struct brw_codegen *p,

      case GS_OPCODE_THREAD_END:
         generate_gs_thread_end(p, inst);
+         send_count++;
         break;

      case GS_OPCODE_SET_WRITE_OFFSET:
@@ -1837,6 +1855,7 @@ generate_code(struct brw_codegen *p,

      case GS_OPCODE_FF_SYNC:
         generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
+         send_count++;
         break;

      case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
@@ -1866,12 +1885,14 @@ generate_code(struct brw_codegen *p,
      case SHADER_OPCODE_SHADER_TIME_ADD:
         brw_shader_time_add(p, src[0],
                             prog_data->base.binding_table.shader_time_start);
+         send_count++;
         break;

      case VEC4_OPCODE_UNTYPED_ATOMIC:
         assert(src[2].file == BRW_IMMEDIATE_VALUE);
         brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
                            !inst->dst.is_null(), inst->header_size);
+         send_count++;
         break;

      case VEC4_OPCODE_UNTYPED_SURFACE_READ:
@@ -1879,16 +1900,19 @@ generate_code(struct brw_codegen *p,
         assert(src[2].file == BRW_IMMEDIATE_VALUE);
         brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
                                  src[2].ud);
+         send_count++;
         break;

      case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
         assert(src[2].file == BRW_IMMEDIATE_VALUE);
         brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
                                   src[2].ud, inst->header_size);
+         send_count++;
         break;

      case SHADER_OPCODE_MEMORY_FENCE:
         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, /* bti */ 0);
+         send_count++;
         break;

      case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
@@ -2068,10 +2092,12 @@ generate_code(struct brw_codegen *p,

      case TCS_OPCODE_URB_WRITE:
         generate_tcs_urb_write(p, inst, src[0]);
+         send_count++;
         break;

      case VEC4_OPCODE_URB_READ:
         generate_vec4_urb_read(p, inst, dst, src[0]);
+         send_count++;
         break;

      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
@@ -2113,15 +2139,18 @@ generate_code(struct brw_codegen *p,

      case TCS_OPCODE_RELEASE_INPUT:
         generate_tcs_release_input(p, dst, src[0], src[1]);
+         send_count++;
         break;

      case TCS_OPCODE_THREAD_END:
         generate_tcs_thread_end(p, inst);
+         send_count++;
         break;

      case SHADER_OPCODE_BARRIER:
         brw_barrier(p, src[0]);
         brw_WAIT(p);
+         send_count++;
         break;

      case SHADER_OPCODE_MOV_INDIRECT:
@@ -2188,9 +2217,9 @@ generate_code(struct brw_codegen *p,
            sha1buf);

      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
-                     "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
+                     "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n",
            stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
-            spill_count, fill_count, before_size, after_size,
+            spill_count, fill_count, send_count, before_size, after_size,
            100.0f * (before_size - after_size) / before_size);

      /* overriding the shader makes disasm_info invalid */
@@ -2205,10 +2234,11 @@ generate_code(struct brw_codegen *p,

   compiler->shader_debug_log(log_data,
                              "%s vec4 shader: %d inst, %d loops, %u cycles, "
-                              "%d:%d spills:fills, compacted %d to %d bytes.",
+                              "%d:%d spills:fills, %u sends, "
+                              "compacted %d to %d bytes.",
                              stage_abbrev, before_size / 16,
                              loop_count, cfg->cycle_count, spill_count,
-                              fill_count, before_size, after_size);
+                              fill_count, send_count, before_size, after_size);
   if (stats) {
      stats->dispatch_width = 0;
      stats->instructions = before_size / 16;
--- a/src/intel/dev/gen_device_info.c
+++ b/src/intel/dev/gen_device_info.c
@@ -1043,7 +1043,8 @@ static const struct gen_device_info gen_device_info_ehl_2x4 = {
   .gt = _gt, .num_slices = _slices, .l3_banks = _l3,           \
   .simulator_id = 22,                                          \
   .urb.size = (_gt) == 1 ? 512 : 1024,                         \
-   .num_subslices = _dual_subslices
+   .num_subslices = _dual_subslices,                            \
+   .num_eu_per_subslice = 16

 #define dual_subslices(args...) { args, }

--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -532,9 +532,11 @@ anv_block_pool_expand_range(struct anv_block_pool *pool,
   if (use_softpin) {
      gem_handle = anv_gem_create(pool->device, newbo_size);
      map = anv_gem_mmap(pool->device, gem_handle, 0, newbo_size, 0);
-      if (map == MAP_FAILED)
+      if (map == MAP_FAILED) {
+         anv_gem_close(pool->device, gem_handle);
         return vk_errorf(pool->device->instance, pool->device,
                          VK_ERROR_MEMORY_MAP_FAILED, "gem mmap failed: %m");
+      }
      assert(center_bo_offset == 0);
   } else {
      /* Just leak the old map until we destroy the pool.  We can't munmap it
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -32,7 +32,6 @@
 #include "drm-uapi/drm_fourcc.h"

 #include "anv_private.h"
-#include "util/strtod.h"
 #include "util/debug.h"
 #include "util/build_id.h"
 #include "util/disk_cache.h"
@@ -792,7 +791,6 @@ VkResult anv_CreateInstance(
   instance->pipeline_cache_enabled =
      env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true);

-   _mesa_locale_init();
   glsl_type_singleton_init_or_ref();

   VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
@@ -831,7 +829,6 @@ void anv_DestroyInstance(
   vk_debug_report_instance_destroy(&instance->debug_report_callbacks);

   glsl_type_singleton_decref();
-   _mesa_locale_fini();

   driDestroyOptionCache(&instance->dri_options);
   driDestroyOptionInfo(&instance->available_dri_options);
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -2216,12 +2216,15 @@ compute_pipeline_create(

   pipeline->blend_state.map = NULL;

-   result = anv_reloc_list_init(&pipeline->batch_relocs,
-                                pAllocator ? pAllocator : &device->alloc);
+   const VkAllocationCallbacks *alloc =
+      pAllocator ? pAllocator : &device->alloc;
+
+   result = anv_reloc_list_init(&pipeline->batch_relocs, alloc);
   if (result != VK_SUCCESS) {
      vk_free2(&device->alloc, pAllocator, pipeline);
      return result;
   }
+   pipeline->batch.alloc = alloc;
   pipeline->batch.next = pipeline->batch.start = pipeline->batch_data;
   pipeline->batch.end = pipeline->batch.start + sizeof(pipeline->batch_data);
   pipeline->batch.relocs = &pipeline->batch_relocs;
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -94,12 +94,7 @@ VkResult genX(CreateQueryPool)(
      uint64s_per_slot += 4;
      break;
   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
-      uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */
-      uint64s_per_slot += 4; /* PerfCounter 1 & 2 */
-      uint64s_per_slot++; /* 2 * 32bit RPSTAT register */
-      uint64s_per_slot++; /* 64bit marker */
-      uint64s_per_slot++; /* availability */
-      uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */
+      uint64s_per_slot = 72; /* 576 bytes, see layout below */
      break;
   }
   default:
@@ -179,54 +174,51 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query)
 }

 /**
- * VK_INTEL_performance_query layout:
+ * VK_INTEL_performance_query layout (576 bytes) :
 *
 * ------------------------------
- * |       end MI_RPC (256b)    |
+ * |       availability (8b)    |
 * |----------------------------|
- * |     begin MI_RPC (256b)    |
- * |----------------------------|
- * | begin perfcntr 1 & 2 (16b) |
- * |----------------------------|
- * |  end perfcntr 1 & 2 (16b)  |
+ * |         marker (8b)        |
 * |----------------------------|
 * | begin RPSTAT register (4b) |
 * |----------------------------|
 * |  end RPSTAT register (4b)  |
 * |----------------------------|
- * |         marker (8b)        |
+ * | begin perfcntr 1 & 2 (16b) |
 * |----------------------------|
- * |       availability (8b)    |
+ * |  end perfcntr 1 & 2 (16b)  |
+ * |----------------------------|
+ * |          Unused (8b)       |
+ * |----------------------------|
+ * |     begin MI_RPC (256b)    |
+ * |----------------------------|
+ * |       end MI_RPC (256b)    |
 * ------------------------------
 */

 static uint32_t
-intel_perf_mi_rpc_offset(bool end)
+intel_perf_marker_offset(void)
 {
-   return end ? 0 : 256;
-}
-
-static uint32_t
-intel_perf_counter(bool end)
-{
-   uint32_t offset = 512;
-   offset += end ? 2 * sizeof(uint64_t) : 0;
-   return offset;
+   return 8;
 }

 static uint32_t
 intel_perf_rpstart_offset(bool end)
 {
-   uint32_t offset = intel_perf_counter(false) +
-      4 * sizeof(uint64_t);
-   offset += end ? sizeof(uint32_t) : 0;
-   return offset;
+   return 16 + (end ? sizeof(uint32_t) : 0);
 }

 static uint32_t
-intel_perf_marker_offset(void)
+intel_perf_counter(bool end)
 {
-   return intel_perf_rpstart_offset(false) + sizeof(uint64_t);
+   return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
+}
+
+static uint32_t
+intel_perf_mi_rpc_offset(bool end)
+{
+   return 64 + (end ? 256 : 0);
 }

 static void
@@ -251,10 +243,6 @@ query_slot(struct anv_query_pool *pool, uint32_t query)
 static bool
 query_is_available(struct anv_query_pool *pool, uint32_t query)
 {
-   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
-      return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) +
-                                    pool->stride - 8);
-   } else
   return *(volatile uint64_t *)query_slot(pool, query);
 }

--- a/src/intel/vulkan/tests/block_pool_grow_first.c
+++ b/src/intel/vulkan/tests/block_pool_grow_first.c
@@ -27,7 +27,11 @@

 int main(int argc, char **argv)
 {
-   struct anv_instance instance;
+   struct anv_instance instance = {
+      .physicalDevice = {
+         .use_softpin = true,
+      },
+   };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/block_pool_no_free.c
+++ b/src/intel/vulkan/tests/block_pool_no_free.c
@@ -111,7 +111,7 @@ static void validate_monotonic(int32_t **blocks)

 static void run_test()
 {
-   struct anv_instance instance;
+   struct anv_instance instance = { };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/state_pool.c
+++ b/src/intel/vulkan/tests/state_pool.c
@@ -36,7 +36,7 @@

 int main(int argc, char **argv)
 {
-   struct anv_instance instance;
+   struct anv_instance instance = { };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/state_pool_free_list_only.c
+++ b/src/intel/vulkan/tests/state_pool_free_list_only.c
@@ -35,7 +35,7 @@

 int main(int argc, char **argv)
 {
-   struct anv_instance instance;
+   struct anv_instance instance = { };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/state_pool_no_free.c
+++ b/src/intel/vulkan/tests/state_pool_no_free.c
@@ -56,7 +56,7 @@ static void *alloc_states(void *_job)

 static void run_test()
 {
-   struct anv_instance instance;
+   struct anv_instance instance = { };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/state_pool_padding.c
+++ b/src/intel/vulkan/tests/state_pool_padding.c
@@ -27,7 +27,11 @@

 int main(int argc, char **argv)
 {
-   struct anv_instance instance;
+   struct anv_instance instance = {
+      .physicalDevice = {
+         .use_softpin = true,
+      },
+   };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/mesa/drivers/osmesa/meson.build
+++ b/src/mesa/drivers/osmesa/meson.build
@@ -36,6 +36,8 @@ libosmesa = shared_library(
  link_whole : libglapi_static,
  link_with : [libmesa_classic, osmesa_link_with],
  dependencies : [dep_thread, dep_selinux],
+  name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libosmesa.dll
+  soversion : host_machine.system() == 'windows' ? '' : '8',
  version : '8.0.0',
  install : true,
 )
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -350,6 +350,12 @@ clear_bufferiv(struct gl_context *ctx, GLenum buffer, GLint drawbuffer,
      _mesa_update_state( ctx );
   }

+   if (!no_error && ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+      _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
+                  "glClearBufferiv(incomplete framebuffer)");
+      return;
+   }
+
   switch (buffer) {
   case GL_STENCIL:
      /* Page 264 (page 280 of the PDF) of the OpenGL 3.0 spec says:
@@ -686,6 +692,12 @@ clear_bufferfi(struct gl_context *ctx, GLenum buffer, GLint drawbuffer,
                     drawbuffer);
         return;
      }
+
+      if (ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+         _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
+                     "glClearBufferfi(incomplete framebuffer)");
+         return;
+      }
   }

   if (ctx->RasterDiscard)
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -325,6 +325,7 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
   cso_set_stream_outputs(cso, 0, NULL, NULL);
   cso_set_sample_mask(cso, ~0);
   cso_set_min_samples(cso, 1);
+   st->clear.raster.multisample = st->state.fb_num_samples > 1;
   cso_set_rasterizer(cso, &st->clear.raster);

   /* viewport state: viewport matching window dims */