Bump version for -rc3

cherry-ignore: Update for 19.3-rc3 cycle
egl: fix _EGL_NATIVE_PLATFORM fallback
2019-11-13 11:17:01 -08:00 · 2019-11-11 11:44:26 -08:00 · 2019-11-11 11:44:26 -08:00 · 2019-11-11 11:44:26 -08:00 · 2019-11-11 11:44:26 -08:00 · 2019-11-11 11:44:26 -08:00
52 changed files with 573 additions and 271 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-19.3.0-devel
+19.3.0-rc3
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,2 @@
 # This is reverted shortly after landing
 4432a2d14d80081d062f7939a950d65ea3a16eed
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -92,7 +92,7 @@ is_revert_nomination()
 }
 # Use the last branchpoint as our limit for the search
-latest_branchpoint=`git merge-base origin/master HEAD`
+latest_branchpoint=`git merge-base upstream/master HEAD`
 # List all the commits between day 1 and the branch point...
 git log --reverse --pretty=%H $latest_branchpoint > already_landed
@@ -103,7 +103,7 @@ git log --reverse --pretty=medium --grep="cherry picked from commit" $latest_bra
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
 # Grep for potential candidates
-git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable\|^CC:.*mesa-dev\|\<fixes\>\|\<broken by\>\|This reverts commit' $latest_branchpoint..origin/master |\
+git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable\|^CC:.*mesa-dev\|\<fixes\>\|\<broken by\>\|This reverts commit' $latest_branchpoint..upstream/master |\
 while read sha
 do
 	# Check to see whether the patch is on the ignore list.
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@@ -16,3 +16,5 @@ VK_INTEL_performance_query on Intel.
 Meson support for windows using MSVC and MinGW
 scons has been deprecated for non windows
 Initial Intel gen12 (Tigerlake) support on anvil and iris
 New compiler backend "ACO" for RADV (RADV_PERFTEST=aco)
 VK_EXT_shader_demote_to_helper_invocation on RADV/ACO.
--- a/src/amd/Makefile.sources
+++ b/src/amd/Makefile.sources
@@ -85,6 +85,7 @@ ACO_FILES = \
 	compiler/aco_register_allocation.cpp \
 	compiler/aco_live_var_analysis.cpp \
 	compiler/aco_lower_bool_phis.cpp \
 	compiler/aco_lower_to_cssa.cpp \
 	compiler/aco_lower_to_hw_instr.cpp \
 	compiler/aco_optimizer.cpp \
 	compiler/aco_opt_value_numbering.cpp \
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -114,6 +114,11 @@ unsigned
 ac_get_tbuffer_format(enum chip_class chip_class,
 		      unsigned dfmt, unsigned nfmt)
 {
 	// Some games try to access vertex buffers without a valid format.
 	// This is a game bug, but we should still handle it gracefully.
 	if (dfmt == V_008F0C_IMG_FORMAT_INVALID)
 		return V_008F0C_IMG_FORMAT_INVALID;
 	if (chip_class >= GFX10) {
 		unsigned format;
 		switch (dfmt) {
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -317,6 +317,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf->dfmt, mtbuf->nfmt);
      uint32_t encoding = (0b111010 << 26);
      assert(img_format <= 0x7F);
      assert(!mtbuf->dlc || ctx.chip_class >= GFX10);
      encoding |= (mtbuf->dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */
      encoding |= (mtbuf->glc ? 1 : 0) << 14;
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -1263,14 +1263,14 @@ setup_isel_context(Program* program,
   } else if (program->chip_class >= GFX8) {
      program->physical_sgprs = 800;
      program->sgpr_alloc_granule = 15;
      program->sgpr_limit = 102;
   } else {
      program->physical_sgprs = 512;
      program->sgpr_alloc_granule = 7;
      if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
         program->sgpr_limit = 94; /* workaround hardware bug */
      else
-         program->sgpr_limit = 104;
+         program->sgpr_limit = 102;
   } else {
      program->physical_sgprs = 512;
      program->sgpr_alloc_granule = 7;
      program->sgpr_limit = 104;
   }
   /* TODO: we don't have to allocate VCC if we don't need it */
   program->needs_vcc = true;
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -172,11 +172,11 @@ bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int movin
   }
 }
-bool can_reorder(Instruction* candidate, bool allow_smem)
+bool can_reorder(Instruction* candidate)
 {
   switch (candidate->format) {
   case Format::SMEM:
-      return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder;
+      return static_cast<SMEM_instruction*>(candidate)->can_reorder;
   case Format::MUBUF:
      return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
   case Format::MIMG:
@@ -200,7 +200,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
   int window_size = SMEM_WINDOW_SIZE;
   int max_moves = SMEM_MAX_MOVES;
   int16_t k = 0;
-   bool can_reorder_cur = can_reorder(current, false);
+   bool can_reorder_cur = can_reorder(current);
   /* don't move s_memtime/s_memrealtime */
   if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
@@ -224,6 +224,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
      assert(candidate_idx >= 0);
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
      bool can_reorder_candidate = can_reorder(candidate.get());
      /* break if we'd make the previous SMEM instruction stall */
      bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
@@ -231,7 +232,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
         break;
      /* break when encountering another MEM instruction, logical_start or barriers */
-      if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
+      if (!can_reorder_candidate && !can_reorder_cur)
         break;
      if (candidate->opcode == aco_opcode::p_logical_start)
         break;
@@ -239,6 +240,8 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
         break;
      if (!can_move_instr(candidate, current, moving_interaction))
         break;
      if (candidate->isVMEM())
         break;
      register_pressure.update(register_demand[candidate_idx]);
      /* if current depends on candidate, add additional dependencies and continue */
@@ -264,6 +267,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.depends_on[op.tempId()] = true;
         }
         can_reorder_cur &= can_reorder_candidate;
         continue;
      }
@@ -280,6 +284,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.depends_on[op.tempId()] = true;
         }
         can_reorder_cur &= can_reorder_candidate;
         continue;
      }
@@ -323,12 +328,14 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
   insert_idx = idx + 1;
   moving_interaction = barrier_none;
   moving_spill = false;
   can_reorder_cur = true;
   bool found_dependency = false;
   /* second, check if we have instructions after current to move up */
   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
      assert(candidate_idx < (int) block->instructions.size());
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
      bool can_reorder_candidate = can_reorder(candidate.get());
      if (candidate->opcode == aco_opcode::p_logical_end)
         break;
@@ -369,7 +376,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
         }
      }
-      if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
+      if (!can_reorder_candidate && !can_reorder_cur)
         break;
      if (!found_dependency) {
@@ -380,8 +387,10 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
      /* update register pressure */
      register_pressure.update(register_demand[candidate_idx - 1]);
-      if (is_dependency)
+      if (is_dependency) {
         can_reorder_cur &= can_reorder_candidate;
         continue;
      }
      assert(insert_idx != idx);
      // TODO: correctly calculate register pressure for this case
@@ -392,6 +401,8 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
            register_pressure_unknown = true;
      }
      if (register_pressure_unknown) {
         if (candidate->isVMEM())
            break;
         for (const Definition& def : candidate->definitions) {
            if (def.isTemp())
               ctx.RAR_dependencies[def.tempId()] = true;
@@ -400,6 +411,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.RAR_dependencies[op.tempId()] = true;
         }
         can_reorder_cur &= can_reorder_candidate;
         continue;
      }
@@ -440,7 +452,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
   int max_moves = VMEM_MAX_MOVES;
   int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
   int16_t k = 0;
-   bool can_reorder_cur = can_reorder(current, false);
+   /* initially true as we don't pull other VMEM instructions
    * through the current instruction */
   bool can_reorder_vmem = true;
   bool can_reorder_smem = true;
   /* create the initial set of values which current depends on */
   std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
@@ -467,9 +482,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
      assert(candidate_idx >= 0);
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
      bool can_reorder_candidate = can_reorder(candidate.get());
      /* break when encountering another VMEM instruction, logical_start or barriers */
-      if (!can_reorder(candidate.get(), true) && !can_reorder_cur)
+      if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
         break;
      if (candidate->opcode == aco_opcode::p_logical_start)
         break;
@@ -487,10 +503,11 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
      bool part_of_clause = false;
      if (candidate->isVMEM()) {
         bool same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
         bool can_reorder = can_reorder_vmem || can_reorder_candidate;
         int grab_dist = clause_insert_idx - candidate_idx;
         /* We can't easily tell how much this will decrease the def-to-use
          * distances, so just use how far it will be moved as a heuristic. */
-         part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
+         part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist;
      }
      /* if current depends on candidate, add additional dependencies and continue */
@@ -522,6 +539,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            }
         }
         register_pressure_clause.update(register_demand[candidate_idx]);
         can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
         continue;
      }
@@ -555,6 +574,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            }
         }
         register_pressure_clause.update(register_demand[candidate_idx]);
         can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
         continue;
      }
@@ -605,12 +626,16 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
   int insert_idx = idx;
   moving_interaction = barrier_none;
   moving_spill = false;
   // TODO: differentiate between loads and stores (load-load can always reorder)
   can_reorder_vmem = true;
   can_reorder_smem = true;
   bool found_dependency = false;
   /* second, check if we have instructions after current to move up */
   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
      assert(candidate_idx < (int) block->instructions.size());
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
      bool can_reorder_candidate = can_reorder(candidate.get());
      if (candidate->opcode == aco_opcode::p_logical_end)
         break;
@@ -623,7 +648,11 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
         break;
      /* check if candidate depends on current */
-      bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur;
+      bool is_dependency = false;
      if (candidate->format == Format::SMEM)
         is_dependency = !can_reorder_smem && !can_reorder_candidate;
      if (candidate->isVMEM())
         is_dependency = !can_reorder_vmem && !can_reorder_candidate;
      for (const Operand& op : candidate->operands) {
         if (op.isTemp() && ctx.depends_on[op.tempId()]) {
            is_dependency = true;
@@ -645,6 +674,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.RAR_dependencies[op.tempId()] = true;
         }
         /* update flag whether we can reorder other memory instructions */
         can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
         if (!found_dependency) {
            insert_idx = candidate_idx;
            found_dependency = true;
@@ -652,7 +685,9 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            register_pressure = register_demand[insert_idx - 1];
            continue;
         }
      } else if (candidate->isVMEM()) {
         /* don't move up dependencies of other VMEM instructions */
         for (const Definition& def : candidate->definitions) {
            if (def.isTemp())
               ctx.depends_on[def.tempId()] = true;
@@ -681,6 +716,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
            if (op.isTemp())
               ctx.RAR_dependencies[op.tempId()] = true;
         }
         can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
         continue;
      }
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1291,9 +1291,9 @@ Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
      rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
   }
-   /* older generations need element size = 16 bytes. element size removed in GFX9 */
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
   if (ctx.program->chip_class <= GFX8)
-      rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
                     private_segment_buffer, Operand(-1u),
@@ -1530,12 +1530,12 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
               /* spill vgpr */
               ctx.program->config->spilled_vgprs += (*it)->operands[0].size();
               uint32_t spill_slot = vgpr_slot[spill_id];
-               bool add_offset = ctx.program->config->scratch_bytes_per_wave + vgpr_spill_slots * 4 > 4096;
+               bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
-               unsigned base_offset = add_offset ? 0 : ctx.program->config->scratch_bytes_per_wave;
+               unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
               /* check if the scratch resource descriptor already exists */
               if (scratch_rsrc == Temp()) {
-                  unsigned offset = ctx.program->config->scratch_bytes_per_wave - base_offset;
+                  unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
                  scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
                                                       last_top_level_block_idx == block.index ?
                                                       instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
@@ -1544,37 +1544,21 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
               }
               unsigned offset = base_offset + spill_slot * 4;
-               aco_opcode opcode;
+               aco_opcode opcode = aco_opcode::buffer_store_dword;
               assert((*it)->operands[0].isTemp());
               Temp temp = (*it)->operands[0].getTemp();
               assert(temp.type() == RegType::vgpr && !temp.is_linear());
-               switch (temp.size()) {
+               if (temp.size() > 1) {
               case 1: opcode = aco_opcode::buffer_store_dword; break;
               case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
               case 6: temp = bld.tmp(v3); /* fallthrough */
               case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
               case 8: temp = bld.tmp(v4); /* fallthrough */
               case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
               default: {
                  Instruction* split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
                  split->operands[0] = Operand(temp);
                  for (unsigned i = 0; i < temp.size(); i++)
                     split->definitions[i] = bld.def(v1);
                  bld.insert(split);
                  opcode = aco_opcode::buffer_store_dword;
                  for (unsigned i = 0; i < temp.size(); i++)
                     bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
-                  continue;
+               } else {
                  bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp, offset, false);
               }
               }
               if ((*it)->operands[0].size() > 4) {
                  Temp temp2 = bld.pseudo(aco_opcode::p_split_vector, bld.def(temp.regClass()), Definition(temp), (*it)->operands[0]);
                  bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp2, offset, false);
                  offset += temp.size() * 4;
               }
               bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp, offset, false);
            } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
               ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
@@ -1615,12 +1599,12 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
            if (vgpr_slot.find(spill_id) != vgpr_slot.end()) {
               /* reload vgpr */
               uint32_t spill_slot = vgpr_slot[spill_id];
-               bool add_offset = ctx.program->config->scratch_bytes_per_wave + vgpr_spill_slots * 4 > 4096;
+               bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
-               unsigned base_offset = add_offset ? 0 : ctx.program->config->scratch_bytes_per_wave;
+               unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
               /* check if the scratch resource descriptor already exists */
               if (scratch_rsrc == Temp()) {
-                  unsigned offset = ctx.program->config->scratch_bytes_per_wave - base_offset;
+                  unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
                  scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
                                                       last_top_level_block_idx == block.index ?
                                                       instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
@@ -1629,35 +1613,20 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
               }
               unsigned offset = base_offset + spill_slot * 4;
-               aco_opcode opcode;
+               aco_opcode opcode = aco_opcode::buffer_load_dword;
               Definition def = (*it)->definitions[0];
-               switch (def.size()) {
+               if (def.size() > 1) {
               case 1: opcode = aco_opcode::buffer_load_dword; break;
               case 2: opcode = aco_opcode::buffer_load_dwordx2; break;
               case 6: def = bld.def(v3); /* fallthrough */
               case 3: opcode = aco_opcode::buffer_load_dwordx3; break;
               case 8: def = bld.def(v4); /* fallthrough */
               case 4: opcode = aco_opcode::buffer_load_dwordx4; break;
               default: {
                  Instruction* vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
                  vec->definitions[0] = def;
                  opcode = aco_opcode::buffer_load_dword;
                  for (unsigned i = 0; i < def.size(); i++) {
                     Temp tmp = bld.tmp(v1);
                     vec->operands[i] = Operand(tmp);
                     bld.mubuf(opcode, Definition(tmp), Operand(), scratch_rsrc, scratch_offset, offset + i * 4, false);
                  }
                  bld.insert(vec);
-                  continue;
+               } else {
                  bld.mubuf(opcode, def, Operand(), scratch_rsrc, scratch_offset, offset, false);
               }
               }
               bld.mubuf(opcode, def, Operand(), scratch_rsrc, scratch_offset, offset, false);
               if ((*it)->definitions[0].size() > 4) {
                  Temp temp2 = bld.mubuf(opcode, bld.def(def.regClass()), Operand(), scratch_rsrc, scratch_offset, offset + def.size() * 4, false);
                  bld.pseudo(aco_opcode::p_create_vector, (*it)->definitions[0], def.getTemp(), temp2);
               }
            } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
               uint32_t spill_slot = sgpr_slot[spill_id];
               reload_in_loop[spill_slot / 64] = block.loop_nest_depth > 0;
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -25,6 +25,7 @@
 * IN THE SOFTWARE.
 */
 #include "dirent.h"
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/audit.h>
@@ -47,7 +48,6 @@
 #include "radv_shader.h"
 #include "radv_cs.h"
 #include "util/disk_cache.h"
 #include "util/strtod.h"
 #include "vk_util.h"
 #include <xf86drm.h>
 #include <amdgpu.h>
@@ -682,7 +682,6 @@ VkResult radv_CreateInstance(
 					 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
 	instance->engineVersion = engine_version;
 	_mesa_locale_init();
 	glsl_type_singleton_init_or_ref();
 	VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
@@ -713,7 +712,6 @@ void radv_DestroyInstance(
 	VG(VALGRIND_DESTROY_MEMPOOL(instance));
 	glsl_type_singleton_decref();
 	_mesa_locale_fini();
 	driDestroyOptionCache(&instance->dri_options);
 	driDestroyOptionInfo(&instance->available_dri_options);
@@ -2069,25 +2067,61 @@ bool radv_sc_read(int fd, void *buf, size_t size, bool timeout)
 	}
 }
 static bool radv_close_all_fds(const int *keep_fds, int keep_fd_count)
 {
 	DIR *d;
 	struct dirent *dir;
 	d = opendir("/proc/self/fd");
 	if (!d)
 		return false;
 	int dir_fd = dirfd(d);
 	while ((dir = readdir(d)) != NULL) {
 		if (dir->d_name[0] == '.')
 			continue;
 		int fd = atoi(dir->d_name);
 		if (fd == dir_fd)
 			continue;
 		bool keep = false;
 		for (int i = 0; !keep && i < keep_fd_count; ++i)
 			if (keep_fds[i] == fd)
 				keep = true;
 		if (keep)
 			continue;
 		close(fd);
 	}
 	closedir(d);
 	return true;
 }
 static void run_secure_compile_device(struct radv_device *device, unsigned process,
-				      int *fd_secure_input, int *fd_secure_output)
+				      int fd_secure_input, int fd_secure_output)
 {
 	enum radv_secure_compile_type sc_type;
-	if (install_seccomp_filter() == -1) {
+
 	const int needed_fds[] = {
 		fd_secure_input,
 		fd_secure_output,
 	};
 	if (!radv_close_all_fds(needed_fds, ARRAY_SIZE(needed_fds)) || install_seccomp_filter() == -1) {
 		sc_type = RADV_SC_TYPE_INIT_FAILURE;
 	} else {
 		sc_type = RADV_SC_TYPE_INIT_SUCCESS;
-		device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input[0];
+		device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input;
-		device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output[1];
+		device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output;
 	}
-	write(fd_secure_output[1], &sc_type, sizeof(sc_type));
+	write(fd_secure_output, &sc_type, sizeof(sc_type));
 	if (sc_type == RADV_SC_TYPE_INIT_FAILURE)
 		goto secure_compile_exit;
 	while (true) {
-		radv_sc_read(fd_secure_input[0], &sc_type, sizeof(sc_type), false);
+		radv_sc_read(fd_secure_input, &sc_type, sizeof(sc_type), false);
 		if (sc_type == RADV_SC_TYPE_COMPILE_PIPELINE) {
 			struct radv_pipeline *pipeline;
@@ -2100,20 +2134,20 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 			/* Read pipeline layout */
 			struct radv_pipeline_layout layout;
-			sc_read = radv_sc_read(fd_secure_input[0], &layout, sizeof(struct radv_pipeline_layout), true);
+			sc_read = radv_sc_read(fd_secure_input, &layout, sizeof(struct radv_pipeline_layout), true);
-			sc_read &= radv_sc_read(fd_secure_input[0], &layout.num_sets, sizeof(uint32_t), true);
+			sc_read &= radv_sc_read(fd_secure_input, &layout.num_sets, sizeof(uint32_t), true);
 			if (!sc_read)
 				goto secure_compile_exit;
 			for (uint32_t set = 0; set < layout.num_sets; set++) {
 				uint32_t layout_size;
-				sc_read &= radv_sc_read(fd_secure_input[0], &layout_size, sizeof(uint32_t), true);
+				sc_read &= radv_sc_read(fd_secure_input, &layout_size, sizeof(uint32_t), true);
 				if (!sc_read)
 					goto secure_compile_exit;
 				layout.set[set].layout = malloc(layout_size);
 				layout.set[set].layout->layout_size = layout_size;
-				sc_read &= radv_sc_read(fd_secure_input[0], layout.set[set].layout,
+				sc_read &= radv_sc_read(fd_secure_input, layout.set[set].layout,
 							layout.set[set].layout->layout_size, true);
 			}
@@ -2121,16 +2155,16 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 			/* Read pipeline key */
 			struct radv_pipeline_key key;
-			sc_read &= radv_sc_read(fd_secure_input[0], &key, sizeof(struct radv_pipeline_key), true);
+			sc_read &= radv_sc_read(fd_secure_input, &key, sizeof(struct radv_pipeline_key), true);
 			/* Read pipeline create flags */
 			VkPipelineCreateFlags flags;
-			sc_read &= radv_sc_read(fd_secure_input[0], &flags, sizeof(VkPipelineCreateFlags), true);
+			sc_read &= radv_sc_read(fd_secure_input, &flags, sizeof(VkPipelineCreateFlags), true);
 			/* Read stage and shader information */
 			uint32_t num_stages;
 			const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 0, };
-			sc_read &= radv_sc_read(fd_secure_input[0], &num_stages, sizeof(uint32_t), true);
+			sc_read &= radv_sc_read(fd_secure_input, &num_stages, sizeof(uint32_t), true);
 			if (!sc_read)
 				goto secure_compile_exit;
@@ -2138,33 +2172,33 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 				/* Read stage */
 				gl_shader_stage stage;
-				sc_read &= radv_sc_read(fd_secure_input[0], &stage, sizeof(gl_shader_stage), true);
+				sc_read &= radv_sc_read(fd_secure_input, &stage, sizeof(gl_shader_stage), true);
 				VkPipelineShaderStageCreateInfo *pStage = calloc(1, sizeof(VkPipelineShaderStageCreateInfo));
 				/* Read entry point name */
 				size_t name_size;
-				sc_read &= radv_sc_read(fd_secure_input[0], &name_size, sizeof(size_t), true);
+				sc_read &= radv_sc_read(fd_secure_input, &name_size, sizeof(size_t), true);
 				if (!sc_read)
 					goto secure_compile_exit;
 				char *ep_name = malloc(name_size);
-				sc_read &= radv_sc_read(fd_secure_input[0], ep_name, name_size, true);
+				sc_read &= radv_sc_read(fd_secure_input, ep_name, name_size, true);
 				pStage->pName = ep_name;
 				/* Read shader module */
 				size_t module_size;
-				sc_read &= radv_sc_read(fd_secure_input[0], &module_size, sizeof(size_t), true);
+				sc_read &= radv_sc_read(fd_secure_input, &module_size, sizeof(size_t), true);
 				if (!sc_read)
 					goto secure_compile_exit;
 				struct radv_shader_module *module = malloc(module_size);
-				sc_read &= radv_sc_read(fd_secure_input[0], module, module_size, true);
+				sc_read &= radv_sc_read(fd_secure_input, module, module_size, true);
 				pStage->module = radv_shader_module_to_handle(module);
 				/* Read specialization info */
 				bool has_spec_info;
-				sc_read &= radv_sc_read(fd_secure_input[0], &has_spec_info, sizeof(bool), true);
+				sc_read &= radv_sc_read(fd_secure_input, &has_spec_info, sizeof(bool), true);
 				if (!sc_read)
 					goto secure_compile_exit;
@@ -2172,21 +2206,21 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 					VkSpecializationInfo *specInfo = malloc(sizeof(VkSpecializationInfo));
 					pStage->pSpecializationInfo = specInfo;
-					sc_read &= radv_sc_read(fd_secure_input[0], &specInfo->dataSize, sizeof(size_t), true);
+					sc_read &= radv_sc_read(fd_secure_input, &specInfo->dataSize, sizeof(size_t), true);
 					if (!sc_read)
 						goto secure_compile_exit;
 					void *si_data = malloc(specInfo->dataSize);
-					sc_read &= radv_sc_read(fd_secure_input[0], si_data, specInfo->dataSize, true);
+					sc_read &= radv_sc_read(fd_secure_input, si_data, specInfo->dataSize, true);
 					specInfo->pData = si_data;
-					sc_read &= radv_sc_read(fd_secure_input[0], &specInfo->mapEntryCount, sizeof(uint32_t), true);
+					sc_read &= radv_sc_read(fd_secure_input, &specInfo->mapEntryCount, sizeof(uint32_t), true);
 					if (!sc_read)
 						goto secure_compile_exit;
 					VkSpecializationMapEntry *mapEntries = malloc(sizeof(VkSpecializationMapEntry) * specInfo->mapEntryCount);
 					for (uint32_t j = 0; j < specInfo->mapEntryCount; j++) {
-						sc_read &= radv_sc_read(fd_secure_input[0], &mapEntries[j], sizeof(VkSpecializationMapEntry), true);
+						sc_read &= radv_sc_read(fd_secure_input, &mapEntries[j], sizeof(VkSpecializationMapEntry), true);
 						if (!sc_read)
 							goto secure_compile_exit;
 					}
@@ -2222,7 +2256,7 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 			vk_free(&device->alloc, pipeline);
 			sc_type = RADV_SC_TYPE_COMPILE_PIPELINE_FINISHED;
-			write(fd_secure_output[1], &sc_type, sizeof(sc_type));
+			write(fd_secure_output, &sc_type, sizeof(sc_type));
 		} else if (sc_type == RADV_SC_TYPE_DESTROY_DEVICE) {
 			goto secure_compile_exit;
@@ -2230,10 +2264,8 @@ static void run_secure_compile_device(struct radv_device *device, unsigned proce
 	}
 secure_compile_exit:
-	close(fd_secure_input[1]);
+	close(fd_secure_input);
-	close(fd_secure_input[0]);
+	close(fd_secure_output);
 	close(fd_secure_output[1]);
 	close(fd_secure_output[0]);
 	_exit(0);
 }
@@ -2278,7 +2310,7 @@ static VkResult fork_secure_compile_device(struct radv_device *device)
 	for (unsigned process = 0; process < sc_threads; process++) {
 		if ((device->sc_state->secure_compile_processes[process].sc_pid = fork()) == 0) {
 			device->sc_state->secure_compile_thread_counter = process;
-			run_secure_compile_device(device, process, fd_secure_input[process], fd_secure_output[process]);
+			run_secure_compile_device(device, process, fd_secure_input[process][0], fd_secure_output[process][1]);
 		} else {
 			if (device->sc_state->secure_compile_processes[process].sc_pid == -1)
 				return VK_ERROR_INITIALIZATION_FAILED;
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -4646,10 +4646,10 @@ radv_secure_compile(struct radv_pipeline *pipeline,
 	/* Do an early exit if all cache entries are already there. */
 	bool may_need_copy_shader = pStages[MESA_SHADER_GEOMETRY];
-	void *main_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[0], 20);
+	void *main_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[0], NULL);
 	void *copy_entry = NULL;
 	if (may_need_copy_shader)
-		copy_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[1], 20);
+		copy_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[1], NULL);
 	bool has_all_cache_entries = main_entry && (!may_need_copy_shader || copy_entry);
 	free(main_entry);
@@ -5065,6 +5065,19 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline)
 	assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
 }
 static struct radv_pipeline_key
 radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
 				   const VkComputePipelineCreateInfo *pCreateInfo)
 {
 	struct radv_pipeline_key key;
 	memset(&key, 0, sizeof(key));
 	if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
 		key.optimisations_disabled = 1;
 	return key;
 }
 static VkResult radv_compute_pipeline_create(
 	VkDevice                                    _device,
 	VkPipelineCache                             _cache,
@@ -5098,13 +5111,16 @@ static VkResult radv_compute_pipeline_create(
 	pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;
 	struct radv_pipeline_key key =
 		radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
 	if (radv_device_use_secure_compile(device->instance)) {
-		result = radv_secure_compile(pipeline, device, &(struct radv_pipeline_key) {0}, pStages, pCreateInfo->flags, 1);
+		result = radv_secure_compile(pipeline, device, &key, pStages, pCreateInfo->flags, 1);
 		*pPipeline = radv_pipeline_to_handle(pipeline);
 		return result;
 	} else {
-		radv_create_shaders(pipeline, device, cache, &(struct radv_pipeline_key) {0}, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks);
+		radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks);
 	}
 	pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -1582,7 +1582,7 @@ static bool radv_amdgpu_wait_syncobj(struct radeon_winsys *_ws, const uint32_t *
 					 &tmp);
 	if (ret == 0) {
 		return true;
-	} else if (ret == -1 && errno == ETIME) {
+	} else if (ret == -ETIME) {
 		return false;
 	} else {
 		fprintf(stderr, "amdgpu: radv_amdgpu_wait_syncobj failed!\nerrno: %d\n", errno);
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -301,8 +301,8 @@ class Variable(Value):
      # constant.  If we want to support names that have numeric or
      # punctuation characters, we can me the first assertion more flexible.
      assert self.var_name.isalpha()
-      assert self.var_name is not 'True'
+      assert self.var_name != 'True'
-      assert self.var_name is not 'False'
+      assert self.var_name != 'False'
      self.is_constant = m.group('const') is not None
      self.cond = m.group('cond')
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -5152,7 +5152,8 @@ spirv_to_nir(const uint32_t *words, size_t word_count,
   }
   /* Set shader info defaults */
-   b->shader->info.gs.invocations = 1;
+   if (stage == MESA_SHADER_GEOMETRY)
      b->shader->info.gs.invocations = 1;
   /* Parse rounding mode execution modes. This has to happen earlier than
    * other changes in the execution modes since they can affect, for example,
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -138,15 +138,6 @@ _eglNativePlatformDetectNativeDisplay(void *nativeDisplay)
      if (first_pointer == gbm_create_device)
         return _EGL_PLATFORM_DRM;
 #endif
 #ifdef HAVE_X11_PLATFORM
      /* If not matched to any other platform, fallback to x11. */
      return _EGL_PLATFORM_X11;
 #endif
 #ifdef HAVE_HAIKU_PLATFORM
      return _EGL_PLATFORM_HAIKU;
 #endif
   }
   return _EGL_INVALID_PLATFORM;
--- a/src/egl/meson.build
+++ b/src/egl/meson.build
@@ -1,4 +1,4 @@
-# Copyright Â© 2017 Intel Corporation
+# Copyright © 2017-2019 Intel Corporation
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -149,6 +149,7 @@ if not with_glvnd
 else
  egl_lib_name = 'EGL_mesa'
  egl_lib_version = '0.0.0'
  deps_for_egl += dep_glvnd
  files_egl += [g_egldispatchstubs_h, g_egldispatchstubs_c]
  files_egl += files('main/eglglvnd.c', 'main/egldispatchstubs.c')
  install_data(
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -39,7 +39,6 @@
 #include "compiler/glsl_types.h"
 #include "util/debug.h"
 #include "util/disk_cache.h"
 #include "util/strtod.h"
 #include "vk_format.h"
 #include "vk_util.h"
@@ -431,7 +430,6 @@ tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
      return vk_error(instance, result);
   }
   _mesa_locale_init();
   glsl_type_singleton_init_or_ref();
   VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
@@ -457,7 +455,6 @@ tu_DestroyInstance(VkInstance _instance,
   VG(VALGRIND_DESTROY_MEMPOOL(instance));
   glsl_type_singleton_decref();
   _mesa_locale_fini();
   vk_debug_report_instance_destroy(&instance->debug_report_callbacks);
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -470,10 +470,6 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen,
 	case PIPE_SHADER_FRAGMENT:
 	case PIPE_SHADER_VERTEX:
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		if (is_a6xx(screen))
 			break;
 		return 0;
 	case PIPE_SHADER_COMPUTE:
 		if (has_compute(screen))
 			break;
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -136,6 +136,7 @@ enum {
 #define IRIS_DIRTY_VF_STATISTICS            (1ull << 57)
 #define IRIS_DIRTY_PMA_FIX                  (1ull << 58)
 #define IRIS_DIRTY_DEPTH_BOUNDS             (1ull << 59)
 #define IRIS_DIRTY_RENDER_BUFFER            (1ull << 60)
 #define IRIS_ALL_DIRTY_FOR_COMPUTE (IRIS_DIRTY_CS | \
                                    IRIS_DIRTY_SAMPLER_STATES_CS | \
@@ -151,7 +152,8 @@ enum {
                                 IRIS_DIRTY_BINDINGS_TES | \
                                 IRIS_DIRTY_BINDINGS_GS  | \
                                 IRIS_DIRTY_BINDINGS_FS  | \
-                                 IRIS_DIRTY_BINDINGS_CS)
+                                 IRIS_DIRTY_BINDINGS_CS  | \
                                 IRIS_DIRTY_RENDER_BUFFER)
 /**
 * Non-orthogonal state (NOS) dependency flags.
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -3023,31 +3023,14 @@ iris_set_framebuffer_state(struct pipe_context *ctx,
   /* Render target change */
   ice->state.dirty |= IRIS_DIRTY_BINDINGS_FS;
   ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
   ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
   ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_FRAMEBUFFER];
   if (GEN_GEN == 8)
      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
 #if GEN_GEN == 11
   // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
   // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
   /* The PIPE_CONTROL command description says:
    *
    *   "Whenever a Binding Table Index (BTI) used by a Render Target Message
    *    points to a different RENDER_SURFACE_STATE, SW must issue a Render
    *    Target Cache Flush by enabling this bit. When render target flush
    *    is set due to new association of BTI, PS Scoreboard Stall bit must
    *    be set in this packet."
    */
   // XXX: does this need to happen at 3DSTATE_BTP_PS time?
   iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],
                                "workaround: RT BTI change [draw]",
                                PIPE_CONTROL_RENDER_TARGET_FLUSH |
                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
 #endif
 }
 /**
@@ -5297,6 +5280,24 @@ iris_upload_dirty_render_state(struct iris_context *ice,
      }
   }
   if (GEN_GEN >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {
      // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
      // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
      /* The PIPE_CONTROL command description says:
       *
       *   "Whenever a Binding Table Index (BTI) used by a Render Target
       *    Message points to a different RENDER_SURFACE_STATE, SW must issue a
       *    Render Target Cache Flush by enabling this bit. When render target
       *    flush is set due to new association of BTI, PS Scoreboard Stall bit
       *    must be set in this packet."
       */
      // XXX: does this need to happen at 3DSTATE_BTP_PS time?
      iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",
                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
                                   PIPE_CONTROL_STALL_AT_SCOREBOARD);
   }
   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
      if (dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) {
         iris_populate_binding_table(ice, batch, stage, false);
@@ -5508,7 +5509,7 @@ iris_upload_dirty_render_state(struct iris_context *ice,
             BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
            cl.NonPerspectiveBarycentricEnable = true;
-         cl.ForceZeroRTAIndexEnable = cso_fb->layers == 0;
+         cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
         cl.MaximumVPIndex = ice->state.num_viewports - 1;
      }
      iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -122,6 +122,8 @@ private:
   void emitSAM();
   void emitRAM();
   void emitPSETP();
   void emitMOV();
   void emitS2R();
   void emitCS2R();
@@ -690,6 +692,31 @@ CodeEmitterGM107::emitRAM()
 * predicate/cc
 ******************************************************************************/
 void
 CodeEmitterGM107::emitPSETP()
 {
   emitInsn(0x50900000);
   switch (insn->op) {
   case OP_AND: emitField(0x18, 3, 0); break;
   case OP_OR:  emitField(0x18, 3, 1); break;
   case OP_XOR: emitField(0x18, 3, 2); break;
   default:
      assert(!"unexpected operation");
      break;
   }
   // emitINV (0x2a);
   emitPRED(0x27); // TODO: support 3-arg
   emitINV (0x20, insn->src(1));
   emitPRED(0x1d, insn->src(1));
   emitINV (0x0f, insn->src(0));
   emitPRED(0x0c, insn->src(0));
   emitPRED(0x03, insn->def(0));
   emitPRED(0x00);
 }
 /*******************************************************************************
 * movement / conversion
 ******************************************************************************/
@@ -3557,7 +3584,12 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
   case OP_AND:
   case OP_OR:
   case OP_XOR:
-      emitLOP();
+      switch (insn->def(0).getFile()) {
      case FILE_GPR: emitLOP(); break;
      case FILE_PREDICATE: emitPSETP(); break;
      default:
         assert(!"invalid bool op");
      }
      break;
   case OP_NOT:
      emitNOT();
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1591,6 +1591,12 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
      if (insn.getOpcode() == TGSI_OPCODE_STORE &&
          dst.getFile() != TGSI_FILE_MEMORY) {
         info->io.globalAccess |= 0x2;
         if (dst.getFile() == TGSI_FILE_INPUT) {
            // TODO: Handle indirect somehow?
            const int i = dst.getIndex(0);
            info->in[i].mask |= 1;
         }
      }
      if (dst.getFile() == TGSI_FILE_OUTPUT) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1802,6 +1802,9 @@ NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless
 {
   uint32_t base = slot * NVC0_SU_INFO__STRIDE;
   // We don't upload surface info for bindless for GM107+
   assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET);
   if (ptr) {
      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
      if (bindless)
@@ -2204,7 +2207,7 @@ getDestType(const ImgType type) {
 }
 void
-NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
+NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded)
 {
   const TexInstruction::ImgFormatDesc *format = su->tex.format;
   int width = format->bits[0] + format->bits[1] +
@@ -2223,21 +2226,38 @@ NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
   if (width < 32)
      untypedDst[0] = bld.getSSA();
-   for (int i = 0; i < 4; i++) {
+   if (loaded && loaded[0]) {
-      typedDst[i] = su->getDef(i);
+      for (int i = 0; i < 4; i++) {
         if (loaded[i])
            typedDst[i] = loaded[i]->getDef(0);
      }
   } else {
      for (int i = 0; i < 4; i++) {
         typedDst[i] = su->getDef(i);
      }
   }
   // Set the untyped dsts as the su's destinations
-   for (int i = 0; i < 4; i++)
+   if (loaded && loaded[0]) {
-      su->setDef(i, untypedDst[i]);
+      for (int i = 0; i < 4; i++)
         if (loaded[i])
            loaded[i]->setDef(0, untypedDst[i]);
   } else {
      for (int i = 0; i < 4; i++)
         su->setDef(i, untypedDst[i]);
-   bld.setPosition(su, true);
+      bld.setPosition(su, true);
   }
   // Unpack each component into the typed dsts
   int bits = 0;
   for (int i = 0; i < 4; bits += format->bits[i], i++) {
      if (!typedDst[i])
         continue;
      if (loaded && loaded[0])
         bld.setPosition(loaded[i], true);
      if (i >= format->components) {
         if (format->type == FLOAT ||
             format->type == UNORM ||
@@ -2308,7 +2328,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
   processSurfaceCoordsNVE4(su);
   if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
+      convertSurfaceFormat(su, NULL);
      insertOOBSurfaceOpResult(su);
   }
@@ -2421,7 +2441,7 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
   processSurfaceCoordsNVC0(su);
   if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
+      convertSurfaceFormat(su, NULL);
      insertOOBSurfaceOpResult(su);
   }
@@ -2463,14 +2483,16 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
   }
 }
-void
+TexInstruction *
-NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
+NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4])
 {
   const int slot = su->tex.r;
   const int dim = su->tex.target.getDim();
-   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
   const int arg = dim + array;
   Value *ind = su->getIndirectR();
   Value *handle;
   Instruction *pred = NULL, *pred2d = NULL;
   int pos = 0;
   bld.setPosition(su, false);
@@ -2489,67 +2511,153 @@ NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
      assert(pos == 0);
      break;
   }
   if (dim == 2 && !array) {
      // This might be a 2d slice of a 3d texture, try to load the z
      // coordinate in.
      Value *v;
      if (!su->tex.bindless)
         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
      else
         v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11));
      Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1));
      pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
                         TYPE_U32, bld.mkImm(0), is_3d);
      bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16));
      su->moveSources(dim, 1);
      su->setSrc(dim, v);
      su->tex.target = nv50_ir::TEX_TARGET_3D;
      pos++;
   }
   if (su->tex.bindless)
-      handle = ind;
+      handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047));
   else
      handle = loadTexHandle(ind, slot + 32);
   su->setSrc(arg + pos, handle);
   // The address check doesn't make sense here. The format check could make
   // sense but it's a bit of a pain.
-   if (su->tex.bindless)
+   if (!su->tex.bindless) {
-      return;
+      // prevent read fault when the image is not actually bound
      pred =
         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
                   TYPE_U32, bld.mkImm(0),
                   loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
      if (su->op != OP_SUSTP && su->tex.format) {
         const TexInstruction::ImgFormatDesc *format = su->tex.format;
         int blockwidth = format->bits[0] + format->bits[1] +
            format->bits[2] + format->bits[3];
-   // prevent read fault when the image is not actually bound
+         assert(format->components != 0);
-   CmpInstruction *pred =
+         // make sure that the format doesn't mismatch when it's not FMT_NONE
-      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+         bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
-                TYPE_U32, bld.mkImm(0),
+                   TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
-                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
+                   loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
-   if (su->op != OP_SUSTP && su->tex.format) {
+                   pred->getDef(0));
-      const TexInstruction::ImgFormatDesc *format = su->tex.format;
+      }
      int blockwidth = format->bits[0] + format->bits[1] +
                       format->bits[2] + format->bits[3];
      assert(format->components != 0);
      // make sure that the format doesn't mismatch when it's not FMT_NONE
      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
                TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
                pred->getDef(0));
   }
-   su->setPredicate(CC_NOT_P, pred->getDef(0));
+
   // Now we have "pred" which (optionally) contains whether to do the surface
   // op at all, and a "pred2d" which indicates that, in case of doing the
   // surface op, we have to create a 2d and 3d version, conditioned on pred2d.
   TexInstruction *su2d = NULL;
   if (pred2d) {
      su2d = cloneForward(func, su)->asTex();
      for (unsigned i = 0; su->defExists(i); ++i)
         su2d->setDef(i, bld.getSSA());
      su2d->moveSources(dim + 1, -1);
      su2d->tex.target = nv50_ir::TEX_TARGET_2D;
   }
   if (pred2d && pred) {
      Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8,
                                      bld.getSSA(1, FILE_PREDICATE),
                                      pred->getDef(0), pred2d->getDef(0));
      pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
      pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT);
      su->setPredicate(CC_P, pred3d->getDef(0));
      pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE),
                         pred->getDef(0), pred2d->getDef(0));
      pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
   } else if (pred) {
      su->setPredicate(CC_NOT_P, pred->getDef(0));
   } else if (pred2d) {
      su->setPredicate(CC_NOT_P, pred2d->getDef(0));
   }
   if (su2d) {
      su2d->setPredicate(CC_P, pred2d->getDef(0));
      bld.insert(su2d);
      // Create a UNION so that RA assigns the same registers
      bld.setPosition(su, true);
      for (unsigned i = 0; su->defExists(i); ++i) {
         assert(i < 4);
         ValueDef &def = su->def(i);
         ValueDef &def2 = su2d->def(i);
         Instruction *mov = NULL;
         if (pred) {
            mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
            mov->setPredicate(CC_P, pred->getDef(0));
         }
         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
                                      bld.getSSA(),
                                      NULL, def2.get());
         def.replace(uni->getDef(0), false);
         uni->setSrc(0, def.get());
         if (mov)
            uni->setSrc(2, mov->getDef(0));
      }
   } else if (pred) {
      // Create a UNION so that RA assigns the same registers
      bld.setPosition(su, true);
      for (unsigned i = 0; su->defExists(i); ++i) {
         assert(i < 4);
         ValueDef &def = su->def(i);
         Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
         mov->setPredicate(CC_P, pred->getDef(0));
         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
                                      bld.getSSA(),
                                      NULL, mov->getDef(0));
         def.replace(uni->getDef(0), false);
         uni->setSrc(0, def.get());
      }
   }
   return su2d;
 }
 void
 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
 {
-   processSurfaceCoordsGM107(su);
+   // processSurfaceCoords also takes care of fixing up the outputs and
   // union'ing them with 0 as necessary. Additionally it may create a second
   // surface which needs some of the similar fixups.
   Instruction *loaded[4] = {};
   TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded);
   if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
+      convertSurfaceFormat(su, loaded);
      insertOOBSurfaceOpResult(su);
   }
   if (su->op == OP_SUREDP) {
      Value *def = su->getDef(0);
      su->op = OP_SUREDB;
   }
-      // There may not be a predicate in the bindless case.
+   // If we fixed up the type of the regular surface load instruction, we also
-      if (su->getPredicate()) {
+   // have to fix up the copy.
-         su->setDef(0, bld.getSSA());
+   if (su2) {
-
+      su2->op = su->op;
-         bld.setPosition(su, true);
+      su2->dType = su->dType;
-
+      su2->sType = su->sType;
         // make sure to initialize dst value when the atomic operation is not
         // performed
         Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
         assert(su->cc == CC_NOT_P);
         mov->setPredicate(CC_P, su->getPredicate());
         bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
      }
   }
 }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -171,10 +171,10 @@ private:
   Value *loadMsInfo32(Value *ptr, uint32_t off);
   void adjustCoordinatesMS(TexInstruction *);
-   void processSurfaceCoordsGM107(TexInstruction *);
+   TexInstruction *processSurfaceCoordsGM107(TexInstruction *, Instruction *[4]);
   void processSurfaceCoordsNVE4(TexInstruction *);
   void processSurfaceCoordsNVC0(TexInstruction *);
-   void convertSurfaceFormat(TexInstruction *);
+   void convertSurfaceFormat(TexInstruction *, Instruction **);
   void insertOOBSurfaceOpResult(TexInstruction *);
   Value *calculateSampleOffset(Value *sampleID);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -1433,7 +1433,15 @@ gm107_create_image_handle(struct pipe_context *pipe,
   nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
-   return 0x100000000ULL | tic->id;
+   // Compute handle. This will include the TIC as well as some additional
   // info regarding the bound 3d surface layer, if applicable.
   uint64_t handle = 0x100000000ULL | tic->id;
   struct nv04_resource *res = nv04_resource(view->resource);
   if (res->base.target == PIPE_TEXTURE_3D) {
      handle |= 1 << 11;
      handle |= view->u.tex.first_layer << (11 + 16);
   }
   return handle;
 fail:
   FREE(tic);
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -903,6 +903,10 @@ static void si_disk_cache_create(struct si_screen *sscreen)
 	/* These flags affect shader compilation. */
 	#define ALL_FLAGS (DBG(SI_SCHED) | DBG(GISEL))
 	uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
 	/* Reserve left-most bit for tgsi/nir selector */
 	assert(!(shader_debug_flags & (1u << 31)));
 	shader_debug_flags |= (uint32_t)
 		((sscreen->options.enable_nir & 0x1) << 31);
 	/* Add the high bits of 32-bit addresses, which affects
 	 * how 32-bit addresses are expanded to 64 bits.
@@ -1026,6 +1030,13 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
 		return NULL;
 	}
 	{
 #define OPT_BOOL(name, dflt, description) \
 		sscreen->options.name = \
 			driQueryOptionb(config->options, "radeonsi_"#name);
 #include "si_debug_options.h"
 	}
 	si_disk_cache_create(sscreen);
 	/* Determine the number of shader compiler threads. */
@@ -1146,13 +1157,6 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
 	sscreen->commutative_blend_add =
 		driQueryOptionb(config->options, "radeonsi_commutative_blend_add");
 	{
 #define OPT_BOOL(name, dflt, description) \
 		sscreen->options.name = \
 			driQueryOptionb(config->options, "radeonsi_"#name);
 #include "si_debug_options.h"
 	}
 	sscreen->use_ngg = sscreen->info.chip_class >= GFX10 &&
 			   sscreen->info.family != CHIP_NAVI14 &&
 			   !(sscreen->debug_flags & DBG(NO_NGG));
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -1231,6 +1231,14 @@ swr_update_derived(struct pipe_context *pipe,
         util_viewport_zmin_zmax(state, rasterizer->clip_halfz,
                                 &vp->minZ, &vp->maxZ);
         if (rasterizer->depth_clip_near) {
            vp->minZ = 0.0f;
         }
         if (rasterizer->depth_clip_far) {
            vp->maxZ = 1.0f;
         }
         vpm->m00[i] = state->scale[0];
         vpm->m11[i] = state->scale[1];
         vpm->m22[i] = state->scale[2];
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@@ -488,9 +488,10 @@ get_render_pass(struct zink_context *ctx)
   struct zink_render_pass_state state;
   for (int i = 0; i < fb->nr_cbufs; i++) {
-      struct zink_resource *cbuf = zink_resource(fb->cbufs[i]->texture);
+      struct pipe_resource *res = fb->cbufs[i]->texture;
-      state.rts[i].format = cbuf->format;
+      state.rts[i].format = zink_get_format(screen, fb->cbufs[i]->format);
-      state.rts[i].samples = cbuf->base.nr_samples > 0 ? cbuf->base.nr_samples : VK_SAMPLE_COUNT_1_BIT;
+      state.rts[i].samples = res->nr_samples > 0 ? res->nr_samples :
                                                   VK_SAMPLE_COUNT_1_BIT;
   }
   state.num_cbufs = fb->nr_cbufs;
@@ -993,6 +994,25 @@ get_gfx_program(struct zink_context *ctx)
   return ctx->curr_program;
 }
 static bool
 line_width_needed(enum pipe_prim_type reduced_prim,
                  VkPolygonMode polygon_mode)
 {
   switch (reduced_prim) {
   case PIPE_PRIM_POINTS:
      return false;
   case PIPE_PRIM_LINES:
      return true;
   case PIPE_PRIM_TRIANGLES:
      return polygon_mode == VK_POLYGON_MODE_LINE;
   default:
      unreachable("unexpected reduced prim");
   }
 }
 static void
 zink_draw_vbo(struct pipe_context *pctx,
              const struct pipe_draw_info *dinfo)
@@ -1156,7 +1176,7 @@ zink_draw_vbo(struct pipe_context *pctx,
      vkCmdSetScissor(batch->cmdbuf, 0, 1, &fb_scissor);
   }
-   if (reduced_prim == PIPE_PRIM_LINES) {
+   if (line_width_needed(reduced_prim, rast_state->hw_state.polygon_mode)) {
      if (screen->feats.wideLines || ctx->line_width == 1.0f)
         vkCmdSetLineWidth(batch->cmdbuf, ctx->line_width);
      else
@@ -1294,6 +1314,10 @@ blit_native(struct zink_context *ctx, const struct pipe_blit_info *info)
   zink_batch_reference_resoure(batch, src);
   zink_batch_reference_resoure(batch, dst);
   if (src->layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL)
      zink_resource_barrier(batch->cmdbuf, src, src->aspect,
                            VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
   if (dst->layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL)
      zink_resource_barrier(batch->cmdbuf, dst, dst->aspect,
                            VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
--- a/src/gallium/drivers/zink/zink_resource.c
+++ b/src/gallium/drivers/zink/zink_resource.c
@@ -137,6 +137,7 @@ resource_create(struct pipe_screen *pscreen,
      VkImageCreateInfo ici = {};
      ici.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
      ici.flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
      switch (templ->target) {
      case PIPE_TEXTURE_1D:
@@ -146,7 +147,7 @@ resource_create(struct pipe_screen *pscreen,
      case PIPE_TEXTURE_CUBE:
      case PIPE_TEXTURE_CUBE_ARRAY:
-         ici.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
+         ici.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
         /* fall-through */
      case PIPE_TEXTURE_2D:
      case PIPE_TEXTURE_2D_ARRAY:
@@ -157,7 +158,7 @@ resource_create(struct pipe_screen *pscreen,
      case PIPE_TEXTURE_3D:
         ici.imageType = VK_IMAGE_TYPE_3D;
         if (templ->bind & PIPE_BIND_RENDER_TARGET)
-            ici.flags = VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
+            ici.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
         break;
      case PIPE_BUFFER:
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -125,6 +125,8 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return 1;
   case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
      return 0; /* TODO: re-enable after implementing nir_texop_txd */
   case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
   case PIPE_CAP_VERTEX_SHADER_SATURATE:
      return 1;
@@ -284,7 +286,7 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return 0;
   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
-      return 1;
+      return 0;
   case PIPE_CAP_NIR_COMPACT_ARRAYS:
      return 1;
@@ -549,7 +551,7 @@ static const VkFormat formats[PIPE_FORMAT_COUNT] = {
   [PIPE_FORMAT_Z32_FLOAT] = VK_FORMAT_D32_SFLOAT,
   [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = VK_FORMAT_D32_SFLOAT_S8_UINT,
   [PIPE_FORMAT_Z16_UNORM] = VK_FORMAT_D16_UNORM,
-   [PIPE_FORMAT_X8Z24_UNORM] = VK_FORMAT_X8_D24_UNORM_PACK32,
+   [PIPE_FORMAT_Z24X8_UNORM] = VK_FORMAT_X8_D24_UNORM_PACK32,
   [PIPE_FORMAT_Z24_UNORM_S8_UINT] = VK_FORMAT_D24_UNORM_S8_UINT,
   // compressed formats
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -940,7 +940,7 @@ dri2_create_image_from_fd(__DRIscreen *_screen,
      whandles[i].stride = (unsigned)strides[index];
      whandles[i].offset = (unsigned)offsets[index];
      whandles[i].modifier = modifier;
-      whandles[i].plane = i;
+      whandles[i].plane = index;
   }
   img = dri2_create_image_from_winsys(_screen, width, height, use, map,
--- a/src/gallium/targets/graw-gdi/meson.build
+++ b/src/gallium/targets/graw-gdi/meson.build
@@ -32,6 +32,7 @@ libgraw_gdi = shared_library(
  dependencies : [
    dep_ws2_32, idep_mesautil, driver_swrast,
  ],
  name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libgraw.dll
 )
 libgraw = libgraw_gdi
--- a/src/gallium/targets/graw-null/meson.build
+++ b/src/gallium/targets/graw-null/meson.build
@@ -32,6 +32,7 @@ libgraw_null = shared_library(
  include_directories : inc_common,
  link_with : libgallium,
  dependencies : idep_mesautil,
  name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libgraw_null.dll
 )
 libgraw = libgraw_null
--- a/src/gallium/targets/osmesa/meson.build
+++ b/src/gallium/targets/osmesa/meson.build
@@ -58,6 +58,7 @@ libosmesa = shared_library(
    dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind,
    driver_swrast, driver_swr,
  ],
  name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libosmesa.dll
  soversion : host_machine.system() == 'windows' ? '' : '8',
  version : '8.0.0',
  install : true,
--- a/src/gallium/targets/pipe-loader/meson.build
+++ b/src/gallium/targets/pipe-loader/meson.build
@@ -47,6 +47,15 @@ endif
 pipe_loader_install_dir = join_paths(get_option('libdir'), 'gallium-pipe')
 _kmsro_targets = [
   driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
   driver_panfrost, driver_lima,
 ]
 if with_gallium_v3d
   _kmsro_targets += [idep_xmlconfig, dep_expat]
 endif
 pipe_loaders = [
  [with_gallium_i915, 'i915', driver_i915, []],
  [with_gallium_nouveau, 'nouveau', driver_nouveau, []],
@@ -54,7 +63,7 @@ pipe_loaders = [
  [with_gallium_r600, 'r600', driver_r600, []],
  [with_gallium_radeonsi, 'radeonsi', [driver_radeonsi, idep_xmlconfig], []],
  [with_gallium_freedreno, 'msm', driver_freedreno, []],
-  [with_gallium_panfrost, 'kmsro', [driver_kmsro, driver_panfrost], []],
+  [with_gallium_kmsro, 'kmsro', _kmsro_targets, []],
  [with_gallium_svga, 'vmwgfx', driver_svga, []],
  [with_gallium_softpipe, 'swrast', [driver_swrast, driver_swr], [libwsw, libws_null]],
 ]
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -736,8 +736,7 @@ namespace brw {
      src_reg
      fix_byte_src(const src_reg &src) const
      {
-         if ((shader->devinfo->gen < 11 && !shader->devinfo->is_geminilake) ||
+         if (shader->devinfo->gen < 11 || type_sz(src.type) != 1)
             type_sz(src.type) != 1)
            return src;
         dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1505,8 +1505,15 @@ generate_code(struct brw_codegen *p,
   bool debug_flag = INTEL_DEBUG &
      intel_debug_flag_for_shader_stage(nir->info.stage);
   struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
   /* `send_count` explicitly does not include spills or fills, as we'd
    * like to use it as a metric for intentional memory access or other
    * shared function use.  Otherwise, subtle changes to scheduling or
    * register allocation could cause it to fluctuate wildly - and that
    * effect is already counted in spill/fill counts.
    */
   int spill_count = 0, fill_count = 0;
-   int loop_count = 0;
+   int loop_count = 0, send_count = 0;
   foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
      struct brw_reg src[3], dst;
@@ -1746,6 +1753,7 @@ generate_code(struct brw_codegen *p,
            generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
         } else {
            generate_math1_gen4(p, inst, dst, src[0]);
            send_count++;
         }
         break;
@@ -1759,6 +1767,7 @@ generate_code(struct brw_codegen *p,
            generate_math_gen6(p, inst, dst, src[0], src[1]);
         } else {
            generate_math2_gen4(p, inst, dst, src[0], src[1]);
            send_count++;
         }
         break;
@@ -1775,14 +1784,17 @@ generate_code(struct brw_codegen *p,
      case SHADER_OPCODE_SAMPLEINFO:
         generate_tex(p, prog_data, nir->info.stage,
                      inst, dst, src[0], src[1], src[2]);
         send_count++;
         break;
      case SHADER_OPCODE_GET_BUFFER_SIZE:
         generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
         send_count++;
         break;
      case VS_OPCODE_URB_WRITE:
         generate_vs_urb_write(p, inst);
         send_count++;
         break;
      case SHADER_OPCODE_GEN4_SCRATCH_READ:
@@ -1797,10 +1809,12 @@ generate_code(struct brw_codegen *p,
      case VS_OPCODE_PULL_CONSTANT_LOAD:
         generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
         send_count++;
         break;
      case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
         generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
         send_count++;
         break;
      case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
@@ -1809,14 +1823,17 @@ generate_code(struct brw_codegen *p,
      case GS_OPCODE_URB_WRITE:
         generate_gs_urb_write(p, inst);
         send_count++;
         break;
      case GS_OPCODE_URB_WRITE_ALLOCATE:
         generate_gs_urb_write_allocate(p, inst);
         send_count++;
         break;
      case GS_OPCODE_SVB_WRITE:
         generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
         send_count++;
         break;
      case GS_OPCODE_SVB_SET_DST_INDEX:
@@ -1825,6 +1842,7 @@ generate_code(struct brw_codegen *p,
      case GS_OPCODE_THREAD_END:
         generate_gs_thread_end(p, inst);
         send_count++;
         break;
      case GS_OPCODE_SET_WRITE_OFFSET:
@@ -1837,6 +1855,7 @@ generate_code(struct brw_codegen *p,
      case GS_OPCODE_FF_SYNC:
         generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
         send_count++;
         break;
      case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
@@ -1866,12 +1885,14 @@ generate_code(struct brw_codegen *p,
      case SHADER_OPCODE_SHADER_TIME_ADD:
         brw_shader_time_add(p, src[0],
                             prog_data->base.binding_table.shader_time_start);
         send_count++;
         break;
      case VEC4_OPCODE_UNTYPED_ATOMIC:
         assert(src[2].file == BRW_IMMEDIATE_VALUE);
         brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
                            !inst->dst.is_null(), inst->header_size);
         send_count++;
         break;
      case VEC4_OPCODE_UNTYPED_SURFACE_READ:
@@ -1879,16 +1900,19 @@ generate_code(struct brw_codegen *p,
         assert(src[2].file == BRW_IMMEDIATE_VALUE);
         brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
                                  src[2].ud);
         send_count++;
         break;
      case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
         assert(src[2].file == BRW_IMMEDIATE_VALUE);
         brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
                                   src[2].ud, inst->header_size);
         send_count++;
         break;
      case SHADER_OPCODE_MEMORY_FENCE:
         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, /* bti */ 0);
         send_count++;
         break;
      case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
@@ -2068,10 +2092,12 @@ generate_code(struct brw_codegen *p,
      case TCS_OPCODE_URB_WRITE:
         generate_tcs_urb_write(p, inst, src[0]);
         send_count++;
         break;
      case VEC4_OPCODE_URB_READ:
         generate_vec4_urb_read(p, inst, dst, src[0]);
         send_count++;
         break;
      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
@@ -2113,15 +2139,18 @@ generate_code(struct brw_codegen *p,
      case TCS_OPCODE_RELEASE_INPUT:
         generate_tcs_release_input(p, dst, src[0], src[1]);
         send_count++;
         break;
      case TCS_OPCODE_THREAD_END:
         generate_tcs_thread_end(p, inst);
         send_count++;
         break;
      case SHADER_OPCODE_BARRIER:
         brw_barrier(p, src[0]);
         brw_WAIT(p);
         send_count++;
         break;
      case SHADER_OPCODE_MOV_INDIRECT:
@@ -2188,9 +2217,9 @@ generate_code(struct brw_codegen *p,
            sha1buf);
      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
-                     "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
+                     "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n",
            stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
-            spill_count, fill_count, before_size, after_size,
+            spill_count, fill_count, send_count, before_size, after_size,
            100.0f * (before_size - after_size) / before_size);
      /* overriding the shader makes disasm_info invalid */
@@ -2205,10 +2234,11 @@ generate_code(struct brw_codegen *p,
   compiler->shader_debug_log(log_data,
                              "%s vec4 shader: %d inst, %d loops, %u cycles, "
-                              "%d:%d spills:fills, compacted %d to %d bytes.",
+                              "%d:%d spills:fills, %u sends, "
                              "compacted %d to %d bytes.",
                              stage_abbrev, before_size / 16,
                              loop_count, cfg->cycle_count, spill_count,
-                              fill_count, before_size, after_size);
+                              fill_count, send_count, before_size, after_size);
   if (stats) {
      stats->dispatch_width = 0;
      stats->instructions = before_size / 16;
--- a/src/intel/dev/gen_device_info.c
+++ b/src/intel/dev/gen_device_info.c
@@ -1043,7 +1043,8 @@ static const struct gen_device_info gen_device_info_ehl_2x4 = {
   .gt = _gt, .num_slices = _slices, .l3_banks = _l3,           \
   .simulator_id = 22,                                          \
   .urb.size = (_gt) == 1 ? 512 : 1024,                         \
-   .num_subslices = _dual_subslices
+   .num_subslices = _dual_subslices,                            \
   .num_eu_per_subslice = 16
 #define dual_subslices(args...) { args, }
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -532,9 +532,11 @@ anv_block_pool_expand_range(struct anv_block_pool *pool,
   if (use_softpin) {
      gem_handle = anv_gem_create(pool->device, newbo_size);
      map = anv_gem_mmap(pool->device, gem_handle, 0, newbo_size, 0);
-      if (map == MAP_FAILED)
+      if (map == MAP_FAILED) {
         anv_gem_close(pool->device, gem_handle);
         return vk_errorf(pool->device->instance, pool->device,
                          VK_ERROR_MEMORY_MAP_FAILED, "gem mmap failed: %m");
      }
      assert(center_bo_offset == 0);
   } else {
      /* Just leak the old map until we destroy the pool.  We can't munmap it
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -32,7 +32,6 @@
 #include "drm-uapi/drm_fourcc.h"
 #include "anv_private.h"
 #include "util/strtod.h"
 #include "util/debug.h"
 #include "util/build_id.h"
 #include "util/disk_cache.h"
@@ -792,7 +791,6 @@ VkResult anv_CreateInstance(
   instance->pipeline_cache_enabled =
      env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true);
   _mesa_locale_init();
   glsl_type_singleton_init_or_ref();
   VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
@@ -831,7 +829,6 @@ void anv_DestroyInstance(
   vk_debug_report_instance_destroy(&instance->debug_report_callbacks);
   glsl_type_singleton_decref();
   _mesa_locale_fini();
   driDestroyOptionCache(&instance->dri_options);
   driDestroyOptionInfo(&instance->available_dri_options);
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -2216,12 +2216,15 @@ compute_pipeline_create(
   pipeline->blend_state.map = NULL;
-   result = anv_reloc_list_init(&pipeline->batch_relocs,
+   const VkAllocationCallbacks *alloc =
-                                pAllocator ? pAllocator : &device->alloc);
+      pAllocator ? pAllocator : &device->alloc;
   result = anv_reloc_list_init(&pipeline->batch_relocs, alloc);
   if (result != VK_SUCCESS) {
      vk_free2(&device->alloc, pAllocator, pipeline);
      return result;
   }
   pipeline->batch.alloc = alloc;
   pipeline->batch.next = pipeline->batch.start = pipeline->batch_data;
   pipeline->batch.end = pipeline->batch.start + sizeof(pipeline->batch_data);
   pipeline->batch.relocs = &pipeline->batch_relocs;
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -94,12 +94,7 @@ VkResult genX(CreateQueryPool)(
      uint64s_per_slot += 4;
      break;
   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
-      uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */
+      uint64s_per_slot = 72; /* 576 bytes, see layout below */
      uint64s_per_slot += 4; /* PerfCounter 1 & 2 */
      uint64s_per_slot++; /* 2 * 32bit RPSTAT register */
      uint64s_per_slot++; /* 64bit marker */
      uint64s_per_slot++; /* availability */
      uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */
      break;
   }
   default:
@@ -179,54 +174,51 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query)
 }
 /**
- * VK_INTEL_performance_query layout:
+ * VK_INTEL_performance_query layout (576 bytes) :
 *
 * ------------------------------
- * |       end MI_RPC (256b)    |
+ * |       availability (8b)    |
 * |----------------------------|
- * |     begin MI_RPC (256b)    |
+ * |         marker (8b)        |
 * |----------------------------|
 * | begin perfcntr 1 & 2 (16b) |
 * |----------------------------|
 * |  end perfcntr 1 & 2 (16b)  |
 * |----------------------------|
 * | begin RPSTAT register (4b) |
 * |----------------------------|
 * |  end RPSTAT register (4b)  |
 * |----------------------------|
- * |         marker (8b)        |
+ * | begin perfcntr 1 & 2 (16b) |
 * |----------------------------|
- * |       availability (8b)    |
+ * |  end perfcntr 1 & 2 (16b)  |
 * |----------------------------|
 * |          Unused (8b)       |
 * |----------------------------|
 * |     begin MI_RPC (256b)    |
 * |----------------------------|
 * |       end MI_RPC (256b)    |
 * ------------------------------
 */
 static uint32_t
-intel_perf_mi_rpc_offset(bool end)
+intel_perf_marker_offset(void)
 {
-   return end ? 0 : 256;
+   return 8;
 }
 static uint32_t
 intel_perf_counter(bool end)
 {
   uint32_t offset = 512;
   offset += end ? 2 * sizeof(uint64_t) : 0;
   return offset;
 }
 static uint32_t
 intel_perf_rpstart_offset(bool end)
 {
-   uint32_t offset = intel_perf_counter(false) +
+   return 16 + (end ? sizeof(uint32_t) : 0);
      4 * sizeof(uint64_t);
   offset += end ? sizeof(uint32_t) : 0;
   return offset;
 }
 static uint32_t
-intel_perf_marker_offset(void)
+intel_perf_counter(bool end)
 {
-   return intel_perf_rpstart_offset(false) + sizeof(uint64_t);
+   return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
 }
 static uint32_t
 intel_perf_mi_rpc_offset(bool end)
 {
   return 64 + (end ? 256 : 0);
 }
 static void
@@ -251,11 +243,7 @@ query_slot(struct anv_query_pool *pool, uint32_t query)
 static bool
 query_is_available(struct anv_query_pool *pool, uint32_t query)
 {
-   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
+   return *(volatile uint64_t *)query_slot(pool, query);
      return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) +
                                    pool->stride - 8);
   } else
      return *(volatile uint64_t *)query_slot(pool, query);
 }
 static VkResult
--- a/src/intel/vulkan/tests/block_pool_grow_first.c
+++ b/src/intel/vulkan/tests/block_pool_grow_first.c
@@ -27,7 +27,11 @@
 int main(int argc, char **argv)
 {
-   struct anv_instance instance;
+   struct anv_instance instance = {
      .physicalDevice = {
         .use_softpin = true,
      },
   };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/block_pool_no_free.c
+++ b/src/intel/vulkan/tests/block_pool_no_free.c
@@ -111,7 +111,7 @@ static void validate_monotonic(int32_t **blocks)
 static void run_test()
 {
-   struct anv_instance instance;
+   struct anv_instance instance = { };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/state_pool.c
+++ b/src/intel/vulkan/tests/state_pool.c
@@ -36,7 +36,7 @@
 int main(int argc, char **argv)
 {
-   struct anv_instance instance;
+   struct anv_instance instance = { };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/state_pool_free_list_only.c
+++ b/src/intel/vulkan/tests/state_pool_free_list_only.c
@@ -35,7 +35,7 @@
 int main(int argc, char **argv)
 {
-   struct anv_instance instance;
+   struct anv_instance instance = { };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/state_pool_no_free.c
+++ b/src/intel/vulkan/tests/state_pool_no_free.c
@@ -56,7 +56,7 @@ static void *alloc_states(void *_job)
 static void run_test()
 {
-   struct anv_instance instance;
+   struct anv_instance instance = { };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/intel/vulkan/tests/state_pool_padding.c
+++ b/src/intel/vulkan/tests/state_pool_padding.c
@@ -27,7 +27,11 @@
 int main(int argc, char **argv)
 {
-   struct anv_instance instance;
+   struct anv_instance instance = {
      .physicalDevice = {
         .use_softpin = true,
      },
   };
   struct anv_device device = {
      .instance = &instance,
   };
--- a/src/mesa/drivers/osmesa/meson.build
+++ b/src/mesa/drivers/osmesa/meson.build
@@ -36,6 +36,8 @@ libosmesa = shared_library(
  link_whole : libglapi_static,
  link_with : [libmesa_classic, osmesa_link_with],
  dependencies : [dep_thread, dep_selinux],
  name_prefix : host_machine.system() == 'windows' ? '' : 'lib',  # otherwise mingw will create libosmesa.dll
  soversion : host_machine.system() == 'windows' ? '' : '8',
  version : '8.0.0',
  install : true,
 )
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -350,6 +350,12 @@ clear_bufferiv(struct gl_context *ctx, GLenum buffer, GLint drawbuffer,
      _mesa_update_state( ctx );
   }
   if (!no_error && ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
      _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
                  "glClearBufferiv(incomplete framebuffer)");
      return;
   }
   switch (buffer) {
   case GL_STENCIL:
      /* Page 264 (page 280 of the PDF) of the OpenGL 3.0 spec says:
@@ -686,6 +692,12 @@ clear_bufferfi(struct gl_context *ctx, GLenum buffer, GLint drawbuffer,
                     drawbuffer);
         return;
      }
      if (ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
         _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
                     "glClearBufferfi(incomplete framebuffer)");
         return;
      }
   }
   if (ctx->RasterDiscard)
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -325,6 +325,7 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
   cso_set_stream_outputs(cso, 0, NULL, NULL);
   cso_set_sample_mask(cso, ~0);
   cso_set_min_samples(cso, 1);
   st->clear.raster.multisample = st->state.fb_num_samples > 1;
   cso_set_rasterizer(cso, &st->clear.raster);
   /* viewport state: viewport matching window dims */
		`@@ -0,0 +1,2 @@`
							`# This is reverted shortly after landing`
							`4432a2d14d80081d062f7939a950d65ea3a16eed`