Update version to 18.2.0-rc2

Signed-off-by: Andres Gomez <agomez@igalia.com>
meson: use correct keyword to fix a meson warning
2018-08-09 02:29:47 +03:00 · 2018-08-07 20:59:51 +03:00 · 2018-08-07 20:57:27 +03:00 · 2018-08-07 20:57:01 +03:00 · 2018-08-07 20:56:30 +03:00 · 2018-08-07 20:55:56 +03:00
21 changed files with 346 additions and 44 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-18.2.0-devel
+18.2.0-rc2
--- a/bin/install_megadrivers.py
+++ b/bin/install_megadrivers.py
@@ -43,13 +43,15 @@ def main():
    master = os.path.join(to, os.path.basename(args.megadriver))

    if not os.path.exists(to):
+        if os.path.lexists(to):
+            os.unlink(to)
        os.makedirs(to)
    shutil.copy(args.megadriver, master)

    for driver in args.drivers:
        abs_driver = os.path.join(to, driver)

-        if os.path.exists(abs_driver):
+        if os.path.lexists(abs_driver):
            os.unlink(abs_driver)
        print('installing {} to {}'.format(args.megadriver, abs_driver))
        os.link(master, abs_driver)
@@ -60,7 +62,7 @@ def main():

            name, ext = os.path.splitext(driver)
            while ext != '.so':
-                if os.path.exists(name):
+                if os.path.lexists(name):
                    os.unlink(name)
                os.symlink(driver, name)
                name, ext = os.path.splitext(name)
--- a/meson.build
+++ b/meson.build
@@ -989,7 +989,7 @@ if cc.links('''
      freelocale(loc);
      return 0;
    }''',
-    extra_args : pre_args,
+    args : pre_args,
    name : 'strtod has locale support')
  pre_args += '-DHAVE_STRTOD_L'
 endif
--- a/src/amd/Android.mk
+++ b/src/amd/Android.mk
@@ -27,4 +27,6 @@ include $(LOCAL_PATH)/Makefile.sources

 include $(LOCAL_PATH)/Android.addrlib.mk
 include $(LOCAL_PATH)/Android.common.mk
+ifneq ($(filter radeonsi,$(BOARD_GPU_DRIVERS)),)
 include $(LOCAL_PATH)/vulkan/Android.mk
+endif
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -528,6 +528,16 @@
    <field name="number of attribute arrays" size="5" start="0" type="uint"/>
  </packet>

+  <packet code="71" name="VCM Cache Size" min_ver="41">
+    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
+    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
+  </packet>
+
+  <packet code="73" name="VCM Cache Size" max_ver="33">
+    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
+    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
+  </packet>
+
  <packet code="73" name="Transform Feedback Buffer" min_ver="41">
    <field name="Buffer Address" size="32" start="32" type="address"/>
    <field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/>
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -27,13 +27,14 @@
 #include <stdint.h>

 /**
- * Struct for tracking features of the V3D chip. This is where we'll store
- * boolean flags for features in a specific version, but for now it's just the
- * version
+ * Struct for tracking features of the V3D chip across driver and compiler.
 */
 struct v3d_device_info {
        /** Simple V3D version: major * 10 + minor */
        uint8_t ver;
+
+        /** Size of the VPM, in bytes. */
+        int vpm_size;
 };

 #endif
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -462,6 +462,7 @@ struct choose_scoreboard {
        int last_magic_sfu_write_tick;
        int last_ldvary_tick;
        int last_uniforms_reset_tick;
+        int last_thrsw_tick;
        bool tlb_locked;
 };

@@ -1095,10 +1096,16 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
 }

 static bool
-valid_thrsw_sequence(struct v3d_compile *c,
+valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
                     struct qinst *qinst, int instructions_in_sequence,
                     bool is_thrend)
 {
+        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
+        if (scoreboard->last_thrsw_tick + 3 >
+            scoreboard->tick - instructions_in_sequence) {
+                return false;
+        }
+
        for (int slot = 0; slot < instructions_in_sequence; slot++) {
                /* No scheduling SFU when the result would land in the other
                 * thread.  The simulator complains for safety, though it
@@ -1159,7 +1166,8 @@ emit_thrsw(struct v3d_compile *c,
                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
                        break;

-                if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
+                if (!valid_thrsw_sequence(c, scoreboard,
+                                          prev_inst, slots_filled + 1,
                                          is_thrend)) {
                        break;
                }
@@ -1173,7 +1181,9 @@ emit_thrsw(struct v3d_compile *c,
        if (merge_inst) {
                merge_inst->qpu.sig.thrsw = true;
                needs_free = true;
+                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
        } else {
+                scoreboard->last_thrsw_tick = scoreboard->tick;
                insert_scheduled_instruction(c, block, scoreboard, inst);
                time++;
                slots_filled++;
@@ -1475,6 +1485,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
        scoreboard.last_ldvary_tick = -10;
        scoreboard.last_magic_sfu_write_tick = -10;
        scoreboard.last_uniforms_reset_tick = -10;
+        scoreboard.last_thrsw_tick = -10;

        if (debug) {
                fprintf(stderr, "Pre-schedule instructions\n");
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -648,6 +648,9 @@ struct v3d_vs_prog_data {

        /* Total number of components written, for the shader state record. */
        uint32_t vpm_output_size;
+
+        /* Value to be programmed in VCM_CACHE_SIZE. */
+        uint8_t vcm_cache_size;
 };

 struct v3d_fs_prog_data {
@@ -928,7 +931,7 @@ VIR_A_ALU2(OR)
 VIR_A_ALU2(XOR)
 VIR_A_ALU2(VADD)
 VIR_A_ALU2(VSUB)
-VIR_A_ALU2(STVPMV)
+VIR_A_NODST_2(STVPMV)
 VIR_A_ALU1(NOT)
 VIR_A_ALU1(NEG)
 VIR_A_ALU1(FLAPUSH)
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -452,6 +452,16 @@ vir_emit_def(struct v3d_compile *c, struct qinst *inst)
 {
        assert(inst->dst.file == QFILE_NULL);

+        /* If we're emitting an instruction that's a def, it had better be
+         * writing a register.
+         */
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
+                       v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
+                assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
+                       v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
+        }
+
        inst->dst = vir_get_temp(c);

        if (inst->dst.file == QFILE_TEMP)
@@ -746,10 +756,28 @@ uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
        if (prog_data->uses_iid)
                prog_data->vpm_input_size++;

-        /* Input/output segment size are in 8x32-bit multiples. */
+        /* Input/output segment size are in sectors (8 rows of 32 bits per
+         * channel).
+         */
        prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
        prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;

+        /* Compute VCM cache size.  We set up our program to take up less than
+         * half of the VPM, so that any set of bin and render programs won't
+         * run out of space.  We need space for at least one input segment,
+         * and then allocate the rest to output segments (one for the current
+         * program, the rest to VCM).  The valid range of the VCM cache size
+         * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
+         * batches.
+         */
+        assert(c->devinfo->vpm_size);
+        int sector_size = 16 * sizeof(uint32_t) * 8;
+        int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
+        int half_vpm = vpm_size_in_sectors / 2;
+        int vpm_output_batches = half_vpm - prog_data->vpm_input_size;
+        assert(vpm_output_batches >= 2);
+        prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
+
        return v3d_return_qpu_insts(c, final_assembly_size);
 }

--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -94,6 +94,15 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                }
                        }

+                        /* Refuse to spill a ldvary's dst, because that means
+                         * that ldvary's r5 would end up being used across a
+                         * thrsw.
+                         */
+                        if (inst->qpu.sig.ldvary) {
+                                assert(inst->dst.file == QFILE_TEMP);
+                                BITSET_CLEAR(c->spillable, inst->dst.index);
+                        }
+
                        if (inst->is_last_thrsw)
                                started_last_seg = true;

@@ -102,7 +111,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                                started_last_seg = true;

                        /* Track when we're in between a TMU setup and the
-                         * final LDTMU from that TMU setup.  We can't
+                         * final LDTMU or TMUWT from that TMU setup.  We can't
                         * spill/fill any temps during that time, because that
                         * involves inserting a new TMU setup/LDTMU sequence.
                         */
@@ -110,6 +119,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
                            is_last_ldtmu(inst, block))
                                in_tmu_operation = false;

+                        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT)
+                                in_tmu_operation = false;
+
                        if (v3d_qpu_writes_tmu(&inst->qpu))
                                in_tmu_operation = true;
                }
@@ -206,6 +219,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                                     inst->dst);
                        v3d_emit_spill_tmua(c, spill_offset);
                        vir_emit_thrsw(c);
+                        vir_TMUWT(c);
                        c->spills++;
                }

--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -1131,6 +1131,31 @@ static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr)
   mgr->dirty_real_vb_mask = 0;
 }

+static void
+u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
+                               unsigned *indirect_data, unsigned stride,
+                               unsigned draw_count)
+{
+   assert(info->index_size);
+   info->indirect = NULL;
+
+   for (unsigned i = 0; i < draw_count; i++) {
+      unsigned offset = i * stride / 4;
+
+      info->count = indirect_data[offset + 0];
+      info->instance_count = indirect_data[offset + 1];
+
+      if (!info->count || !info->instance_count)
+         continue;
+
+      info->start = indirect_data[offset + 2];
+      info->index_bias = indirect_data[offset + 3];
+      info->start_instance = indirect_data[offset + 4];
+
+      u_vbuf_draw_vbo(mgr, info);
+   }
+}
+
 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
 {
   struct pipe_context *pipe = mgr->pipe;
@@ -1160,33 +1185,163 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)

   new_info = *info;

-   /* Fallback. We need to know all the parameters. */
+   /* Handle indirect (multi)draws. */
   if (new_info.indirect) {
-      struct pipe_transfer *transfer = NULL;
-      int *data;
+      const struct pipe_draw_indirect_info *indirect = new_info.indirect;
+      unsigned draw_count = 0;

-      if (new_info.index_size) {
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
-                                      new_info.indirect->offset, 20,
-                                      PIPE_TRANSFER_READ, &transfer);
-         new_info.index_bias = data[3];
-         new_info.start_instance = data[4];
-      }
-      else {
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
-                                      new_info.indirect->offset, 16,
-                                      PIPE_TRANSFER_READ, &transfer);
-         new_info.start_instance = data[3];
+      /* Get the number of draws. */
+      if (indirect->indirect_draw_count) {
+         pipe_buffer_read(pipe, indirect->indirect_draw_count,
+                          indirect->indirect_draw_count_offset,
+                          4, &draw_count);
+      } else {
+         draw_count = indirect->draw_count;
      }

-      new_info.count = data[0];
-      new_info.instance_count = data[1];
-      new_info.start = data[2];
-      pipe_buffer_unmap(pipe, transfer);
-      new_info.indirect = NULL;
-
-      if (!new_info.count)
+      if (!draw_count)
         return;
+
+      unsigned data_size = (draw_count - 1) * indirect->stride +
+                           (new_info.index_size ? 20 : 16);
+      unsigned *data = malloc(data_size);
+      if (!data)
+         return; /* report an error? */
+
+      /* Read the used buffer range only once, because the read can be
+       * uncached.
+       */
+      pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size,
+                       data);
+
+      if (info->index_size) {
+         /* Indexed multidraw. */
+         unsigned index_bias0 = data[3];
+         bool index_bias_same = true;
+
+         /* If we invoke the translate path, we have to split the multidraw. */
+         if (incompatible_vb_mask ||
+             mgr->ve->incompatible_elem_mask) {
+            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
+                                           indirect->stride, draw_count);
+            free(data);
+            return;
+         }
+
+         /* See if index_bias is the same for all draws. */
+         for (unsigned i = 1; i < draw_count; i++) {
+            if (data[i * indirect->stride / 4 + 3] != index_bias0) {
+               index_bias_same = false;
+               break;
+            }
+         }
+
+         /* Split the multidraw if index_bias is different. */
+         if (!index_bias_same) {
+            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
+                                           indirect->stride, draw_count);
+            free(data);
+            return;
+         }
+
+         /* If we don't need to use the translate path and index_bias is
+          * the same, we can process the multidraw with the time complexity
+          * equal to 1 draw call (except for the index range computation).
+          * We only need to compute the index range covering all draw calls
+          * of the multidraw.
+          *
+          * The driver will not look at these values because indirect != NULL.
+          * These values determine the user buffer bounds to upload.
+          */
+         new_info.index_bias = index_bias0;
+         new_info.min_index = ~0u;
+         new_info.max_index = 0;
+         new_info.start_instance = ~0u;
+         unsigned end_instance = 0;
+
+         struct pipe_transfer *transfer = NULL;
+         const uint8_t *indices;
+
+         if (info->has_user_indices) {
+            indices = (uint8_t*)info->index.user;
+         } else {
+            indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource,
+                                                PIPE_TRANSFER_READ, &transfer);
+         }
+
+         for (unsigned i = 0; i < draw_count; i++) {
+            unsigned offset = i * indirect->stride / 4;
+            unsigned start = data[offset + 2];
+            unsigned count = data[offset + 0];
+            unsigned start_instance = data[offset + 4];
+            unsigned instance_count = data[offset + 1];
+
+            if (!count || !instance_count)
+               continue;
+
+            /* Update the ranges of instances. */
+            new_info.start_instance = MIN2(new_info.start_instance,
+                                           start_instance);
+            end_instance = MAX2(end_instance, start_instance + instance_count);
+
+            /* Update the index range. */
+            unsigned min, max;
+            new_info.count = count; /* only used by get_minmax_index */
+            u_vbuf_get_minmax_index_mapped(&new_info,
+                                           indices +
+                                           new_info.index_size * start,
+                                           &min, &max);
+
+            new_info.min_index = MIN2(new_info.min_index, min);
+            new_info.max_index = MAX2(new_info.max_index, max);
+         }
+         free(data);
+
+         if (transfer)
+            pipe_buffer_unmap(pipe, transfer);
+
+         /* Set the final instance count. */
+         new_info.instance_count = end_instance - new_info.start_instance;
+
+         if (new_info.start_instance == ~0u || !new_info.instance_count)
+            return;
+      } else {
+         /* Non-indexed multidraw.
+          *
+          * Keep the draw call indirect and compute minimums & maximums,
+          * which will determine the user buffer bounds to upload, but
+          * the driver will not look at these values because indirect != NULL.
+          *
+          * This efficiently processes the multidraw with the time complexity
+          * equal to 1 draw call.
+          */
+         new_info.start = ~0u;
+         new_info.start_instance = ~0u;
+         unsigned end_vertex = 0;
+         unsigned end_instance = 0;
+
+         for (unsigned i = 0; i < draw_count; i++) {
+            unsigned offset = i * indirect->stride / 4;
+            unsigned start = data[offset + 2];
+            unsigned count = data[offset + 0];
+            unsigned start_instance = data[offset + 3];
+            unsigned instance_count = data[offset + 1];
+
+            new_info.start = MIN2(new_info.start, start);
+            new_info.start_instance = MIN2(new_info.start_instance,
+                                           start_instance);
+
+            end_vertex = MAX2(end_vertex, start + count);
+            end_instance = MAX2(end_instance, start_instance + instance_count);
+         }
+
+         /* Set the final counts. */
+         new_info.count = end_vertex - new_info.start;
+         new_info.instance_count = end_instance - new_info.start_instance;
+
+         if (new_info.start == ~0u || !new_info.count || !new_info.instance_count)
+            return;
+      }
   }

   if (new_info.index_size) {
@@ -1211,7 +1366,8 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
          * We would have to break this drawing operation into several ones. */
         /* Use some heuristic to see if unrolling indices improves
          * performance. */
-         if (!new_info.primitive_restart &&
+         if (!info->indirect &&
+             !new_info.primitive_restart &&
             num_vertices > new_info.count*2 &&
             num_vertices - new_info.count > 32 &&
             !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -2151,13 +2151,36 @@ NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
   }
 }

+void
+NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
+{
+   if (!su->getPredicate())
+      return;
+
+   bld.setPosition(su, true);
+
+   for (unsigned i = 0; su->defExists(i); ++i) {
+      ValueDef &def = su->def(i);
+
+      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
+      assert(su->cc == CC_NOT_P);
+      mov->setPredicate(CC_P, su->getPredicate());
+      Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), NULL, mov->getDef(0));
+
+      def.replace(uni->getDef(0), false);
+      uni->setSrc(0, def.get());
+   }
+}
+
 void
 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
 {
   processSurfaceCoordsNVE4(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
      assert(su->getPredicate());
@@ -2267,8 +2290,10 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)

   processSurfaceCoordsNVC0(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
      const int dim = su->tex.target.getDim();
@@ -2370,8 +2395,10 @@ NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
 {
   processSurfaceCoordsGM107(su);

-   if (su->op == OP_SULDP)
+   if (su->op == OP_SULDP) {
      convertSurfaceFormat(su);
+      insertOOBSurfaceOpResult(su);
+   }

   if (su->op == OP_SUREDP) {
      Value *def = su->getDef(0);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -172,6 +172,7 @@ private:
   void processSurfaceCoordsNVE4(TexInstruction *);
   void processSurfaceCoordsNVC0(TexInstruction *);
   void convertSurfaceFormat(TexInstruction *);
+   void insertOOBSurfaceOpResult(TexInstruction *);
   Value *calculateSampleOffset(Value *sampleID);

 protected:
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -1148,7 +1148,6 @@ struct pipe_screen *
 swr_create_screen_internal(struct sw_winsys *winsys)
 {
   struct swr_screen *screen = CALLOC_STRUCT(swr_screen);
-   memset(screen, 0, sizeof(struct swr_screen));

   if (!screen)
      return NULL;
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -585,6 +585,8 @@ v3d_get_device_info(struct v3d_screen *screen)
        uint32_t minor = (ident1.value >> 0) & 0xf;
        screen->devinfo.ver = major * 10 + minor;

+        screen->devinfo.vpm_size = (ident1.value >> 28 & 0xf) * 1024;
+
        switch (screen->devinfo.ver) {
        case 33:
        case 41:
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -306,6 +306,13 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                }
        }

+        cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) {
+                vcm.number_of_16_vertex_batches_for_binning =
+                        v3d->prog.cs->prog_data.vs->vcm_cache_size;
+                vcm.number_of_16_vertex_batches_for_rendering =
+                        v3d->prog.vs->prog_data.vs->vcm_cache_size;
+        }
+
        cl_emit(&job->bcl, GL_SHADER_STATE, state) {
                state.address = cl_address(job->indirect.bo, shader_rec_offset);
                state.number_of_attribute_arrays = num_elements_to_emit;
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -222,6 +222,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
                        attr.coordinate_shader_vpm_offset = 0;
                        attr.vertex_shader_vpm_offset = 0;
                }
+
+                vc4_bo_unreference(&bo);
        }

        cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) {
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -142,8 +142,12 @@ vc4_fence_context_init(struct vc4_context *vc4)
        /* Since we initialize the in_fence_fd to -1 (no wait necessary),
         * we also need to initialize our in_syncobj as signaled.
         */
-        return drmSyncobjCreate(vc4->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
-                                &vc4->in_syncobj);
+        if (vc4->screen->has_syncobj) {
+                return drmSyncobjCreate(vc4->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                        &vc4->in_syncobj);
+        } else {
+                return 0;
+        }
 }

 void
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -26,8 +26,12 @@
 *
 **************************************************************************/

+#if !defined(ANDROID) || ANDROID_API_LEVEL >= 26
+/* Android's libc began supporting shm in Oreo */
+#define HAVE_SHM
 #include <sys/ipc.h>
 #include <sys/shm.h>
+#endif

 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
@@ -83,6 +87,7 @@ dri_sw_is_displaytarget_format_supported( struct sw_winsys *ws,
   return TRUE;
 }

+#ifdef HAVE_SHM
 static char *
 alloc_shm(struct dri_sw_displaytarget *dri_sw_dt, unsigned size)
 {
@@ -101,6 +106,7 @@ alloc_shm(struct dri_sw_displaytarget *dri_sw_dt, unsigned size)

   return addr;
 }
+#endif

 static struct sw_displaytarget *
 dri_sw_displaytarget_create(struct sw_winsys *winsys,
@@ -131,8 +137,11 @@ dri_sw_displaytarget_create(struct sw_winsys *winsys,
   size = dri_sw_dt->stride * nblocksy;

   dri_sw_dt->shmid = -1;
+
+#ifdef HAVE_SHM
   if (ws->lf->put_image_shm)
      dri_sw_dt->data = alloc_shm(dri_sw_dt, size);
+#endif

   if(!dri_sw_dt->data)
      dri_sw_dt->data = align_malloc(size, alignment);
@@ -156,8 +165,10 @@ dri_sw_displaytarget_destroy(struct sw_winsys *ws,
   struct dri_sw_displaytarget *dri_sw_dt = dri_sw_displaytarget(dt);

   if (dri_sw_dt->shmid >= 0) {
+#ifdef HAVE_SHM
      shmdt(dri_sw_dt->data);
      shmctl(dri_sw_dt->shmid, IPC_RMID, 0);
+#endif
   } else {
      align_free(dri_sw_dt->data);
   }
--- a/src/intel/Makefile.tools.am
+++ b/src/intel/Makefile.tools.am
@@ -21,7 +21,9 @@

 noinst_PROGRAMS += \
 	tools/aubinator \
-	tools/aubinator_error_decode
+	tools/aubinator_error_decode \
+	tools/error2aub
+

 tools_aubinator_SOURCES = \
 	tools/aubinator.c \
@@ -59,3 +61,23 @@ tools_aubinator_error_decode_LDADD = \
 tools_aubinator_error_decode_CFLAGS = \
 	$(AM_CFLAGS) \
 	$(ZLIB_CFLAGS)
+
+
+tools_error2aub_SOURCES = \
+	tools/gen_context.h \
+	tools/gen8_context.h \
+	tools/gen10_context.h \
+	tools/aub_write.h \
+	tools/aub_write.c \
+	tools/error2aub.c
+
+tools_error2aub_CFLAGS = \
+	$(AM_CFLAGS) \
+	$(ZLIB_CFLAGS)
+
+tools_error2aub_LDADD = \
+	dev/libintel_dev.la \
+	$(PTHREAD_LIBS) \
+	$(DLOPEN_LIBS) \
+	$(ZLIB_LIBS) \
+	-lm
--- a/src/intel/tools/aubinator.c
+++ b/src/intel/tools/aubinator.c
@@ -590,7 +590,7 @@ handle_memtrace_reg_write(uint32_t *p)
   uint32_t pphwsp_addr = context_descriptor & 0xfffff000;
   struct gen_batch_decode_bo pphwsp_bo = get_ggtt_batch_bo(NULL, pphwsp_addr);
   uint32_t *context = (uint32_t *)((uint8_t *)pphwsp_bo.map +
-                                    (pphwsp_bo.addr - pphwsp_addr) +
+                                    (pphwsp_addr - pphwsp_bo.addr) +
                                    pphwsp_size);

   uint32_t ring_buffer_head = context[5];
@@ -601,7 +601,7 @@ handle_memtrace_reg_write(uint32_t *p)
   struct gen_batch_decode_bo ring_bo = get_ggtt_batch_bo(NULL,
                                                          ring_buffer_start);
   assert(ring_bo.size > 0);
-   void *commands = (uint8_t *)ring_bo.map + (ring_bo.addr - ring_buffer_start);
+   void *commands = (uint8_t *)ring_bo.map + (ring_buffer_start - ring_bo.addr);

   if (context_descriptor & 0x100 /* ppgtt */) {
      batch_ctx.get_bo = get_ppgtt_batch_bo;
Author	SHA1	Message	Date
Andres Gomez	4a25d8b623	Update version to 18.2.0-rc2 Signed-off-by: Andres Gomez <agomez@igalia.com>	2018-08-09 02:29:47 +03:00
Jon Turney	4a769c8850	meson: use correct keyword to fix a meson warning With a sufficently recent meson, the following warning is produced: WARNING: Passed invalid keyword argument "extra_args". WARNING: This will become a hard error in the future. It seems that compiler.links(args:) is meant here. Signed-off-by: Jon Turney <jon.turney@dronecode.org.uk> Reviewed-and-Tested-by: Eric Engestrom <eric.engestrom@intel.com> Reviewed-by: Dylan Baker <dylan@pnwbakers.com> (cherry picked from commit `a48c0659e1`)	2018-08-07 20:59:51 +03:00
Eric Anholt	d39fb6d157	vc4: Fix a leak of the no-vertex-elements workaround BO. Fixes: `bd1925562a` ("vc4: Convert the driver to emitting the shader record using pack macros.") (cherry picked from commit `9507e03699`)	2018-08-07 20:57:27 +03:00
Eric Anholt	ed117c27e1	vc4: Fix context creation when syncobjs aren't supported. Noticed when trying to run current Mesa on rpi's downstream kernel. Fixes: `b0acc3a562` ("broadcom/vc4: Native fence fd support") (cherry picked from commit `86095e9bb1`)	2018-08-07 20:57:01 +03:00
Chad Versace	fdbbe4c50c	drisw: Fix build on Android Nougat, which lacks shm (v2) In commit `cf54bd5e8`, dri_sw_winsys.c began using <sys/shm.h> to support the new functions putImageShm, getImageShm in DRI_SWRastLoader. But Android began supporting System V shared memory only in Oreo. Nougat has no shm headers. Fix the build by ifdef'ing out the shm code on Nougat. Fixes: `cf54bd5e8` "drisw: use shared memory when possible" Reviewed-by: Dave Airlie <airlied@redhat.com> Cc: Marc-André Lureau <marcandre.lureau@gmail.com> (cherry picked from commit `aaa41cd297`)	2018-08-07 20:56:30 +03:00
Gert Wollny	3c3589a0ba	meson, install_megadrivers: Also remove stale symlinks os.path.exists doesn't return True for stale symlinks, but they are in the way later, when a link/file with the same name is to be created. For instance it is conceivable that the pointed to file is replaced by a file with a new name, and then the symlink is dead. To handle this check specifically for all existing symlinks to be removed. (This bugged me for some time with a link libXvMCr600.so always being in the way of installing this file) v2: use only os.lexist and replace all instances of os.exist (Dylan Baker) v3: handle directory check correctly (Eric Engestrom) Fixes: `f7f1b30f81` ("meson: extend install_megadrivers script to handle symmlinking") Reviewed-by: Eric Engestrom <eric.engestrom@intel.com>(v2 minus dir check) Reviewed-by: Dylan Baker <dylan@pnwbakers.com> Signed-off-by: Gert Wollny <gert.wollny@collabora.com> (cherry picked from commit `7a46b2d641`)	2018-08-07 20:55:56 +03:00
Eric Anholt	37fa81f631	v3d: Emit the VCM_CACHE_SIZE packet. This is needed to ensure that we don't get blocked waiting for VPM space with bin/render overlapping. Cc: "18.2" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `1561e4984e`)	2018-08-07 20:55:09 +03:00
Eric Anholt	71aa72d695	v3d: Avoid spilling that breaks the r5 usage after a ldvary. Fixes bad rendering when forcing 2 spills in glxgears. Cc: "18.2" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `50a8713d4f`)	2018-08-07 20:54:41 +03:00
Eric Anholt	c8d41bc58d	v3d: Make sure that QPU instruction-has-a-dest matches VIR. Found when debugging register spilling -- we would try to spill the dest of a STVPMV, inserting spill code after entering the last segment. In fact, we were likely to to choose to do this, given that the STVPMV "dest" temp was never read from, making it cheap to spill. Cc: "18.2" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `f2c0d310d6`)	2018-08-07 20:54:10 +03:00
Eric Anholt	c3b1a6d7fa	v3d: Wait for TMU writes to complete before continuing after a spill. The simulator complained that we had write responses outstanding at shader end. It seems that a TMU read does not guarantee that previous TMU writes by the thread have completed, which surprised me. Cc: "18.2" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `3f9cb2eb05`)	2018-08-07 20:53:42 +03:00
Eric Anholt	cce78368df	v3d: Make sure we don't emit a thrsw before the last one finished. Found while forcing some spilling, which creates a lot of short tmua->thrsw->ldtmu sequences. Cc: "18.2" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `ccbe33af5b`)	2018-08-07 20:52:48 +03:00
Lionel Landwerlin	b6e9ef1556	intel: aubinator: fix read the context/ring Up to now we've been lucky that the buffer returned was always exactly at the address we requested. Fixes: `144b40db54` ("intel: aubinator: drop the 1Tb GTT mapping") Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Rafael Antognolli <rafael.antognolli@intel.com> (cherry picked from commit `35955afa7a`)	2018-08-06 16:43:31 +03:00
Karol Herbst	c18ed873a5	nvc0/ir: return 0 in imageLoad on incomplete textures We already guarded all OP_SULDP against out of bound accesses, but we ended up just reusing whatever value was stored in the dest registers. Fixes CTS test shader_image_load_store.incomplete_textures v2: fix for loads not ending up with predicates (bindless_texture) v3: fix replacing the def Cc: <mesa-stable@lists.freedesktop.org> Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu> Signed-off-by: Karol Herbst <kherbst@redhat.com> (cherry picked from commit `c3325097be`)	2018-08-06 16:42:47 +03:00
Marek Olšák	88c36f4379	gallium/u_vbuf: handle indirect multidraws correctly and efficiently (v3) v2: need to do MAX{start+count} instead of MAX{count} added piglit tests v3: use malloc Cc: 18.2 <mesa-stable@lists.freedesktop.org> Reviewed-by: Eric Anholt <eric@anholt.net> (cherry picked from commit `0f79b2015b`)	2018-08-06 15:46:19 +03:00
Mauro Rossi	bbeb78620c	android: radv: build vulkan.radv conditionally to radeonsi A problem was reported with arm,arm64 targets build due to missing libLLVM shared library dependency with AOSP; to avoid this issue vulkan.radv is built conditionally only when radeonsi is in BOARD_GPU_DRIVERS Fixes: `0ca153f869` ("android: radv: enable build of vulkan.radv HAL module") Reported-by: John Stultz <john.stultz@linaro.org> Signed-off-by: Mauro Rossi <issor.oruam@gmail.com> Reviewed-by: Emil Velikov <emil.velikov@collabora.com> Cc: "18.2" <mesa-stable@lists.freedesktop.org> (cherry picked from commit `1c7a2433b2`)	2018-08-06 15:44:06 +03:00
Andres Gomez	9ddff68f6f	intel/tools: add error2aub creation into autotools Tarball distribution is done through "make distcheck". We include the meson targets also into autotools so they won't fail when building from the tarball. Fixes: `6a60beba40` ("intel/tools: Add an error state to aub translator") Cc: Jason Ekstrand <jason.ekstrand@intel.com> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Cc: Dylan Baker <dylan.c.baker@intel.com> Signed-off-by: Andres Gomez <agomez@igalia.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Dylan Baker <dylan@pnwbakers.com> (cherry picked from commit `2d4d139877`)	2018-08-02 21:21:22 +03:00
Vlad Golovkin	2e903df72f	swr: Remove unnecessary memset call Zeroing memory after calloc is not necessary. This also allows to avoid possible crash when allocation fails, because memset is called before checking screen for NULL. Fixes: `a29d63ecf7` "swr: refactor swr_create_screen to allow for proper cleanup on error" Reviewed-by: Eric Engestrom <eric.engestrom@intel.com> (cherry picked from commit `9d3a2394e4`)	2018-08-02 21:20:52 +03:00
Andres Gomez	cb542ac550	Update version to 18.2.0-rc1 Signed-off-by: Andres Gomez <agomez@igalia.com>	2018-08-02 18:28:04 +03:00