Compare commits
32 Commits
mesa-25.1.
...
mesa-24.3.
Author | SHA1 | Date | |
---|---|---|---|
|
4c41bb9bef | ||
|
1bc37bb465 | ||
|
e86386df89 | ||
|
e839ff344e | ||
|
3e45c3eec2 | ||
|
e7ebb97fdf | ||
|
97d974a3ad | ||
|
6c9587db99 | ||
|
dc8e19aede | ||
|
d185a4658e | ||
|
e3f3e315af | ||
|
27b2c2b869 | ||
|
a9f1c10a10 | ||
|
739c3615ce | ||
|
4a71355172 | ||
|
dd14b60b49 | ||
|
349687c73a | ||
|
67bd351553 | ||
|
eb34c059be | ||
|
6965aff4d1 | ||
|
ea9b3f928d | ||
|
7994534fe9 | ||
|
1e792b0933 | ||
|
08955d2ee8 | ||
|
8f53de4a5d | ||
|
baba2805ca | ||
|
7cef55b993 | ||
|
b856d0d3cc | ||
|
1ab129ba70 | ||
|
7dc84d1c96 | ||
|
93d5d587f5 | ||
|
85ba713d76 |
@@ -54,7 +54,7 @@ workflow:
|
||||
# pipeline for direct pushes that bypassed the CI
|
||||
- if: &is-direct-push $CI_PROJECT_NAMESPACE == "mesa" && $CI_PIPELINE_SOURCE == "push" && $GITLAB_USER_LOGIN != "marge-bot"
|
||||
variables:
|
||||
JOB_PRIORITY: 40
|
||||
JOB_PRIORITY: 70
|
||||
|
||||
|
||||
variables:
|
||||
|
@@ -34,24 +34,6 @@ function get_current_minsec {
|
||||
printf "%02d:%02d" $((CURR_TIME/60)) $((CURR_TIME%60))
|
||||
}
|
||||
|
||||
function error {
|
||||
x_off 2>/dev/null
|
||||
RED="\e[0;31m"
|
||||
ENDCOLOR="\e[0m"
|
||||
# we force the following to be not in a section
|
||||
if [ -n "${CURRENT_SECTION:-}" ]; then
|
||||
_section_end $CURRENT_SECTION
|
||||
fi
|
||||
|
||||
CURR_MINSEC=$(get_current_minsec)
|
||||
echo -e "\n${RED}[${CURR_MINSEC}] ERROR: $*${ENDCOLOR}\n"
|
||||
x_restore
|
||||
}
|
||||
|
||||
function trap_err {
|
||||
error ${CURRENT_SECTION:-'unknown-section'}: ret code: $*
|
||||
}
|
||||
|
||||
function _build_section_start {
|
||||
local section_params=$1
|
||||
shift
|
||||
@@ -68,13 +50,13 @@ function _build_section_start {
|
||||
alias build_section_start="x_off; _build_section_start"
|
||||
|
||||
function _section_start {
|
||||
_build_section_start "[collapsed=true]" $*
|
||||
build_section_start "[collapsed=true]" $*
|
||||
x_restore
|
||||
}
|
||||
alias section_start="x_off; _section_start"
|
||||
|
||||
function _uncollapsed_section_start {
|
||||
_build_section_start "" $*
|
||||
build_section_start "" $*
|
||||
x_restore
|
||||
}
|
||||
alias uncollapsed_section_start="x_off; _uncollapsed_section_start"
|
||||
@@ -87,7 +69,7 @@ function _build_section_end {
|
||||
alias build_section_end="x_off; _build_section_end"
|
||||
|
||||
function _section_end {
|
||||
_build_section_end $*
|
||||
build_section_end $*
|
||||
x_restore
|
||||
}
|
||||
alias section_end="x_off; _section_end"
|
||||
@@ -95,9 +77,10 @@ alias section_end="x_off; _section_end"
|
||||
function _section_switch {
|
||||
if [ -n "$CURRENT_SECTION" ]
|
||||
then
|
||||
_build_section_end $CURRENT_SECTION
|
||||
build_section_end $CURRENT_SECTION
|
||||
x_off
|
||||
fi
|
||||
_build_section_start "[collapsed=true]" $*
|
||||
build_section_start "[collapsed=true]" $*
|
||||
x_restore
|
||||
}
|
||||
alias section_switch="x_off; _section_switch"
|
||||
@@ -105,9 +88,10 @@ alias section_switch="x_off; _section_switch"
|
||||
function _uncollapsed_section_switch {
|
||||
if [ -n "$CURRENT_SECTION" ]
|
||||
then
|
||||
_build_section_end $CURRENT_SECTION
|
||||
build_section_end $CURRENT_SECTION
|
||||
x_off
|
||||
fi
|
||||
_build_section_start "" $*
|
||||
build_section_start "" $*
|
||||
x_restore
|
||||
}
|
||||
alias uncollapsed_section_switch="x_off; _uncollapsed_section_switch"
|
||||
@@ -116,8 +100,6 @@ export -f _x_store_state
|
||||
export -f _x_off
|
||||
export -f _x_restore
|
||||
export -f get_current_minsec
|
||||
export -f error
|
||||
export -f trap_err
|
||||
export -f _build_section_start
|
||||
export -f _section_start
|
||||
export -f _build_section_end
|
||||
@@ -136,5 +118,27 @@ if [ -z "${RESULTS_DIR:-}" ]; then
|
||||
mkdir -p "${RESULTS_DIR}"
|
||||
fi
|
||||
|
||||
function error {
|
||||
x_off 2>/dev/null
|
||||
RED="\e[0;31m"
|
||||
ENDCOLOR="\e[0m"
|
||||
# we force the following to be not in a section
|
||||
if [ -n "${CURRENT_SECTION:-}" ]; then
|
||||
section_end $CURRENT_SECTION
|
||||
x_off
|
||||
fi
|
||||
|
||||
CURR_MINSEC=$(get_current_minsec)
|
||||
echo -e "\n${RED}[${CURR_MINSEC}] ERROR: $*${ENDCOLOR}\n"
|
||||
x_restore
|
||||
}
|
||||
|
||||
function trap_err {
|
||||
error ${CURRENT_SECTION:-'unknown-section'}: ret code: $*
|
||||
}
|
||||
|
||||
export -f error
|
||||
export -f trap_err
|
||||
|
||||
set -E
|
||||
trap 'trap_err $?' ERR
|
||||
|
@@ -370,7 +370,7 @@ yaml-toml-shell-test:
|
||||
after_script:
|
||||
# Keep the results path the same as baremetal and LAVA
|
||||
- mkdir -p "${JOB_FOLDER}"/results
|
||||
- mv "${JOB_FOLDER}"/results results/
|
||||
- mv "${JOB_FOLDER}"/results ./
|
||||
- !reference [default, after_script]
|
||||
|
||||
artifacts:
|
||||
|
2602
.pick_status.json
Normal file
2602
.pick_status.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1848,7 +1848,7 @@ endif
|
||||
dep_spirv_tools = dependency(
|
||||
'SPIRV-Tools',
|
||||
required : with_clover_spirv or with_clc,
|
||||
version : '>= 2018.0'
|
||||
version : '>= 2022.1'
|
||||
)
|
||||
if dep_spirv_tools.found()
|
||||
pre_args += '-DHAVE_SPIRV_TOOLS'
|
||||
|
@@ -448,16 +448,17 @@ agx_nir_fs_epilog(nir_builder *b, const void *key_)
|
||||
* to the epilog, when sample shading is not used but blending is.
|
||||
*/
|
||||
if (key->link.sample_shading) {
|
||||
NIR_PASS(_, b->shader, agx_nir_lower_to_per_sample);
|
||||
NIR_PASS(_, b->shader, agx_nir_lower_fs_active_samples_to_register);
|
||||
|
||||
/* Lower the resulting discards. Done in agx_nir_lower_monolithic_msaa for
|
||||
* the pixel shaded path.
|
||||
* the pixel shaded path. Must be done before agx_nir_lower_to_per_sample
|
||||
* to avoid duplicating tests.
|
||||
*/
|
||||
if (key->blend.alpha_to_coverage) {
|
||||
NIR_PASS(_, b->shader, agx_nir_lower_sample_mask);
|
||||
}
|
||||
|
||||
NIR_PASS(_, b->shader, agx_nir_lower_to_per_sample);
|
||||
NIR_PASS(_, b->shader, agx_nir_lower_fs_active_samples_to_register);
|
||||
|
||||
/* Ensure the sample ID is preserved in register. We do this late since it
|
||||
* has to go in the last block, and the above passes might add control
|
||||
* flow when lowering.
|
||||
|
@@ -1973,11 +1973,12 @@ emit_frag_end(struct v3d_compile *c)
|
||||
*/
|
||||
if (c->output_position_index == -1 &&
|
||||
!(c->s->info.num_images || c->s->info.num_ssbos) &&
|
||||
!c->s->info.fs.uses_discard &&
|
||||
!c->fs_key->sample_alpha_to_coverage &&
|
||||
c->output_sample_mask_index == -1 &&
|
||||
has_any_tlb_color_write) {
|
||||
c->s->info.fs.early_fragment_tests = true;
|
||||
c->s->info.fs.early_fragment_tests =
|
||||
!c->s->info.fs.uses_discard ||
|
||||
c->fs_key->can_earlyz_with_discard;
|
||||
}
|
||||
|
||||
/* By default, Z buffer writes are implicit using the Z values produced
|
||||
|
@@ -426,6 +426,7 @@ struct v3d_fs_key {
|
||||
bool msaa;
|
||||
bool sample_alpha_to_coverage;
|
||||
bool sample_alpha_to_one;
|
||||
bool can_earlyz_with_discard;
|
||||
/* Mask of which color render targets are present. */
|
||||
uint8_t cbufs;
|
||||
uint8_t swap_color_rb;
|
||||
|
@@ -584,8 +584,10 @@ nir_lower_mediump_vars(nir_shader *shader, nir_variable_mode modes)
|
||||
nir_variable *var = nir_deref_instr_get_variable(deref);
|
||||
|
||||
/* If we have atomic derefs that we can't track, then don't lower any mediump. */
|
||||
if (!var)
|
||||
if (!var) {
|
||||
ralloc_free(no_lower_set);
|
||||
return false;
|
||||
}
|
||||
|
||||
_mesa_set_add(no_lower_set, var);
|
||||
break;
|
||||
|
@@ -825,6 +825,24 @@ assign_src(struct ra_ctx *ctx, struct ir3_register *src)
|
||||
interval->src = false;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_nontrivial_collect(struct ir3_instruction *collect)
|
||||
{
|
||||
if (collect->opc != OPC_META_COLLECT) {
|
||||
return false;
|
||||
}
|
||||
|
||||
struct ir3_register *dst = collect->dsts[0];
|
||||
|
||||
foreach_src_n (src, src_n, collect) {
|
||||
if (src->num != dst->num + src_n) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
|
||||
struct ir3_register *dst)
|
||||
@@ -861,10 +879,26 @@ handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
|
||||
free_space(ctx, physreg, size);
|
||||
}
|
||||
|
||||
dst->num = ra_physreg_to_num(physreg, dst->flags);
|
||||
|
||||
/* Non-trivial collects (i.e., ones that will introduce moves because the
|
||||
* sources don't line-up with the destination) may cause source intervals to
|
||||
* get implicitly moved when they are inserted as children of the destination
|
||||
* interval. Since we don't support moving intervals in shared RA, this may
|
||||
* cause illegal register allocations. Prevent this by creating a new
|
||||
* top-level interval for the destination so that the source intervals will
|
||||
* be left alone.
|
||||
*/
|
||||
if (is_nontrivial_collect(instr)) {
|
||||
dst->merge_set = NULL;
|
||||
dst->interval_start = ctx->live->interval_offset;
|
||||
dst->interval_end = dst->interval_start + reg_size(dst);
|
||||
ctx->live->interval_offset = dst->interval_end;
|
||||
}
|
||||
|
||||
ra_update_affinity(reg_file_size(dst), dst, physreg);
|
||||
interval->physreg_start = physreg;
|
||||
interval->physreg_end = physreg + reg_size(dst);
|
||||
dst->num = ra_physreg_to_num(physreg, dst->flags);
|
||||
ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
|
||||
d("insert dst %u physreg %u", dst->name, physreg);
|
||||
|
||||
|
@@ -2514,6 +2514,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
|
||||
vk_descriptor_set_layout_unref(&cmd_buffer->device->vk,
|
||||
&cmd_buffer->descriptors[i].push_set.layout->vk);
|
||||
}
|
||||
vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->descriptors[i].push_set.mapped_ptr);
|
||||
memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
|
||||
cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
|
||||
cmd_buffer->descriptors[i].max_sets_bound = 0;
|
||||
@@ -4630,8 +4631,8 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
|
||||
const struct tu_image_view *, pass->attachment_count);
|
||||
vk_multialloc_add(&ma, &cmd->state.clear_values, VkClearValue,
|
||||
pRenderPassBegin->clearValueCount);
|
||||
if (!vk_multialloc_alloc(&ma, &cmd->vk.pool->alloc,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
|
||||
if (ma.size && !vk_multialloc_alloc(&ma, &cmd->vk.pool->alloc,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
|
||||
vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
return;
|
||||
}
|
||||
|
@@ -61,6 +61,13 @@ etna_ml_create_tensor(struct etna_ml_subgraph *subgraph, unsigned idx, unsigned
|
||||
ML_DBG("created resource %p for tensor %d with size %d\n", res, idx, size);
|
||||
}
|
||||
|
||||
struct etna_core_npu_info *
|
||||
etna_ml_get_core_info(struct etna_context *context) {
|
||||
struct etna_screen *screen = context->screen;
|
||||
struct etna_core_info *info = etna_gpu_get_core_info(screen->npu);
|
||||
return &info->npu;
|
||||
}
|
||||
|
||||
static bool
|
||||
needs_reshuffle(const struct pipe_ml_operation *poperation)
|
||||
{
|
||||
@@ -237,7 +244,7 @@ etna_ml_subgraph_create(struct pipe_context *pcontext,
|
||||
unsigned count)
|
||||
{
|
||||
struct etna_context *ctx = etna_context(pcontext);
|
||||
unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
struct etna_ml_subgraph *subgraph;
|
||||
struct list_head operations;
|
||||
unsigned tensor_count;
|
||||
@@ -358,7 +365,7 @@ void
|
||||
etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psubgraph, struct pipe_tensor *input)
|
||||
{
|
||||
struct etna_context *ctx = etna_context(pctx);
|
||||
unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
|
||||
unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
|
||||
struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
|
||||
struct etna_cmd_stream *stream = ctx->stream;
|
||||
static bool is_initialized = false;
|
||||
|
@@ -7,7 +7,8 @@
|
||||
#define H_ETNA_ML
|
||||
|
||||
#include "pipe/p_state.h"
|
||||
#include "util/u_inlines.h"
|
||||
#include "util/u_dynarray.h"
|
||||
#include "etnaviv_context.h"
|
||||
|
||||
#define MAX_CONFIG_BOS 4
|
||||
|
||||
@@ -94,6 +95,8 @@ unsigned etna_ml_allocate_tensor(struct etna_ml_subgraph *subgraph);
|
||||
struct pipe_resource *etna_ml_get_tensor(struct etna_ml_subgraph *subgraph, unsigned idx);
|
||||
unsigned etna_ml_get_offset(struct etna_ml_subgraph *subgraph, unsigned idx);
|
||||
|
||||
struct etna_core_npu_info *etna_ml_get_core_info(struct etna_context *context);
|
||||
|
||||
struct pipe_ml_subgraph *
|
||||
etna_ml_subgraph_create(struct pipe_context *context,
|
||||
const struct pipe_ml_operation *operations,
|
||||
|
@@ -515,8 +515,8 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
|
||||
static unsigned
|
||||
calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
|
||||
{
|
||||
unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
|
||||
unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
|
||||
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
||||
unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
|
||||
unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
|
||||
@@ -590,8 +590,8 @@ calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *inp
|
||||
static unsigned
|
||||
calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
|
||||
{
|
||||
unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
|
||||
unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
|
||||
unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
|
||||
unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
|
||||
unsigned input_width = operation->input_width;
|
||||
unsigned input_height = operation->input_height;
|
||||
unsigned input_channels = operation->input_channels;
|
||||
@@ -639,9 +639,9 @@ create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation
|
||||
{
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(context);
|
||||
unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
unsigned nn_core_version = ctx->screen->specs.nn_core_version;
|
||||
unsigned oc_sram_size = ctx->screen->info->npu.on_chip_sram_size;
|
||||
unsigned oc_sram_size = etna_ml_get_core_info(ctx)->on_chip_sram_size;
|
||||
struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
|
||||
sizeof(struct etna_nn_params),
|
||||
DRM_ETNA_GEM_CACHE_WC);
|
||||
@@ -967,7 +967,7 @@ static unsigned
|
||||
write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
|
||||
{
|
||||
struct pipe_context *pctx = subgraph->base.context;
|
||||
unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
|
||||
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
||||
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
||||
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
||||
@@ -1047,7 +1047,7 @@ static unsigned
|
||||
write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
|
||||
{
|
||||
struct pipe_context *pctx = subgraph->base.context;
|
||||
unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
|
||||
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
||||
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
||||
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
||||
@@ -1134,7 +1134,7 @@ static unsigned
|
||||
write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
|
||||
{
|
||||
struct pipe_context *pctx = subgraph->base.context;
|
||||
unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
|
||||
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
||||
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
||||
unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
|
||||
@@ -1221,7 +1221,7 @@ calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_op
|
||||
{
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(context);
|
||||
unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
unsigned header_size = ALIGN(nn_core_count * 4, 64);
|
||||
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
||||
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
||||
@@ -1245,8 +1245,8 @@ calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operatio
|
||||
{
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(context);
|
||||
unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
|
||||
unsigned max_zrl_bits = ctx->screen->info->npu.nn_zrl_bits;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
unsigned max_zrl_bits = etna_ml_get_core_info(ctx)->nn_zrl_bits;
|
||||
unsigned header_size = ALIGN(nn_core_count * 4, 64);
|
||||
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
||||
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
||||
@@ -1298,7 +1298,7 @@ create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_oper
|
||||
{
|
||||
struct pipe_context *context = subgraph->base.context;
|
||||
struct etna_context *ctx = etna_context(context);
|
||||
unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
|
||||
unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
|
||||
unsigned header_size = ALIGN(nn_core_count * 4, 64);
|
||||
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
||||
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
||||
|
@@ -394,7 +394,7 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
|
||||
unsigned tp_core, unsigned tp_cores_used)
|
||||
{
|
||||
struct etna_context *ctx = etna_context(subgraph->base.context);
|
||||
unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
|
||||
unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
|
||||
struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
|
||||
sizeof(struct etna_tp_params),
|
||||
DRM_ETNA_GEM_CACHE_WC);
|
||||
@@ -730,7 +730,7 @@ etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph,
|
||||
instruction->configs[0] = create_detranspose_config(subgraph, operation);
|
||||
break;
|
||||
case ETNA_ML_TP_RESHUFFLE: {
|
||||
unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
|
||||
unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
|
||||
unsigned tp_cores_used;
|
||||
|
||||
tp_cores_used = (operation->input_width > 8 || operation->input_channels > 1) ? tp_core_count : 1;
|
||||
@@ -756,7 +756,7 @@ etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph,
|
||||
unsigned idx)
|
||||
{
|
||||
struct etna_context *ctx = etna_context(subgraph->base.context);
|
||||
unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
|
||||
unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
|
||||
struct etna_cmd_stream *stream = ctx->stream;
|
||||
bool more_than_one_tp_job = operation->configs[1] != NULL;
|
||||
bool parallel = DBG_ENABLED(ETNA_DBG_NPU_PARALLEL);
|
||||
|
@@ -141,11 +141,9 @@ nvc0_resource_from_user_memory(struct pipe_screen *pipe,
|
||||
const struct pipe_resource *templ,
|
||||
void *user_memory)
|
||||
{
|
||||
ASSERTED struct nouveau_screen *screen = nouveau_screen(pipe);
|
||||
|
||||
assert(screen->has_svm);
|
||||
assert(templ->target == PIPE_BUFFER);
|
||||
|
||||
struct nouveau_screen *screen = nouveau_screen(pipe);
|
||||
if (!screen->has_svm || templ->target != PIPE_BUFFER)
|
||||
return NULL;
|
||||
return nouveau_buffer_create_from_user(pipe, templ, user_memory);
|
||||
}
|
||||
|
||||
|
@@ -649,6 +649,7 @@ v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode)
|
||||
V3D_DIRTY_BLEND |
|
||||
V3D_DIRTY_FRAMEBUFFER |
|
||||
V3D_DIRTY_ZSA |
|
||||
V3D_DIRTY_OQ |
|
||||
V3D_DIRTY_RASTERIZER |
|
||||
V3D_DIRTY_SAMPLE_STATE |
|
||||
V3D_DIRTY_FRAGTEX |
|
||||
@@ -677,6 +678,10 @@ v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode)
|
||||
}
|
||||
|
||||
key->swap_color_rb = v3d->swap_color_rb;
|
||||
key->can_earlyz_with_discard = s->info.fs.uses_discard &&
|
||||
(!v3d->zsa || !job->zsbuf || !v3d->zsa->base.depth_enabled ||
|
||||
!v3d->zsa->base.depth_writemask) &&
|
||||
!(v3d->active_queries && v3d->current_oq);
|
||||
|
||||
for (int i = 0; i < v3d->framebuffer.nr_cbufs; i++) {
|
||||
struct pipe_surface *cbuf = v3d->framebuffer.cbufs[i];
|
||||
|
@@ -548,9 +548,27 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
|
||||
|
||||
assert(dest.nr < XE2_MAX_GRF);
|
||||
|
||||
if (devinfo->ver >= 10)
|
||||
assert(!(src0.file == IMM &&
|
||||
src2.file == IMM));
|
||||
if (devinfo->ver <= 9) {
|
||||
assert(src0.file != IMM && src2.file != IMM);
|
||||
} else if (devinfo->ver <= 11) {
|
||||
/* On Ice Lake, BFE and CSEL cannot have any immediate sources. */
|
||||
assert((opcode != BRW_OPCODE_BFE && opcode != BRW_OPCODE_CSEL) ||
|
||||
(src0.file != IMM && src2.file != IMM));
|
||||
|
||||
/* On Ice Lake, DP4A and MAD can only have one immediate source. */
|
||||
assert((opcode != BRW_OPCODE_DP4A && opcode != BRW_OPCODE_MAD) ||
|
||||
!(src0.file == IMM && src2.file == IMM));
|
||||
} else {
|
||||
/* Having two immediate sources is allowed, but this should have been
|
||||
* converted to a regular ADD by brw_fs_opt_algebraic.
|
||||
*/
|
||||
assert(opcode == BRW_OPCODE_ADD3 ||
|
||||
!(src0.file == IMM && src2.file == IMM));
|
||||
}
|
||||
|
||||
/* BFI2 cannot have any immediate sources on any platform. */
|
||||
assert(opcode != BRW_OPCODE_BFI2 ||
|
||||
(src0.file != IMM && src2.file != IMM));
|
||||
|
||||
assert(src0.file == IMM || src0.nr < XE2_MAX_GRF);
|
||||
assert(src1.file != IMM && src1.nr < XE2_MAX_GRF);
|
||||
|
@@ -825,9 +825,8 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
|
||||
* destination of the copy, and simply replacing the sources would give a
|
||||
* program with different semantics.
|
||||
*/
|
||||
if ((brw_type_size_bits(entry->dst.type) < brw_type_size_bits(inst->src[arg].type) ||
|
||||
entry->is_partial_write) &&
|
||||
inst->opcode != BRW_OPCODE_MOV) {
|
||||
if (brw_type_size_bits(entry->dst.type) < brw_type_size_bits(inst->src[arg].type) ||
|
||||
(entry->is_partial_write && inst->opcode != BRW_OPCODE_MOV)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1506,8 +1505,7 @@ try_copy_propagate_def(const brw_compiler *compiler,
|
||||
* destination of the copy, and simply replacing the sources would give a
|
||||
* program with different semantics.
|
||||
*/
|
||||
if (inst->opcode != BRW_OPCODE_MOV &&
|
||||
brw_type_size_bits(def->dst.type) <
|
||||
if (brw_type_size_bits(def->dst.type) <
|
||||
brw_type_size_bits(inst->src[arg].type))
|
||||
return false;
|
||||
|
||||
|
@@ -475,6 +475,19 @@ brw_fs_opt_cse_defs(fs_visitor &s)
|
||||
assert(ops_must_match);
|
||||
}
|
||||
|
||||
/* Some later instruction could depend on the flags written by
|
||||
* this instruction. It can only be removed if the previous
|
||||
* instruction that write the flags is identical.
|
||||
*/
|
||||
if (inst->flags_written(devinfo)) {
|
||||
bool ignored;
|
||||
|
||||
if (last_flag_write == NULL ||
|
||||
!instructions_match(last_flag_write, inst, &ignored)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
progress = true;
|
||||
need_remaps = true;
|
||||
remap_table[inst->dst.nr] =
|
||||
|
@@ -159,7 +159,7 @@ get_ray_query_shadow_addr(nir_builder *b,
|
||||
nir_imul(
|
||||
b,
|
||||
brw_load_btd_dss_id(b),
|
||||
brw_nir_rt_load_num_simd_lanes_per_dss(b, state->devinfo)),
|
||||
state->globals.num_dss_rt_stacks),
|
||||
brw_nir_rt_sync_stack_id(b)),
|
||||
BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
|
||||
|
||||
@@ -232,7 +232,8 @@ lower_ray_query_intrinsic(nir_builder *b,
|
||||
nir_def *shadow_stack_addr =
|
||||
get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
|
||||
nir_def *hw_stack_addr =
|
||||
brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, state->devinfo);
|
||||
brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
|
||||
state->globals.num_dss_rt_stacks);
|
||||
nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
|
@@ -74,15 +74,6 @@ brw_load_btd_dss_id(nir_builder *b)
|
||||
return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_DSS);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_nir_rt_load_num_simd_lanes_per_dss(nir_builder *b,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
return nir_imm_int(b, devinfo->num_thread_per_eu *
|
||||
devinfo->max_eus_per_subslice *
|
||||
16 /* The RT computation is based off SIMD16 */);
|
||||
}
|
||||
|
||||
static inline nir_def *
|
||||
brw_load_eu_thread_simd(nir_builder *b)
|
||||
{
|
||||
@@ -187,23 +178,27 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
|
||||
static inline nir_def *
|
||||
brw_nir_rt_sync_stack_addr(nir_builder *b,
|
||||
nir_def *base_mem_addr,
|
||||
const struct intel_device_info *devinfo)
|
||||
nir_def *num_dss_rt_stacks)
|
||||
{
|
||||
/* For Ray queries (Synchronous Ray Tracing), the formula is similar but
|
||||
* goes down from rtMemBasePtr :
|
||||
/* Bspec 47547 (Xe) and 56936 (Xe2+) say:
|
||||
* For Ray queries (Synchronous Ray Tracing), the formula is similar but
|
||||
* goes down from rtMemBasePtr :
|
||||
*
|
||||
* syncBase = RTDispatchGlobals.rtMemBasePtr
|
||||
* - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
|
||||
* * syncStackSize
|
||||
* syncBase = RTDispatchGlobals.rtMemBasePtr
|
||||
* - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
|
||||
* * syncStackSize
|
||||
*
|
||||
* We assume that we can calculate a 32-bit offset first and then add it
|
||||
* to the 64-bit base address at the end.
|
||||
* We assume that we can calculate a 32-bit offset first and then add it
|
||||
* to the 64-bit base address at the end.
|
||||
*
|
||||
* However, on HSD 14020275151 it's clarified that the HW uses
|
||||
* NUM_SYNC_STACKID_PER_DSS instead.
|
||||
*/
|
||||
nir_def *offset32 =
|
||||
nir_imul(b,
|
||||
nir_iadd(b,
|
||||
nir_imul(b, brw_load_btd_dss_id(b),
|
||||
brw_nir_rt_load_num_simd_lanes_per_dss(b, devinfo)),
|
||||
num_dss_rt_stacks),
|
||||
nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
|
||||
nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
|
||||
return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
|
||||
|
@@ -452,10 +452,8 @@ anv_CopyImageToImageEXT(
|
||||
.y = dst_offset_el.y + y_el,
|
||||
};
|
||||
VkExtent3D extent = {
|
||||
.width = MIN2(extent_el.width - src_offset.x,
|
||||
tile_width_el),
|
||||
.height = MIN2(extent_el.height - src_offset.y,
|
||||
tile_height_el),
|
||||
.width = MIN2(extent_el.width - x_el, tile_width_el),
|
||||
.height = MIN2(extent_el.height - y_el, tile_height_el),
|
||||
.depth = 1,
|
||||
};
|
||||
|
||||
|
@@ -57,6 +57,72 @@ compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
struct anv_descriptor_limits {
|
||||
uint32_t max_ubos;
|
||||
uint32_t max_ssbos;
|
||||
uint32_t max_samplers;
|
||||
uint32_t max_images;
|
||||
uint32_t max_resources;
|
||||
};
|
||||
|
||||
static void
|
||||
get_device_descriptor_limits(const struct anv_physical_device *device,
|
||||
struct anv_descriptor_limits *limits)
|
||||
{
|
||||
memset(limits, 0, sizeof(*limits));
|
||||
|
||||
/* It's a bit hard to exactly map our implementation to the limits
|
||||
* described by Vulkan. The bindless surface handle in the extended message
|
||||
* descriptors is 20 bits on <= Gfx12.0, 26 bits on >= Gfx12.5 and it's an
|
||||
* index into the table of RENDER_SURFACE_STATE structs that starts at
|
||||
* bindless surface base address. On <= Gfx12.0, this means that we can
|
||||
* have at must 1M surface states allocated at any given time. Since most
|
||||
* image views take two descriptors, this means we have a limit of about
|
||||
* 500K image views. On >= Gfx12.5, we do not need 2 surfaces per
|
||||
* descriptors and we have 33M+ descriptors (we have a 2GB limit, due to
|
||||
* overlapping heaps for workarounds, but HW can do 4GB).
|
||||
*
|
||||
* However, on <= Gfx12.0, since we allocate surface states at
|
||||
* vkCreateImageView time, this means our limit is actually something on
|
||||
* the order of 500K image views allocated at any time. The actual limit
|
||||
* describe by Vulkan, on the other hand, is a limit of how many you can
|
||||
* have in a descriptor set. Assuming anyone using 1M descriptors will be
|
||||
* using the same image view twice a bunch of times (or a bunch of null
|
||||
* descriptors), we can safely advertise a larger limit here.
|
||||
*
|
||||
* Here we use the size of the heap in which the descriptors are stored and
|
||||
* divide by the size of the descriptor to get a limit value.
|
||||
*/
|
||||
const uint64_t descriptor_heap_size =
|
||||
device->indirect_descriptors ?
|
||||
device->va.indirect_descriptor_pool.size :
|
||||
device->va.bindless_surface_state_pool.size;;
|
||||
|
||||
const uint32_t buffer_descriptor_size =
|
||||
device->indirect_descriptors ?
|
||||
sizeof(struct anv_address_range_descriptor) :
|
||||
ANV_SURFACE_STATE_SIZE;
|
||||
const uint32_t image_descriptor_size =
|
||||
device->indirect_descriptors ?
|
||||
sizeof(struct anv_address_range_descriptor) :
|
||||
ANV_SURFACE_STATE_SIZE;
|
||||
const uint32_t sampler_descriptor_size =
|
||||
device->indirect_descriptors ?
|
||||
sizeof(struct anv_sampled_image_descriptor) :
|
||||
ANV_SAMPLER_STATE_SIZE;
|
||||
|
||||
limits->max_ubos = descriptor_heap_size / buffer_descriptor_size;
|
||||
limits->max_ssbos = descriptor_heap_size / buffer_descriptor_size;
|
||||
limits->max_images = descriptor_heap_size / image_descriptor_size;
|
||||
limits->max_samplers = descriptor_heap_size / sampler_descriptor_size;
|
||||
|
||||
limits->max_resources = UINT32_MAX;
|
||||
limits->max_resources = MIN2(limits->max_resources, limits->max_ubos);
|
||||
limits->max_resources = MIN2(limits->max_resources, limits->max_ssbos);
|
||||
limits->max_resources = MIN2(limits->max_resources, limits->max_images);
|
||||
limits->max_resources = MIN2(limits->max_resources, limits->max_samplers);
|
||||
}
|
||||
|
||||
static void
|
||||
get_device_extensions(const struct anv_physical_device *device,
|
||||
struct vk_device_extension_table *ext)
|
||||
@@ -972,25 +1038,10 @@ get_properties_1_2(const struct anv_physical_device *pdevice,
|
||||
p->shaderRoundingModeRTZFloat64 = true;
|
||||
p->shaderSignedZeroInfNanPreserveFloat64 = true;
|
||||
|
||||
/* It's a bit hard to exactly map our implementation to the limits
|
||||
* described by Vulkan. The bindless surface handle in the extended
|
||||
* message descriptors is 20 bits and it's an index into the table of
|
||||
* RENDER_SURFACE_STATE structs that starts at bindless surface base
|
||||
* address. This means that we can have at must 1M surface states
|
||||
* allocated at any given time. Since most image views take two
|
||||
* descriptors, this means we have a limit of about 500K image views.
|
||||
*
|
||||
* However, since we allocate surface states at vkCreateImageView time,
|
||||
* this means our limit is actually something on the order of 500K image
|
||||
* views allocated at any time. The actual limit describe by Vulkan, on
|
||||
* the other hand, is a limit of how many you can have in a descriptor set.
|
||||
* Assuming anyone using 1M descriptors will be using the same image view
|
||||
* twice a bunch of times (or a bunch of null descriptors), we can safely
|
||||
* advertise a larger limit here.
|
||||
*/
|
||||
const unsigned max_bindless_views =
|
||||
anv_physical_device_bindless_heap_size(pdevice, false) / ANV_SURFACE_STATE_SIZE;
|
||||
p->maxUpdateAfterBindDescriptorsInAllPools = max_bindless_views;
|
||||
struct anv_descriptor_limits desc_limits;
|
||||
get_device_descriptor_limits(pdevice, &desc_limits);
|
||||
|
||||
p->maxUpdateAfterBindDescriptorsInAllPools = desc_limits.max_resources;
|
||||
p->shaderUniformBufferArrayNonUniformIndexingNative = false;
|
||||
p->shaderSampledImageArrayNonUniformIndexingNative = false;
|
||||
p->shaderStorageBufferArrayNonUniformIndexingNative = true;
|
||||
@@ -998,20 +1049,20 @@ get_properties_1_2(const struct anv_physical_device *pdevice,
|
||||
p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
|
||||
p->robustBufferAccessUpdateAfterBind = true;
|
||||
p->quadDivergentImplicitLod = false;
|
||||
p->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views;
|
||||
p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
|
||||
p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
|
||||
p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views;
|
||||
p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views;
|
||||
p->maxPerStageDescriptorUpdateAfterBindSamplers = desc_limits.max_samplers;
|
||||
p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = desc_limits.max_ubos;
|
||||
p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = desc_limits.max_ssbos;
|
||||
p->maxPerStageDescriptorUpdateAfterBindSampledImages = desc_limits.max_images;
|
||||
p->maxPerStageDescriptorUpdateAfterBindStorageImages = desc_limits.max_images;
|
||||
p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
|
||||
p->maxPerStageUpdateAfterBindResources = UINT32_MAX;
|
||||
p->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views;
|
||||
p->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
|
||||
p->maxPerStageUpdateAfterBindResources = desc_limits.max_resources;
|
||||
p->maxDescriptorSetUpdateAfterBindSamplers = desc_limits.max_samplers;
|
||||
p->maxDescriptorSetUpdateAfterBindUniformBuffers = desc_limits.max_ubos;
|
||||
p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
|
||||
p->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX;
|
||||
p->maxDescriptorSetUpdateAfterBindStorageBuffers = desc_limits.max_ssbos;
|
||||
p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
|
||||
p->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views;
|
||||
p->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views;
|
||||
p->maxDescriptorSetUpdateAfterBindSampledImages = desc_limits.max_images;
|
||||
p->maxDescriptorSetUpdateAfterBindStorageImages = desc_limits.max_images;
|
||||
p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
|
||||
|
||||
/* We support all of the depth resolve modes */
|
||||
@@ -1125,15 +1176,8 @@ get_properties(const struct anv_physical_device *pdevice,
|
||||
|
||||
const struct intel_device_info *devinfo = &pdevice->info;
|
||||
|
||||
const uint32_t max_ssbos = UINT16_MAX;
|
||||
const uint32_t max_textures = UINT16_MAX;
|
||||
const uint32_t max_samplers = UINT16_MAX;
|
||||
const uint32_t max_images = UINT16_MAX;
|
||||
const VkDeviceSize max_heap_size = anx_get_physical_device_max_heap_size(pdevice);
|
||||
|
||||
/* Claim a high per-stage limit since we have bindless. */
|
||||
const uint32_t max_per_stage = UINT32_MAX;
|
||||
|
||||
const uint32_t max_workgroup_size =
|
||||
MIN2(1024, 32 * devinfo->max_cs_workgroup_threads);
|
||||
|
||||
@@ -1158,6 +1202,9 @@ get_properties(const struct anv_physical_device *pdevice,
|
||||
}
|
||||
#endif /* DETECT_OS_ANDROID */
|
||||
|
||||
struct anv_descriptor_limits desc_limits;
|
||||
get_device_descriptor_limits(pdevice, &desc_limits);
|
||||
|
||||
*props = (struct vk_properties) {
|
||||
.apiVersion = ANV_API_VERSION,
|
||||
.driverVersion = vk_get_driver_version(),
|
||||
@@ -1183,20 +1230,20 @@ get_properties(const struct anv_physical_device *pdevice,
|
||||
.bufferImageGranularity = 1,
|
||||
.sparseAddressSpaceSize = sparse_addr_space_size,
|
||||
.maxBoundDescriptorSets = MAX_SETS,
|
||||
.maxPerStageDescriptorSamplers = max_samplers,
|
||||
.maxPerStageDescriptorUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
|
||||
.maxPerStageDescriptorStorageBuffers = max_ssbos,
|
||||
.maxPerStageDescriptorSampledImages = max_textures,
|
||||
.maxPerStageDescriptorStorageImages = max_images,
|
||||
.maxPerStageDescriptorSamplers = desc_limits.max_samplers,
|
||||
.maxPerStageDescriptorUniformBuffers = desc_limits.max_ubos,
|
||||
.maxPerStageDescriptorStorageBuffers = desc_limits.max_ssbos,
|
||||
.maxPerStageDescriptorSampledImages = desc_limits.max_images,
|
||||
.maxPerStageDescriptorStorageImages = desc_limits.max_images,
|
||||
.maxPerStageDescriptorInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
|
||||
.maxPerStageResources = max_per_stage,
|
||||
.maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
|
||||
.maxDescriptorSetUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS, /* number of stages * maxPerStageDescriptorUniformBuffers */
|
||||
.maxPerStageResources = desc_limits.max_resources,
|
||||
.maxDescriptorSetSamplers = desc_limits.max_samplers,
|
||||
.maxDescriptorSetUniformBuffers = desc_limits.max_ubos,
|
||||
.maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
|
||||
.maxDescriptorSetStorageBuffers = 6 * max_ssbos, /* number of stages * maxPerStageDescriptorStorageBuffers */
|
||||
.maxDescriptorSetStorageBuffers = desc_limits.max_ssbos,
|
||||
.maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
|
||||
.maxDescriptorSetSampledImages = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
|
||||
.maxDescriptorSetStorageImages = 6 * max_images, /* number of stages * maxPerStageDescriptorStorageImages */
|
||||
.maxDescriptorSetSampledImages = desc_limits.max_images,
|
||||
.maxDescriptorSetStorageImages = desc_limits.max_images,
|
||||
.maxDescriptorSetInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
|
||||
.maxVertexInputAttributes = MAX_VES,
|
||||
.maxVertexInputBindings = MAX_VBS,
|
||||
@@ -1227,7 +1274,8 @@ get_properties(const struct anv_physical_device *pdevice,
|
||||
.maxFragmentInputComponents = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
|
||||
.maxFragmentOutputAttachments = 8,
|
||||
.maxFragmentDualSrcAttachments = 1,
|
||||
.maxFragmentCombinedOutputResources = MAX_RTS + max_ssbos + max_images,
|
||||
.maxFragmentCombinedOutputResources = MAX_RTS + desc_limits.max_ssbos +
|
||||
desc_limits.max_images,
|
||||
.maxComputeSharedMemorySize = intel_device_info_get_max_slm_size(&pdevice->info),
|
||||
.maxComputeWorkGroupCount = { 65535, 65535, 65535 },
|
||||
.maxComputeWorkGroupInvocations = max_workgroup_size,
|
||||
|
@@ -3035,7 +3035,7 @@ struct anv_descriptor_pool {
|
||||
*/
|
||||
bool host_only;
|
||||
|
||||
char host_mem[0];
|
||||
alignas(8) char host_mem[0];
|
||||
};
|
||||
|
||||
bool
|
||||
|
@@ -3151,7 +3151,7 @@ Converter::visit(nir_tex_instr *insn)
|
||||
if (lodIdx != -1 && !target.isMS())
|
||||
srcs.push_back(getSrc(&insn->src[lodIdx].src, 0));
|
||||
else if (op == OP_TXQ)
|
||||
srcs.push_back(zero); // TXQ always needs an LOD
|
||||
srcs.push_back(loadImm(NULL, 0)); // TXQ always needs an LOD
|
||||
else if (op == OP_TXF)
|
||||
lz = true;
|
||||
if (msIdx != -1)
|
||||
|
@@ -643,12 +643,8 @@ nvk_cmd_dirty_cbufs_for_descriptors(struct nvk_cmd_buffer *cmd,
|
||||
|
||||
case NVK_CBUF_TYPE_DESC_SET:
|
||||
case NVK_CBUF_TYPE_UBO_DESC:
|
||||
if (cbuf->desc_set >= sets_start && cbuf->desc_set < sets_end)
|
||||
group->dirty |= BITFIELD_BIT(i);
|
||||
break;
|
||||
|
||||
case NVK_CBUF_TYPE_DYNAMIC_UBO:
|
||||
if (cbuf->dynamic_idx >= dyn_start && cbuf->dynamic_idx < dyn_end)
|
||||
if (cbuf->desc_set >= sets_start && cbuf->desc_set < sets_end)
|
||||
group->dirty |= BITFIELD_BIT(i);
|
||||
break;
|
||||
|
||||
@@ -749,7 +745,7 @@ nvk_bind_descriptor_sets(struct nvk_cmd_buffer *cmd,
|
||||
assert(next_dyn_offset <= info->dynamicOffsetCount);
|
||||
|
||||
nvk_descriptor_state_set_root_array(cmd, desc, dynamic_buffers,
|
||||
dyn_buffer_start, dyn_buffer_end,
|
||||
dyn_buffer_start, dyn_buffer_end - dyn_buffer_start,
|
||||
&dynamic_buffers[dyn_buffer_start]);
|
||||
|
||||
/* We need to set everything above first_set because later calls to
|
||||
|
@@ -103,8 +103,9 @@ struct nvk_descriptor_state {
|
||||
const struct nvk_root_descriptor_table *root = \
|
||||
(const struct nvk_root_descriptor_table *)(desc)->root; \
|
||||
unsigned _start = start; \
|
||||
assert(_start + count <= ARRAY_SIZE(root->member)); \
|
||||
for (unsigned i = 0; i < count; i++) \
|
||||
unsigned _count = count; \
|
||||
assert(_start + _count <= ARRAY_SIZE(root->member)); \
|
||||
for (unsigned i = 0; i < _count; i++) \
|
||||
(dst)[i] = root->member[i + _start]; \
|
||||
} while (0)
|
||||
|
||||
@@ -125,13 +126,14 @@ struct nvk_descriptor_state {
|
||||
struct nvk_root_descriptor_table *root = \
|
||||
(struct nvk_root_descriptor_table *)_desc->root; \
|
||||
unsigned _start = start; \
|
||||
assert(_start + count <= ARRAY_SIZE(root->member)); \
|
||||
for (unsigned i = 0; i < count; i++) \
|
||||
unsigned _count = count; \
|
||||
assert(_start + _count <= ARRAY_SIZE(root->member)); \
|
||||
for (unsigned i = 0; i < _count; i++) \
|
||||
root->member[i + _start] = (src)[i]; \
|
||||
if (_desc->flush_root != NULL) { \
|
||||
size_t offset = (char *)&root->member[_start] - (char *)root; \
|
||||
_desc->flush_root((cmd), _desc, offset, \
|
||||
count * sizeof(root->member[0])); \
|
||||
_count * sizeof(root->member[0])); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
@@ -71,6 +71,10 @@ panvk_per_arch(cmd_meta_compute_end)(
|
||||
|
||||
cmdbuf->state.compute.shader = save_ctx->cs.shader;
|
||||
cmdbuf->state.compute.cs.desc = save_ctx->cs.desc;
|
||||
|
||||
#if PAN_ARCH >= 9
|
||||
cmdbuf->state.compute.cs.desc.res_table = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
@@ -136,6 +140,9 @@ panvk_per_arch(cmd_meta_gfx_end)(
|
||||
cmdbuf->state.gfx.vs.attribs = 0;
|
||||
cmdbuf->state.gfx.vs.attrib_bufs = 0;
|
||||
cmdbuf->state.gfx.fs.rsd = 0;
|
||||
#else
|
||||
cmdbuf->state.gfx.fs.desc.res_table = 0;
|
||||
cmdbuf->state.gfx.vs.desc.res_table = 0;
|
||||
#endif
|
||||
|
||||
cmdbuf->vk.dynamic_graphics_state = save_ctx->dyn_state.all;
|
||||
|
@@ -25,11 +25,12 @@
|
||||
#define _UTIL_PERFETTO_H
|
||||
|
||||
#include "util/u_atomic.h"
|
||||
#include "util/detect_os.h"
|
||||
|
||||
// On Unix, pass a clockid_t to designate which clock was used to gather the timestamp
|
||||
// On Windows, this paramter is ignored, and it's expected that `timestamp` comes from QueryPerformanceCounter
|
||||
#ifndef _WIN32
|
||||
#include <sys/types.h>
|
||||
#if DETECT_OS_POSIX
|
||||
#include <time.h>
|
||||
typedef clockid_t perfetto_clock_id;
|
||||
#else
|
||||
typedef int32_t perfetto_clock_id;
|
||||
|
@@ -1149,41 +1149,47 @@ wsi_wl_surface_get_support(VkIcdSurfaceBase *surface,
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
/* For true mailbox mode, we need at least 4 images:
|
||||
* 1) One to scan out from
|
||||
* 2) One to have queued for scan-out
|
||||
* 3) One to be currently held by the Wayland compositor
|
||||
* 4) One to render to
|
||||
*/
|
||||
#define WSI_WL_BUMPED_NUM_IMAGES 4
|
||||
|
||||
/* Catch-all. 3 images is a sound default for everything except MAILBOX. */
|
||||
#define WSI_WL_DEFAULT_NUM_IMAGES 3
|
||||
|
||||
static uint32_t
|
||||
wsi_wl_surface_get_min_image_count(struct wsi_wl_display *display,
|
||||
const VkSurfacePresentModeEXT *present_mode)
|
||||
{
|
||||
/* With legacy frame callback mechanism, report 4 images by default, unless
|
||||
* EXT_surface_maintenance1 query is used to ask explicitly for FIFO. */
|
||||
if (present_mode && (present_mode->presentMode == VK_PRESENT_MODE_FIFO_KHR ||
|
||||
present_mode->presentMode == VK_PRESENT_MODE_FIFO_RELAXED_KHR)) {
|
||||
if (display->fifo_manager) {
|
||||
/* When FIFO protocol is supported, applications will no longer block
|
||||
* in QueuePresentKHR due to frame callback, so returning 4 images
|
||||
* for a FIFO swapchain is problematic due to excessive latency. This
|
||||
* latency can only be limited through means of presentWait which few
|
||||
* applications use.
|
||||
* 2 images are enough for forward progress, but 3 is used here
|
||||
* because 2 could result in waiting for the compositor to remove an
|
||||
* old image from scanout when we'd like to be rendering.
|
||||
*/
|
||||
return 3;
|
||||
}
|
||||
|
||||
/* If we receive a FIFO present mode, only 2 images is required for forward progress.
|
||||
* Performance with 2 images will be questionable, but we only allow it for applications
|
||||
* using the new API, so we don't risk breaking any existing apps this way.
|
||||
* Other ICDs expose 2 images here already. */
|
||||
return 2;
|
||||
} else {
|
||||
/* For true mailbox mode, we need at least 4 images:
|
||||
* 1) One to scan out from
|
||||
* 2) One to have queued for scan-out
|
||||
* 3) One to be currently held by the Wayland compositor
|
||||
* 4) One to render to
|
||||
*/
|
||||
return 4;
|
||||
if (present_mode) {
|
||||
return present_mode->presentMode == VK_PRESENT_MODE_MAILBOX_KHR ?
|
||||
WSI_WL_BUMPED_NUM_IMAGES : WSI_WL_DEFAULT_NUM_IMAGES;
|
||||
}
|
||||
|
||||
/* If explicit present_mode is not being queried, we need to provide a safe "catch-all"
|
||||
* which can work for any presentation mode. Implementations are allowed to bump the minImageCount
|
||||
* on swapchain creation, so this limit should be the lowest value which can guarantee forward progress. */
|
||||
|
||||
/* When FIFO protocol is not supported, we always returned 4 here,
|
||||
* despite it going against the spirit of minImageCount in the specification.
|
||||
* To avoid any unforeseen breakage, just keep using the same values we always have.
|
||||
* In this path, we also never consider bumping the image count in minImageCount in swapchain creation time. */
|
||||
|
||||
/* When FIFO protocol is supported, applications will no longer block
|
||||
* in QueuePresentKHR due to frame callback, so returning 4 images
|
||||
* for a FIFO swapchain is deeply problematic due to excessive latency.
|
||||
* This latency can only be limited through means of presentWait which few applications use, and we cannot
|
||||
* mandate that shipping applications are rewritten to avoid a regression.
|
||||
* 2 images are enough for forward progress in FIFO, but 3 is used here as a pragmatic decision
|
||||
* because 2 could result in waiting for the compositor to remove an
|
||||
* old image from scanout when we'd like to be rendering,
|
||||
* and we don't want naively written applications to head into poor performance territory by default.
|
||||
* X11 backend has very similar logic and rationale here.
|
||||
*/
|
||||
return display->fifo_manager ? WSI_WL_DEFAULT_NUM_IMAGES : WSI_WL_BUMPED_NUM_IMAGES;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
@@ -2760,9 +2766,10 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
|
||||
old_chain->retired = true;
|
||||
}
|
||||
|
||||
int num_images = pCreateInfo->minImageCount;
|
||||
|
||||
size_t size = sizeof(*chain) + num_images * sizeof(chain->images[0]);
|
||||
/* We need to allocate the chain handle early, since display initialization code relies on it.
|
||||
* We do not know the actual image count until we have initialized the display handle,
|
||||
* so allocate conservatively in case we need to bump the image count. */
|
||||
size_t size = sizeof(*chain) + MAX2(WSI_WL_BUMPED_NUM_IMAGES, pCreateInfo->minImageCount) * sizeof(chain->images[0]);
|
||||
chain = vk_zalloc(pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
||||
if (chain == NULL)
|
||||
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
@@ -2802,6 +2809,31 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
uint32_t num_images = pCreateInfo->minImageCount;
|
||||
|
||||
/* If app provides a present mode list from EXT_swapchain_maintenance1,
|
||||
* we don't know which present mode will be used.
|
||||
* Application is assumed to be well-behaved and be spec-compliant.
|
||||
* It needs to query all per-present mode minImageCounts individually and use the max() of those modes,
|
||||
* so there should never be any need to bump image counts. */
|
||||
bool uses_present_mode_group = vk_find_struct_const(
|
||||
pCreateInfo->pNext, SWAPCHAIN_PRESENT_MODES_CREATE_INFO_EXT) != NULL;
|
||||
|
||||
/* If FIFO manager is not used, minImageCount is already the bumped value for reasons outlined in
|
||||
* wsi_wl_surface_get_min_image_count(), so skip any attempt to bump the counts. */
|
||||
if (wsi_wl_surface->display->fifo_manager && !uses_present_mode_group) {
|
||||
/* With proper FIFO, we return a lower minImageCount to make FIFO viable without requiring the use of KHR_present_wait.
|
||||
* The image count for MAILBOX should be bumped for performance reasons in this case.
|
||||
* This matches strategy for X11. */
|
||||
const VkSurfacePresentModeEXT mode =
|
||||
{ VK_STRUCTURE_TYPE_SURFACE_PRESENT_MODE_EXT, NULL, pCreateInfo->presentMode };
|
||||
|
||||
uint32_t min_images = wsi_wl_surface_get_min_image_count(wsi_wl_surface->display, &mode);
|
||||
bool requires_image_count_bump = min_images == WSI_WL_BUMPED_NUM_IMAGES;
|
||||
if (requires_image_count_bump)
|
||||
num_images = MAX2(min_images, num_images);
|
||||
}
|
||||
|
||||
VkPresentModeKHR present_mode = wsi_swapchain_get_present_mode(wsi_device, pCreateInfo);
|
||||
if (present_mode == VK_PRESENT_MODE_IMMEDIATE_KHR) {
|
||||
chain->tearing_control =
|
||||
|
Reference in New Issue
Block a user