Compare commits
65 Commits
mesa-23.0.
...
mesa-19.0.
Author | SHA1 | Date | |
---|---|---|---|
|
838baab472 | ||
|
d8534f931c | ||
|
1f33f3cf3a | ||
|
fbcd1ad42c | ||
|
9a5c8d2aab | ||
|
c55008e5a0 | ||
|
ab585817e6 | ||
|
75bec50c2a | ||
|
62b3bd8cd1 | ||
|
fb3485bc92 | ||
|
2a97a3a8e7 | ||
|
ab70eccc75 | ||
|
24bb2771b6 | ||
|
7b5e0f8316 | ||
|
77102d0151 | ||
|
c96d433105 | ||
|
81810fa5db | ||
|
c5b9774eb4 | ||
|
a08aba86da | ||
|
d278b3c187 | ||
|
5a9b7bce9c | ||
|
b9e5e15f87 | ||
|
f305135e0b | ||
|
eb766a259e | ||
|
a1ae60e9a3 | ||
|
37ade3a566 | ||
|
92fa6d6959 | ||
|
5e85df1cfd | ||
|
e9dc4e252f | ||
|
56a47e3421 | ||
|
ca36eb12fd | ||
|
9dd433dfa7 | ||
|
f59c77ef8c | ||
|
61c22ba94b | ||
|
ad2b712a56 | ||
|
07e299a0a0 | ||
|
36d99d9ad0 | ||
|
94f0908216 | ||
|
f880c74717 | ||
|
6f36d3bbc0 | ||
|
b4e8a3294c | ||
|
ef6809ba88 | ||
|
7254d2f4a3 | ||
|
dbc43e3897 | ||
|
262fd16b99 | ||
|
452f9b9984 | ||
|
131f12d49f | ||
|
f8f68c41a1 | ||
|
15e2fc16e9 | ||
|
3f5099180d | ||
|
9667d89fe6 | ||
|
c6649ca94d | ||
|
89f84f98e0 | ||
|
c824f8031c | ||
|
7fdb08375f | ||
|
535cc4f1d5 | ||
|
7f91ae20b9 | ||
|
0a72505a9e | ||
|
31d0079a20 | ||
|
4d1dd3b0cd | ||
|
45d1aa2f6c | ||
|
2fddad9e3f | ||
|
2b603ee4f1 | ||
|
e7f6a5d17f | ||
|
1f5f12687f |
@@ -22,6 +22,7 @@
|
||||
SUBDIRS = src
|
||||
|
||||
AM_DISTCHECK_CONFIGURE_FLAGS = \
|
||||
--enable-autotools \
|
||||
--enable-dri \
|
||||
--enable-dri3 \
|
||||
--enable-egl \
|
||||
|
3
bin/.cherry-ignore
Normal file
3
bin/.cherry-ignore
Normal file
@@ -0,0 +1,3 @@
|
||||
# Both of these were already merged with different shas
|
||||
da48cba61ef6fefb799bf96e6364b70dbf4ec712
|
||||
c812c740e60c14060eb89db66039111881a0f42f
|
@@ -122,7 +122,7 @@ LLVM_REQUIRED_OPENCL=3.9.0
|
||||
LLVM_REQUIRED_R600=3.9.0
|
||||
LLVM_REQUIRED_RADEONSI=7.0.0
|
||||
LLVM_REQUIRED_RADV=7.0.0
|
||||
LLVM_REQUIRED_SWR=6.0.0
|
||||
LLVM_REQUIRED_SWR=7.0.0
|
||||
|
||||
dnl Check for progs
|
||||
AC_PROG_CPP
|
||||
@@ -2845,8 +2845,8 @@ if test -n "$with_gallium_drivers"; then
|
||||
fi
|
||||
|
||||
# XXX: Keep in sync with LLVM_REQUIRED_SWR
|
||||
AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x6.0.0 -a \
|
||||
"x$LLVM_VERSION" != x6.0.1)
|
||||
AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x7.0.0 -a \
|
||||
"x$LLVM_VERSION" != x7.0.1)
|
||||
|
||||
if test "x$enable_llvm" = "xyes" -a "$with_gallium_drivers"; then
|
||||
llvm_require_version $LLVM_REQUIRED_GALLIUM "gallium"
|
||||
|
@@ -1400,7 +1400,7 @@ if with_platform_x11
|
||||
dep_xcb_xfixes = dependency('xcb-xfixes')
|
||||
endif
|
||||
if with_xlib_lease
|
||||
dep_xcb_xrandr = dependency('xcb-randr', version : '>= 1.12')
|
||||
dep_xcb_xrandr = dependency('xcb-randr')
|
||||
dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3')
|
||||
endif
|
||||
endif
|
||||
|
@@ -923,6 +923,14 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
|
||||
ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
|
||||
}
|
||||
|
||||
LLVMValueRef
|
||||
ac_build_gep_ptr(struct ac_llvm_context *ctx,
|
||||
LLVMValueRef base_ptr,
|
||||
LLVMValueRef index)
|
||||
{
|
||||
return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
|
||||
}
|
||||
|
||||
LLVMValueRef
|
||||
ac_build_gep0(struct ac_llvm_context *ctx,
|
||||
LLVMValueRef base_ptr,
|
||||
|
@@ -223,6 +223,11 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
|
||||
LLVMValueRef attr_number,
|
||||
LLVMValueRef params);
|
||||
|
||||
LLVMValueRef
|
||||
ac_build_gep_ptr(struct ac_llvm_context *ctx,
|
||||
LLVMValueRef base_ptr,
|
||||
LLVMValueRef index);
|
||||
|
||||
LLVMValueRef
|
||||
ac_build_gep0(struct ac_llvm_context *ctx,
|
||||
LLVMValueRef base_ptr,
|
||||
|
@@ -2006,18 +2006,23 @@ static void
|
||||
visit_store_var(struct ac_nir_context *ctx,
|
||||
nir_intrinsic_instr *instr)
|
||||
{
|
||||
nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
|
||||
nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
|
||||
nir_variable *var = nir_deref_instr_get_variable(deref);
|
||||
|
||||
LLVMValueRef temp_ptr, value;
|
||||
int idx = var->data.driver_location;
|
||||
unsigned comp = var->data.location_frac;
|
||||
int idx = 0;
|
||||
unsigned comp = 0;
|
||||
LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1]));
|
||||
int writemask = instr->const_index[0];
|
||||
LLVMValueRef indir_index;
|
||||
unsigned const_index;
|
||||
|
||||
get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false,
|
||||
NULL, NULL, &const_index, &indir_index);
|
||||
if (var) {
|
||||
get_deref_offset(ctx, deref, false,
|
||||
NULL, NULL, &const_index, &indir_index);
|
||||
idx = var->data.driver_location;
|
||||
comp = var->data.location_frac;
|
||||
}
|
||||
|
||||
if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) {
|
||||
|
||||
@@ -2030,7 +2035,7 @@ visit_store_var(struct ac_nir_context *ctx,
|
||||
|
||||
writemask = writemask << comp;
|
||||
|
||||
switch (var->data.mode) {
|
||||
switch (deref->mode) {
|
||||
case nir_var_shader_out:
|
||||
|
||||
if (ctx->stage == MESA_SHADER_TESS_CTRL) {
|
||||
@@ -2039,8 +2044,8 @@ visit_store_var(struct ac_nir_context *ctx,
|
||||
unsigned const_index = 0;
|
||||
const bool is_patch = var->data.patch;
|
||||
|
||||
get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
|
||||
false, NULL, is_patch ? NULL : &vertex_index,
|
||||
get_deref_offset(ctx, deref, false, NULL,
|
||||
is_patch ? NULL : &vertex_index,
|
||||
&const_index, &indir_index);
|
||||
|
||||
ctx->abi->store_tcs_outputs(ctx->abi, var,
|
||||
@@ -2107,7 +2112,7 @@ visit_store_var(struct ac_nir_context *ctx,
|
||||
int writemask = instr->const_index[0];
|
||||
LLVMValueRef address = get_src(ctx, instr->src[0]);
|
||||
LLVMValueRef val = get_src(ctx, instr->src[1]);
|
||||
if (util_is_power_of_two_nonzero(writemask)) {
|
||||
if (writemask == (1u << ac_get_llvm_num_components(val)) - 1) {
|
||||
val = LLVMBuildBitCast(
|
||||
ctx->ac.builder, val,
|
||||
LLVMGetElementType(LLVMTypeOf(address)), "");
|
||||
@@ -3818,6 +3823,73 @@ static void visit_jump(struct ac_llvm_context *ctx,
|
||||
}
|
||||
}
|
||||
|
||||
static LLVMTypeRef
|
||||
glsl_base_to_llvm_type(struct ac_llvm_context *ac,
|
||||
enum glsl_base_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case GLSL_TYPE_INT:
|
||||
case GLSL_TYPE_UINT:
|
||||
case GLSL_TYPE_BOOL:
|
||||
case GLSL_TYPE_SUBROUTINE:
|
||||
return ac->i32;
|
||||
case GLSL_TYPE_INT16:
|
||||
case GLSL_TYPE_UINT16:
|
||||
return ac->i16;
|
||||
case GLSL_TYPE_FLOAT:
|
||||
return ac->f32;
|
||||
case GLSL_TYPE_FLOAT16:
|
||||
return ac->f16;
|
||||
case GLSL_TYPE_INT64:
|
||||
case GLSL_TYPE_UINT64:
|
||||
return ac->i64;
|
||||
case GLSL_TYPE_DOUBLE:
|
||||
return ac->f64;
|
||||
default:
|
||||
unreachable("unknown GLSL type");
|
||||
}
|
||||
}
|
||||
|
||||
static LLVMTypeRef
|
||||
glsl_to_llvm_type(struct ac_llvm_context *ac,
|
||||
const struct glsl_type *type)
|
||||
{
|
||||
if (glsl_type_is_scalar(type)) {
|
||||
return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
|
||||
}
|
||||
|
||||
if (glsl_type_is_vector(type)) {
|
||||
return LLVMVectorType(
|
||||
glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
|
||||
glsl_get_vector_elements(type));
|
||||
}
|
||||
|
||||
if (glsl_type_is_matrix(type)) {
|
||||
return LLVMArrayType(
|
||||
glsl_to_llvm_type(ac, glsl_get_column_type(type)),
|
||||
glsl_get_matrix_columns(type));
|
||||
}
|
||||
|
||||
if (glsl_type_is_array(type)) {
|
||||
return LLVMArrayType(
|
||||
glsl_to_llvm_type(ac, glsl_get_array_element(type)),
|
||||
glsl_get_length(type));
|
||||
}
|
||||
|
||||
assert(glsl_type_is_struct(type));
|
||||
|
||||
LLVMTypeRef member_types[glsl_get_length(type)];
|
||||
|
||||
for (unsigned i = 0; i < glsl_get_length(type); i++) {
|
||||
member_types[i] =
|
||||
glsl_to_llvm_type(ac,
|
||||
glsl_get_struct_field(type, i));
|
||||
}
|
||||
|
||||
return LLVMStructTypeInContext(ac->context, member_types,
|
||||
glsl_get_length(type), false);
|
||||
}
|
||||
|
||||
static void visit_deref(struct ac_nir_context *ctx,
|
||||
nir_deref_instr *instr)
|
||||
{
|
||||
@@ -3839,9 +3911,27 @@ static void visit_deref(struct ac_nir_context *ctx,
|
||||
result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
|
||||
get_src(ctx, instr->arr.index));
|
||||
break;
|
||||
case nir_deref_type_cast:
|
||||
result = get_src(ctx, instr->parent);
|
||||
case nir_deref_type_ptr_as_array:
|
||||
result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent),
|
||||
get_src(ctx, instr->arr.index));
|
||||
break;
|
||||
case nir_deref_type_cast: {
|
||||
result = get_src(ctx, instr->parent);
|
||||
|
||||
LLVMTypeRef pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type);
|
||||
LLVMTypeRef type = LLVMPointerType(pointee_type, AC_ADDR_SPACE_LDS);
|
||||
|
||||
if (LLVMTypeOf(result) != type) {
|
||||
if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) {
|
||||
result = LLVMBuildBitCast(ctx->ac.builder, result,
|
||||
type, "");
|
||||
} else {
|
||||
result = LLVMBuildIntToPtr(ctx->ac.builder, result,
|
||||
type, "");
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
unreachable("Unhandled deref_instr deref type");
|
||||
}
|
||||
@@ -3990,73 +4080,6 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
|
||||
}
|
||||
}
|
||||
|
||||
static LLVMTypeRef
|
||||
glsl_base_to_llvm_type(struct ac_llvm_context *ac,
|
||||
enum glsl_base_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case GLSL_TYPE_INT:
|
||||
case GLSL_TYPE_UINT:
|
||||
case GLSL_TYPE_BOOL:
|
||||
case GLSL_TYPE_SUBROUTINE:
|
||||
return ac->i32;
|
||||
case GLSL_TYPE_INT16:
|
||||
case GLSL_TYPE_UINT16:
|
||||
return ac->i16;
|
||||
case GLSL_TYPE_FLOAT:
|
||||
return ac->f32;
|
||||
case GLSL_TYPE_FLOAT16:
|
||||
return ac->f16;
|
||||
case GLSL_TYPE_INT64:
|
||||
case GLSL_TYPE_UINT64:
|
||||
return ac->i64;
|
||||
case GLSL_TYPE_DOUBLE:
|
||||
return ac->f64;
|
||||
default:
|
||||
unreachable("unknown GLSL type");
|
||||
}
|
||||
}
|
||||
|
||||
static LLVMTypeRef
|
||||
glsl_to_llvm_type(struct ac_llvm_context *ac,
|
||||
const struct glsl_type *type)
|
||||
{
|
||||
if (glsl_type_is_scalar(type)) {
|
||||
return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
|
||||
}
|
||||
|
||||
if (glsl_type_is_vector(type)) {
|
||||
return LLVMVectorType(
|
||||
glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
|
||||
glsl_get_vector_elements(type));
|
||||
}
|
||||
|
||||
if (glsl_type_is_matrix(type)) {
|
||||
return LLVMArrayType(
|
||||
glsl_to_llvm_type(ac, glsl_get_column_type(type)),
|
||||
glsl_get_matrix_columns(type));
|
||||
}
|
||||
|
||||
if (glsl_type_is_array(type)) {
|
||||
return LLVMArrayType(
|
||||
glsl_to_llvm_type(ac, glsl_get_array_element(type)),
|
||||
glsl_get_length(type));
|
||||
}
|
||||
|
||||
assert(glsl_type_is_struct(type));
|
||||
|
||||
LLVMTypeRef member_types[glsl_get_length(type)];
|
||||
|
||||
for (unsigned i = 0; i < glsl_get_length(type); i++) {
|
||||
member_types[i] =
|
||||
glsl_to_llvm_type(ac,
|
||||
glsl_get_struct_field(type, i));
|
||||
}
|
||||
|
||||
return LLVMStructTypeInContext(ac->context, member_types,
|
||||
glsl_get_length(type), false);
|
||||
}
|
||||
|
||||
static void
|
||||
setup_locals(struct ac_nir_context *ctx,
|
||||
struct nir_function *func)
|
||||
|
@@ -1356,7 +1356,7 @@ radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
|
||||
|
||||
uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
|
||||
|
||||
if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
|
||||
if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
|
||||
radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
@@ -1518,14 +1518,13 @@ radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
|
||||
|
||||
uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
|
||||
|
||||
if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
|
||||
if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
|
||||
radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
|
||||
radeon_emit(cs, 2);
|
||||
} else {
|
||||
/* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */
|
||||
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
|
||||
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
|
||||
COPY_DATA_DST_SEL(COPY_DATA_REG) |
|
||||
|
@@ -84,7 +84,9 @@ VkResult radv_CreateDescriptorSetLayout(
|
||||
uint32_t immutable_sampler_count = 0;
|
||||
for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
|
||||
max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding);
|
||||
if (pCreateInfo->pBindings[j].pImmutableSamplers)
|
||||
if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
|
||||
pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
|
||||
pCreateInfo->pBindings[j].pImmutableSamplers)
|
||||
immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
|
||||
}
|
||||
|
||||
@@ -182,7 +184,9 @@ VkResult radv_CreateDescriptorSetLayout(
|
||||
set_layout->has_variable_descriptors = true;
|
||||
}
|
||||
|
||||
if (binding->pImmutableSamplers) {
|
||||
if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
|
||||
binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
|
||||
binding->pImmutableSamplers) {
|
||||
set_layout->binding[b].immutable_samplers_offset = samplers_offset;
|
||||
set_layout->binding[b].immutable_samplers_equal =
|
||||
has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount);
|
||||
|
@@ -369,6 +369,11 @@ radv_physical_device_init(struct radv_physical_device *device,
|
||||
device->dcc_msaa_allowed =
|
||||
(device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA);
|
||||
|
||||
/* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */
|
||||
device->has_load_ctx_reg_pkt = device->rad_info.chip_class >= GFX9 ||
|
||||
(device->rad_info.chip_class >= VI &&
|
||||
device->rad_info.me_fw_feature >= 41);
|
||||
|
||||
radv_physical_device_init_mem_types(device);
|
||||
radv_fill_device_extension_table(device, &device->supported_extensions);
|
||||
|
||||
|
@@ -849,54 +849,60 @@ build_pipeline(struct radv_device *device,
|
||||
.subpass = 0,
|
||||
};
|
||||
|
||||
switch(aspect) {
|
||||
case VK_IMAGE_ASPECT_COLOR_BIT:
|
||||
vk_pipeline_info.pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
|
||||
.attachmentCount = 1,
|
||||
.pAttachments = (VkPipelineColorBlendAttachmentState []) {
|
||||
{ .colorWriteMask =
|
||||
VK_COLOR_COMPONENT_A_BIT |
|
||||
VK_COLOR_COMPONENT_R_BIT |
|
||||
VK_COLOR_COMPONENT_G_BIT |
|
||||
VK_COLOR_COMPONENT_B_BIT },
|
||||
VkPipelineColorBlendStateCreateInfo color_blend_info = {
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
|
||||
.attachmentCount = 1,
|
||||
.pAttachments = (VkPipelineColorBlendAttachmentState []) {
|
||||
{
|
||||
.colorWriteMask = VK_COLOR_COMPONENT_A_BIT |
|
||||
VK_COLOR_COMPONENT_R_BIT |
|
||||
VK_COLOR_COMPONENT_G_BIT |
|
||||
VK_COLOR_COMPONENT_B_BIT },
|
||||
}
|
||||
};
|
||||
|
||||
VkPipelineDepthStencilStateCreateInfo depth_info = {
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
|
||||
.depthTestEnable = true,
|
||||
.depthWriteEnable = true,
|
||||
.depthCompareOp = VK_COMPARE_OP_ALWAYS,
|
||||
};
|
||||
|
||||
VkPipelineDepthStencilStateCreateInfo stencil_info = {
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
|
||||
.depthTestEnable = false,
|
||||
.depthWriteEnable = false,
|
||||
.stencilTestEnable = true,
|
||||
.front = {
|
||||
.failOp = VK_STENCIL_OP_REPLACE,
|
||||
.passOp = VK_STENCIL_OP_REPLACE,
|
||||
.depthFailOp = VK_STENCIL_OP_REPLACE,
|
||||
.compareOp = VK_COMPARE_OP_ALWAYS,
|
||||
.compareMask = 0xff,
|
||||
.writeMask = 0xff,
|
||||
.reference = 0
|
||||
},
|
||||
.back = {
|
||||
.failOp = VK_STENCIL_OP_REPLACE,
|
||||
.passOp = VK_STENCIL_OP_REPLACE,
|
||||
.depthFailOp = VK_STENCIL_OP_REPLACE,
|
||||
.compareOp = VK_COMPARE_OP_ALWAYS,
|
||||
.compareMask = 0xff,
|
||||
.writeMask = 0xff,
|
||||
.reference = 0
|
||||
},
|
||||
.depthCompareOp = VK_COMPARE_OP_ALWAYS,
|
||||
};
|
||||
|
||||
switch(aspect) {
|
||||
case VK_IMAGE_ASPECT_COLOR_BIT:
|
||||
vk_pipeline_info.pColorBlendState = &color_blend_info;
|
||||
break;
|
||||
case VK_IMAGE_ASPECT_DEPTH_BIT:
|
||||
vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
|
||||
.depthTestEnable = true,
|
||||
.depthWriteEnable = true,
|
||||
.depthCompareOp = VK_COMPARE_OP_ALWAYS,
|
||||
};
|
||||
vk_pipeline_info.pDepthStencilState = &depth_info;
|
||||
break;
|
||||
case VK_IMAGE_ASPECT_STENCIL_BIT:
|
||||
vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
|
||||
.depthTestEnable = false,
|
||||
.depthWriteEnable = false,
|
||||
.stencilTestEnable = true,
|
||||
.front = {
|
||||
.failOp = VK_STENCIL_OP_REPLACE,
|
||||
.passOp = VK_STENCIL_OP_REPLACE,
|
||||
.depthFailOp = VK_STENCIL_OP_REPLACE,
|
||||
.compareOp = VK_COMPARE_OP_ALWAYS,
|
||||
.compareMask = 0xff,
|
||||
.writeMask = 0xff,
|
||||
.reference = 0
|
||||
},
|
||||
.back = {
|
||||
.failOp = VK_STENCIL_OP_REPLACE,
|
||||
.passOp = VK_STENCIL_OP_REPLACE,
|
||||
.depthFailOp = VK_STENCIL_OP_REPLACE,
|
||||
.compareOp = VK_COMPARE_OP_ALWAYS,
|
||||
.compareMask = 0xff,
|
||||
.writeMask = 0xff,
|
||||
.reference = 0
|
||||
},
|
||||
.depthCompareOp = VK_COMPARE_OP_ALWAYS,
|
||||
};
|
||||
vk_pipeline_info.pDepthStencilState = &stencil_info;
|
||||
break;
|
||||
default:
|
||||
unreachable("Unhandled aspect");
|
||||
|
@@ -306,6 +306,9 @@ struct radv_physical_device {
|
||||
/* Whether DCC should be enabled for MSAA textures. */
|
||||
bool dcc_msaa_allowed;
|
||||
|
||||
/* Whether LOAD_CONTEXT_REG packets are supported. */
|
||||
bool has_load_ctx_reg_pkt;
|
||||
|
||||
/* This is the drivers on-disk cache used as a fallback as opposed to
|
||||
* the pipeline cache defined by apps.
|
||||
*/
|
||||
|
@@ -159,7 +159,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively,
|
||||
NIR_PASS(progress, shader, nir_opt_if);
|
||||
NIR_PASS(progress, shader, nir_opt_dead_cf);
|
||||
NIR_PASS(progress, shader, nir_opt_cse);
|
||||
NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);
|
||||
NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true);
|
||||
NIR_PASS(progress, shader, nir_opt_algebraic);
|
||||
NIR_PASS(progress, shader, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, shader, nir_opt_undef);
|
||||
|
@@ -101,7 +101,7 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
|
||||
case MESA_SHADER_VERTEX: {
|
||||
nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
|
||||
|
||||
if (var->data.mode == nir_var_shader_in) {
|
||||
if (var && var->data.mode == nir_var_shader_in) {
|
||||
unsigned idx = var->data.location;
|
||||
uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa);
|
||||
|
||||
@@ -150,7 +150,7 @@ gather_intrinsic_store_deref_info(const nir_shader *nir,
|
||||
{
|
||||
nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
|
||||
|
||||
if (var->data.mode == nir_var_shader_out) {
|
||||
if (var && var->data.mode == nir_var_shader_out) {
|
||||
unsigned idx = var->data.location;
|
||||
|
||||
switch (nir->info.stage) {
|
||||
|
@@ -543,7 +543,7 @@ static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs,
|
||||
cs->handles[cs->num_buffers].bo_handle = bo;
|
||||
cs->handles[cs->num_buffers].bo_priority = priority;
|
||||
|
||||
hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
|
||||
hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
|
||||
cs->buffer_hash_table[hash] = cs->num_buffers;
|
||||
|
||||
++cs->num_buffers;
|
||||
|
@@ -159,9 +159,8 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
|
||||
* d0-d7.
|
||||
*/
|
||||
"vstm %[gpu], {q0, q1, q2, q3}\n"
|
||||
:
|
||||
: [cpu] "+r"(cpu)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "q0", "q1", "q2", "q3");
|
||||
return;
|
||||
|
@@ -1455,7 +1455,7 @@ v3d_optimize_nir(struct nir_shader *s)
|
||||
NIR_PASS(progress, s, nir_opt_dce);
|
||||
NIR_PASS(progress, s, nir_opt_dead_cf);
|
||||
NIR_PASS(progress, s, nir_opt_cse);
|
||||
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
|
||||
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
|
||||
NIR_PASS(progress, s, nir_opt_algebraic);
|
||||
NIR_PASS(progress, s, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, s, nir_opt_undef);
|
||||
|
@@ -156,7 +156,7 @@ pack_sint(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
|
||||
int num_components)
|
||||
{
|
||||
color = nir_channels(b, color, (1 << num_components) - 1);
|
||||
color = nir_format_clamp_uint(b, color, bits);
|
||||
color = nir_format_clamp_sint(b, color, bits);
|
||||
return pack_bits(b, color, bits, num_components, true);
|
||||
}
|
||||
|
||||
|
@@ -104,6 +104,6 @@ $(intermediates)/glsl/ir_expression_operation_strings.h: $(LOCAL_PATH)/glsl/ir_e
|
||||
@mkdir -p $(dir $@)
|
||||
$(hide) $(MESA_PYTHON2) $< strings > $@
|
||||
|
||||
$(intermediates)/compiler/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
|
||||
$(intermediates)/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
|
||||
@mkdir -p $(dir $@)
|
||||
$(hide) $(MESA_PYTHON2) $< $(MESA_TOP)/src/compiler/glsl/float64.glsl $@ -n float64_source > $@
|
||||
|
@@ -2825,7 +2825,7 @@ should_print_nir(void)
|
||||
static inline void nir_validate_shader(nir_shader *shader, const char *when) { (void) shader; (void)when; }
|
||||
static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
|
||||
static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
|
||||
static inline bool should_skip_nir(const char *pass_name) { return false; }
|
||||
static inline bool should_skip_nir(UNUSED const char *pass_name) { return false; }
|
||||
static inline bool should_clone_nir(void) { return false; }
|
||||
static inline bool should_serialize_deserialize_nir(void) { return false; }
|
||||
static inline bool should_print_nir(void) { return false; }
|
||||
@@ -3316,7 +3316,7 @@ bool nir_opt_move_comparisons(nir_shader *shader);
|
||||
bool nir_opt_move_load_ubo(nir_shader *shader);
|
||||
|
||||
bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
|
||||
bool indirect_load_ok, bool expensive_alu_ok);
|
||||
bool indirect_load_ok);
|
||||
|
||||
bool nir_opt_remove_phis(nir_shader *shader);
|
||||
|
||||
|
@@ -574,10 +574,9 @@ nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl)
|
||||
_mesa_hash_table_clear(state.cache, NULL);
|
||||
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type == nir_instr_type_deref) {
|
||||
nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr));
|
||||
if (instr->type == nir_instr_type_deref &&
|
||||
nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr)))
|
||||
continue;
|
||||
}
|
||||
|
||||
state.builder.cursor = nir_before_instr(instr);
|
||||
nir_foreach_src(instr, rematerialize_deref_src, &state);
|
||||
|
@@ -59,8 +59,7 @@
|
||||
|
||||
static bool
|
||||
block_check_for_allowed_instrs(nir_block *block, unsigned *count,
|
||||
bool alu_ok, bool indirect_load_ok,
|
||||
bool expensive_alu_ok)
|
||||
bool alu_ok, bool indirect_load_ok)
|
||||
{
|
||||
nir_foreach_instr(instr, block) {
|
||||
switch (instr->type) {
|
||||
@@ -118,25 +117,6 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,
|
||||
case nir_op_vec3:
|
||||
case nir_op_vec4:
|
||||
break;
|
||||
|
||||
case nir_op_fcos:
|
||||
case nir_op_fdiv:
|
||||
case nir_op_fexp2:
|
||||
case nir_op_flog2:
|
||||
case nir_op_fmod:
|
||||
case nir_op_fpow:
|
||||
case nir_op_frcp:
|
||||
case nir_op_frem:
|
||||
case nir_op_frsq:
|
||||
case nir_op_fsin:
|
||||
case nir_op_idiv:
|
||||
case nir_op_irem:
|
||||
case nir_op_udiv:
|
||||
if (!alu_ok || !expensive_alu_ok)
|
||||
return false;
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
if (!alu_ok) {
|
||||
/* It must be a move-like operation. */
|
||||
@@ -180,8 +160,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,
|
||||
|
||||
static bool
|
||||
nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
|
||||
unsigned limit, bool indirect_load_ok,
|
||||
bool expensive_alu_ok)
|
||||
unsigned limit, bool indirect_load_ok)
|
||||
{
|
||||
if (nir_cf_node_is_first(&block->cf_node))
|
||||
return false;
|
||||
@@ -202,9 +181,9 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
|
||||
/* ... and those blocks must only contain "allowed" instructions. */
|
||||
unsigned count = 0;
|
||||
if (!block_check_for_allowed_instrs(then_block, &count, limit != 0,
|
||||
indirect_load_ok, expensive_alu_ok) ||
|
||||
indirect_load_ok) ||
|
||||
!block_check_for_allowed_instrs(else_block, &count, limit != 0,
|
||||
indirect_load_ok, expensive_alu_ok))
|
||||
indirect_load_ok))
|
||||
return false;
|
||||
|
||||
if (count > limit)
|
||||
@@ -271,15 +250,14 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
|
||||
|
||||
static bool
|
||||
nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,
|
||||
bool indirect_load_ok, bool expensive_alu_ok)
|
||||
bool indirect_load_ok)
|
||||
{
|
||||
nir_shader *shader = impl->function->shader;
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_block_safe(block, impl) {
|
||||
progress |= nir_opt_peephole_select_block(block, shader, limit,
|
||||
indirect_load_ok,
|
||||
expensive_alu_ok);
|
||||
indirect_load_ok);
|
||||
}
|
||||
|
||||
if (progress) {
|
||||
@@ -295,15 +273,14 @@ nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,
|
||||
|
||||
bool
|
||||
nir_opt_peephole_select(nir_shader *shader, unsigned limit,
|
||||
bool indirect_load_ok, bool expensive_alu_ok)
|
||||
bool indirect_load_ok)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_function(function, shader) {
|
||||
if (function->impl)
|
||||
progress |= nir_opt_peephole_select_impl(function->impl, limit,
|
||||
indirect_load_ok,
|
||||
expensive_alu_ok);
|
||||
indirect_load_ok);
|
||||
}
|
||||
|
||||
return progress;
|
||||
|
@@ -45,6 +45,7 @@ TESTS =
|
||||
BUILT_SOURCES =
|
||||
CLEANFILES =
|
||||
EXTRA_DIST = \
|
||||
meson.build \
|
||||
drm/meson.build \
|
||||
ir3/ir3_nir_trig.py \
|
||||
ir3/meson.build
|
||||
|
@@ -97,7 +97,7 @@ ir3_optimize_loop(nir_shader *s)
|
||||
progress |= OPT(s, nir_opt_gcm, true);
|
||||
else if (gcm == 2)
|
||||
progress |= OPT(s, nir_opt_gcm, false);
|
||||
progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
|
||||
progress |= OPT(s, nir_opt_peephole_select, 16, true);
|
||||
progress |= OPT(s, nir_opt_intrinsics);
|
||||
progress |= OPT(s, nir_opt_algebraic);
|
||||
progress |= OPT(s, nir_opt_constant_folding);
|
||||
|
@@ -1524,7 +1524,8 @@ tc_buffer_do_flush_region(struct threaded_context *tc,
|
||||
if (ttrans->staging) {
|
||||
struct pipe_box src_box;
|
||||
|
||||
u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
|
||||
u_box_1d(ttrans->offset + ttrans->b.box.x % tc->map_buffer_alignment +
|
||||
(box->x - ttrans->b.box.x),
|
||||
box->width, &src_box);
|
||||
|
||||
/* Copy the staging buffer into the original one. */
|
||||
|
@@ -487,6 +487,10 @@ The integer capabilities:
|
||||
* ``PIPE_CAP_DEST_SURFACE_SRGB_CONTROL``: Indicates whether the drivers
|
||||
supports switching the format between sRGB and linear for a surface that is
|
||||
used as destination in draw and blit calls.
|
||||
* ``PIPE_CAP_MAX_VARYINGS``: The maximum number of fragment shader
|
||||
varyings. This will generally correspond to
|
||||
``PIPE_SHADER_CAP_MAX_INPUTS`` for the fragment shader, but in some
|
||||
cases may be a smaller number.
|
||||
|
||||
.. _pipe_capf:
|
||||
|
||||
|
@@ -360,6 +360,9 @@ etna_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
|
||||
return 0;
|
||||
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return screen->specs.max_varyings;
|
||||
|
||||
case PIPE_CAP_PCI_GROUP:
|
||||
case PIPE_CAP_PCI_BUS:
|
||||
case PIPE_CAP_PCI_DEVICE:
|
||||
|
@@ -23,4 +23,6 @@ libfreedreno_la_SOURCES = \
|
||||
$(a6xx_SOURCES) \
|
||||
$(ir3_SOURCES)
|
||||
|
||||
EXTRA_DIST = meson.build
|
||||
EXTRA_DIST = \
|
||||
ir3/ir3_cmdline.c \
|
||||
meson.build
|
||||
|
@@ -339,7 +339,6 @@ clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring,
|
||||
OUT_PKT3(ring, CP_SET_CONSTANT, 2);
|
||||
OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR));
|
||||
OUT_RINGP(ring, patch_type, &batch->gmem_patches);
|
||||
OUT_RING(ring, 0);
|
||||
|
||||
OUT_PKT3(ring, CP_SET_CONSTANT, 4);
|
||||
OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO));
|
||||
|
@@ -74,7 +74,7 @@ ir2_optimize_loop(nir_shader *s)
|
||||
progress |= OPT(s, nir_opt_dce);
|
||||
progress |= OPT(s, nir_opt_cse);
|
||||
/* progress |= OPT(s, nir_opt_gcm, true); */
|
||||
progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
|
||||
progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true);
|
||||
progress |= OPT(s, nir_opt_intrinsics);
|
||||
progress |= OPT(s, nir_opt_algebraic);
|
||||
progress |= OPT(s, nir_opt_constant_folding);
|
||||
|
@@ -438,7 +438,7 @@ emit_blit_texture(struct fd_ringbuffer *ring, const struct pipe_blit_info *info)
|
||||
OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) |
|
||||
A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) |
|
||||
A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap));
|
||||
OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */
|
||||
OUT_RELOCW(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */
|
||||
OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch));
|
||||
OUT_RING(ring, 0x00000000);
|
||||
OUT_RING(ring, 0x00000000);
|
||||
|
@@ -317,6 +317,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_MAX_VIEWPORTS:
|
||||
return 1;
|
||||
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 16;
|
||||
|
||||
case PIPE_CAP_SHAREABLE_SHADERS:
|
||||
case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
|
||||
/* manage the variants for these ourself, to avoid breaking precompile: */
|
||||
|
@@ -402,6 +402,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
|
||||
return 0;
|
||||
case PIPE_CAP_ENDIANNESS:
|
||||
return PIPE_ENDIAN_LITTLE;
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 10;
|
||||
|
||||
case PIPE_CAP_VENDOR_ID:
|
||||
return 0x8086;
|
||||
|
@@ -310,6 +310,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
|
||||
return 1;
|
||||
case PIPE_CAP_CLEAR_TEXTURE:
|
||||
return 1;
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 32;
|
||||
case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
|
||||
case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
|
||||
case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
|
||||
|
@@ -543,6 +543,8 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
|
||||
$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
|
||||
long mov b32 $r3 0x3f800000
|
||||
long nop
|
||||
sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
|
||||
long nop
|
||||
long ret
|
||||
|
||||
|
||||
@@ -554,7 +556,144 @@ long ret
|
||||
// SIZE: 9 * 8 bytes
|
||||
//
|
||||
gk104_rcp_f64:
|
||||
long nop
|
||||
// Step 1: classify input according to exponent and value, and calculate
|
||||
// result for 0/inf/nan. $r2 holds the exponent value, which starts at
|
||||
// bit 52 (bit 20 of the upper half) and is 11 bits in length
|
||||
ext u32 $r2 $r1 0xb14
|
||||
add b32 $r3 $r2 0xffffffff
|
||||
joinat #rcp_rejoin
|
||||
// We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
|
||||
// denorm, or 0). Do this by substracting 1 from the exponent, which will
|
||||
// mean that it's > 0x7fd in those cases when doing unsigned comparison
|
||||
set $p0 0x1 gt u32 $r3 0x7fd
|
||||
// $r3: 0 for norms, 0x36 for denorms, -1 for others
|
||||
long mov b32 $r3 0x0
|
||||
sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
|
||||
join (not $p0) nop
|
||||
// Process all special values: NaN, inf, denorm, 0
|
||||
mov b32 $r3 0xffffffff
|
||||
// A number is NaN if its abs value is greater than or unordered with inf
|
||||
set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
|
||||
(not $p0) bra #rcp_inf_or_denorm_or_zero
|
||||
// NaN -> NaN, the next line sets the "quiet" bit of the result. This
|
||||
// behavior is both seen on the CPU and the blob
|
||||
join or b32 $r1 $r1 0x80000
|
||||
rcp_inf_or_denorm_or_zero:
|
||||
and b32 $r4 $r1 0x7ff00000
|
||||
// Other values with nonzero in exponent field should be inf
|
||||
set $p0 0x1 eq s32 $r4 0x0
|
||||
sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
|
||||
$p0 bra #rcp_denorm_or_zero
|
||||
// +/-Inf -> +/-0
|
||||
xor b32 $r1 $r1 0x7ff00000
|
||||
join mov b32 $r0 0x0
|
||||
rcp_denorm_or_zero:
|
||||
set $p0 0x1 gtu f64 abs $r0d 0x0
|
||||
$p0 bra #rcp_denorm
|
||||
// +/-0 -> +/-Inf
|
||||
join or b32 $r1 $r1 0x7ff00000
|
||||
rcp_denorm:
|
||||
// non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
|
||||
mul rn f64 $r0d $r0d 0x4350000000000000
|
||||
sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
|
||||
join mov b32 $r3 0x36
|
||||
rcp_rejoin:
|
||||
// All numbers with -1 in $r3 have their result ready in $r0d, return them
|
||||
// others need further calculation
|
||||
set $p0 0x1 lt s32 $r3 0x0
|
||||
$p0 bra #rcp_end
|
||||
// Step 2: Before the real calculation goes on, renormalize the values to
|
||||
// range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
|
||||
// result in $r6d. The exponent will be recovered later.
|
||||
ext u32 $r2 $r1 0xb14
|
||||
and b32 $r7 $r1 0x800fffff
|
||||
add b32 $r7 $r7 0x3ff00000
|
||||
long mov b32 $r6 $r0
|
||||
sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
|
||||
// Step 3: Convert new value to float (no overflow will occur due to step
|
||||
// 2), calculate rcp and do newton-raphson step once
|
||||
cvt rz f32 $r5 f64 $r6d
|
||||
long rcp f32 $r4 $r5
|
||||
mov b32 $r0 0xbf800000
|
||||
fma rn f32 $r5 $r4 $r5 $r0
|
||||
fma rn f32 $r0 neg $r4 $r5 $r4
|
||||
// Step 4: convert result $r0 back to double, do newton-raphson steps
|
||||
cvt f64 $r0d f32 $r0
|
||||
cvt f64 $r6d neg f64 $r6d
|
||||
sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
|
||||
cvt f64 $r8d f32 0x3f800000
|
||||
// 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
|
||||
// The formula used here (and above) is:
|
||||
// RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
|
||||
// The following code uses 2 FMAs for each step, and it will basically
|
||||
// looks like:
|
||||
// tmp = -src * RCP_{n} + 1
|
||||
// RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
|
||||
fma rn f64 $r4d $r6d $r0d $r8d
|
||||
fma rn f64 $r0d $r0d $r4d $r0d
|
||||
fma rn f64 $r4d $r6d $r0d $r8d
|
||||
fma rn f64 $r0d $r0d $r4d $r0d
|
||||
fma rn f64 $r4d $r6d $r0d $r8d
|
||||
fma rn f64 $r0d $r0d $r4d $r0d
|
||||
sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
|
||||
fma rn f64 $r4d $r6d $r0d $r8d
|
||||
fma rn f64 $r0d $r0d $r4d $r0d
|
||||
// Step 5: Exponent recovery and final processing
|
||||
// The exponent is recovered by adding what we added to the exponent.
|
||||
// Suppose we want to calculate rcp(x), but we have rcp(cx), then
|
||||
// rcp(x) = c * rcp(cx)
|
||||
// The delta in exponent comes from two sources:
|
||||
// 1) The renormalization in step 2. The delta is:
|
||||
// 0x3ff - $r2
|
||||
// 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
|
||||
// in $r3
|
||||
// These 2 sources are calculated in the first two lines below, and then
|
||||
// added to the exponent extracted from the result above.
|
||||
// Note that after processing, the new exponent may >= 0x7ff (inf)
|
||||
// or <= 0 (denorm). Those cases will be handled respectively below
|
||||
subr b32 $r2 $r2 0x3ff
|
||||
long add b32 $r4 $r2 $r3
|
||||
ext u32 $r3 $r1 0xb14
|
||||
// New exponent in $r3
|
||||
long add b32 $r3 $r3 $r4
|
||||
add b32 $r2 $r3 0xffffffff
|
||||
sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
|
||||
// (exponent-1) < 0x7fe (unsigned) means the result is in norm range
|
||||
// (same logic as in step 1)
|
||||
set $p0 0x1 lt u32 $r2 0x7fe
|
||||
(not $p0) bra #rcp_result_inf_or_denorm
|
||||
// Norms: convert exponents back and return
|
||||
shl b32 $r4 $r4 clamp 0x14
|
||||
long add b32 $r1 $r4 $r1
|
||||
bra #rcp_end
|
||||
rcp_result_inf_or_denorm:
|
||||
// New exponent >= 0x7ff means that result is inf
|
||||
set $p0 0x1 ge s32 $r3 0x7ff
|
||||
(not $p0) bra #rcp_result_denorm
|
||||
sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
|
||||
// Infinity
|
||||
and b32 $r1 $r1 0x80000000
|
||||
long mov b32 $r0 0x0
|
||||
add b32 $r1 $r1 0x7ff00000
|
||||
bra #rcp_end
|
||||
rcp_result_denorm:
|
||||
// Denorm result comes from huge input. The greatest possible fp64, i.e.
|
||||
// 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
|
||||
// normal value. Other rcp result should be greater than that. If we
|
||||
// set the exponent field to 1, we can recover the result by multiplying
|
||||
// it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
|
||||
// 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
|
||||
// the logic here.
|
||||
set $p0 0x1 ne u32 $r3 0x0
|
||||
and b32 $r1 $r1 0x800fffff
|
||||
// 0x3e800000: 1/4
|
||||
$p0 cvt f64 $r6d f32 0x3e800000
|
||||
sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
|
||||
// 0x3f000000: 1/2
|
||||
(not $p0) cvt f64 $r6d f32 0x3f000000
|
||||
add b32 $r1 $r1 0x00100000
|
||||
mul rn f64 $r0d $r0d $r6d
|
||||
rcp_end:
|
||||
long ret
|
||||
|
||||
// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
|
||||
@@ -565,7 +704,67 @@ gk104_rcp_f64:
|
||||
// SIZE: 14 * 8 bytes
|
||||
//
|
||||
gk104_rsq_f64:
|
||||
long nop
|
||||
// Before getting initial result rsqrt64h, two special cases should be
|
||||
// handled first.
|
||||
// 1. NaN: set the highest bit in mantissa so it'll be surely recognized
|
||||
// as NaN in rsqrt64h
|
||||
set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
|
||||
$p0 or b32 $r1 $r1 0x00080000
|
||||
and b32 $r2 $r1 0x7fffffff
|
||||
sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
|
||||
// 2. denorms and small normal values: using their original value will
|
||||
// lose precision either at rsqrt64h or the first step in newton-raphson
|
||||
// steps below. Take 2 as a threshold in exponent field, and multiply
|
||||
// with 2^54 if the exponent is smaller or equal. (will multiply 2^27
|
||||
// to recover in the end)
|
||||
ext u32 $r3 $r1 0xb14
|
||||
set $p1 0x1 le u32 $r3 0x2
|
||||
long or b32 $r2 $r0 $r2
|
||||
$p1 mul rn f64 $r0d $r0d 0x4350000000000000
|
||||
rsqrt64h $r5 $r1
|
||||
// rsqrt64h will give correct result for 0/inf/nan, the following logic
|
||||
// checks whether the input is one of those (exponent is 0x7ff or all 0
|
||||
// except for the sign bit)
|
||||
set b32 $r6 ne u32 $r3 0x7ff
|
||||
long and b32 $r2 $r2 $r6
|
||||
sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
|
||||
set $p0 0x1 ne u32 $r2 0x0
|
||||
$p0 bra #rsq_norm
|
||||
// For 0/inf/nan, make sure the sign bit agrees with input and return
|
||||
and b32 $r1 $r1 0x80000000
|
||||
long mov b32 $r0 0x0
|
||||
long or b32 $r1 $r1 $r5
|
||||
long ret
|
||||
rsq_norm:
|
||||
// For others, do 4 Newton-Raphson steps with the formula:
|
||||
// RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
|
||||
// In the code below, each step is written as:
|
||||
// tmp1 = 0.5 * x * RSQ_{n}
|
||||
// tmp2 = -RSQ_{n} * tmp1 + 0.5
|
||||
// RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
|
||||
long mov b32 $r4 0x0
|
||||
sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
|
||||
// 0x3f000000: 1/2
|
||||
cvt f64 $r8d f32 0x3f000000
|
||||
mul rn f64 $r2d $r0d $r8d
|
||||
mul rn f64 $r0d $r2d $r4d
|
||||
fma rn f64 $r6d neg $r4d $r0d $r8d
|
||||
fma rn f64 $r4d $r4d $r6d $r4d
|
||||
mul rn f64 $r0d $r2d $r4d
|
||||
fma rn f64 $r6d neg $r4d $r0d $r8d
|
||||
sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
|
||||
fma rn f64 $r4d $r4d $r6d $r4d
|
||||
mul rn f64 $r0d $r2d $r4d
|
||||
fma rn f64 $r6d neg $r4d $r0d $r8d
|
||||
fma rn f64 $r4d $r4d $r6d $r4d
|
||||
mul rn f64 $r0d $r2d $r4d
|
||||
fma rn f64 $r6d neg $r4d $r0d $r8d
|
||||
fma rn f64 $r4d $r4d $r6d $r4d
|
||||
sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
|
||||
// Multiply 2^27 to result for small inputs to recover
|
||||
$p1 mul rn f64 $r4d $r4d 0x41a0000000000000
|
||||
long mov b32 $r1 $r5
|
||||
long mov b32 $r0 $r4
|
||||
long ret
|
||||
|
||||
//
|
||||
|
@@ -481,12 +481,132 @@ uint64_t gk104_builtin_code[] = {
|
||||
0xd40040000840c785,
|
||||
0x18fe00000000dde2,
|
||||
0x4000000000001de4,
|
||||
0x9000000000001de7,
|
||||
/* 0x0f08: gk104_rcp_f64 */
|
||||
0x2000000000000007,
|
||||
0x4000000000001de4,
|
||||
0x9000000000001de7,
|
||||
/* 0x0f18: gk104_rsq_f64 */
|
||||
0x4000000000001de4,
|
||||
/* 0x0f18: gk104_rcp_f64 */
|
||||
0x7000c02c50109c03,
|
||||
0x0bfffffffc20dc02,
|
||||
0x6000000280000007,
|
||||
0x1a0ec01ff431dc03,
|
||||
0x180000000000dde2,
|
||||
0x228282f2b2d042f7,
|
||||
0x40000000000021f4,
|
||||
0x1bfffffffc00dde2,
|
||||
0x1e0edffc0001dc81,
|
||||
0x40000000200021e7,
|
||||
0x3800200000105c52,
|
||||
/* 0x0f70: rcp_inf_or_denorm_or_zero */
|
||||
0x39ffc00000111c02,
|
||||
0x190e0000fc41dc23,
|
||||
0x2202f2b2d2f042b7,
|
||||
0x40000000400001e7,
|
||||
0x39ffc00000105c82,
|
||||
0x1800000000001df2,
|
||||
/* 0x0fa0: rcp_denorm_or_zero */
|
||||
0x1e0ec0000001dc81,
|
||||
0x40000000200001e7,
|
||||
0x39ffc00000105c52,
|
||||
/* 0x0fb8: rcp_denorm */
|
||||
0x5000d0d400001c01,
|
||||
0x2280428282b282f7,
|
||||
0x18000000d800ddf2,
|
||||
/* 0x0fd0: rcp_rejoin */
|
||||
0x188e0000fc31dc23,
|
||||
0x40000006000001e7,
|
||||
0x7000c02c50109c03,
|
||||
0x3a003ffffc11dc02,
|
||||
0x08ffc0000071dc02,
|
||||
0x2800000000019de4,
|
||||
0x22e2b2a2828042b7,
|
||||
0x1006000019a15c04,
|
||||
0xc800000010511c00,
|
||||
0x1afe000000001de2,
|
||||
0x3000000014415c00,
|
||||
0x3008000014401e00,
|
||||
0x1000000001301c04,
|
||||
0x1000000019b19d04,
|
||||
0x22929292929292e7,
|
||||
0x1000cfe001321c04,
|
||||
0x2010000000611c01,
|
||||
0x2000000010001c01,
|
||||
0x2010000000611c01,
|
||||
0x2000000010001c01,
|
||||
0x2010000000611c01,
|
||||
0x2000000010001c01,
|
||||
0x2282828282820297,
|
||||
0x2010000000611c01,
|
||||
0x2000000010001c01,
|
||||
0x0800000ffc209e02,
|
||||
0x480000000c211c03,
|
||||
0x7000c02c5010dc03,
|
||||
0x480000001030dc03,
|
||||
0x0bfffffffc309c02,
|
||||
0x22b28282b282b287,
|
||||
0x188ec01ff821dc03,
|
||||
0x40000000600021e7,
|
||||
0x6000c00050411c03,
|
||||
0x4800000004405c03,
|
||||
0x40000001c0001de7,
|
||||
/* 0x10f0: rcp_result_inf_or_denorm */
|
||||
0x1b0ec01ffc31dc23,
|
||||
0x40000000a00021e7,
|
||||
0x22f25232b2825207,
|
||||
0x3a00000000105c02,
|
||||
0x1800000000001de2,
|
||||
0x09ffc00000105c02,
|
||||
0x40000000e0001de7,
|
||||
/* 0x1128: rcp_result_denorm */
|
||||
0x1a8e0000fc31dc03,
|
||||
0x3a003ffffc105c02,
|
||||
0x1000cfa001318004,
|
||||
0x227202a2e2c282f7,
|
||||
0x1000cfc00131a004,
|
||||
0x0800400000105c02,
|
||||
0x5000000018001c01,
|
||||
/* 0x1160: rcp_end */
|
||||
0x9000000000001de7,
|
||||
/* 0x1168: gk104_rsq_f64 */
|
||||
0x1e0edffc0001dc81,
|
||||
0x3800200000104042,
|
||||
0x39fffffffc109c02,
|
||||
0x22828252c2820277,
|
||||
0x7000c02c5010dc03,
|
||||
0x198ec0000833dc03,
|
||||
0x6800000008009c43,
|
||||
0x5000d0d400000401,
|
||||
0xc80000001c115c00,
|
||||
0x128ec01ffc319c03,
|
||||
0x6800000018209c03,
|
||||
0x2282e2827202b287,
|
||||
0x1a8e0000fc21dc03,
|
||||
0x40000000800001e7,
|
||||
0x3a00000000105c02,
|
||||
0x1800000000001de2,
|
||||
0x6800000014105c43,
|
||||
0x9000000000001de7,
|
||||
/* 0x11f8: rsq_norm */
|
||||
0x1800000000011de2,
|
||||
0x22929292929292f7,
|
||||
0x1000cfc001321c04,
|
||||
0x5000000020009c01,
|
||||
0x5000000010201c01,
|
||||
0x2010000000419e01,
|
||||
0x2008000018411c01,
|
||||
0x5000000010201c01,
|
||||
0x2010000000419e01,
|
||||
0x2292929292929297,
|
||||
0x2008000018411c01,
|
||||
0x5000000010201c01,
|
||||
0x2010000000419e01,
|
||||
0x2008000018411c01,
|
||||
0x5000000010201c01,
|
||||
0x2010000000419e01,
|
||||
0x2008000018411c01,
|
||||
0x20000002e2820297,
|
||||
0x5000d06800410401,
|
||||
0x2800000014005de4,
|
||||
0x2800000010001de4,
|
||||
0x9000000000001de7,
|
||||
0xc800000003f01cc5,
|
||||
0x2c00000100005c04,
|
||||
@@ -495,7 +615,7 @@ uint64_t gk104_builtin_code[] = {
|
||||
0x680100000c1fdc03,
|
||||
0x4000000a60001c47,
|
||||
0x180000004000dde2,
|
||||
/* 0x0f60: spill_cfstack */
|
||||
/* 0x12e0: spill_cfstack */
|
||||
0x78000009c0000007,
|
||||
0x0c0000000430dd02,
|
||||
0x4003ffffa0001ca7,
|
||||
@@ -543,14 +663,14 @@ uint64_t gk104_builtin_code[] = {
|
||||
0x4000000100001ea7,
|
||||
0x480100000c001c03,
|
||||
0x0800000000105c42,
|
||||
/* 0x10d8: shared_loop */
|
||||
/* 0x1458: shared_loop */
|
||||
0xc100000000309c85,
|
||||
0x9400000500009c85,
|
||||
0x0c00000010001d02,
|
||||
0x0800000000105d42,
|
||||
0x0c0000001030dd02,
|
||||
0x4003ffff40001ca7,
|
||||
/* 0x1108: shared_done */
|
||||
/* 0x1488: shared_done */
|
||||
0x2800406420001de4,
|
||||
0x2800406430005de4,
|
||||
0xe000000000001c45,
|
||||
@@ -564,7 +684,7 @@ uint64_t gk104_builtin_code[] = {
|
||||
0x480000000c209c03,
|
||||
0x4801000008001c03,
|
||||
0x0800000000105c42,
|
||||
/* 0x1170: search_cstack */
|
||||
/* 0x14f0: search_cstack */
|
||||
0x280040646000dde4,
|
||||
0x8400000020009f05,
|
||||
0x190ec0002821dc03,
|
||||
@@ -573,17 +693,17 @@ uint64_t gk104_builtin_code[] = {
|
||||
0x0800000000105c42,
|
||||
0x0c0000004030dd02,
|
||||
0x00029dff0ffc5cbf,
|
||||
/* 0x11b0: entry_found */
|
||||
/* 0x1530: entry_found */
|
||||
0x8400000000009f85,
|
||||
0x2800406400001de4,
|
||||
0x2800406410005de4,
|
||||
0x9400000010009c85,
|
||||
0x4000000000001df4,
|
||||
/* 0x11d8: end_exit */
|
||||
/* 0x1558: end_exit */
|
||||
0x9800000003ffdcc5,
|
||||
0xd000000000008007,
|
||||
0xa000000000004007,
|
||||
/* 0x11f0: end_cont */
|
||||
/* 0x1570: end_cont */
|
||||
0xd000000000008007,
|
||||
0x3400c3fffc201c04,
|
||||
0xc000000003f01ec5,
|
||||
@@ -593,6 +713,6 @@ uint64_t gk104_builtin_code[] = {
|
||||
uint64_t gk104_builtin_offsets[] = {
|
||||
0x0000000000000000,
|
||||
0x00000000000000f0,
|
||||
0x0000000000000f08,
|
||||
0x0000000000000f18,
|
||||
0x0000000000001168,
|
||||
};
|
||||
|
@@ -83,12 +83,229 @@ gk110_div_s32:
|
||||
$p0 sub b32 $r1 $r1 $r2
|
||||
$p0 add b32 $r0 $r0 0x1
|
||||
$p3 cvt s32 $r0 neg s32 $r0
|
||||
sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
|
||||
sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28
|
||||
$p2 cvt s32 $r1 neg s32 $r1
|
||||
ret
|
||||
|
||||
// RCP F64
|
||||
//
|
||||
// INPUT: $r0d
|
||||
// OUTPUT: $r0d
|
||||
// CLOBBER: $r2 - $r9, $p0
|
||||
//
|
||||
// The core of RCP and RSQ implementation is Newton-Raphson step, which is
|
||||
// used to find successively better approximation from an imprecise initial
|
||||
// value (single precision rcp in RCP and rsqrt64h in RSQ).
|
||||
//
|
||||
gk110_rcp_f64:
|
||||
// Step 1: classify input according to exponent and value, and calculate
|
||||
// result for 0/inf/nan. $r2 holds the exponent value, which starts at
|
||||
// bit 52 (bit 20 of the upper half) and is 11 bits in length
|
||||
ext u32 $r2 $r1 0xb14
|
||||
add b32 $r3 $r2 0xffffffff
|
||||
joinat #rcp_rejoin
|
||||
// We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
|
||||
// denorm, or 0). Do this by substracting 1 from the exponent, which will
|
||||
// mean that it's > 0x7fd in those cases when doing unsigned comparison
|
||||
set b32 $p0 0x1 gt u32 $r3 0x7fd
|
||||
// $r3: 0 for norms, 0x36 for denorms, -1 for others
|
||||
mov b32 $r3 0x0
|
||||
sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
|
||||
join (not $p0) nop
|
||||
// Process all special values: NaN, inf, denorm, 0
|
||||
mov b32 $r3 0xffffffff
|
||||
// A number is NaN if its abs value is greater than or unordered with inf
|
||||
set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
|
||||
(not $p0) bra #rcp_inf_or_denorm_or_zero
|
||||
// NaN -> NaN, the next line sets the "quiet" bit of the result. This
|
||||
// behavior is both seen on the CPU and the blob
|
||||
join or b32 $r1 $r1 0x80000
|
||||
rcp_inf_or_denorm_or_zero:
|
||||
and b32 $r4 $r1 0x7ff00000
|
||||
// Other values with nonzero in exponent field should be inf
|
||||
set b32 $p0 0x1 eq s32 $r4 0x0
|
||||
sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
|
||||
$p0 bra #rcp_denorm_or_zero
|
||||
// +/-Inf -> +/-0
|
||||
xor b32 $r1 $r1 0x7ff00000
|
||||
join mov b32 $r0 0x0
|
||||
rcp_denorm_or_zero:
|
||||
set $p0 0x1 gtu f64 abs $r0d 0x0
|
||||
$p0 bra #rcp_denorm
|
||||
// +/-0 -> +/-Inf
|
||||
join or b32 $r1 $r1 0x7ff00000
|
||||
rcp_denorm:
|
||||
// non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
|
||||
mul rn f64 $r0d $r0d 0x4350000000000000
|
||||
sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
|
||||
join mov b32 $r3 0x36
|
||||
rcp_rejoin:
|
||||
// All numbers with -1 in $r3 have their result ready in $r0d, return them
|
||||
// others need further calculation
|
||||
set b32 $p0 0x1 lt s32 $r3 0x0
|
||||
$p0 bra #rcp_end
|
||||
// Step 2: Before the real calculation goes on, renormalize the values to
|
||||
// range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
|
||||
// result in $r6d. The exponent will be recovered later.
|
||||
ext u32 $r2 $r1 0xb14
|
||||
and b32 $r7 $r1 0x800fffff
|
||||
add b32 $r7 $r7 0x3ff00000
|
||||
mov b32 $r6 $r0
|
||||
sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
|
||||
// Step 3: Convert new value to float (no overflow will occur due to step
|
||||
// 2), calculate rcp and do newton-raphson step once
|
||||
cvt rz f32 $r5 f64 $r6d
|
||||
rcp f32 $r4 $r5
|
||||
mov b32 $r0 0xbf800000
|
||||
fma rn f32 $r5 $r4 $r5 $r0
|
||||
fma rn f32 $r0 neg $r4 $r5 $r4
|
||||
// Step 4: convert result $r0 back to double, do newton-raphson steps
|
||||
cvt f64 $r0d f32 $r0
|
||||
cvt f64 $r6d f64 neg $r6d
|
||||
sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
|
||||
cvt f64 $r8d f32 0x3f800000
|
||||
// 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
|
||||
// The formula used here (and above) is:
|
||||
// RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
|
||||
// The following code uses 2 FMAs for each step, and it will basically
|
||||
// looks like:
|
||||
// tmp = -src * RCP_{n} + 1
|
||||
// RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
|
||||
fma rn f64 $r4d $r6d $r0d $r8d
|
||||
fma rn f64 $r0d $r0d $r4d $r0d
|
||||
fma rn f64 $r4d $r6d $r0d $r8d
|
||||
fma rn f64 $r0d $r0d $r4d $r0d
|
||||
fma rn f64 $r4d $r6d $r0d $r8d
|
||||
fma rn f64 $r0d $r0d $r4d $r0d
|
||||
sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
|
||||
fma rn f64 $r4d $r6d $r0d $r8d
|
||||
fma rn f64 $r0d $r0d $r4d $r0d
|
||||
// Step 5: Exponent recovery and final processing
|
||||
// The exponent is recovered by adding what we added to the exponent.
|
||||
// Suppose we want to calculate rcp(x), but we have rcp(cx), then
|
||||
// rcp(x) = c * rcp(cx)
|
||||
// The delta in exponent comes from two sources:
|
||||
// 1) The renormalization in step 2. The delta is:
|
||||
// 0x3ff - $r2
|
||||
// 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
|
||||
// in $r3
|
||||
// These 2 sources are calculated in the first two lines below, and then
|
||||
// added to the exponent extracted from the result above.
|
||||
// Note that after processing, the new exponent may >= 0x7ff (inf)
|
||||
// or <= 0 (denorm). Those cases will be handled respectively below
|
||||
subr b32 $r2 $r2 0x3ff
|
||||
add b32 $r4 $r2 $r3
|
||||
ext u32 $r3 $r1 0xb14
|
||||
// New exponent in $r3
|
||||
add b32 $r3 $r3 $r4
|
||||
add b32 $r2 $r3 0xffffffff
|
||||
sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
|
||||
// (exponent-1) < 0x7fe (unsigned) means the result is in norm range
|
||||
// (same logic as in step 1)
|
||||
set b32 $p0 0x1 lt u32 $r2 0x7fe
|
||||
(not $p0) bra #rcp_result_inf_or_denorm
|
||||
// Norms: convert exponents back and return
|
||||
shl b32 $r4 $r4 clamp 0x14
|
||||
add b32 $r1 $r4 $r1
|
||||
bra #rcp_end
|
||||
rcp_result_inf_or_denorm:
|
||||
// New exponent >= 0x7ff means that result is inf
|
||||
set b32 $p0 0x1 ge s32 $r3 0x7ff
|
||||
(not $p0) bra #rcp_result_denorm
|
||||
sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
|
||||
// Infinity
|
||||
and b32 $r1 $r1 0x80000000
|
||||
mov b32 $r0 0x0
|
||||
add b32 $r1 $r1 0x7ff00000
|
||||
bra #rcp_end
|
||||
rcp_result_denorm:
|
||||
// Denorm result comes from huge input. The greatest possible fp64, i.e.
|
||||
// 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
|
||||
// normal value. Other rcp result should be greater than that. If we
|
||||
// set the exponent field to 1, we can recover the result by multiplying
|
||||
// it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
|
||||
// 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
|
||||
// the logic here.
|
||||
set b32 $p0 0x1 ne u32 $r3 0x0
|
||||
and b32 $r1 $r1 0x800fffff
|
||||
// 0x3e800000: 1/4
|
||||
$p0 cvt f64 $r6d f32 0x3e800000
|
||||
sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
|
||||
// 0x3f000000: 1/2
|
||||
(not $p0) cvt f64 $r6d f32 0x3f000000
|
||||
add b32 $r1 $r1 0x00100000
|
||||
mul rn f64 $r0d $r0d $r6d
|
||||
rcp_end:
|
||||
ret
|
||||
|
||||
// RSQ F64
|
||||
//
|
||||
// INPUT: $r0d
|
||||
// OUTPUT: $r0d
|
||||
// CLOBBER: $r2 - $r9, $p0 - $p1
|
||||
//
|
||||
gk110_rsq_f64:
|
||||
// Before getting initial result rsqrt64h, two special cases should be
|
||||
// handled first.
|
||||
// 1. NaN: set the highest bit in mantissa so it'll be surely recognized
|
||||
// as NaN in rsqrt64h
|
||||
set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
|
||||
$p0 or b32 $r1 $r1 0x00080000
|
||||
and b32 $r2 $r1 0x7fffffff
|
||||
sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
|
||||
// 2. denorms and small normal values: using their original value will
|
||||
// lose precision either at rsqrt64h or the first step in newton-raphson
|
||||
// steps below. Take 2 as a threshold in exponent field, and multiply
|
||||
// with 2^54 if the exponent is smaller or equal. (will multiply 2^27
|
||||
// to recover in the end)
|
||||
ext u32 $r3 $r1 0xb14
|
||||
set b32 $p1 0x1 le u32 $r3 0x2
|
||||
or b32 $r2 $r0 $r2
|
||||
$p1 mul rn f64 $r0d $r0d 0x4350000000000000
|
||||
rsqrt64h f32 $r5 $r1
|
||||
// rsqrt64h will give correct result for 0/inf/nan, the following logic
|
||||
// checks whether the input is one of those (exponent is 0x7ff or all 0
|
||||
// except for the sign bit)
|
||||
set b32 $r6 ne u32 $r3 0x7ff
|
||||
and b32 $r2 $r2 $r6
|
||||
sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
|
||||
set b32 $p0 0x1 ne u32 $r2 0x0
|
||||
$p0 bra #rsq_norm
|
||||
// For 0/inf/nan, make sure the sign bit agrees with input and return
|
||||
and b32 $r1 $r1 0x80000000
|
||||
mov b32 $r0 0x0
|
||||
or b32 $r1 $r1 $r5
|
||||
ret
|
||||
rsq_norm:
|
||||
// For others, do 4 Newton-Raphson steps with the formula:
|
||||
// RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
|
||||
// In the code below, each step is written as:
|
||||
// tmp1 = 0.5 * x * RSQ_{n}
|
||||
// tmp2 = -RSQ_{n} * tmp1 + 0.5
|
||||
// RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
|
||||
mov b32 $r4 0x0
|
||||
sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
|
||||
// 0x3f000000: 1/2
|
||||
cvt f64 $r8d f32 0x3f000000
|
||||
mul rn f64 $r2d $r0d $r8d
|
||||
mul rn f64 $r0d $r2d $r4d
|
||||
fma rn f64 $r6d neg $r4d $r0d $r8d
|
||||
fma rn f64 $r4d $r4d $r6d $r4d
|
||||
mul rn f64 $r0d $r2d $r4d
|
||||
fma rn f64 $r6d neg $r4d $r0d $r8d
|
||||
sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
|
||||
fma rn f64 $r4d $r4d $r6d $r4d
|
||||
mul rn f64 $r0d $r2d $r4d
|
||||
fma rn f64 $r6d neg $r4d $r0d $r8d
|
||||
fma rn f64 $r4d $r4d $r6d $r4d
|
||||
mul rn f64 $r0d $r2d $r4d
|
||||
fma rn f64 $r6d neg $r4d $r0d $r8d
|
||||
fma rn f64 $r4d $r4d $r6d $r4d
|
||||
sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
|
||||
// Multiply 2^27 to result for small inputs to recover
|
||||
$p1 mul rn f64 $r4d $r4d 0x41a0000000000000
|
||||
mov b32 $r1 $r5
|
||||
mov b32 $r0 $r4
|
||||
ret
|
||||
|
||||
.section #gk110_builtin_offsets
|
||||
|
@@ -65,11 +65,132 @@ uint64_t gk110_builtin_code[] = {
|
||||
0xe088000001000406,
|
||||
0x4000000000800001,
|
||||
0xe6010000000ce802,
|
||||
0x08b08010a010b810,
|
||||
0x08a0a0a010a0b810,
|
||||
0xe60100000088e806,
|
||||
0x19000000001c003c,
|
||||
/* 0x0218: gk110_rcp_f64 */
|
||||
/* 0x0218: gk110_rsq_f64 */
|
||||
0xc00000058a1c0409,
|
||||
0x407fffffff9c080d,
|
||||
0x1480000050000000,
|
||||
0xb3401c03fe9c0c1d,
|
||||
0xe4c03c007f9c000e,
|
||||
0x08a0a0bcacb410bc,
|
||||
0x8580000000603c02,
|
||||
0x747fffffff9fc00e,
|
||||
0xb4601fff801c021d,
|
||||
0x120000000420003c,
|
||||
0x21000400005c0404,
|
||||
/* 0x0270: rcp_inf_or_denorm_or_zero */
|
||||
0x203ff800001c0410,
|
||||
0xb3281c00001c101d,
|
||||
0x0880bcacb4bc10ac,
|
||||
0x120000000800003c,
|
||||
0x223ff800001c0404,
|
||||
0xe4c03c007fdc0002,
|
||||
/* 0x02a0: rcp_denorm_or_zero */
|
||||
0xb4601c00001c021d,
|
||||
0x120000000400003c,
|
||||
0x213ff800005c0404,
|
||||
/* 0x02b8: rcp_denorm */
|
||||
0xc400021a801c0001,
|
||||
0x08a010a0a0aca0bc,
|
||||
0x740000001b5fc00e,
|
||||
/* 0x02d0: rcp_rejoin */
|
||||
0xb3181c00001c0c1d,
|
||||
0x12000000c000003c,
|
||||
0xc00000058a1c0409,
|
||||
0x204007ffff9c041c,
|
||||
0x401ff800001c1c1d,
|
||||
0xe4c03c00001c001a,
|
||||
0x08b8aca8a0a010ac,
|
||||
0xe5400c00031c3816,
|
||||
0x84000000021c1412,
|
||||
0x745fc000001fc002,
|
||||
0xcc000000029c1016,
|
||||
0xcc081000029c1002,
|
||||
0xe5400000001c2c02,
|
||||
0xe5410000031c3c1a,
|
||||
0x08a4a4a4a4a4a4b8,
|
||||
0xc54001fc001c2c21,
|
||||
0xdb802000001c1812,
|
||||
0xdb800000021c0002,
|
||||
0xdb802000001c1812,
|
||||
0xdb800000021c0002,
|
||||
0xdb802000001c1812,
|
||||
0xdb800000021c0002,
|
||||
0x08a0a0a0a0a080a4,
|
||||
0xdb802000001c1812,
|
||||
0xdb800000021c0002,
|
||||
0x48000001ff9c0809,
|
||||
0xe0800000019c0812,
|
||||
0xc00000058a1c040d,
|
||||
0xe0800000021c0c0e,
|
||||
0x407fffffff9c0c09,
|
||||
0x08aca0a0aca0aca0,
|
||||
0xb3101c03ff1c081d,
|
||||
0x120000000c20003c,
|
||||
0xc24000000a1c1011,
|
||||
0xe0800000009c1006,
|
||||
0x12000000381c003c,
|
||||
/* 0x03f0: rcp_result_inf_or_denorm */
|
||||
0xb3681c03ff9c0c1d,
|
||||
0x120000001420003c,
|
||||
0x08bc948caca09480,
|
||||
0x20400000001c0404,
|
||||
0xe4c03c007f9c0002,
|
||||
0x403ff800001c0405,
|
||||
0x120000001c1c003c,
|
||||
/* 0x0428: rcp_result_denorm */
|
||||
0xb3501c00001c0c1d,
|
||||
0x204007ffff9c0404,
|
||||
0xc54001f400002c19,
|
||||
0x089c80a8b8b0a0bc,
|
||||
0xc54001f800202c19,
|
||||
0x40000800001c0405,
|
||||
0xe4000000031c0002,
|
||||
/* 0x0460: rcp_end */
|
||||
0x19000000001c003c,
|
||||
/* 0x0468: gk110_rsq_f64 */
|
||||
0xb4601fff801c021d,
|
||||
0x2100040000000404,
|
||||
0x203fffffff9c0408,
|
||||
0x08a0a094b0a0809c,
|
||||
0xc00000058a1c040d,
|
||||
0xb3301c00011c0c3d,
|
||||
0xe2001000011c000a,
|
||||
0xc400021a80040001,
|
||||
0x84000000039c0416,
|
||||
0xb2d01c03ff9c0c19,
|
||||
0xe2000000031c080a,
|
||||
0x08a0b8a09c80aca0,
|
||||
0xb3501c00001c081d,
|
||||
0x120000001000003c,
|
||||
0x20400000001c0404,
|
||||
0xe4c03c007f9c0002,
|
||||
0xe2001000029c0406,
|
||||
0x19000000001c003c,
|
||||
/* 0x04f8: rsq_norm */
|
||||
0xe4c03c007f9c0012,
|
||||
0x08a4a4a4a4a4a4bc,
|
||||
0xc54001f8001c2c21,
|
||||
0xe4000000041c000a,
|
||||
0xe4000000021c0802,
|
||||
0xdb882000001c101a,
|
||||
0xdb801000031c1012,
|
||||
0xe4000000021c0802,
|
||||
0xdb882000001c101a,
|
||||
0x08a4a4a4a4a4a4a4,
|
||||
0xdb801000031c1012,
|
||||
0xe4000000021c0802,
|
||||
0xdb882000001c101a,
|
||||
0xdb801000031c1012,
|
||||
0xe4000000021c0802,
|
||||
0xdb882000001c101a,
|
||||
0xdb801000031c1012,
|
||||
0x08000000b8a080a4,
|
||||
0xc400020d00041011,
|
||||
0xe4c03c00029c0006,
|
||||
0xe4c03c00021c0002,
|
||||
0x19000000001c003c,
|
||||
};
|
||||
|
||||
@@ -77,5 +198,5 @@ uint64_t gk110_builtin_offsets[] = {
|
||||
0x0000000000000000,
|
||||
0x00000000000000f0,
|
||||
0x0000000000000218,
|
||||
0x0000000000000218,
|
||||
0x0000000000000468,
|
||||
};
|
||||
|
@@ -100,10 +100,253 @@ gm107_div_s32:
|
||||
ret
|
||||
nop 0
|
||||
|
||||
// STUB
|
||||
// RCP F64
|
||||
//
|
||||
// INPUT: $r0d
|
||||
// OUTPUT: $r0d
|
||||
// CLOBBER: $r2 - $r9, $p0
|
||||
//
|
||||
// The core of RCP and RSQ implementation is Newton-Raphson step, which is
|
||||
// used to find successively better approximation from an imprecise initial
|
||||
// value (single precision rcp in RCP and rsqrt64h in RSQ).
|
||||
//
|
||||
gm107_rcp_f64:
|
||||
gm107_rsq_f64:
|
||||
// Step 1: classify input according to exponent and value, and calculate
|
||||
// result for 0/inf/nan. $r2 holds the exponent value, which starts at
|
||||
// bit 52 (bit 20 of the upper half) and is 11 bits in length
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
bfe u32 $r2 $r1 0xb14
|
||||
iadd32i $r3 $r2 -1
|
||||
ssy #rcp_rejoin
|
||||
// We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
|
||||
// denorm, or 0). Do this by substracting 1 from the exponent, which will
|
||||
// mean that it's > 0x7fd in those cases when doing unsigned comparison
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
isetp gt u32 and $p0 1 $r3 0x7fd 1
|
||||
// $r3: 0 for norms, 0x36 for denorms, -1 for others
|
||||
mov $r3 0x0 0xf
|
||||
not $p0 sync
|
||||
// Process all special values: NaN, inf, denorm, 0
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
mov32i $r3 0xffffffff 0xf
|
||||
// A number is NaN if its abs value is greater than or unordered with inf
|
||||
dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
|
||||
not $p0 bra #rcp_inf_or_denorm_or_zero
|
||||
// NaN -> NaN, the next line sets the "quiet" bit of the result. This
|
||||
// behavior is both seen on the CPU and the blob
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
lop32i or $r1 $r1 0x80000
|
||||
sync
|
||||
rcp_inf_or_denorm_or_zero:
|
||||
lop32i and $r4 $r1 0x7ff00000
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
// Other values with nonzero in exponent field should be inf
|
||||
isetp eq and $p0 1 $r4 0x0 1
|
||||
$p0 bra #rcp_denorm_or_zero
|
||||
// +/-Inf -> +/-0
|
||||
lop32i xor $r1 $r1 0x7ff00000
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
mov $r0 0x0 0xf
|
||||
sync
|
||||
rcp_denorm_or_zero:
|
||||
dsetp gtu and $p0 1 abs $r0 0x0 1
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
$p0 bra #rcp_denorm
|
||||
// +/-0 -> +/-Inf
|
||||
lop32i or $r1 $r1 0x7ff00000
|
||||
sync
|
||||
rcp_denorm:
|
||||
// non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
dmul $r0 $r0 0x4350000000000000
|
||||
mov $r3 0x36 0xf
|
||||
sync
|
||||
rcp_rejoin:
|
||||
// All numbers with -1 in $r3 have their result ready in $r0d, return them
|
||||
// others need further calculation
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
isetp lt and $p0 1 $r3 0x0 1
|
||||
$p0 bra #rcp_end
|
||||
// Step 2: Before the real calculation goes on, renormalize the values to
|
||||
// range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
|
||||
// result in $r6d. The exponent will be recovered later.
|
||||
bfe u32 $r2 $r1 0xb14
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
lop32i and $r7 $r1 0x800fffff
|
||||
iadd32i $r7 $r7 0x3ff00000
|
||||
mov $r6 $r0 0xf
|
||||
// Step 3: Convert new value to float (no overflow will occur due to step
|
||||
// 2), calculate rcp and do newton-raphson step once
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
f2f ftz f64 f32 $r5 $r6
|
||||
mufu rcp $r4 $r5
|
||||
mov32i $r0 0xbf800000 0xf
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
ffma $r5 $r4 $r5 $r0
|
||||
ffma $r0 $r5 neg $r4 $r4
|
||||
// Step 4: convert result $r0 back to double, do newton-raphson steps
|
||||
f2f f32 f64 $r0 $r0
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
f2f f64 f64 $r6 neg $r6
|
||||
f2f f32 f64 $r8 0x3f800000
|
||||
// 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
|
||||
// The formula used here (and above) is:
|
||||
// RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
|
||||
// The following code uses 2 FMAs for each step, and it will basically
|
||||
// looks like:
|
||||
// tmp = -src * RCP_{n} + 1
|
||||
// RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
|
||||
dfma $r4 $r6 $r0 $r8
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
dfma $r0 $r0 $r4 $r0
|
||||
dfma $r4 $r6 $r0 $r8
|
||||
dfma $r0 $r0 $r4 $r0
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
dfma $r4 $r6 $r0 $r8
|
||||
dfma $r0 $r0 $r4 $r0
|
||||
dfma $r4 $r6 $r0 $r8
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
dfma $r0 $r0 $r4 $r0
|
||||
// Step 5: Exponent recovery and final processing
|
||||
// The exponent is recovered by adding what we added to the exponent.
|
||||
// Suppose we want to calculate rcp(x), but we have rcp(cx), then
|
||||
// rcp(x) = c * rcp(cx)
|
||||
// The delta in exponent comes from two sources:
|
||||
// 1) The renormalization in step 2. The delta is:
|
||||
// 0x3ff - $r2
|
||||
// 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
|
||||
// in $r3
|
||||
// These 2 sources are calculated in the first two lines below, and then
|
||||
// added to the exponent extracted from the result above.
|
||||
// Note that after processing, the new exponent may >= 0x7ff (inf)
|
||||
// or <= 0 (denorm). Those cases will be handled respectively below
|
||||
iadd $r2 neg $r2 0x3ff
|
||||
iadd $r4 $r2 $r3
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
bfe u32 $r3 $r1 0xb14
|
||||
// New exponent in $r3
|
||||
iadd $r3 $r3 $r4
|
||||
iadd32i $r2 $r3 -1
|
||||
// (exponent-1) < 0x7fe (unsigned) means the result is in norm range
|
||||
// (same logic as in step 1)
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
isetp lt u32 and $p0 1 $r2 0x7fe 1
|
||||
not $p0 bra #rcp_result_inf_or_denorm
|
||||
// Norms: convert exponents back and return
|
||||
shl $r4 $r4 0x14
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
iadd $r1 $r4 $r1
|
||||
bra #rcp_end
|
||||
rcp_result_inf_or_denorm:
|
||||
// New exponent >= 0x7ff means that result is inf
|
||||
isetp ge and $p0 1 $r3 0x7ff 1
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
not $p0 bra #rcp_result_denorm
|
||||
// Infinity
|
||||
lop32i and $r1 $r1 0x80000000
|
||||
mov $r0 0x0 0xf
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
iadd32i $r1 $r1 0x7ff00000
|
||||
bra #rcp_end
|
||||
rcp_result_denorm:
|
||||
// Denorm result comes from huge input. The greatest possible fp64, i.e.
|
||||
// 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
|
||||
// normal value. Other rcp result should be greater than that. If we
|
||||
// set the exponent field to 1, we can recover the result by multiplying
|
||||
// it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
|
||||
// 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
|
||||
// the logic here.
|
||||
isetp ne u32 and $p0 1 $r3 0x0 1
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
lop32i and $r1 $r1 0x800fffff
|
||||
// 0x3e800000: 1/4
|
||||
$p0 f2f f32 f64 $r6 0x3e800000
|
||||
// 0x3f000000: 1/2
|
||||
not $p0 f2f f32 f64 $r6 0x3f000000
|
||||
sched (st 0x0) (st 0x0) (st 0x0)
|
||||
iadd32i $r1 $r1 0x00100000
|
||||
dmul $r0 $r0 $r6
|
||||
rcp_end:
|
||||
ret
|
||||
|
||||
// RSQ F64
|
||||
//
|
||||
// INPUT: $r0d
|
||||
// OUTPUT: $r0d
|
||||
// CLOBBER: $r2 - $r9, $p0 - $p1
|
||||
//
|
||||
gm107_rsq_f64:
|
||||
// Before getting initial result rsqrt64h, two special cases should be
|
||||
// handled first.
|
||||
// 1. NaN: set the highest bit in mantissa so it'll be surely recognized
|
||||
// as NaN in rsqrt64h
|
||||
sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd)
|
||||
dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
|
||||
$p0 lop32i or $r1 $r1 0x00080000
|
||||
lop32i and $r2 $r1 0x7fffffff
|
||||
// 2. denorms and small normal values: using their original value will
|
||||
// lose precision either at rsqrt64h or the first step in newton-raphson
|
||||
// steps below. Take 2 as a threshold in exponent field, and multiply
|
||||
// with 2^54 if the exponent is smaller or equal. (will multiply 2^27
|
||||
// to recover in the end)
|
||||
sched (st 0xd) (st 0xd) (st 0xd)
|
||||
bfe u32 $r3 $r1 0xb14
|
||||
isetp le u32 and $p1 1 $r3 0x2 1
|
||||
lop or 1 $r2 $r0 $r2
|
||||
sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd)
|
||||
$p1 dmul $r0 $r0 0x4350000000000000
|
||||
mufu rsq64h $r5 $r1
|
||||
// rsqrt64h will give correct result for 0/inf/nan, the following logic
|
||||
// checks whether the input is one of those (exponent is 0x7ff or all 0
|
||||
// except for the sign bit)
|
||||
iset ne u32 and $r6 $r3 0x7ff 1
|
||||
sched (st 0xd) (st 0xd) (st 0xd)
|
||||
lop and 1 $r2 $r2 $r6
|
||||
isetp ne u32 and $p0 1 $r2 0x0 1
|
||||
$p0 bra #rsq_norm
|
||||
// For 0/inf/nan, make sure the sign bit agrees with input and return
|
||||
sched (st 0xd) (st 0xd) (st 0xd wt 0x1)
|
||||
lop32i and $r1 $r1 0x80000000
|
||||
mov $r0 0x0 0xf
|
||||
lop or 1 $r1 $r1 $r5
|
||||
sched (st 0xd) (st 0xf) (st 0xf)
|
||||
ret
|
||||
nop 0
|
||||
nop 0
|
||||
rsq_norm:
|
||||
// For others, do 4 Newton-Raphson steps with the formula:
|
||||
// RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
|
||||
// In the code below, each step is written as:
|
||||
// tmp1 = 0.5 * x * RSQ_{n}
|
||||
// tmp2 = -RSQ_{n} * tmp1 + 0.5
|
||||
// RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
|
||||
sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3)
|
||||
mov $r4 0x0 0xf
|
||||
// 0x3f000000: 1/2
|
||||
f2f f32 f64 $r8 0x3f000000
|
||||
dmul $r2 $r0 $r8
|
||||
sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
|
||||
dmul $r0 $r2 $r4
|
||||
dfma $r6 $r0 neg $r4 $r8
|
||||
dfma $r4 $r4 $r6 $r4
|
||||
sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
|
||||
dmul $r0 $r2 $r4
|
||||
dfma $r6 $r0 neg $r4 $r8
|
||||
dfma $r4 $r4 $r6 $r4
|
||||
sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
|
||||
dmul $r0 $r2 $r4
|
||||
dfma $r6 $r0 neg $r4 $r8
|
||||
dfma $r4 $r4 $r6 $r4
|
||||
sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
|
||||
dmul $r0 $r2 $r4
|
||||
dfma $r6 $r0 neg $r4 $r8
|
||||
dfma $r4 $r4 $r6 $r4
|
||||
// Multiply 2^27 to result for small inputs to recover
|
||||
sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd)
|
||||
$p1 dmul $r4 $r4 0x41a0000000000000
|
||||
mov $r1 $r5 0xf
|
||||
mov $r0 $r4 0xf
|
||||
sched (st 0xd) (st 0xf) (st 0xf)
|
||||
ret
|
||||
nop 0
|
||||
nop 0
|
||||
|
@@ -82,8 +82,156 @@ uint64_t gm107_builtin_code[] = {
|
||||
0xe32000000007000f,
|
||||
0x50b0000000070f00,
|
||||
/* 0x0280: gm107_rcp_f64 */
|
||||
/* 0x0280: gm107_rsq_f64 */
|
||||
0x001f8000fc0007e0,
|
||||
0x38000000b1470102,
|
||||
0x1c0ffffffff70203,
|
||||
0xe29000000e000000,
|
||||
0x001f8000fc0007e0,
|
||||
0x366803807fd70307,
|
||||
0x5c9807800ff70003,
|
||||
0xf0f800000008000f,
|
||||
0x001f8000fc0007e0,
|
||||
0x010ffffffff7f003,
|
||||
0x368c03fff0070087,
|
||||
0xe24000000188000f,
|
||||
0x001f8000fc0007e0,
|
||||
0x0420008000070101,
|
||||
0xf0f800000007000f,
|
||||
/* 0x02f8: rcp_inf_or_denorm_or_zero */
|
||||
0x0407ff0000070104,
|
||||
0x001f8000fc0007e0,
|
||||
0x5b6503800ff70407,
|
||||
0xe24000000200000f,
|
||||
0x0447ff0000070101,
|
||||
0x001f8000fc0007e0,
|
||||
0x5c9807800ff70000,
|
||||
0xf0f800000007000f,
|
||||
/* 0x0338: rcp_denorm_or_zero */
|
||||
0x5b8c03800ff70087,
|
||||
0x001f8000fc0007e0,
|
||||
0xe24000000100000f,
|
||||
0x0427ff0000070101,
|
||||
0xf0f800000007000f,
|
||||
/* 0x0360: rcp_denorm */
|
||||
0x001f8000fc0007e0,
|
||||
0x3880004350070000,
|
||||
0x3898078003670003,
|
||||
0xf0f800000007000f,
|
||||
/* 0x0380: rcp_rejoin */
|
||||
0x001f8000fc0007e0,
|
||||
0x5b6303800ff70307,
|
||||
0xe24000001c00000f,
|
||||
0x38000000b1470102,
|
||||
0x001f8000fc0007e0,
|
||||
0x040800fffff70107,
|
||||
0x1c03ff0000070707,
|
||||
0x5c98078000070006,
|
||||
0x001f8000fc0007e0,
|
||||
0x5ca8100000670e05,
|
||||
0x5080000000470504,
|
||||
0x010bf8000007f000,
|
||||
0x001f8000fc0007e0,
|
||||
0x5980000000570405,
|
||||
0x5981020000470500,
|
||||
0x5ca8000000070b00,
|
||||
0x001f8000fc0007e0,
|
||||
0x5ca8200000670f06,
|
||||
0x38a8003f80070b08,
|
||||
0x5b70040000070604,
|
||||
0x001f8000fc0007e0,
|
||||
0x5b70000000470000,
|
||||
0x5b70040000070604,
|
||||
0x5b70000000470000,
|
||||
0x001f8000fc0007e0,
|
||||
0x5b70040000070604,
|
||||
0x5b70000000470000,
|
||||
0x5b70040000070604,
|
||||
0x001f8000fc0007e0,
|
||||
0x5b70000000470000,
|
||||
0x381200003ff70202,
|
||||
0x5c10000000370204,
|
||||
0x001f8000fc0007e0,
|
||||
0x38000000b1470103,
|
||||
0x5c10000000470303,
|
||||
0x1c0ffffffff70302,
|
||||
0x001f8000fc0007e0,
|
||||
0x366203807fe70207,
|
||||
0xe24000000208000f,
|
||||
0x3848000001470404,
|
||||
0x001f8000fc0007e0,
|
||||
0x5c10000000170401,
|
||||
0xe24000000807000f,
|
||||
/* 0x04d8: rcp_result_inf_or_denorm */
|
||||
0x366d03807ff70307,
|
||||
0x001f8000fc0007e0,
|
||||
0xe24000000288000f,
|
||||
0x0408000000070101,
|
||||
0x5c9807800ff70000,
|
||||
0x001f8000fc0007e0,
|
||||
0x1c07ff0000070101,
|
||||
0xe24000000407000f,
|
||||
/* 0x0518: rcp_result_denorm */
|
||||
0x5b6a03800ff70307,
|
||||
0x001f8000fc0007e0,
|
||||
0x040800fffff70101,
|
||||
0x38a8003e80000b06,
|
||||
0x38a8003f00080b06,
|
||||
0x001f8000fc0007e0,
|
||||
0x1c00010000070101,
|
||||
0x5c80000000670000,
|
||||
/* 0x0558: rcp_end */
|
||||
0xe32000000007000f,
|
||||
/* 0x0560: gm107_rsq_f64 */
|
||||
0x001fb401fda1ff0d,
|
||||
0x368c03fff0070087,
|
||||
0x0420008000000101,
|
||||
0x0407fffffff70102,
|
||||
0x001fb400fda007ed,
|
||||
0x38000000b1470103,
|
||||
0x366603800027030f,
|
||||
0x5c47020000270002,
|
||||
0x001fb401e1a0070d,
|
||||
0x3880004350010000,
|
||||
0x5080000000770105,
|
||||
0x365a03807ff70306,
|
||||
0x001fb400fda007ed,
|
||||
0x5c47000000670202,
|
||||
0x5b6a03800ff70207,
|
||||
0xe24000000400000f,
|
||||
0x003fb400fda007ed,
|
||||
0x0408000000070101,
|
||||
0x5c9807800ff70000,
|
||||
0x5c47020000570101,
|
||||
0x001fbc00fde007ed,
|
||||
0xe32000000007000f,
|
||||
0x50b0000000070f00,
|
||||
0x50b0000000070f00,
|
||||
/* 0x0620: rsq_norm */
|
||||
0x0060b400e5a007ed,
|
||||
0x5c9807800ff70004,
|
||||
0x38a8003f00070b08,
|
||||
0x5c80000000870002,
|
||||
0x003c3401e1a01f0d,
|
||||
0x5c80000000470200,
|
||||
0x5b71040000470006,
|
||||
0x5b70020000670404,
|
||||
0x003c3401e1a00f0d,
|
||||
0x5c80000000470200,
|
||||
0x5b71040000470006,
|
||||
0x5b70020000670404,
|
||||
0x003c3401e1a00f0d,
|
||||
0x5c80000000470200,
|
||||
0x5b71040000470006,
|
||||
0x5b70020000670404,
|
||||
0x003c3401e1a00f0d,
|
||||
0x5c80000000470200,
|
||||
0x5b71040000470006,
|
||||
0x5b70020000670404,
|
||||
0x001fb401fda00f0d,
|
||||
0x38800041a0010404,
|
||||
0x5c98078000570001,
|
||||
0x5c98078000470000,
|
||||
0x001fbc00fde007ed,
|
||||
0xe32000000007000f,
|
||||
0x50b0000000070f00,
|
||||
0x50b0000000070f00,
|
||||
@@ -93,5 +241,5 @@ uint64_t gm107_builtin_offsets[] = {
|
||||
0x0000000000000000,
|
||||
0x0000000000000120,
|
||||
0x0000000000000280,
|
||||
0x0000000000000280,
|
||||
0x0000000000000560,
|
||||
};
|
||||
|
@@ -1119,6 +1119,7 @@ Program::Program(Type type, Target *arch)
|
||||
binSize = 0;
|
||||
|
||||
maxGPR = -1;
|
||||
fp64 = false;
|
||||
|
||||
main = new Function(this, "MAIN", ~0);
|
||||
calls.insert(&main->call);
|
||||
|
@@ -1311,6 +1311,7 @@ public:
|
||||
uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL
|
||||
|
||||
int maxGPR;
|
||||
bool fp64;
|
||||
|
||||
MemoryPool mem_Instruction;
|
||||
MemoryPool mem_CmpInstruction;
|
||||
|
@@ -1087,6 +1087,8 @@ public:
|
||||
};
|
||||
std::vector<MemoryFile> memoryFiles;
|
||||
|
||||
std::vector<bool> bufferAtomics;
|
||||
|
||||
private:
|
||||
int inferSysValDirection(unsigned sn) const;
|
||||
bool scanDeclaration(const struct tgsi_full_declaration *);
|
||||
@@ -1137,6 +1139,7 @@ bool Source::scanSource()
|
||||
//resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
|
||||
tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1);
|
||||
memoryFiles.resize(scan.file_max[TGSI_FILE_MEMORY] + 1);
|
||||
bufferAtomics.resize(scan.file_max[TGSI_FILE_BUFFER] + 1);
|
||||
|
||||
info->immd.bufSize = 0;
|
||||
|
||||
@@ -1483,11 +1486,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
|
||||
tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair(
|
||||
first, last - first + 1)));
|
||||
break;
|
||||
case TGSI_FILE_BUFFER:
|
||||
for (i = first; i <= last; ++i)
|
||||
bufferAtomics[i] = decl->Declaration.Atomic;
|
||||
break;
|
||||
case TGSI_FILE_ADDRESS:
|
||||
case TGSI_FILE_CONSTANT:
|
||||
case TGSI_FILE_IMMEDIATE:
|
||||
case TGSI_FILE_SAMPLER:
|
||||
case TGSI_FILE_BUFFER:
|
||||
case TGSI_FILE_IMAGE:
|
||||
break;
|
||||
default:
|
||||
@@ -2720,7 +2726,11 @@ Converter::handleLOAD(Value *dst0[4])
|
||||
}
|
||||
|
||||
Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off);
|
||||
ld->cache = tgsi.getCacheMode();
|
||||
if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER &&
|
||||
code->bufferAtomics[r])
|
||||
ld->cache = nv50_ir::CACHE_CG;
|
||||
else
|
||||
ld->cache = tgsi.getCacheMode();
|
||||
if (ind)
|
||||
ld->setIndirect(0, 1, ind);
|
||||
}
|
||||
|
@@ -83,6 +83,38 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
|
||||
delete_Instruction(prog, i);
|
||||
}
|
||||
|
||||
void
|
||||
NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
|
||||
{
|
||||
FlowInstruction *call;
|
||||
Value *def[2];
|
||||
int builtin;
|
||||
|
||||
def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
|
||||
def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
|
||||
|
||||
if (i->op == OP_RCP)
|
||||
builtin = NVC0_BUILTIN_RCP_F64;
|
||||
else
|
||||
builtin = NVC0_BUILTIN_RSQ_F64;
|
||||
|
||||
call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
|
||||
def[0] = bld.getSSA();
|
||||
def[1] = bld.getSSA();
|
||||
bld.mkMovFromReg(def[0], 0);
|
||||
bld.mkMovFromReg(def[1], 1);
|
||||
bld.mkClobber(FILE_GPR, 0x3fc, 2);
|
||||
bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
|
||||
bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
|
||||
|
||||
call->fixed = 1;
|
||||
call->absolute = call->builtin = 1;
|
||||
call->target.builtin = builtin;
|
||||
delete_Instruction(prog, i);
|
||||
|
||||
prog->fp64 = true;
|
||||
}
|
||||
|
||||
void
|
||||
NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
|
||||
{
|
||||
@@ -96,6 +128,12 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
|
||||
Value *src[2], *dst[2], *def = i->getDef(0);
|
||||
bld.mkSplit(src, 4, i->getSrc(0));
|
||||
|
||||
int chip = prog->getTarget()->getChipset();
|
||||
if (chip >= NVISA_GK104_CHIPSET) {
|
||||
handleRCPRSQLib(i, src);
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
|
||||
dst[0] = bld.loadImm(NULL, 0);
|
||||
dst[1] = bld.getSSA();
|
||||
@@ -1063,22 +1101,6 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
|
||||
}
|
||||
}
|
||||
|
||||
if (chipset >= NVISA_GK104_CHIPSET) {
|
||||
//
|
||||
// If TEX requires more than 4 sources, the 2nd register tuple must be
|
||||
// aligned to 4, even if it consists of just a single 4-byte register.
|
||||
//
|
||||
// XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
|
||||
//
|
||||
int s = i->srcCount(0xff, true);
|
||||
if (s > 4 && s < 7) {
|
||||
if (i->srcExists(s)) // move potential predicate out of the way
|
||||
i->moveSources(s, 7 - s);
|
||||
while (s < 7)
|
||||
i->setSrc(s++, bld.loadImm(NULL, 0));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1887,7 +1909,8 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
|
||||
su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
|
||||
const int slot = su->tex.r;
|
||||
const int dim = su->tex.target.getDim();
|
||||
const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
|
||||
const bool array = su->tex.target.isArray() || su->tex.target.isCube();
|
||||
const int arg = dim + array;
|
||||
int c;
|
||||
Value *zero = bld.mkImm(0);
|
||||
Value *p1 = NULL;
|
||||
@@ -1896,6 +1919,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
|
||||
Value *bf, *eau, *off;
|
||||
Value *addr, *pred;
|
||||
Value *ind = su->getIndirectR();
|
||||
Value *y, *z;
|
||||
|
||||
off = bld.getScratch(4);
|
||||
bf = bld.getScratch(4);
|
||||
@@ -1926,34 +1950,42 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
|
||||
for (; c < 3; ++c)
|
||||
src[c] = zero;
|
||||
|
||||
if (dim == 2 && !array) {
|
||||
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
|
||||
src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
|
||||
v, bld.loadImm(NULL, 16));
|
||||
|
||||
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
|
||||
bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
|
||||
->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
|
||||
}
|
||||
|
||||
// set predicate output
|
||||
if (su->tex.target == TEX_TARGET_BUFFER) {
|
||||
src[0]->getInsn()->setFlagsDef(1, pred);
|
||||
} else
|
||||
if (su->tex.target.isArray() || su->tex.target.isCube()) {
|
||||
if (array) {
|
||||
p1 = bld.getSSA(1, FILE_PREDICATE);
|
||||
src[dim]->getInsn()->setFlagsDef(1, p1);
|
||||
}
|
||||
|
||||
// calculate pixel offset
|
||||
if (dim == 1) {
|
||||
y = z = zero;
|
||||
if (su->tex.target != TEX_TARGET_BUFFER)
|
||||
bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
|
||||
} else
|
||||
if (dim == 3) {
|
||||
} else {
|
||||
y = src[1];
|
||||
z = src[2];
|
||||
|
||||
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
|
||||
bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
|
||||
->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
|
||||
->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
|
||||
|
||||
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
|
||||
bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
|
||||
->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
|
||||
} else {
|
||||
assert(dim == 2);
|
||||
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
|
||||
bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
|
||||
->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
|
||||
NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
|
||||
->subOp = array ?
|
||||
NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
|
||||
}
|
||||
|
||||
// calculate effective address part 1
|
||||
@@ -1966,19 +1998,15 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
|
||||
->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
|
||||
}
|
||||
} else {
|
||||
Value *y = src[1];
|
||||
Value *z = src[2];
|
||||
uint16_t subOp = 0;
|
||||
|
||||
switch (dim) {
|
||||
case 1:
|
||||
y = zero;
|
||||
z = zero;
|
||||
break;
|
||||
case 2:
|
||||
z = off;
|
||||
if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
|
||||
z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
|
||||
if (array) {
|
||||
z = off;
|
||||
} else {
|
||||
subOp = NV50_IR_SUBOP_SUBFM_3D;
|
||||
}
|
||||
break;
|
||||
@@ -2001,7 +2029,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
|
||||
eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
|
||||
}
|
||||
// add array layer offset
|
||||
if (su->tex.target.isArray() || su->tex.target.isCube()) {
|
||||
if (array) {
|
||||
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
|
||||
if (dim == 1)
|
||||
bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
|
||||
|
@@ -62,6 +62,7 @@ private:
|
||||
|
||||
// we want to insert calls to the builtin library only after optimization
|
||||
void handleDIV(Instruction *); // integer division, modulus
|
||||
void handleRCPRSQLib(Instruction *, Value *[]);
|
||||
void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
|
||||
void handleFTZ(Instruction *);
|
||||
void handleSET(CmpInstruction *);
|
||||
|
@@ -2341,9 +2341,19 @@ RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex)
|
||||
if (!tex->tex.target.isArray() && tex->tex.useOffsets)
|
||||
s++;
|
||||
}
|
||||
n = tex->srcCount(0xff) - s;
|
||||
n = tex->srcCount(0xff, true) - s;
|
||||
// TODO: Is this necessary? Perhaps just has to be aligned to the
|
||||
// level that the first arg is, not necessarily to 4. This
|
||||
// requirement has not been rigorously verified, as it has been on
|
||||
// Kepler.
|
||||
if (n > 0 && n < 3) {
|
||||
if (tex->srcExists(n + s)) // move potential predicate out of the way
|
||||
tex->moveSources(n + s, 3 - n);
|
||||
while (n < 3)
|
||||
tex->setSrc(s + n++, new_LValue(func, FILE_GPR));
|
||||
}
|
||||
} else {
|
||||
s = tex->srcCount(0xff);
|
||||
s = tex->srcCount(0xff, true);
|
||||
n = 0;
|
||||
}
|
||||
|
||||
@@ -2366,14 +2376,18 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
|
||||
} else
|
||||
if (isTextureOp(tex->op)) {
|
||||
int n = tex->srcCount(0xff, true);
|
||||
if (n > 4) {
|
||||
condenseSrcs(tex, 0, 3);
|
||||
if (n > 5) // NOTE: first call modified positions already
|
||||
condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
|
||||
} else
|
||||
if (n > 1) {
|
||||
condenseSrcs(tex, 0, n - 1);
|
||||
int s = n > 4 ? 4 : n;
|
||||
if (n > 4 && n < 7) {
|
||||
if (tex->srcExists(n)) // move potential predicate out of the way
|
||||
tex->moveSources(n, 7 - n);
|
||||
|
||||
while (n < 7)
|
||||
tex->setSrc(n++, new_LValue(func, FILE_GPR));
|
||||
}
|
||||
if (s > 1)
|
||||
condenseSrcs(tex, 0, s - 1);
|
||||
if (n > 4)
|
||||
condenseSrcs(tex, 1, n - s);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2510,6 +2524,7 @@ RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s)
|
||||
assert(cst->getSrc(s)->defs.size() == 1); // still SSA
|
||||
|
||||
Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
|
||||
|
||||
bool imm = defi->op == OP_MOV &&
|
||||
defi->src(0).getFile() == FILE_IMMEDIATE;
|
||||
bool load = defi->op == OP_LOAD &&
|
||||
|
@@ -399,6 +399,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
|
||||
}
|
||||
}
|
||||
}
|
||||
info->io.fp64 |= fp64;
|
||||
info->bin.relocData = emit->getRelocInfo();
|
||||
info->bin.fixupData = emit->getFixupInfo();
|
||||
|
||||
|
@@ -79,6 +79,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
return 2048;
|
||||
case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
|
||||
return 8 * 1024 * 1024;
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 8;
|
||||
|
||||
/* supported capabilities */
|
||||
case PIPE_CAP_ANISOTROPIC_FILTER:
|
||||
case PIPE_CAP_POINT_SPRITE:
|
||||
|
@@ -98,12 +98,10 @@ nv50_render_condition(struct pipe_context *pipe,
|
||||
case PIPE_QUERY_OCCLUSION_COUNTER:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
|
||||
if (hq->state == NV50_HW_QUERY_STATE_READY)
|
||||
wait = true;
|
||||
if (likely(!condition)) {
|
||||
if (unlikely(hq->nesting))
|
||||
cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
|
||||
NV50_3D_COND_MODE_ALWAYS;
|
||||
else
|
||||
cond = NV50_3D_COND_MODE_RES_NON_ZERO;
|
||||
cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : NV50_3D_COND_MODE_ALWAYS;
|
||||
} else {
|
||||
cond = wait ? NV50_3D_COND_MODE_EQUAL : NV50_3D_COND_MODE_ALWAYS;
|
||||
}
|
||||
@@ -129,7 +127,7 @@ nv50_render_condition(struct pipe_context *pipe,
|
||||
|
||||
PUSH_SPACE(push, 9);
|
||||
|
||||
if (wait) {
|
||||
if (wait && hq->state != NV50_HW_QUERY_STATE_READY) {
|
||||
BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
|
||||
PUSH_DATA (push, 0);
|
||||
}
|
||||
|
@@ -29,11 +29,6 @@
|
||||
#include "nv50/nv50_query_hw_sm.h"
|
||||
#include "nv_object.xml.h"
|
||||
|
||||
#define NV50_HW_QUERY_STATE_READY 0
|
||||
#define NV50_HW_QUERY_STATE_ACTIVE 1
|
||||
#define NV50_HW_QUERY_STATE_ENDED 2
|
||||
#define NV50_HW_QUERY_STATE_FLUSHED 3
|
||||
|
||||
/* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
|
||||
* (since we use only a single GPU channel per screen) will not work properly.
|
||||
*
|
||||
@@ -158,8 +153,7 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
|
||||
case PIPE_QUERY_OCCLUSION_COUNTER:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
|
||||
hq->nesting = nv50->screen->num_occlusion_queries_active++;
|
||||
if (hq->nesting) {
|
||||
if (nv50->screen->num_occlusion_queries_active++) {
|
||||
nv50_hw_query_get(push, q, 0x10, 0x0100f002);
|
||||
} else {
|
||||
PUSH_SPACE(push, 4);
|
||||
|
@@ -6,6 +6,11 @@
|
||||
|
||||
#include "nv50_query.h"
|
||||
|
||||
#define NV50_HW_QUERY_STATE_READY 0
|
||||
#define NV50_HW_QUERY_STATE_ACTIVE 1
|
||||
#define NV50_HW_QUERY_STATE_ENDED 2
|
||||
#define NV50_HW_QUERY_STATE_FLUSHED 3
|
||||
|
||||
#define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
|
||||
|
||||
struct nv50_hw_query;
|
||||
@@ -29,7 +34,6 @@ struct nv50_hw_query {
|
||||
uint8_t state;
|
||||
bool is64bit;
|
||||
uint8_t rotate;
|
||||
int nesting; /* only used for occlusion queries */
|
||||
struct nouveau_mm_allocation *mm;
|
||||
struct nouveau_fence *fence;
|
||||
};
|
||||
|
@@ -156,6 +156,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
return NV50_MAX_WINDOW_RECTANGLES;
|
||||
case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
|
||||
return 16 * 1024 * 1024;
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 15;
|
||||
|
||||
/* supported caps */
|
||||
case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
|
||||
@@ -215,6 +217,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_TGSI_CLOCK:
|
||||
case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
|
||||
case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
|
||||
case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
|
||||
return 1;
|
||||
case PIPE_CAP_SEAMLESS_CUBE_MAP:
|
||||
return 1; /* class_3d >= NVA0_3D_CLASS; */
|
||||
@@ -312,6 +315,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_TGSI_ATOMFADD:
|
||||
case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
|
||||
case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
|
||||
case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
|
||||
return 0;
|
||||
|
||||
case PIPE_CAP_VENDOR_ID:
|
||||
|
@@ -434,6 +434,7 @@ nvc0_video_buffer_create(struct pipe_context *pipe,
|
||||
|
||||
/* nvc0_push.c */
|
||||
void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *);
|
||||
void nvc0_push_vbo_indirect(struct nvc0_context *, const struct pipe_draw_info *);
|
||||
|
||||
/* nve4_compute.c */
|
||||
void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
|
||||
|
@@ -121,12 +121,10 @@ nvc0_render_condition(struct pipe_context *pipe,
|
||||
case PIPE_QUERY_OCCLUSION_COUNTER:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
|
||||
if (hq->state == NVC0_HW_QUERY_STATE_READY)
|
||||
wait = true;
|
||||
if (likely(!condition)) {
|
||||
if (unlikely(hq->nesting))
|
||||
cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL :
|
||||
NVC0_3D_COND_MODE_ALWAYS;
|
||||
else
|
||||
cond = NVC0_3D_COND_MODE_RES_NON_ZERO;
|
||||
cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
|
||||
} else {
|
||||
cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
|
||||
}
|
||||
@@ -151,7 +149,7 @@ nvc0_render_condition(struct pipe_context *pipe,
|
||||
return;
|
||||
}
|
||||
|
||||
if (wait)
|
||||
if (wait && hq->state != NVC0_HW_QUERY_STATE_READY)
|
||||
nvc0_hw_query_fifo_wait(nvc0, q);
|
||||
|
||||
PUSH_SPACE(push, 10);
|
||||
|
@@ -28,11 +28,6 @@
|
||||
#include "nvc0/nvc0_query_hw_metric.h"
|
||||
#include "nvc0/nvc0_query_hw_sm.h"
|
||||
|
||||
#define NVC0_HW_QUERY_STATE_READY 0
|
||||
#define NVC0_HW_QUERY_STATE_ACTIVE 1
|
||||
#define NVC0_HW_QUERY_STATE_ENDED 2
|
||||
#define NVC0_HW_QUERY_STATE_FLUSHED 3
|
||||
|
||||
#define NVC0_HW_QUERY_ALLOC_SPACE 256
|
||||
|
||||
bool
|
||||
@@ -158,14 +153,18 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
|
||||
case PIPE_QUERY_OCCLUSION_COUNTER:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
|
||||
hq->nesting = nvc0->screen->num_occlusion_queries_active++;
|
||||
if (hq->nesting) {
|
||||
if (nvc0->screen->num_occlusion_queries_active++) {
|
||||
nvc0_hw_query_get(push, q, 0x10, 0x0100f002);
|
||||
} else {
|
||||
PUSH_SPACE(push, 3);
|
||||
BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
|
||||
PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
|
||||
IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
|
||||
/* Given that the counter is reset, the contents at 0x10 are
|
||||
* equivalent to doing the query -- we would get hq->sequence as the
|
||||
* payload and 0 as the reported value. This is already set up above
|
||||
* as in the hq->rotate case.
|
||||
*/
|
||||
}
|
||||
break;
|
||||
case PIPE_QUERY_PRIMITIVES_GENERATED:
|
||||
@@ -199,6 +198,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
|
||||
nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
|
||||
nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
|
||||
nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
|
||||
((uint64_t *)hq->data)[(12 + 10) * 2] = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -271,6 +271,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
|
||||
nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
|
||||
nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
|
||||
nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
|
||||
((uint64_t *)hq->data)[10 * 2] = 0;
|
||||
break;
|
||||
case PIPE_QUERY_TIMESTAMP_DISJOINT:
|
||||
/* This query is not issued on GPU because disjoint is forced to false */
|
||||
|
@@ -6,6 +6,11 @@
|
||||
|
||||
#include "nvc0_query.h"
|
||||
|
||||
#define NVC0_HW_QUERY_STATE_READY 0
|
||||
#define NVC0_HW_QUERY_STATE_ACTIVE 1
|
||||
#define NVC0_HW_QUERY_STATE_ENDED 2
|
||||
#define NVC0_HW_QUERY_STATE_FLUSHED 3
|
||||
|
||||
#define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
|
||||
|
||||
struct nvc0_hw_query;
|
||||
@@ -29,7 +34,6 @@ struct nvc0_hw_query {
|
||||
uint8_t state;
|
||||
boolean is64bit;
|
||||
uint8_t rotate;
|
||||
int nesting; /* only used for occlusion queries */
|
||||
struct nouveau_mm_allocation *mm;
|
||||
struct nouveau_fence *fence;
|
||||
};
|
||||
|
@@ -182,6 +182,13 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
return class_3d >= GM200_3D_CLASS ? 8 : 0;
|
||||
case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
|
||||
return 64 * 1024 * 1024;
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
/* NOTE: These only count our slots for GENERIC varyings.
|
||||
* The address space may be larger, but the actual hard limit seems to be
|
||||
* less than what the address space layout permits, so don't add TEXCOORD,
|
||||
* COLOR, etc. here.
|
||||
*/
|
||||
return 0x1f0 / 16;
|
||||
|
||||
/* supported caps */
|
||||
case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
|
||||
@@ -266,6 +273,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
|
||||
case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
|
||||
case PIPE_CAP_QUERY_SO_OVERFLOW:
|
||||
case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
|
||||
return 1;
|
||||
case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
|
||||
return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
|
||||
@@ -336,6 +344,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_SURFACE_SAMPLE_COUNT:
|
||||
case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
|
||||
case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
|
||||
case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
|
||||
return 0;
|
||||
|
||||
case PIPE_CAP_VENDOR_ID:
|
||||
@@ -392,18 +401,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen,
|
||||
case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
|
||||
return 16;
|
||||
case PIPE_SHADER_CAP_MAX_INPUTS:
|
||||
if (shader == PIPE_SHADER_VERTEX)
|
||||
return 32;
|
||||
/* NOTE: These only count our slots for GENERIC varyings.
|
||||
* The address space may be larger, but the actual hard limit seems to be
|
||||
* less than what the address space layout permits, so don't add TEXCOORD,
|
||||
* COLOR, etc. here.
|
||||
*/
|
||||
if (shader == PIPE_SHADER_FRAGMENT)
|
||||
return 0x1f0 / 16;
|
||||
/* Actually this counts CLIPVERTEX, which occupies the last generic slot,
|
||||
* and excludes 0x60 per-patch inputs.
|
||||
*/
|
||||
return 0x200 / 16;
|
||||
case PIPE_SHADER_CAP_MAX_OUTPUTS:
|
||||
return 32;
|
||||
@@ -1286,8 +1283,8 @@ nvc0_screen_create(struct nouveau_device *dev)
|
||||
for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) {
|
||||
BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3);
|
||||
PUSH_DATA (push, 1);
|
||||
PUSH_DATA (push, 8192 << 16);
|
||||
PUSH_DATA (push, 8192 << 16);
|
||||
PUSH_DATA (push, 16384 << 16);
|
||||
PUSH_DATA (push, 16384 << 16);
|
||||
}
|
||||
|
||||
#define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n);
|
||||
|
@@ -1051,21 +1051,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
|
||||
} else {
|
||||
struct nv50_miptree *mt = nv50_miptree(&res->base);
|
||||
struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level];
|
||||
const unsigned z = view->u.tex.first_layer;
|
||||
unsigned z = view->u.tex.first_layer;
|
||||
|
||||
if (z) {
|
||||
if (mt->layout_3d) {
|
||||
address += nvc0_mt_zslice_offset(mt, view->u.tex.level, z);
|
||||
/* doesn't work if z passes z-tile boundary */
|
||||
if (depth > 1) {
|
||||
pipe_debug_message(&nvc0->base.debug, CONFORMANCE,
|
||||
"3D images are not really supported!");
|
||||
debug_printf("3D images are not really supported!\n");
|
||||
}
|
||||
} else {
|
||||
address += mt->layer_stride * z;
|
||||
}
|
||||
if (!mt->layout_3d) {
|
||||
address += mt->layer_stride * z;
|
||||
z = 0;
|
||||
}
|
||||
|
||||
address += lvl->offset;
|
||||
|
||||
info[0] = address >> 8;
|
||||
@@ -1080,7 +1072,8 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
|
||||
info[6] = depth - 1;
|
||||
info[6] |= (lvl->tile_mode & 0xf00) << 21;
|
||||
info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22;
|
||||
info[7] = 0;
|
||||
info[7] = mt->layout_3d ? 1 : 0;
|
||||
info[7] |= z << 16;
|
||||
info[14] = mt->ms_x;
|
||||
info[15] = mt->ms_y;
|
||||
}
|
||||
|
@@ -1040,7 +1040,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
|
||||
}
|
||||
|
||||
if (nvc0->state.vbo_mode) {
|
||||
nvc0_push_vbo(nvc0, info);
|
||||
if (info->indirect)
|
||||
nvc0_push_vbo_indirect(nvc0, info);
|
||||
else
|
||||
nvc0_push_vbo(nvc0, info);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
@@ -466,6 +466,83 @@ nvc0_prim_gl(unsigned prim)
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
uint32_t count;
|
||||
uint32_t primCount;
|
||||
uint32_t first;
|
||||
uint32_t baseInstance;
|
||||
} DrawArraysIndirectCommand;
|
||||
|
||||
typedef struct {
|
||||
uint32_t count;
|
||||
uint32_t primCount;
|
||||
uint32_t firstIndex;
|
||||
int32_t baseVertex;
|
||||
uint32_t baseInstance;
|
||||
} DrawElementsIndirectCommand;
|
||||
|
||||
void
|
||||
nvc0_push_vbo_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
|
||||
{
|
||||
/* The strategy here is to just read the commands from the indirect buffer
|
||||
* and do the draws. This is suboptimal, but will only happen in the case
|
||||
* that conversion is required for FIXED or DOUBLE inputs.
|
||||
*/
|
||||
struct nvc0_screen *screen = nvc0->screen;
|
||||
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
|
||||
struct nv04_resource *buf = nv04_resource(info->indirect->buffer);
|
||||
struct nv04_resource *buf_count = nv04_resource(info->indirect->indirect_draw_count);
|
||||
unsigned i;
|
||||
|
||||
unsigned draw_count = info->indirect->draw_count;
|
||||
if (buf_count) {
|
||||
uint32_t *count = nouveau_resource_map_offset(
|
||||
&nvc0->base, buf_count, info->indirect->indirect_draw_count_offset,
|
||||
NOUVEAU_BO_RD);
|
||||
draw_count = *count;
|
||||
}
|
||||
|
||||
uint8_t *buf_data = nouveau_resource_map_offset(
|
||||
&nvc0->base, buf, info->indirect->offset, NOUVEAU_BO_RD);
|
||||
struct pipe_draw_info single = *info;
|
||||
single.indirect = NULL;
|
||||
for (i = 0; i < draw_count; i++, buf_data += info->indirect->stride) {
|
||||
if (info->index_size) {
|
||||
DrawElementsIndirectCommand *cmd = (void *)buf_data;
|
||||
single.start = info->start + cmd->firstIndex;
|
||||
single.count = cmd->count;
|
||||
single.start_instance = cmd->baseInstance;
|
||||
single.instance_count = cmd->primCount;
|
||||
single.index_bias = cmd->baseVertex;
|
||||
} else {
|
||||
DrawArraysIndirectCommand *cmd = (void *)buf_data;
|
||||
single.start = cmd->first;
|
||||
single.count = cmd->count;
|
||||
single.start_instance = cmd->baseInstance;
|
||||
single.instance_count = cmd->primCount;
|
||||
}
|
||||
|
||||
if (nvc0->vertprog->vp.need_draw_parameters) {
|
||||
PUSH_SPACE(push, 9);
|
||||
BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
|
||||
PUSH_DATA (push, NVC0_CB_AUX_SIZE);
|
||||
PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
|
||||
PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
|
||||
BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
|
||||
PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
|
||||
PUSH_DATA (push, single.index_bias);
|
||||
PUSH_DATA (push, single.start_instance);
|
||||
PUSH_DATA (push, single.drawid + i);
|
||||
}
|
||||
|
||||
nvc0_push_vbo(nvc0, &single);
|
||||
}
|
||||
|
||||
nouveau_resource_unmap(buf);
|
||||
if (buf_count)
|
||||
nouveau_resource_unmap(buf_count);
|
||||
}
|
||||
|
||||
void
|
||||
nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
|
||||
{
|
||||
|
@@ -304,6 +304,9 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
|
||||
return 2048;
|
||||
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 10;
|
||||
|
||||
case PIPE_CAP_VENDOR_ID:
|
||||
return 0x1002;
|
||||
case PIPE_CAP_DEVICE_ID:
|
||||
|
@@ -536,6 +536,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_MAX_TEXEL_OFFSET:
|
||||
return 7;
|
||||
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 32;
|
||||
|
||||
case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
|
||||
return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600;
|
||||
case PIPE_CAP_ENDIANNESS:
|
||||
|
@@ -521,10 +521,13 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
|
||||
struct si_resource *buf = si_resource(transfer->resource);
|
||||
|
||||
if (stransfer->staging) {
|
||||
unsigned src_offset = stransfer->offset +
|
||||
transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
|
||||
(box->x - transfer->box.x);
|
||||
|
||||
/* Copy the staging buffer into the original one. */
|
||||
si_copy_buffer((struct si_context*)ctx, transfer->resource,
|
||||
&stransfer->staging->b.b, box->x,
|
||||
stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT,
|
||||
&stransfer->staging->b.b, box->x, src_offset,
|
||||
box->width);
|
||||
}
|
||||
|
||||
|
@@ -254,6 +254,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
|
||||
return 30;
|
||||
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 32;
|
||||
|
||||
case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
|
||||
return sscreen->info.chip_class <= VI ?
|
||||
PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
|
||||
|
@@ -1333,7 +1333,7 @@ void si_init_perfcounters(struct si_screen *screen)
|
||||
for (i = 0; i < num_blocks; ++i) {
|
||||
struct si_pc_block *block = &pc->blocks[i];
|
||||
block->b = &blocks[i];
|
||||
block->num_instances = block->b->instances;
|
||||
block->num_instances = MAX2(1, block->b->instances);
|
||||
|
||||
if (!strcmp(block->b->b->name, "CB") ||
|
||||
!strcmp(block->b->b->name, "DB"))
|
||||
|
@@ -834,7 +834,7 @@ si_lower_nir(struct si_shader_selector* sel)
|
||||
NIR_PASS(progress, sel->nir, nir_opt_if);
|
||||
NIR_PASS(progress, sel->nir, nir_opt_dead_cf);
|
||||
NIR_PASS(progress, sel->nir, nir_opt_cse);
|
||||
NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true, true);
|
||||
NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true);
|
||||
|
||||
/* Needed for algebraic lowering */
|
||||
NIR_PASS(progress, sel->nir, nir_opt_algebraic);
|
||||
|
@@ -265,6 +265,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
|
||||
return 1;
|
||||
case PIPE_CAP_CLEAR_TEXTURE:
|
||||
return 1;
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return TGSI_EXEC_MAX_INPUT_ATTRIBS;
|
||||
case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
|
||||
case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
|
||||
case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
|
||||
|
@@ -350,6 +350,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
|
||||
|
||||
case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
|
||||
return sws->have_sm4_1 ? 1 : 0; /* only single-channel textures */
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return sws->have_vgpu10 ? VGPU10_MAX_FS_INPUTS : 10;
|
||||
|
||||
/* Unsupported features */
|
||||
case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
|
||||
|
@@ -780,7 +780,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
|
||||
rsc->tiled = false;
|
||||
} else {
|
||||
fprintf(stderr, "Unsupported modifier requested\n");
|
||||
return NULL;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
rsc->internal_format = prsc->format;
|
||||
|
@@ -177,6 +177,9 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
|
||||
return 4;
|
||||
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return V3D_MAX_FS_INPUTS / 4;
|
||||
|
||||
/* Texturing. */
|
||||
case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
|
||||
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
|
||||
|
@@ -1591,7 +1591,7 @@ vc4_optimize_nir(struct nir_shader *s)
|
||||
NIR_PASS(progress, s, nir_opt_dce);
|
||||
NIR_PASS(progress, s, nir_opt_dead_cf);
|
||||
NIR_PASS(progress, s, nir_opt_cse);
|
||||
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
|
||||
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
|
||||
NIR_PASS(progress, s, nir_opt_algebraic);
|
||||
NIR_PASS(progress, s, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, s, nir_opt_undef);
|
||||
|
@@ -132,7 +132,7 @@ vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
|
||||
|
||||
/* We can't mix HW and non-HW queries. */
|
||||
if (nhwqueries && nhwqueries != num_queries)
|
||||
return NULL;
|
||||
goto err_free_query;
|
||||
|
||||
if (!nhwqueries)
|
||||
return (struct pipe_query *)query;
|
||||
|
@@ -178,6 +178,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
||||
/* Note: Not supported in hardware, just faking it. */
|
||||
return 5;
|
||||
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
return 8;
|
||||
|
||||
case PIPE_CAP_VENDOR_ID:
|
||||
return 0x14E4;
|
||||
case PIPE_CAP_ACCELERATED:
|
||||
|
@@ -258,6 +258,10 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
|
||||
case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
|
||||
case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
|
||||
return 1; /* TODO: need to introduce a hw-cap for this */
|
||||
case PIPE_CAP_MAX_VARYINGS:
|
||||
if (vscreen->caps.caps.v1.glsl_level < 150)
|
||||
return vscreen->caps.caps.v2.max_vertex_attribs;
|
||||
return 32;
|
||||
case PIPE_CAP_TEXTURE_GATHER_SM5:
|
||||
case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
|
||||
case PIPE_CAP_FAKE_SW_MSAA:
|
||||
|
@@ -856,6 +856,7 @@ enum pipe_cap
|
||||
PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE,
|
||||
PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND,
|
||||
PIPE_CAP_DEST_SURFACE_SRGB_CONTROL,
|
||||
PIPE_CAP_MAX_VARYINGS,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@@ -70,7 +70,8 @@ enum pipe_video_profile
|
||||
PIPE_VIDEO_PROFILE_HEVC_MAIN_444,
|
||||
PIPE_VIDEO_PROFILE_JPEG_BASELINE,
|
||||
PIPE_VIDEO_PROFILE_VP9_PROFILE0,
|
||||
PIPE_VIDEO_PROFILE_VP9_PROFILE2
|
||||
PIPE_VIDEO_PROFILE_VP9_PROFILE2,
|
||||
PIPE_VIDEO_PROFILE_MAX
|
||||
};
|
||||
|
||||
/* Video caps, can be different for each codec/profile */
|
||||
|
@@ -175,7 +175,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
|
||||
ctx->version_minor = 1;
|
||||
*ctx->vtable = vtable;
|
||||
*ctx->vtable_vpp = vtable_vpp;
|
||||
ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN;
|
||||
ctx->max_profiles = PIPE_VIDEO_PROFILE_MAX - PIPE_VIDEO_PROFILE_UNKNOWN - 1;
|
||||
ctx->max_entrypoints = 2;
|
||||
ctx->max_attributes = 1;
|
||||
ctx->max_image_formats = VL_VA_MAX_IMAGE_FORMATS;
|
||||
|
@@ -28,6 +28,8 @@
|
||||
#include "vl/vl_vlc.h"
|
||||
#include "va_private.h"
|
||||
|
||||
#define NUM_VP9_REFS 8
|
||||
|
||||
void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
|
||||
{
|
||||
VADecPictureParameterBufferVP9 *vp9 = buf->data;
|
||||
@@ -79,8 +81,11 @@ void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context,
|
||||
|
||||
context->desc.vp9.picture_parameter.bit_depth = vp9->bit_depth;
|
||||
|
||||
for (i = 0 ; i < 8 ; i++)
|
||||
for (i = 0 ; i < NUM_VP9_REFS ; i++)
|
||||
vlVaGetReferenceFrame(drv, vp9->reference_frames[i], &context->desc.vp9.ref[i]);
|
||||
|
||||
if (!context->decoder && !context->templat.max_references)
|
||||
context->templat.max_references = NUM_VP9_REFS;
|
||||
}
|
||||
|
||||
void vlVaHandleSliceParameterBufferVP9(vlVaContext *context, vlVaBuffer *buf)
|
||||
|
@@ -90,15 +90,15 @@ Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
|
||||
if (!attr)
|
||||
return XvMCBadContext;
|
||||
|
||||
if (strcmp(attr, XV_BRIGHTNESS))
|
||||
if (strcmp(attr, XV_BRIGHTNESS) == 0)
|
||||
context_priv->procamp.brightness = value / 1000.0f;
|
||||
else if (strcmp(attr, XV_CONTRAST))
|
||||
else if (strcmp(attr, XV_CONTRAST) == 0)
|
||||
context_priv->procamp.contrast = value / 1000.0f + 1.0f;
|
||||
else if (strcmp(attr, XV_SATURATION))
|
||||
else if (strcmp(attr, XV_SATURATION) == 0)
|
||||
context_priv->procamp.saturation = value / 1000.0f + 1.0f;
|
||||
else if (strcmp(attr, XV_HUE))
|
||||
else if (strcmp(attr, XV_HUE) == 0)
|
||||
context_priv->procamp.hue = value / 1000.0f;
|
||||
else if (strcmp(attr, XV_COLORSPACE))
|
||||
else if (strcmp(attr, XV_COLORSPACE) == 0)
|
||||
context_priv->color_standard = value ?
|
||||
VL_CSC_COLOR_STANDARD_BT_601 :
|
||||
VL_CSC_COLOR_STANDARD_BT_709;
|
||||
@@ -134,15 +134,15 @@ Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
|
||||
if (!attr)
|
||||
return XvMCBadContext;
|
||||
|
||||
if (strcmp(attr, XV_BRIGHTNESS))
|
||||
if (strcmp(attr, XV_BRIGHTNESS) == 0)
|
||||
*value = context_priv->procamp.brightness * 1000;
|
||||
else if (strcmp(attr, XV_CONTRAST))
|
||||
else if (strcmp(attr, XV_CONTRAST) == 0)
|
||||
*value = context_priv->procamp.contrast * 1000 - 1000;
|
||||
else if (strcmp(attr, XV_SATURATION))
|
||||
else if (strcmp(attr, XV_SATURATION) == 0)
|
||||
*value = context_priv->procamp.saturation * 1000 + 1000;
|
||||
else if (strcmp(attr, XV_HUE))
|
||||
else if (strcmp(attr, XV_HUE) == 0)
|
||||
*value = context_priv->procamp.hue * 1000;
|
||||
else if (strcmp(attr, XV_COLORSPACE))
|
||||
else if (strcmp(attr, XV_COLORSPACE) == 0)
|
||||
*value = context_priv->color_standard == VL_CSC_COLOR_STANDARD_BT_709;
|
||||
else
|
||||
return BadName;
|
||||
|
@@ -123,11 +123,11 @@ void ParseArgs(int argc, char **argv, struct Config *config)
|
||||
|
||||
while (token && !fail)
|
||||
{
|
||||
if (strcmp(token, "i"))
|
||||
if (strcmp(token, "i") == 0)
|
||||
config->mb_types |= MB_TYPE_I;
|
||||
else if (strcmp(token, "p"))
|
||||
else if (strcmp(token, "p") == 0)
|
||||
config->mb_types |= MB_TYPE_P;
|
||||
else if (strcmp(token, "b"))
|
||||
else if (strcmp(token, "b") == 0)
|
||||
config->mb_types |= MB_TYPE_B;
|
||||
else
|
||||
fail = 1;
|
||||
|
@@ -1219,8 +1219,6 @@ static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs)
|
||||
{
|
||||
struct amdgpu_cs_context *cs = acs->csc;
|
||||
|
||||
cs->num_fence_dependencies = 0;
|
||||
|
||||
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
|
||||
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
|
||||
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
|
||||
|
@@ -396,6 +396,7 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
|
||||
{
|
||||
struct xlib_displaytarget *xlib_dt;
|
||||
unsigned nblocksy, size;
|
||||
int ignore;
|
||||
|
||||
xlib_dt = CALLOC_STRUCT(xlib_displaytarget);
|
||||
if (!xlib_dt)
|
||||
@@ -410,7 +411,8 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
|
||||
xlib_dt->stride = align(util_format_get_stride(format, width), alignment);
|
||||
size = xlib_dt->stride * nblocksy;
|
||||
|
||||
if (!debug_get_option_xlib_no_shm()) {
|
||||
if (!debug_get_option_xlib_no_shm() &&
|
||||
XQueryExtension(xlib_dt->display, "MIT-SHM", &ignore, &ignore, &ignore)) {
|
||||
xlib_dt->data = alloc_shm(xlib_dt, size);
|
||||
if (xlib_dt->data) {
|
||||
xlib_dt->shm = True;
|
||||
|
@@ -253,6 +253,7 @@ VULKAN_TESTS = \
|
||||
vulkan/tests/block_pool_no_free \
|
||||
vulkan/tests/state_pool_no_free \
|
||||
vulkan/tests/state_pool_free_list_only \
|
||||
vulkan/tests/state_pool_padding \
|
||||
vulkan/tests/state_pool
|
||||
|
||||
VULKAN_TEST_LDADD = \
|
||||
@@ -274,6 +275,10 @@ vulkan_tests_state_pool_free_list_only_CFLAGS = $(VULKAN_CFLAGS)
|
||||
vulkan_tests_state_pool_free_list_only_CPPFLAGS = $(VULKAN_CPPFLAGS)
|
||||
vulkan_tests_state_pool_free_list_only_LDADD = $(VULKAN_TEST_LDADD)
|
||||
|
||||
vulkan_tests_state_pool_padding_CFLAGS = $(VULKAN_CFLAGS)
|
||||
vulkan_tests_state_pool_padding_CPPFLAGS = $(VULKAN_CPPFLAGS)
|
||||
vulkan_tests_state_pool_padding_LDADD = $(VULKAN_TEST_LDADD)
|
||||
|
||||
vulkan_tests_state_pool_CFLAGS = $(VULKAN_CFLAGS)
|
||||
vulkan_tests_state_pool_CPPFLAGS = $(VULKAN_CPPFLAGS)
|
||||
vulkan_tests_state_pool_LDADD = $(VULKAN_TEST_LDADD)
|
||||
|
@@ -667,15 +667,14 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
|
||||
* messages adding a node interference to the grf127_send_hack_node.
|
||||
* This node has a fixed asignment to grf127.
|
||||
*
|
||||
* We don't apply it to SIMD16 because previous code avoids any register
|
||||
* overlap between sources and destination.
|
||||
* We don't apply it to SIMD16 instructions because previous code avoids
|
||||
* any register overlap between sources and destination.
|
||||
*/
|
||||
ra_set_node_reg(g, grf127_send_hack_node, 127);
|
||||
if (dispatch_width == 8) {
|
||||
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
||||
if (inst->is_send_from_grf() && inst->dst.file == VGRF)
|
||||
ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
|
||||
}
|
||||
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
||||
if (inst->exec_size < 16 && inst->is_send_from_grf() &&
|
||||
inst->dst.file == VGRF)
|
||||
ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
|
||||
}
|
||||
|
||||
if (spilled_any_registers) {
|
||||
|
@@ -570,18 +570,7 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
|
||||
OPT(nir_opt_dce);
|
||||
OPT(nir_opt_cse);
|
||||
|
||||
/* Passing 0 to the peephole select pass causes it to convert
|
||||
* if-statements that contain only move instructions in the branches
|
||||
* regardless of the count.
|
||||
*
|
||||
* Passing 1 to the peephole select pass causes it to convert
|
||||
* if-statements that contain at most a single ALU instruction (total)
|
||||
* in both branches. Before Gen6, some math instructions were
|
||||
* prohibitively expensive and the results of compare operations need an
|
||||
* extra resolve step. For these reasons, this pass is more harmful
|
||||
* than good on those platforms.
|
||||
*
|
||||
* For indirect loads of uniforms (push constants), we assume that array
|
||||
/* For indirect loads of uniforms (push constants), we assume that array
|
||||
* indices will nearly always be in bounds and the cost of the load is
|
||||
* low. Therefore there shouldn't be a performance benefit to avoid it.
|
||||
* However, in vec4 tessellation shaders, these loads operate by
|
||||
@@ -590,9 +579,7 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
|
||||
const bool is_vec4_tessellation = !is_scalar &&
|
||||
(nir->info.stage == MESA_SHADER_TESS_CTRL ||
|
||||
nir->info.stage == MESA_SHADER_TESS_EVAL);
|
||||
OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
|
||||
OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation,
|
||||
compiler->devinfo->gen >= 6);
|
||||
OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation);
|
||||
|
||||
OPT(nir_opt_intrinsics);
|
||||
OPT(nir_opt_idiv_const, 32);
|
||||
|
@@ -70,12 +70,36 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
|
||||
};
|
||||
|
||||
const int max = 0xffff;
|
||||
|
||||
uint32_t y_min = s->offset.y;
|
||||
uint32_t x_min = s->offset.x;
|
||||
uint32_t y_max = s->offset.y + s->extent.height - 1;
|
||||
uint32_t x_max = s->offset.x + s->extent.width - 1;
|
||||
|
||||
/* Do this math using int64_t so overflow gets clamped correctly. */
|
||||
if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
|
||||
y_min = clamp_int64((uint64_t) y_min,
|
||||
cmd_buffer->state.render_area.offset.y, max);
|
||||
x_min = clamp_int64((uint64_t) x_min,
|
||||
cmd_buffer->state.render_area.offset.x, max);
|
||||
y_max = clamp_int64((uint64_t) y_max, 0,
|
||||
cmd_buffer->state.render_area.offset.y +
|
||||
cmd_buffer->state.render_area.extent.height - 1);
|
||||
x_max = clamp_int64((uint64_t) x_max, 0,
|
||||
cmd_buffer->state.render_area.offset.x +
|
||||
cmd_buffer->state.render_area.extent.width - 1);
|
||||
} else if (fb) {
|
||||
y_min = clamp_int64((uint64_t) y_min, 0, max);
|
||||
x_min = clamp_int64((uint64_t) x_min, 0, max);
|
||||
y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1);
|
||||
x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1);
|
||||
}
|
||||
|
||||
struct GEN7_SCISSOR_RECT scissor = {
|
||||
/* Do this math using int64_t so overflow gets clamped correctly. */
|
||||
.ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
|
||||
.ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
|
||||
.ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + s->extent.height - 1, 0, fb->height - 1),
|
||||
.ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + s->extent.width - 1, 0, fb->width - 1)
|
||||
.ScissorRectangleYMin = y_min,
|
||||
.ScissorRectangleXMin = x_min,
|
||||
.ScissorRectangleYMax = y_max,
|
||||
.ScissorRectangleXMax = x_max
|
||||
};
|
||||
|
||||
if (s->extent.width <= 0 || s->extent.height <= 0) {
|
||||
|
@@ -1211,13 +1211,30 @@ emit_3dstate_streamout(struct anv_pipeline *pipeline,
|
||||
hole_dwords -= 4;
|
||||
}
|
||||
|
||||
int varying = output->location;
|
||||
uint8_t component_mask = output->component_mask;
|
||||
/* VARYING_SLOT_PSIZ contains three scalar fields packed together:
|
||||
* - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y
|
||||
* - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z
|
||||
* - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w
|
||||
*/
|
||||
if (varying == VARYING_SLOT_LAYER) {
|
||||
varying = VARYING_SLOT_PSIZ;
|
||||
component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
|
||||
} else if (varying == VARYING_SLOT_VIEWPORT) {
|
||||
varying = VARYING_SLOT_PSIZ;
|
||||
component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
|
||||
} else if (varying == VARYING_SLOT_PSIZ) {
|
||||
component_mask = 1 << 3; // SO_DECL_COMPMASK_W
|
||||
}
|
||||
|
||||
next_offset[buffer] = output->offset +
|
||||
__builtin_popcount(output->component_mask) * 4;
|
||||
__builtin_popcount(component_mask) * 4;
|
||||
|
||||
so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
|
||||
.OutputBufferSlot = buffer,
|
||||
.RegisterIndex = vue_map->varying_to_slot[output->location],
|
||||
.ComponentMask = output->component_mask,
|
||||
.RegisterIndex = vue_map->varying_to_slot[varying],
|
||||
.ComponentMask = component_mask,
|
||||
};
|
||||
}
|
||||
|
||||
|
@@ -111,7 +111,7 @@ set_adaptive_sync_property(xcb_connection_t *conn, xcb_drawable_t drawable,
|
||||
xcb_intern_atom_reply_t* reply;
|
||||
xcb_void_cookie_t check;
|
||||
|
||||
cookie = xcb_intern_atom(conn, 0, sizeof(name), name);
|
||||
cookie = xcb_intern_atom(conn, 0, strlen(name), name);
|
||||
reply = xcb_intern_atom_reply(conn, cookie, NULL);
|
||||
if (reply == NULL)
|
||||
return;
|
||||
|
@@ -34,6 +34,8 @@ AM_CFLAGS = \
|
||||
-I$(top_builddir)/src/util \
|
||||
-I$(top_srcdir)/src/mesa/drivers/dri/common \
|
||||
-I$(top_srcdir)/src/gtest/include \
|
||||
-I$(top_builddir)/src/compiler \
|
||||
-I$(top_srcdir)/src/compiler \
|
||||
-I$(top_builddir)/src/compiler/glsl \
|
||||
-I$(top_builddir)/src/compiler/nir \
|
||||
-I$(top_srcdir)/src/compiler/nir \
|
||||
|
@@ -42,7 +42,7 @@
|
||||
#include "compiler/glsl/ir.h"
|
||||
#include "compiler/glsl/program.h"
|
||||
#include "compiler/glsl/glsl_to_nir.h"
|
||||
#include "compiler/glsl/float64_glsl.h"
|
||||
#include "glsl/float64_glsl.h"
|
||||
|
||||
#include "brw_program.h"
|
||||
#include "brw_context.h"
|
||||
|
@@ -4691,6 +4691,29 @@ discard_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
|
||||
if (!att)
|
||||
continue;
|
||||
|
||||
/* If we're asked to invalidate just depth or just stencil, but the
|
||||
* attachment is packed depth/stencil, then we can only use
|
||||
* Driver.DiscardFramebuffer if the attachments list includes both depth
|
||||
* and stencil and they both point at the same renderbuffer.
|
||||
*/
|
||||
if ((attachments[i] == GL_DEPTH_ATTACHMENT ||
|
||||
attachments[i] == GL_STENCIL_ATTACHMENT) &&
|
||||
(!att->Renderbuffer ||
|
||||
att->Renderbuffer->_BaseFormat == GL_DEPTH_STENCIL)) {
|
||||
GLenum other_format = (attachments[i] == GL_DEPTH_ATTACHMENT ?
|
||||
GL_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT);
|
||||
bool has_both = false;
|
||||
for (int j = 0; j < numAttachments; j++) {
|
||||
if (attachments[j] == other_format)
|
||||
has_both = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (fb->Attachment[BUFFER_DEPTH].Renderbuffer !=
|
||||
fb->Attachment[BUFFER_STENCIL].Renderbuffer || !has_both)
|
||||
continue;
|
||||
}
|
||||
|
||||
ctx->Driver.DiscardFramebuffer(ctx, fb, att);
|
||||
}
|
||||
}
|
||||
|
@@ -208,6 +208,10 @@ new_draw_rastpos_stage(struct gl_context *ctx, struct draw_context *draw)
|
||||
rs->prim.end = 1;
|
||||
rs->prim.start = 0;
|
||||
rs->prim.count = 1;
|
||||
rs->prim.pad = 0;
|
||||
rs->prim.num_instances = 1;
|
||||
rs->prim.base_instance = 0;
|
||||
rs->prim.is_indirect = 0;
|
||||
|
||||
return rs;
|
||||
}
|
||||
|
@@ -223,8 +223,13 @@ void st_init_limits(struct pipe_screen *screen,
|
||||
pc->MaxUniformComponents = MIN2(pc->MaxUniformComponents,
|
||||
MAX_UNIFORMS * 4);
|
||||
|
||||
/* For ARB programs, prog_src_register::Index is a signed 13-bit number.
|
||||
* This gives us a limit of 4096 values - but we may need to generate
|
||||
* internal values in addition to what the source program uses. So, we
|
||||
* drop the limit one step lower, to 2048, to be safe.
|
||||
*/
|
||||
pc->MaxParameters =
|
||||
pc->MaxNativeParameters = pc->MaxUniformComponents / 4;
|
||||
pc->MaxNativeParameters = MIN2(pc->MaxUniformComponents / 4, 2048);
|
||||
pc->MaxInputComponents =
|
||||
screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS) * 4;
|
||||
pc->MaxOutputComponents =
|
||||
@@ -362,10 +367,7 @@ void st_init_limits(struct pipe_screen *screen,
|
||||
c->Program[MESA_SHADER_VERTEX].MaxAttribs =
|
||||
MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16);
|
||||
|
||||
/* PIPE_SHADER_CAP_MAX_INPUTS for the FS specifies the maximum number
|
||||
* of inputs. It's always 2 colors + N generic inputs. */
|
||||
c->MaxVarying = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
|
||||
PIPE_SHADER_CAP_MAX_INPUTS);
|
||||
c->MaxVarying = screen->get_param(screen, PIPE_CAP_MAX_VARYINGS);
|
||||
c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING);
|
||||
c->MaxGeometryOutputVertices =
|
||||
screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
|
||||
|
@@ -2356,6 +2356,8 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
|
||||
bindings |= PIPE_BIND_DEPTH_STENCIL;
|
||||
else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 ||
|
||||
internalFormat == GL_RGB || internalFormat == GL_RGBA ||
|
||||
internalFormat == GL_RGBA2 ||
|
||||
internalFormat == GL_RGB4 || internalFormat == GL_RGBA4 ||
|
||||
internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 ||
|
||||
internalFormat == GL_BGRA ||
|
||||
internalFormat == GL_RGB16F ||
|
||||
|
@@ -327,7 +327,7 @@ st_nir_opts(nir_shader *nir, bool scalar)
|
||||
NIR_PASS(progress, nir, nir_opt_if);
|
||||
NIR_PASS(progress, nir, nir_opt_dead_cf);
|
||||
NIR_PASS(progress, nir, nir_opt_cse);
|
||||
NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
|
||||
NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true);
|
||||
|
||||
NIR_PASS(progress, nir, nir_opt_algebraic);
|
||||
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
||||
|
Reference in New Issue
Block a user