glsl: make sure UBO arrays are sized in ES

This check was removed in 5b2675093e add it back in. Reviewed-by: Dave Airlie <airlied@redhat.com> Cc: "12.0" <mesa-stable@lists.freedesktop.org> https://bugs.freedesktop.org/show_bug.cgi?id=96349
clover: Update OpenCL version string to match OpenGL
2016-06-14 11:33:24 +10:00 · 2016-06-13 15:55:59 -07:00 · 2016-06-13 15:55:59 -07:00 · 2016-06-13 15:55:58 -07:00 · 2016-06-13 15:55:58 -07:00 · 2016-06-13 15:55:58 -07:00
283 changed files with 5564 additions and 3728 deletions
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -34,6 +34,10 @@ MESA_VERSION := $(shell cat $(MESA_TOP)/VERSION)
 LOCAL_CFLAGS += \
 	-Wno-unused-parameter \
 	-Wno-date-time \
+	-Wno-pointer-arith \
+	-Wno-missing-field-initializers \
+	-Wno-initializer-overrides \
+	-Wno-mismatched-tags \
 	-DPACKAGE_VERSION=\"$(MESA_VERSION)\" \
 	-DPACKAGE_BUGREPORT=\"https://bugs.freedesktop.org/enter_bug.cgi?product=Mesa\" \
 	-DANDROID_VERSION=0x0$(MESA_ANDROID_MAJOR_VERSION)0$(MESA_ANDROID_MINOR_VERSION)
@@ -78,6 +82,12 @@ LOCAL_CFLAGS += \
 	-D__STDC_LIMIT_MACROS
 endif

+# add libdrm if there are hardware drivers
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DHAVE_LIBDRM
+LOCAL_SHARED_LIBRARIES += libdrm
+endif
+
 LOCAL_CPPFLAGS += \
 	$(if $(filter true,$(MESA_LOLLIPOP_BUILD)),-D_USING_LIBCXX) \
 	-Wno-error=non-virtual-dtor \
--- a/2
+++ b/2
@@ -1 +1 @@
-12.0.0-rc3
+12.1.0-devel
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -1,2 +0,0 @@
-# The offending commit that this patch (part) reverts isn't in 12.0
-be32a2132785fbc119f17e62070e007ee7d17af7 i965/compiler: Bring back the INTEL_PRECISE_TRIG environment variable
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -146,45 +146,45 @@ GL 4.1, GLSL 4.10 --- all DONE: nvc0, r600, radeonsi
  GL_ARB_viewport_array                                 DONE (i965, nv50, llvmpipe, softpipe)


-GL 4.2, GLSL 4.20 -- all DONE: radeonsi
+GL 4.2, GLSL 4.20 -- all DONE: nvc0, radeonsi

-  GL_ARB_texture_compression_bptc                       DONE (i965, nvc0, r600, radeonsi)
+  GL_ARB_texture_compression_bptc                       DONE (i965, r600)
  GL_ARB_compressed_texture_pixel_storage               DONE (all drivers)
-  GL_ARB_shader_atomic_counters                         DONE (i965, nvc0, radeonsi, softpipe)
+  GL_ARB_shader_atomic_counters                         DONE (i965, softpipe)
  GL_ARB_texture_storage                                DONE (all drivers)
-  GL_ARB_transform_feedback_instanced                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
-  GL_ARB_base_instance                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
-  GL_ARB_shader_image_load_store                        DONE (i965, nvc0, radeonsi, softpipe)
+  GL_ARB_transform_feedback_instanced                   DONE (i965, nv50, r600, llvmpipe, softpipe, swr)
+  GL_ARB_base_instance                                  DONE (i965, nv50, r600, llvmpipe, softpipe, swr)
+  GL_ARB_shader_image_load_store                        DONE (i965, softpipe)
  GL_ARB_conservative_depth                             DONE (all drivers that support GLSL 1.30)
  GL_ARB_shading_language_420pack                       DONE (all drivers that support GLSL 1.30)
  GL_ARB_shading_language_packing                       DONE (all drivers)
-  GL_ARB_internalformat_query                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
+  GL_ARB_internalformat_query                           DONE (i965, nv50, r600, llvmpipe, softpipe, swr)
  GL_ARB_map_buffer_alignment                           DONE (all drivers)


-GL 4.3, GLSL 4.30:
+GL 4.3, GLSL 4.30 -- all DONE: nvc0, radeonsi

  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
  GL_ARB_ES3_compatibility                              DONE (all drivers that support GLSL 3.30)
  GL_ARB_clear_buffer_object                            DONE (all drivers)
-  GL_ARB_compute_shader                                 DONE (i965, nvc0, radeonsi, softpipe)
-  GL_ARB_copy_image                                     DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_compute_shader                                 DONE (i965, softpipe)
+  GL_ARB_copy_image                                     DONE (i965, nv50, r600, softpipe, llvmpipe)
  GL_KHR_debug                                          DONE (all drivers)
  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
-  GL_ARB_framebuffer_no_attachments                     DONE (i965, nvc0, r600, radeonsi, softpipe)
+  GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, r600, llvmpipe)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965, r600, softpipe)
  GL_ARB_internalformat_query2                          DONE (all drivers)
  GL_ARB_invalidate_subdata                             DONE (all drivers)
-  GL_ARB_multi_draw_indirect                            DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
+  GL_ARB_multi_draw_indirect                            DONE (i965, r600, llvmpipe, softpipe, swr)
  GL_ARB_program_interface_query                        DONE (all drivers)
-  GL_ARB_robust_buffer_access_behavior                  DONE (i965, nvc0, radeonsi)
-  GL_ARB_shader_image_size                              DONE (i965, nvc0, radeonsi, softpipe)
-  GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0, radeonsi, softpipe)
-  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
-  GL_ARB_texture_buffer_range                           DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
+  GL_ARB_robust_buffer_access_behavior                  DONE (i965)
+  GL_ARB_shader_image_size                              DONE (i965, softpipe)
+  GL_ARB_shader_storage_buffer_object                   DONE (i965, softpipe)
+  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, r600, llvmpipe, softpipe, swr)
+  GL_ARB_texture_buffer_range                           DONE (nv50, i965, r600, llvmpipe)
  GL_ARB_texture_query_levels                           DONE (all drivers that support GLSL 1.30)
  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
+  GL_ARB_texture_view                                   DONE (i965, nv50, r600, llvmpipe, softpipe, swr)
  GL_ARB_vertex_attrib_binding                          DONE (all drivers)


@@ -211,7 +211,7 @@ GL 4.5, GLSL 4.50:
  GL_ARB_ES3_1_compatibility                            DONE (nvc0, radeonsi)
  GL_ARB_clip_control                                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
  GL_ARB_conditional_render_inverted                    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
-  GL_ARB_cull_distance                                  DONE (i965, nv50, nvc0, llvmpipe, softpipe)
+  GL_ARB_cull_distance                                  DONE (i965, nv50, nvc0, llvmpipe, softpipe, swr)
  GL_ARB_derivative_control                             DONE (i965, nv50, nvc0, r600, radeonsi)
  GL_ARB_direct_state_access                            DONE (all drivers)
  GL_ARB_get_texture_sub_image                          DONE (all drivers)
@@ -222,32 +222,32 @@ GL 4.5, GLSL 4.50:
  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)

 These are the extensions cherry-picked to make GLES 3.1
-GLES3.1, GLSL ES 3.1
+GLES3.1, GLSL ES 3.1 -- all DONE: nvc0, radeonsi
  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
-  GL_ARB_compute_shader                                 DONE (i965, nvc0, radeonsi, softpipe)
-  GL_ARB_draw_indirect                                  DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
+  GL_ARB_compute_shader                                 DONE (i965, softpipe)
+  GL_ARB_draw_indirect                                  DONE (i965, r600, llvmpipe, softpipe, swr)
  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
-  GL_ARB_framebuffer_no_attachments                     DONE (i965, nvc0, r600, radeonsi, softpipe)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965, r600, softpipe)
  GL_ARB_program_interface_query                        DONE (all drivers)
-  GL_ARB_shader_atomic_counters                         DONE (i965, nvc0, radeonsi, softpipe)
-  GL_ARB_shader_image_load_store                        DONE (i965, nvc0, radeonsi, softpipe)
-  GL_ARB_shader_image_size                              DONE (i965, nvc0, radeonsi, softpipe)
-  GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0, radeonsi, softpipe)
+  GL_ARB_shader_atomic_counters                         DONE (i965, softpipe)
+  GL_ARB_shader_image_load_store                        DONE (i965, softpipe)
+  GL_ARB_shader_image_size                              DONE (i965, softpipe)
+  GL_ARB_shader_storage_buffer_object                   DONE (i965, softpipe)
  GL_ARB_shading_language_packing                       DONE (all drivers)
  GL_ARB_separate_shader_objects                        DONE (all drivers)
-  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
-  GL_ARB_texture_multisample (Multisample textures)     DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, r600, llvmpipe, softpipe, swr)
+  GL_ARB_texture_multisample (Multisample textures)     DONE (i965, nv50, r600, llvmpipe, softpipe)
  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
  GL_ARB_vertex_attrib_binding                          DONE (all drivers)
-  GS5 Enhanced textureGather                            DONE (i965, nvc0, r600, radeonsi)
-  GS5 Packing/bitfield/conversion functions             DONE (i965, nvc0, r600, radeonsi)
+  GS5 Enhanced textureGather                            DONE (i965, r600)
+  GS5 Packing/bitfield/conversion functions             DONE (i965, r600)
  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)

  Additional functionality not covered above:
      glMemoryBarrierByRegion                           DONE
      glGetTexLevelParameter[fi]v - needs updates       DONE
      glGetBooleani_v - restrict to GLES enums
-      gl_HelperInvocation support                       DONE (i965, nvc0, r600, radeonsi)
+      gl_HelperInvocation support                       DONE (i965, r600)

 GLES3.2, GLSL ES 3.2
  GL_EXT_color_buffer_float                             DONE (all drivers)
--- a/docs/devinfo.html
+++ b/docs/devinfo.html
@@ -684,9 +684,11 @@ To add a new GL extension to Mesa you have to do at least the following.
 </li>
 <li>
   Add a new entry to the <code>gl_extensions</code> struct in mtypes.h
+   if the extension requires driver capabilities not already exposed by
+   another extension.
 </li>
 <li>
-   Update the <code>extensions.c</code> file.
+   Add a new entry to the src/mesa/main/extensions_table.h file.
 </li>
 <li>
   From this point, the best way to proceed is to find another extension,
@@ -697,12 +699,18 @@ To add a new GL extension to Mesa you have to do at least the following.
   If the new extension adds new GL state, the functions in get.c, enable.c
   and attrib.c will most likely require new code.
 </li>
+<li>
+   To determine if the new extension is active in the current context,
+   use the auto-generated _mesa_has_##name_str() function defined in
+   src/mesa/main/extensions.h.
+</li>
 <li>
   The dispatch tests check_table.cpp and dispatch_sanity.cpp
   should be updated with details about the new extensions functions. These
   tests are run using 'make check'
 </li>
 </ul>
+</p>



--- a/docs/relnotes/12.1.0.html
+++ b/docs/relnotes/12.1.0.html
@@ -0,0 +1,60 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 12.1.0 Release Notes / TBD</h1>
+
+<p>
+Mesa 12.1.0 is a new development release.
+People who are concerned with stability and reliability should stick
+with a previous release or wait for Mesa 12.1.1.
+</p>
+<p>
+Mesa 12.1.0 implements the OpenGL 4.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.3.  OpenGL
+4.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD.
+</pre>
+
+
+<h2>New features</h2>
+
+<p>
+Note: some of the new features are only available with certain drivers.
+</p>
+
+<ul>
+<li>GL_ARB_shader_group_vote on nvc0</li>
+</ul>
+
+<h2>Bug fixes</h2>
+
+TBD.
+
+<h2>Changes</h2>
+
+TBD.
+
+</div>
+</body>
+</html>
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -235,6 +235,7 @@ NIR_FILES = \
 	nir/nir_repair_ssa.c \
 	nir/nir_search.c \
 	nir/nir_search.h \
+	nir/nir_search_helpers.h \
 	nir/nir_split_var_copies.c \
 	nir/nir_sweep.c \
 	nir/nir_to_ssa.c \
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -3393,7 +3393,7 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
                (qual_component + components - 1) > 3) {
               _mesa_glsl_error(loc, state, "component overflow (%u > 3)",
                                (qual_component + components - 1));
-            } else if (qual_component == 1 && type->is_double()) {
+            } else if (qual_component == 1 && type->is_64bit()) {
               /* We don't bother checking for 3 as it should be caught by the
                * overflow check above.
                */
@@ -6843,7 +6843,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
            }
         } else {
            if (layout && layout->flags.q.explicit_xfb_offset) {
-               unsigned align = field_type->is_double() ? 8 : 4;
+               unsigned align = field_type->is_64bit() ? 8 : 4;
               fields[i].offset = glsl_align(block_xfb_offset, align);
               block_xfb_offset +=
                  MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -528,6 +528,12 @@ barrier_supported(const _mesa_glsl_parse_state *state)
          state->stage == MESA_SHADER_TESS_CTRL;
 }

+static bool
+vote(const _mesa_glsl_parse_state *state)
+{
+   return state->ARB_shader_group_vote_enable;
+}
+
 /** @} */

 /******************************************************************************/
@@ -853,6 +859,8 @@ private:
   ir_function_signature *_shader_clock(builtin_available_predicate avail,
                                        const glsl_type *type);

+   ir_function_signature *_vote(enum ir_expression_operation opcode);
+
 #undef B0
 #undef B1
 #undef B2
@@ -2935,6 +2943,10 @@ builtin_builder::create_builtins()
                              glsl_type::uvec2_type),
                NULL);

+   add_function("anyInvocationARB", _vote(ir_unop_vote_any), NULL);
+   add_function("allInvocationsARB", _vote(ir_unop_vote_all), NULL);
+   add_function("allInvocationsEqualARB", _vote(ir_unop_vote_eq), NULL);
+
 #undef F
 #undef FI
 #undef FIUD
@@ -5576,6 +5588,16 @@ builtin_builder::_shader_clock(builtin_available_predicate avail,
   return sig;
 }

+ir_function_signature *
+builtin_builder::_vote(enum ir_expression_operation opcode)
+{
+   ir_variable *value = in_var(glsl_type::bool_type, "value");
+
+   MAKE_SIG(glsl_type::bool_type, vote, 1, value);
+   body.emit(ret(expr(opcode, value)));
+   return sig;
+}
+
 /** @} */

 /******************************************************************************/
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -2467,6 +2467,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio

         if (extensions->ARB_cull_distance)
            add_builtin_define(parser, "GL_ARB_cull_distance", 1);
+
+         if (extensions->ARB_shader_group_vote)
+            add_builtin_define(parser, "GL_ARB_shader_group_vote", 1);
      }
   }

--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -594,6 +594,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
   EXT(ARB_shader_bit_encoding,          true,  false,     ARB_shader_bit_encoding),
   EXT(ARB_shader_clock,                 true,  false,     ARB_shader_clock),
   EXT(ARB_shader_draw_parameters,       true,  false,     ARB_shader_draw_parameters),
+   EXT(ARB_shader_group_vote,            true,  false,     ARB_shader_group_vote),
   EXT(ARB_shader_image_load_store,      true,  false,     ARB_shader_image_load_store),
   EXT(ARB_shader_image_size,            true,  false,     ARB_shader_image_size),
   EXT(ARB_shader_precision,             true,  false,     ARB_shader_precision),
@@ -1602,6 +1603,7 @@ ast_struct_specifier::ast_struct_specifier(const char *identifier,
   name = identifier;
   this->declarations.push_degenerate_list_at_head(&declarator_list->link);
   is_declaration = true;
+   layout = NULL;
 }

 void ast_subroutine_list::print(void) const
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -575,6 +575,8 @@ struct _mesa_glsl_parse_state {
   bool ARB_shader_clock_warn;
   bool ARB_shader_draw_parameters_enable;
   bool ARB_shader_draw_parameters_warn;
+   bool ARB_shader_group_vote_enable;
+   bool ARB_shader_group_vote_warn;
   bool ARB_shader_image_load_store_enable;
   bool ARB_shader_image_load_store_warn;
   bool ARB_shader_image_size_enable;
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -1284,9 +1284,6 @@ nir_visitor::visit(ir_expression *ir)
          intrin->intrinsic == nir_intrinsic_interp_var_at_sample)
         intrin->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));

-      if (intrin->intrinsic == nir_intrinsic_interp_var_at_offset)
-         shader->info.uses_interp_var_at_offset = true;
-
      unsigned bit_size =  glsl_get_bit_size(deref->type);
      add_instr(&intrin->instr, deref->type->vector_elements, bit_size);

--- a/src/compiler/glsl/ir.cpp
+++ b/src/compiler/glsl/ir.cpp
@@ -341,6 +341,12 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
      this->type = glsl_type::int_type;
      break;

+   case ir_unop_vote_any:
+   case ir_unop_vote_all:
+   case ir_unop_vote_eq:
+      this->type = glsl_type::bool_type;
+      break;
+
   default:
      assert(!"not reached: missing automatic type setup for ir_expression");
      this->type = op0->type;
@@ -563,6 +569,9 @@ static const char *const operator_strs[] = {
   "interpolate_at_centroid",
   "get_buffer_size",
   "ssbo_unsized_array_length",
+   "vote_any",
+   "vote_all",
+   "vote_eq",
   "+",
   "-",
   "*",
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -537,6 +537,10 @@ public:
      return this->interface_type;
   }

+   enum glsl_interface_packing get_interface_type_packing() const
+   {
+     return this->interface_type->get_interface_packing();
+   }
   /**
    * Get the max_ifc_array_access pointer
    *
@@ -1477,10 +1481,17 @@ enum ir_expression_operation {
    */
   ir_unop_ssbo_unsized_array_length,

+   /**
+    * Vote among threads on the value of the boolean argument.
+    */
+   ir_unop_vote_any,
+   ir_unop_vote_all,
+   ir_unop_vote_eq,
+
   /**
    * A sentinel marking the last of the unary operations.
    */
-   ir_last_unop = ir_unop_ssbo_unsized_array_length,
+   ir_last_unop = ir_unop_vote_eq,

   ir_binop_add,
   ir_binop_sub,
--- a/src/compiler/glsl/ir_constant_expression.cpp
+++ b/src/compiler/glsl/ir_constant_expression.cpp
--- a/src/compiler/glsl/ir_set_program_inouts.cpp
+++ b/src/compiler/glsl/ir_set_program_inouts.cpp
@@ -119,7 +119,7 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,

         /* double inputs read is only for vertex inputs */
         if (stage == MESA_SHADER_VERTEX &&
-             var->type->without_array()->is_dual_slot_double())
+             var->type->without_array()->is_dual_slot())
            prog->DoubleInputsRead |= bitfield;

         if (stage == MESA_SHADER_FRAGMENT) {
@@ -306,7 +306,7 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
   /* double element width for double types that takes two slots */
   if (this->shader_stage != MESA_SHADER_VERTEX ||
       var->data.mode != ir_var_shader_in) {
-      if (type->without_array()->is_dual_slot_double())
+      if (type->without_array()->is_dual_slot())
 	 elem_width *= 2;
   }

--- a/src/compiler/glsl/ir_validate.cpp
+++ b/src/compiler/glsl/ir_validate.cpp
@@ -453,6 +453,14 @@ ir_validate::visit_leave(ir_expression *ir)
      assert(ir->operands[0]->type->base_type == GLSL_TYPE_SUBROUTINE);
      assert(ir->type->base_type == GLSL_TYPE_INT);
      break;
+
+   case ir_unop_vote_any:
+   case ir_unop_vote_all:
+   case ir_unop_vote_eq:
+      assert(ir->type == glsl_type::bool_type);
+      assert(ir->operands[0]->type == glsl_type::bool_type);
+      break;
+
   case ir_binop_add:
   case ir_binop_sub:
   case ir_binop_mul:
--- a/src/compiler/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/compiler/glsl/link_uniform_block_active_visitor.cpp
@@ -167,8 +167,7 @@ link_uniform_block_active_visitor::visit(ir_variable *var)
    *     also considered active, even if no member of the block is
    *     referenced."
    */
-   if (var->get_interface_type()->interface_packing ==
-       GLSL_INTERFACE_PACKING_PACKED)
+   if (var->get_interface_type_packing() == GLSL_INTERFACE_PACKING_PACKED)
      return visit_continue;

   /* Process the block.  Bail if there was an error.
@@ -258,8 +257,7 @@ link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir)
    * std140 layout qualifier, all its instances have been already marked
    * as used in link_uniform_block_active_visitor::visit(ir_variable *).
    */
-   if (var->get_interface_type()->interface_packing ==
-       GLSL_INTERFACE_PACKING_PACKED) {
+   if (var->get_interface_type_packing() == GLSL_INTERFACE_PACKING_PACKED) {
      b->var = var;
      process_arrays(this->mem_ctx, ir, b);
   }
--- a/src/compiler/glsl/link_uniform_blocks.cpp
+++ b/src/compiler/glsl/link_uniform_blocks.cpp
@@ -70,7 +70,7 @@ private:
   }

   virtual void enter_record(const glsl_type *type, const char *,
-                             bool row_major, const unsigned packing) {
+                             bool row_major, const enum glsl_interface_packing packing) {
      assert(type->is_record());
      if (packing == GLSL_INTERFACE_PACKING_STD430)
         this->offset = glsl_align(
@@ -81,7 +81,7 @@ private:
   }

   virtual void leave_record(const glsl_type *type, const char *,
-                             bool row_major, const unsigned packing) {
+                             bool row_major, const enum glsl_interface_packing packing) {
      assert(type->is_record());

      /* If this is the last field of a structure, apply rule #9.  The
@@ -106,7 +106,7 @@ private:

   virtual void visit_field(const glsl_type *type, const char *name,
                            bool row_major, const glsl_type *,
-                            const unsigned packing,
+                            const enum glsl_interface_packing packing,
                            bool last_field)
   {
      assert(this->index < this->num_variables);
--- a/src/compiler/glsl/link_uniform_initializers.cpp
+++ b/src/compiler/glsl/link_uniform_initializers.cpp
@@ -222,7 +222,7 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog,
 	 val->array_elements[0]->type->base_type;
      const unsigned int elements = val->array_elements[0]->type->components();
      unsigned int idx = 0;
-      unsigned dmul = (base_type == GLSL_TYPE_DOUBLE) ? 2 : 1;
+      unsigned dmul = glsl_base_type_is_64bit(base_type) ? 2 : 1;

      assert(val->type->length >= storage->array_elements);
      for (unsigned int i = 0; i < storage->array_elements; i++) {
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -65,7 +65,7 @@ program_resource_visitor::process(const glsl_type *type, const char *name)

   unsigned record_array_count = 1;
   char *name_copy = ralloc_strdup(NULL, name);
-   unsigned packing = type->interface_packing;
+   enum glsl_interface_packing packing = type->get_interface_packing();

   recursion(type, &name_copy, strlen(name), false, NULL, packing, false,
             record_array_count, NULL);
@@ -79,9 +79,9 @@ program_resource_visitor::process(ir_variable *var)
   const bool row_major =
      var->data.matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;

-   const unsigned packing = var->get_interface_type() ?
-      var->get_interface_type()->interface_packing :
-      var->type->interface_packing;
+   const enum glsl_interface_packing packing = var->get_interface_type() ?
+      var->get_interface_type_packing() :
+      var->type->get_interface_packing();

   const glsl_type *t =
      var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
@@ -116,7 +116,7 @@ void
 program_resource_visitor::recursion(const glsl_type *t, char **name,
                                    size_t name_length, bool row_major,
                                    const glsl_type *record_type,
-                                    const unsigned packing,
+                                    const enum glsl_interface_packing packing,
                                    bool last_field,
                                    unsigned record_array_count,
                                    const glsl_struct_field *named_ifc_member)
@@ -228,7 +228,7 @@ void
 program_resource_visitor::visit_field(const glsl_type *type, const char *name,
                                      bool row_major,
                                      const glsl_type *,
-                                      const unsigned,
+                                      const enum glsl_interface_packing,
                                      bool /* last_field */)
 {
   visit_field(type, name, row_major);
@@ -243,13 +243,13 @@ program_resource_visitor::visit_field(const glsl_struct_field *field)

 void
 program_resource_visitor::enter_record(const glsl_type *, const char *, bool,
-                                       const unsigned)
+                                       const enum glsl_interface_packing)
 {
 }

 void
 program_resource_visitor::leave_record(const glsl_type *, const char *, bool,
-                                       const unsigned)
+                                       const enum glsl_interface_packing)
 {
 }

@@ -402,7 +402,9 @@ private:
       * uniforms.
       */
      this->num_active_uniforms++;
-      this->num_values += values;
+
+      if(!is_gl_identifier(name) && !is_shader_storage)
+         this->num_values += values;
   }

   struct string_to_uint_map *hidden_map;
@@ -660,7 +662,7 @@ private:
   }

   virtual void enter_record(const glsl_type *type, const char *,
-                             bool row_major, const unsigned packing) {
+                             bool row_major, const enum glsl_interface_packing packing) {
      assert(type->is_record());
      if (this->buffer_block_index == -1)
         return;
@@ -673,7 +675,7 @@ private:
   }

   virtual void leave_record(const glsl_type *type, const char *,
-                             bool row_major, const unsigned packing) {
+                             bool row_major, const enum glsl_interface_packing packing) {
      assert(type->is_record());
      if (this->buffer_block_index == -1)
         return;
@@ -687,7 +689,7 @@ private:

   virtual void visit_field(const glsl_type *type, const char *name,
                            bool row_major, const glsl_type * /* record_type */,
-                            const unsigned packing,
+                            const enum glsl_interface_packing packing,
                            bool /* last_field */)
   {
      assert(!type->without_array()->is_record());
@@ -762,13 +764,14 @@ private:
         current_var->data.how_declared == ir_var_hidden;
      this->uniforms[id].builtin = is_gl_identifier(name);

-      /* Do not assign storage if the uniform is builtin */
-      if (!this->uniforms[id].builtin)
-         this->uniforms[id].storage = this->values;
-
      this->uniforms[id].is_shader_storage =
         current_var->is_in_shader_storage_block();

+      /* Do not assign storage if the uniform is builtin */
+      if (!this->uniforms[id].builtin &&
+          !this->uniforms[id].is_shader_storage)
+         this->uniforms[id].storage = this->values;
+
      if (this->buffer_block_index != -1) {
         this->uniforms[id].block_index = this->buffer_block_index;

@@ -819,7 +822,9 @@ private:
         this->uniforms[id].row_major = false;
      }

-      this->values += values_for_type(type);
+      if (!this->uniforms[id].builtin &&
+          !this->uniforms[id].is_shader_storage)
+         this->values += values_for_type(type);
   }

   /**
@@ -1251,7 +1256,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,

 #ifndef NDEBUG
   for (unsigned i = 0; i < num_uniforms; i++) {
-      assert(uniforms[i].storage != NULL || uniforms[i].builtin);
+      assert(uniforms[i].storage != NULL || uniforms[i].builtin ||
+             uniforms[i].is_shader_storage);
   }

   assert(parcel.values == data_end);
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -397,15 +397,15 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
         unsigned slot_limit = idx + num_elements;
         unsigned last_comp;

-         if (var->type->without_array()->is_record()) {
+         if (type->without_array()->is_record()) {
            /* The component qualifier can't be used on structs so just treat
             * all component slots as used.
             */
            last_comp = 4;
         } else {
-            unsigned dmul = var->type->is_double() ? 2 : 1;
+            unsigned dmul = type->without_array()->is_64bit() ? 2 : 1;
            last_comp = var->data.location_frac +
-               var->type->without_array()->vector_elements * dmul;
+               type->without_array()->vector_elements * dmul;
         }

         while (idx < slot_limit) {
@@ -425,7 +425,7 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
               for (unsigned j = 0; j < 4; j++) {
                  if (explicit_locations[idx][j] &&
                      (explicit_locations[idx][j]->type->without_array()
-                       ->base_type != var->type->without_array()->base_type)) {
+                       ->base_type != type->without_array()->base_type)) {
                     linker_error(prog,
                                  "Varyings sharing the same location must "
                                  "have the same underlying numerical type. "
@@ -443,7 +443,7 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
                * worry about components beginning at anything other than 0 as
                * the spec does not allow this for dvec3 and dvec4.
                */
-               if (i == 3 && last_comp > 4) {
+               if (i == 4 && last_comp > 4) {
                  last_comp = last_comp - 4;
                  /* Bump location index and reset the component index */
                  idx++;
@@ -708,7 +708,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
      + this->matched_candidate->toplevel_var->data.location_frac
      + this->matched_candidate->offset;
   const unsigned dmul =
-      this->matched_candidate->type->without_array()->is_double() ? 2 : 1;
+      this->matched_candidate->type->without_array()->is_64bit() ? 2 : 1;

   if (this->matched_candidate->type->is_array()) {
      /* Array variable */
@@ -886,7 +886,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
   }

   if (explicit_stride && explicit_stride[buffer]) {
-      if (this->is_double() && info->Buffers[buffer].Stride % 2) {
+      if (this->is_64bit() && info->Buffers[buffer].Stride % 2) {
         linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
                      "multiple of 8 as its applied to a type that is or "
                      "contains a double.",
@@ -1937,7 +1937,7 @@ canonicalize_shader_io(exec_list *ir, enum ir_variable_mode io_mode)
 * 64 bit map. Per-vertex and per-patch both have separate location domains
 * with a max of MAX_VARYING.
 */
-static uint64_t
+uint64_t
 reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode)
 {
   assert(io_mode == ir_var_shader_in || io_mode == ir_var_shader_out);
@@ -1999,7 +1999,8 @@ assign_varying_locations(struct gl_context *ctx,
                         struct gl_shader_program *prog,
                         gl_shader *producer, gl_shader *consumer,
                         unsigned num_tfeedback_decls,
-                         tfeedback_decl *tfeedback_decls)
+                         tfeedback_decl *tfeedback_decls,
+                         const uint64_t reserved_slots)
 {
   /* Tessellation shaders treat inputs and outputs as shared memory and can
    * access inputs and outputs of other invocations.
@@ -2177,10 +2178,6 @@ assign_varying_locations(struct gl_context *ctx,
      }
   }

-   const uint64_t reserved_slots =
-      reserved_varying_slot(producer, ir_var_shader_out) |
-      reserved_varying_slot(consumer, ir_var_shader_in);
-
   const unsigned slots_used = matches.assign_locations(prog, reserved_slots);
   matches.store_locations();

@@ -2263,14 +2260,16 @@ assign_varying_locations(struct gl_context *ctx,
 bool
 check_against_output_limit(struct gl_context *ctx,
                           struct gl_shader_program *prog,
-                           gl_shader *producer)
+                           gl_shader *producer,
+                           unsigned num_explicit_locations)
 {
-   unsigned output_vectors = 0;
+   unsigned output_vectors = num_explicit_locations;

   foreach_in_list(ir_instruction, node, producer->ir) {
      ir_variable *const var = node->as_variable();

-      if (var && var->data.mode == ir_var_shader_out &&
+      if (var && !var->data.explicit_location &&
+          var->data.mode == ir_var_shader_out &&
          var_counts_against_varying_limit(producer->Stage, var)) {
         /* outputs for fragment shader can't be doubles */
         output_vectors += var->type->count_attribute_slots(false);
@@ -2305,14 +2304,16 @@ check_against_output_limit(struct gl_context *ctx,
 bool
 check_against_input_limit(struct gl_context *ctx,
                          struct gl_shader_program *prog,
-                          gl_shader *consumer)
+                          gl_shader *consumer,
+                          unsigned num_explicit_locations)
 {
-   unsigned input_vectors = 0;
+   unsigned input_vectors = num_explicit_locations;

   foreach_in_list(ir_instruction, node, consumer->ir) {
      ir_variable *const var = node->as_variable();

-      if (var && var->data.mode == ir_var_shader_in &&
+      if (var && !var->data.explicit_location &&
+          var->data.mode == ir_var_shader_in &&
          var_counts_against_varying_limit(consumer->Stage, var)) {
         /* vertex inputs aren't varying counted */
         input_vectors += var->type->count_attribute_slots(false);
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -151,7 +151,7 @@ public:
         return this->size;
      else
         return this->vector_elements * this->matrix_columns * this->size *
-            (this->is_double() ? 2 : 1);
+            (this->is_64bit() ? 2 : 1);
   }

   unsigned get_location() const {
@@ -160,7 +160,7 @@ public:

 private:

-   bool is_double() const
+   bool is_64bit() const
   {
      switch (this->type) {
      case GL_DOUBLE:
@@ -320,16 +320,22 @@ assign_varying_locations(struct gl_context *ctx,
 			 struct gl_shader_program *prog,
 			 gl_shader *producer, gl_shader *consumer,
                         unsigned num_tfeedback_decls,
-                         tfeedback_decl *tfeedback_decls);
+                         tfeedback_decl *tfeedback_decls,
+                         const uint64_t reserved_slots);
+
+uint64_t
+reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode);

 bool
 check_against_output_limit(struct gl_context *ctx,
                           struct gl_shader_program *prog,
-                           gl_shader *producer);
+                           gl_shader *producer,
+                           unsigned num_explicit_locations);

 bool
 check_against_input_limit(struct gl_context *ctx,
                          struct gl_shader_program *prog,
-                          gl_shader *consumer);
+                          gl_shader *consumer,
+                          unsigned num_explicit_locations);

 #endif /* GLSL_LINK_VARYINGS_H */
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -2863,7 +2863,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
             * issue (3) of the GL_ARB_vertex_attrib_64bit behavior, this
             * is optional behavior, but it seems preferable.
             */
-            if (var->type->without_array()->is_dual_slot_double())
+            if (var->type->without_array()->is_dual_slot())
               double_storage_locations |= (use_mask << attr);
 	 }

@@ -2940,7 +2940,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
      to_assign[i].var->data.is_unmatched_generic_inout = 0;
      used_locations |= (use_mask << location);

-      if (to_assign[i].var->type->without_array()->is_dual_slot_double())
+      if (to_assign[i].var->type->without_array()->is_dual_slot())
         double_storage_locations |= (use_mask << location);
   }

@@ -4850,9 +4850,12 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    */
   if (last < MESA_SHADER_FRAGMENT &&
       (num_tfeedback_decls != 0 || prog->SeparateShader)) {
+      const uint64_t reserved_out_slots =
+         reserved_varying_slot(prog->_LinkedShaders[last], ir_var_shader_out);
      if (!assign_varying_locations(ctx, mem_ctx, prog,
                                    prog->_LinkedShaders[last], NULL,
-                                    num_tfeedback_decls, tfeedback_decls))
+                                    num_tfeedback_decls, tfeedback_decls,
+                                    reserved_out_slots))
         goto done;
   }

@@ -4870,6 +4873,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)

         gl_shader *const sh = prog->_LinkedShaders[last];
         if (prog->SeparateShader) {
+            const uint64_t reserved_slots =
+               reserved_varying_slot(sh, ir_var_shader_in);
+
            /* Assign input locations for SSO, output locations are already
             * assigned.
             */
@@ -4877,7 +4883,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                                          NULL /* producer */,
                                          sh /* consumer */,
                                          0 /* num_tfeedback_decls */,
-                                          NULL /* tfeedback_decls */))
+                                          NULL /* tfeedback_decls */,
+                                          reserved_slots))
               goto done;
         }

@@ -4898,9 +4905,15 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
            gl_shader *const sh_i = prog->_LinkedShaders[i];
            gl_shader *const sh_next = prog->_LinkedShaders[next];

+            const uint64_t reserved_out_slots =
+               reserved_varying_slot(sh_i, ir_var_shader_out);
+            const uint64_t reserved_in_slots =
+               reserved_varying_slot(sh_next, ir_var_shader_in);
+
            if (!assign_varying_locations(ctx, mem_ctx, prog, sh_i, sh_next,
                      next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
-                      tfeedback_decls))
+                      tfeedback_decls,
+                      reserved_out_slots | reserved_in_slots))
               goto done;

            do_dead_builtin_varyings(ctx, sh_i, sh_next,
@@ -4909,11 +4922,14 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)

            /* This must be done after all dead varyings are eliminated. */
            if (sh_i != NULL) {
-               if (!check_against_output_limit(ctx, prog, sh_i)) {
+               unsigned slots_used = _mesa_bitcount_64(reserved_out_slots);
+               if (!check_against_output_limit(ctx, prog, sh_i, slots_used)) {
                  goto done;
               }
            }
-            if (!check_against_input_limit(ctx, prog, sh_next))
+
+            unsigned slots_used = _mesa_bitcount_64(reserved_in_slots);
+            if (!check_against_input_limit(ctx, prog, sh_next, slots_used))
               goto done;

            next = i;
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -156,7 +156,7 @@ protected:
    */
   virtual void visit_field(const glsl_type *type, const char *name,
                            bool row_major, const glsl_type *record_type,
-                            const unsigned packing,
+                            const enum glsl_interface_packing packing,
                            bool last_field);

   /**
@@ -180,10 +180,10 @@ protected:
   virtual void visit_field(const glsl_struct_field *field);

   virtual void enter_record(const glsl_type *type, const char *name,
-                             bool row_major, const unsigned packing);
+                             bool row_major, const enum glsl_interface_packing packing);

   virtual void leave_record(const glsl_type *type, const char *name,
-                             bool row_major, const unsigned packing);
+                             bool row_major, const enum glsl_interface_packing packing);

   virtual void set_buffer_offset(unsigned offset);

@@ -199,7 +199,7 @@ private:
    */
   void recursion(const glsl_type *t, char **name, size_t name_length,
                  bool row_major, const glsl_type *record_type,
-                  const unsigned packing,
+                  const enum glsl_interface_packing packing,
                  bool last_field, unsigned record_array_count,
                  const glsl_struct_field *named_ifc_member);
 };
--- a/src/compiler/glsl/lower_buffer_access.cpp
+++ b/src/compiler/glsl/lower_buffer_access.cpp
@@ -114,7 +114,7 @@ lower_buffer_access::emit_access(void *mem_ctx,
            /* For a row-major matrix, the next column starts at the next
             * element.
             */
-            int size_mul = deref->type->is_double() ? 8 : 4;
+            int size_mul = deref->type->is_64bit() ? 8 : 4;
            emit_access(mem_ctx, is_write, col_deref, base_offset,
                        deref_offset + i * size_mul,
                        row_major, deref->type->matrix_columns, packing,
@@ -125,7 +125,7 @@ lower_buffer_access::emit_access(void *mem_ctx,
            /* std430 doesn't round up vec2 size to a vec4 size */
            if (packing == GLSL_INTERFACE_PACKING_STD430 &&
                deref->type->vector_elements == 2 &&
-                !deref->type->is_double()) {
+                !deref->type->is_64bit()) {
               size_mul = 8;
            } else {
               /* std140 always rounds the stride of arrays (and matrices) to a
@@ -137,7 +137,7 @@ lower_buffer_access::emit_access(void *mem_ctx,
                * machine units, the base alignment is 4N. For vec4, base
                * alignment is 4N.
                */
-               size_mul = (deref->type->is_double() &&
+               size_mul = (deref->type->is_64bit() &&
                           deref->type->vector_elements > 2) ? 32 : 16;
            }

@@ -159,7 +159,7 @@ lower_buffer_access::emit_access(void *mem_ctx,
         is_write ? write_mask : (1 << deref->type->vector_elements) - 1;
      insert_buffer_access(mem_ctx, deref, deref->type, offset, mask, -1);
   } else {
-      unsigned N = deref->type->is_double() ? 8 : 4;
+      unsigned N = deref->type->is_64bit() ? 8 : 4;

      /* We're dereffing a column out of a row-major matrix, so we
       * gather the vector from each stored row.
@@ -328,7 +328,7 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx,
                                         bool *row_major,
                                         int *matrix_columns,
                                         const glsl_struct_field **struct_field,
-                                         unsigned packing)
+                                         enum glsl_interface_packing packing)
 {
   *offset = new(mem_ctx) ir_constant(0u);
   *row_major = is_dereferenced_thing_row_major(deref);
@@ -358,7 +358,7 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx,
             * thread or SIMD channel is modifying the same vector.
             */
            array_stride = 4;
-            if (deref_array->array->type->is_double())
+            if (deref_array->array->type->is_64bit())
               array_stride *= 2;
         } else if (deref_array->array->type->is_matrix() && *row_major) {
            /* When loading a vector out of a row major matrix, the
@@ -367,7 +367,7 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx,
             * vector) is handled below in emit_ubo_loads.
             */
            array_stride = 4;
-            if (deref_array->array->type->is_double())
+            if (deref_array->array->type->is_64bit())
               array_stride *= 2;
            *matrix_columns = deref_array->array->type->matrix_columns;
         } else if (deref_array->type->without_array()->is_interface()) {
--- a/src/compiler/glsl/lower_buffer_access.h
+++ b/src/compiler/glsl/lower_buffer_access.h
@@ -58,7 +58,7 @@ public:
                            ir_rvalue **offset, unsigned *const_offset,
                            bool *row_major, int *matrix_columns,
                            const glsl_struct_field **struct_field,
-                            unsigned packing);
+                            enum glsl_interface_packing packing);
 };

 } /* namespace lower_buffer_access */
--- a/src/compiler/glsl/lower_packed_varyings.cpp
+++ b/src/compiler/glsl/lower_packed_varyings.cpp
@@ -432,7 +432,7 @@ lower_packed_varyings_visitor::lower_rvalue(ir_rvalue *rvalue,
                                            bool gs_input_toplevel,
                                            unsigned vertex_index)
 {
-   unsigned dmul = rvalue->type->is_double() ? 2 : 1;
+   unsigned dmul = rvalue->type->is_64bit() ? 2 : 1;
   /* When gs_input_toplevel is set, we should be looking at a geometry shader
    * input array.
    */
@@ -480,7 +480,7 @@ lower_packed_varyings_visitor::lower_rvalue(ir_rvalue *rvalue,
      char right_swizzle_name[4] = { 0, 0, 0, 0 };

      left_components = 4 - fine_location % 4;
-      if (rvalue->type->is_double()) {
+      if (rvalue->type->is_64bit()) {
         /* We might actually end up with 0 left components! */
         left_components /= 2;
      }
@@ -676,7 +676,7 @@ lower_packed_varyings_visitor::needs_lowering(ir_variable *var)
      return false;

   type = type->without_array();
-   if (type->vector_elements == 4 && !type->is_double())
+   if (type->vector_elements == 4 && !type->is_64bit())
      return false;
   return true;
 }
--- a/src/compiler/glsl/lower_shared_reference.cpp
+++ b/src/compiler/glsl/lower_shared_reference.cpp
@@ -138,7 +138,7 @@ lower_shared_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
   bool row_major;
   int matrix_columns;
   assert(var->get_interface_type() == NULL);
-   const unsigned packing = GLSL_INTERFACE_PACKING_STD430;
+   const enum glsl_interface_packing packing = GLSL_INTERFACE_PACKING_STD430;

   setup_buffer_access(mem_ctx, var, deref,
                       &offset, &const_offset,
@@ -206,7 +206,7 @@ lower_shared_reference_visitor::handle_assignment(ir_assignment *ir)
   bool row_major;
   int matrix_columns;
   assert(var->get_interface_type() == NULL);
-   const unsigned packing = GLSL_INTERFACE_PACKING_STD430;
+   const enum glsl_interface_packing packing = GLSL_INTERFACE_PACKING_STD430;

   setup_buffer_access(mem_ctx, var, deref,
                       &offset, &const_offset,
@@ -365,7 +365,7 @@ lower_shared_reference_visitor::lower_shared_atomic_intrinsic(ir_call *ir)
   bool row_major;
   int matrix_columns;
   assert(var->get_interface_type() == NULL);
-   const unsigned packing = GLSL_INTERFACE_PACKING_STD430;
+   const enum glsl_interface_packing packing = GLSL_INTERFACE_PACKING_STD430;
   buffer_access_type = shared_atomic_access;

   setup_buffer_access(mem_ctx, var, deref,
--- a/src/compiler/glsl/lower_ubo_reference.cpp
+++ b/src/compiler/glsl/lower_ubo_reference.cpp
@@ -61,7 +61,7 @@ public:
                                unsigned *const_offset,
                                bool *row_major,
                                int *matrix_columns,
-                                unsigned packing);
+                                enum glsl_interface_packing packing);
   uint32_t ssbo_access_params();
   ir_expression *ubo_load(void *mem_ctx, const struct glsl_type *type,
 			   ir_rvalue *offset);
@@ -99,7 +99,7 @@ public:
   ir_expression *emit_ssbo_get_buffer_size(void *mem_ctx);

   unsigned calculate_unsized_array_stride(ir_dereference *deref,
-                                           unsigned packing);
+                                           enum glsl_interface_packing packing);

   ir_call *lower_ssbo_atomic_intrinsic(ir_call *ir);
   ir_call *check_for_ssbo_atomic_intrinsic(ir_call *ir);
@@ -273,7 +273,7 @@ lower_ubo_reference_visitor::setup_for_load_or_store(void *mem_ctx,
                                                     unsigned *const_offset,
                                                     bool *row_major,
                                                     int *matrix_columns,
-                                                     unsigned packing)
+                                                     enum glsl_interface_packing packing)
 {
   /* Determine the name of the interface block */
   ir_rvalue *nonconst_block_index;
@@ -344,7 +344,7 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
   unsigned const_offset;
   bool row_major;
   int matrix_columns;
-   unsigned packing = var->get_interface_type()->interface_packing;
+   enum glsl_interface_packing packing = var->get_interface_type_packing();

   this->buffer_access_type =
      var->is_in_shader_storage_block() ?
@@ -557,7 +557,7 @@ lower_ubo_reference_visitor::write_to_memory(void *mem_ctx,
   unsigned const_offset;
   bool row_major;
   int matrix_columns;
-   unsigned packing = var->get_interface_type()->interface_packing;
+   enum glsl_interface_packing packing = var->get_interface_type_packing();

   this->buffer_access_type = ssbo_store_access;
   this->variable = var;
@@ -666,7 +666,7 @@ lower_ubo_reference_visitor::emit_ssbo_get_buffer_size(void *mem_ctx)

 unsigned
 lower_ubo_reference_visitor::calculate_unsized_array_stride(ir_dereference *deref,
-                                                            unsigned packing)
+                                                            enum glsl_interface_packing packing)
 {
   unsigned array_stride = 0;

@@ -736,7 +736,7 @@ lower_ubo_reference_visitor::process_ssbo_unsized_array_length(ir_rvalue **rvalu
   unsigned const_offset;
   bool row_major;
   int matrix_columns;
-   unsigned packing = var->get_interface_type()->interface_packing;
+   enum glsl_interface_packing packing = var->get_interface_type_packing();
   int unsized_array_stride = calculate_unsized_array_stride(deref, packing);

   this->buffer_access_type = ssbo_unsized_array_length_access;
@@ -970,7 +970,7 @@ lower_ubo_reference_visitor::lower_ssbo_atomic_intrinsic(ir_call *ir)
   unsigned const_offset;
   bool row_major;
   int matrix_columns;
-   unsigned packing = var->get_interface_type()->interface_packing;
+   enum glsl_interface_packing packing = var->get_interface_type_packing();

   this->buffer_access_type = ssbo_atomic_access;
   this->variable = var;
--- a/src/compiler/glsl/opt_copy_propagation.cpp
+++ b/src/compiler/glsl/opt_copy_propagation.cpp
@@ -83,6 +83,7 @@ public:
   }

   virtual ir_visitor_status visit(class ir_dereference_variable *);
+   void handle_loop(class ir_loop *, bool keep_acp);
   virtual ir_visitor_status visit_enter(class ir_loop *);
   virtual ir_visitor_status visit_enter(class ir_function_signature *);
   virtual ir_visitor_status visit_enter(class ir_function *);
@@ -252,21 +253,24 @@ ir_copy_propagation_visitor::visit_enter(ir_if *ir)
   return visit_continue_with_parent;
 }

-ir_visitor_status
-ir_copy_propagation_visitor::visit_enter(ir_loop *ir)
+void
+ir_copy_propagation_visitor::handle_loop(ir_loop *ir, bool keep_acp)
 {
   exec_list *orig_acp = this->acp;
   exec_list *orig_kills = this->kills;
   bool orig_killed_all = this->killed_all;

-   /* FINISHME: For now, the initial acp for loops is totally empty.
-    * We could go through once, then go through again with the acp
-    * cloned minus the killed entries after the first run through.
-    */
   this->acp = new(mem_ctx) exec_list;
   this->kills = new(mem_ctx) exec_list;
   this->killed_all = false;

+   if (keep_acp) {
+      /* Populate the initial acp with a copy of the original */
+      foreach_in_list(acp_entry, a, orig_acp) {
+         this->acp->push_tail(new(this->acp) acp_entry(a->lhs, a->rhs));
+      }
+   }
+
   visit_list_elements(this, &ir->body_instructions);

   if (this->killed_all) {
@@ -284,6 +288,20 @@ ir_copy_propagation_visitor::visit_enter(ir_loop *ir)
   }

   ralloc_free(new_kills);
+}
+
+ir_visitor_status
+ir_copy_propagation_visitor::visit_enter(ir_loop *ir)
+{
+   /* Make a conservative first pass over the loop with an empty ACP set.
+    * This also removes any killed entries from the original ACP set.
+    */
+   handle_loop(ir, false);
+
+   /* Then, run it again with the real ACP set, minus any killed entries.
+    * This takes care of propagating values from before the loop into it.
+    */
+   handle_loop(ir, true);

   /* already descended into the children. */
   return visit_continue_with_parent;
--- a/src/compiler/glsl/opt_copy_propagation_elements.cpp
+++ b/src/compiler/glsl/opt_copy_propagation_elements.cpp
@@ -106,6 +106,7 @@ public:
      ralloc_free(mem_ctx);
   }

+   void handle_loop(ir_loop *, bool keep_acp);
   virtual ir_visitor_status visit_enter(class ir_loop *);
   virtual ir_visitor_status visit_enter(class ir_function_signature *);
   virtual ir_visitor_status visit_leave(class ir_assignment *);
@@ -374,8 +375,8 @@ ir_copy_propagation_elements_visitor::visit_enter(ir_if *ir)
   return visit_continue_with_parent;
 }

-ir_visitor_status
-ir_copy_propagation_elements_visitor::visit_enter(ir_loop *ir)
+void
+ir_copy_propagation_elements_visitor::handle_loop(ir_loop *ir, bool keep_acp)
 {
   exec_list *orig_acp = this->acp;
   exec_list *orig_kills = this->kills;
@@ -389,6 +390,13 @@ ir_copy_propagation_elements_visitor::visit_enter(ir_loop *ir)
   this->kills = new(mem_ctx) exec_list;
   this->killed_all = false;

+   if (keep_acp) {
+      /* Populate the initial acp with a copy of the original */
+      foreach_in_list(acp_entry, a, orig_acp) {
+         this->acp->push_tail(new(this->acp) acp_entry(a));
+      }
+   }
+
   visit_list_elements(this, &ir->body_instructions);

   if (this->killed_all) {
@@ -406,6 +414,13 @@ ir_copy_propagation_elements_visitor::visit_enter(ir_loop *ir)
   }

   ralloc_free(new_kills);
+}
+
+ir_visitor_status
+ir_copy_propagation_elements_visitor::visit_enter(ir_loop *ir)
+{
+   handle_loop(ir, false);
+   handle_loop(ir, true);

   /* already descended into the children. */
   return visit_continue_with_parent;
--- a/src/compiler/glsl/opt_dead_code.cpp
+++ b/src/compiler/glsl/opt_dead_code.cpp
@@ -144,7 +144,7 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
             * layouts, do not eliminate it.
             */
            if (entry->var->is_in_buffer_block()) {
-               if (entry->var->get_interface_type()->interface_packing !=
+               if (entry->var->get_interface_type_packing() !=
                   GLSL_INTERFACE_PACKING_PACKED)
                  continue;
            }
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -1434,7 +1434,7 @@ glsl_type::can_implicitly_convert_to(const glsl_type *desired,
 unsigned
 glsl_type::std140_base_alignment(bool row_major) const
 {
-   unsigned N = is_double() ? 8 : 4;
+   unsigned N = is_64bit() ? 8 : 4;

   /* (1) If the member is a scalar consuming <N> basic machine units, the
    *     base alignment is <N>.
@@ -1552,7 +1552,7 @@ glsl_type::std140_base_alignment(bool row_major) const
 unsigned
 glsl_type::std140_size(bool row_major) const
 {
-   unsigned N = is_double() ? 8 : 4;
+   unsigned N = is_64bit() ? 8 : 4;

   /* (1) If the member is a scalar consuming <N> basic machine units, the
    *     base alignment is <N>.
@@ -1689,7 +1689,7 @@ unsigned
 glsl_type::std430_base_alignment(bool row_major) const
 {

-   unsigned N = is_double() ? 8 : 4;
+   unsigned N = is_64bit() ? 8 : 4;

   /* (1) If the member is a scalar consuming <N> basic machine units, the
    *     base alignment is <N>.
@@ -1798,7 +1798,7 @@ glsl_type::std430_base_alignment(bool row_major) const
 unsigned
 glsl_type::std430_array_stride(bool row_major) const
 {
-   unsigned N = is_double() ? 8 : 4;
+   unsigned N = is_64bit() ? 8 : 4;

   /* Notice that the array stride of a vec3 is not 3 * N but 4 * N.
    * See OpenGL 4.30 spec, section 7.6.2.2 "Standard Uniform Block Layout"
@@ -1816,7 +1816,7 @@ glsl_type::std430_array_stride(bool row_major) const
 unsigned
 glsl_type::std430_size(bool row_major) const
 {
-   unsigned N = is_double() ? 8 : 4;
+   unsigned N = is_64bit() ? 8 : 4;

   /* OpenGL 4.30 spec, section 7.6.2.2 "Standard Uniform Block Layout":
    *
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -64,6 +64,11 @@ enum glsl_base_type {
   GLSL_TYPE_ERROR
 };

+static inline bool glsl_base_type_is_64bit(enum glsl_base_type type)
+{
+   return type == GLSL_TYPE_DOUBLE;
+}
+
 enum glsl_sampler_dim {
   GLSL_SAMPLER_DIM_1D = 0,
   GLSL_SAMPLER_DIM_2D,
@@ -490,11 +495,19 @@ struct glsl_type {
   }

   /**
-    * Query whether a double takes two slots.
+    * Query whether a 64-bit type takes two slots.
    */
-   bool is_dual_slot_double() const
+   bool is_dual_slot() const
   {
-      return base_type == GLSL_TYPE_DOUBLE && vector_elements > 2;
+      return is_64bit() && vector_elements > 2;
+   }
+
+   /**
+    * Query whether or not a type is 64-bit
+    */
+   bool is_64bit() const
+   {
+      return glsl_base_type_is_64bit(base_type);
   }

   /**
@@ -745,6 +758,14 @@ struct glsl_type {
    */
   bool record_compare(const glsl_type *b, bool match_locations = true) const;

+   /**
+    * Get the type interface packing.
+    */
+   enum glsl_interface_packing get_interface_packing() const
+   {
+      return (enum glsl_interface_packing)interface_packing;
+   }
+
 private:

   static mtx_t mutex;
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1651,6 +1651,9 @@ typedef struct nir_shader_compiler_options {
   /* lower {slt,sge,seq,sne} to {flt,fge,feq,fne} + b2f: */
   bool lower_scmp;

+   /** enables rules to lower idiv by power-of-two: */
+   bool lower_idiv;
+
   /* Does the native fdot instruction replicate its result for four
    * components?  If so, then opt_algebraic_late will turn all fdotN
    * instructions into fdot_replicatedN instructions.
@@ -1720,9 +1723,6 @@ typedef struct nir_shader_info {
   /* Whether or not this shader ever uses textureGather() */
   bool uses_texture_gather;

-   /** Whether or not this shader uses nir_intrinsic_interp_var_at_offset */
-   bool uses_interp_var_at_offset;
-
   /* Whether or not this shader uses the gl_ClipDistance output */
   bool uses_clip_distance_out;

--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -76,6 +76,7 @@ class Value(object):
         return Constant(val, name_base)

   __template = mako.template.Template("""
+#include "compiler/nir/nir_search_helpers.h"
 static const ${val.c_type} ${val.name} = {
   { ${val.type_enum}, ${val.bit_size} },
 % if isinstance(val, Constant):
@@ -84,6 +85,7 @@ static const ${val.c_type} ${val.name} = {
   ${val.index}, /* ${val.var_name} */
   ${'true' if val.is_constant else 'false'},
   ${val.type() or 'nir_type_invalid' },
+   ${val.cond if val.cond else 'NULL'},
 % elif isinstance(val, Expression):
   ${'true' if val.inexact else 'false'},
   nir_op_${val.opcode},
@@ -113,7 +115,7 @@ static const ${val.c_type} ${val.name} = {
                                    Variable=Variable,
                                    Expression=Expression)

-_constant_re = re.compile(r"(?P<value>[^@]+)(?:@(?P<bits>\d+))?")
+_constant_re = re.compile(r"(?P<value>[^@\(]+)(?:@(?P<bits>\d+))?")

 class Constant(Value):
   def __init__(self, val, name):
@@ -150,7 +152,8 @@ class Constant(Value):
         return "nir_type_float"

 _var_name_re = re.compile(r"(?P<const>#)?(?P<name>\w+)"
-                          r"(?:@(?P<type>int|uint|bool|float)?(?P<bits>\d+)?)?")
+                          r"(?:@(?P<type>int|uint|bool|float)?(?P<bits>\d+)?)?"
+                          r"(?P<cond>\([^\)]+\))?")

 class Variable(Value):
   def __init__(self, val, name, varset):
@@ -161,6 +164,7 @@ class Variable(Value):

      self.var_name = m.group('name')
      self.is_constant = m.group('const') is not None
+      self.cond = m.group('cond')
      self.required_type = m.group('type')
      self.bit_size = int(m.group('bits')) if m.group('bits') else 0

--- a/src/compiler/nir/nir_gather_info.c
+++ b/src/compiler/nir/nir_gather_info.c
@@ -57,10 +57,6 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader)
      shader->info.gs.uses_end_primitive = 1;
      break;

-   case nir_intrinsic_interp_var_at_offset:
-      shader->info.uses_interp_var_at_offset = 1;
-      break;
-
   default:
      break;
   }
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -45,10 +45,11 @@ d = 'd'
 # however, be used for backend-requested lowering operations as those need to
 # happen regardless of precision.
 #
-# Variable names are specified as "[#]name[@type]" where "#" inicates that
-# the given variable will only match constants and the type indicates that
+# Variable names are specified as "[#]name[@type][(cond)]" where "#" inicates
+# that the given variable will only match constants and the type indicates that
 # the given variable will only match values from ALU instructions with the
-# given output type.
+# given output type, and (cond) specifies an additional condition function
+# (see nir_search_helpers.h).
 #
 # For constants, you have to be careful to make sure that it is the right
 # type because python is unaware of the source and destination types of the
@@ -62,6 +63,14 @@ d = 'd'
 # constructed value should have that bit-size.

 optimizations = [
+
+   (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))),
+   (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))),
+   (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b))),
+   (('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'),
+   (('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'),
+   (('umod', a, '#b(is_pos_power_of_two)'),    ('iand', a, ('isub', b, 1))),
+
   (('fneg', ('fneg', a)), a),
   (('ineg', ('ineg', a)), a),
   (('fabs', ('fabs', a)), ('fabs', a)),
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -127,6 +127,9 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
             instr->src[src].src.ssa->parent_instr->type != nir_instr_type_load_const)
            return false;

+         if (var->cond && !var->cond(instr, src, num_components, new_swizzle))
+            return false;
+
         if (var->type != nir_type_invalid) {
            if (instr->src[src].src.ssa->parent_instr->type != nir_instr_type_alu)
               return false;
--- a/src/compiler/nir/nir_search.h
+++ b/src/compiler/nir/nir_search.h
@@ -68,6 +68,16 @@ typedef struct {
    * never match anything.
    */
   nir_alu_type type;
+
+   /** Optional condition fxn ptr
+    *
+    * This is only allowed in search expressions, and allows additional
+    * constraints to be placed on the match.  Typically used for 'is_constant'
+    * variables to require, for example, power-of-two in order for the search
+    * to match.
+    */
+   bool (*cond)(nir_alu_instr *instr, unsigned src,
+                unsigned num_components, const uint8_t *swizzle);
 } nir_search_variable;

 typedef struct {
--- a/src/compiler/nir/nir_search_helpers.h
+++ b/src/compiler/nir/nir_search_helpers.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2016 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef _NIR_SEARCH_HELPERS_
+#define _NIR_SEARCH_HELPERS_
+
+#include "nir.h"
+
+static inline bool
+__is_power_of_two(unsigned int x)
+{
+   return ((x != 0) && !(x & (x - 1)));
+}
+
+static inline bool
+is_pos_power_of_two(nir_alu_instr *instr, unsigned src, unsigned num_components,
+                    const uint8_t *swizzle)
+{
+   nir_const_value *val = nir_src_as_const_value(instr->src[src].src);
+
+   /* only constant src's: */
+   if (!val)
+      return false;
+
+   for (unsigned i = 0; i < num_components; i++) {
+      switch (nir_op_infos[instr->op].input_types[src]) {
+      case nir_type_int:
+         if (val->i32[swizzle[i]] < 0)
+            return false;
+         if (!__is_power_of_two(val->i32[swizzle[i]]))
+            return false;
+         break;
+      case nir_type_uint:
+         if (!__is_power_of_two(val->u32[swizzle[i]]))
+            return false;
+         break;
+      default:
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static inline bool
+is_neg_power_of_two(nir_alu_instr *instr, unsigned src, unsigned num_components,
+                    const uint8_t *swizzle)
+{
+   nir_const_value *val = nir_src_as_const_value(instr->src[src].src);
+
+   /* only constant src's: */
+   if (!val)
+      return false;
+
+   for (unsigned i = 0; i < num_components; i++) {
+      switch (nir_op_infos[instr->op].input_types[src]) {
+      case nir_type_int:
+         if (val->i32[swizzle[i]] > 0)
+            return false;
+         if (!__is_power_of_two(abs(val->i32[swizzle[i]])))
+            return false;
+         break;
+      default:
+         return false;
+      }
+   }
+
+   return true;
+}
+
+#endif /* _NIR_SEARCH_ */
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -61,12 +61,6 @@ ifeq ($(shell echo "$(MESA_ANDROID_VERSION) >= 4.2" | bc),1)
 LOCAL_SHARED_LIBRARIES += libsync
 endif

-# add libdrm if there are hardware drivers
-ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
-LOCAL_CFLAGS += -DHAVE_LIBDRM
-LOCAL_SHARED_LIBRARIES += libdrm
-endif
-
 ifeq ($(strip $(MESA_BUILD_CLASSIC)),true)
 # require i915_dri and/or i965_dri
 LOCAL_REQUIRED_MODULES += \
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -160,8 +160,14 @@ droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf)
 }

 static EGLBoolean
-droid_window_enqueue_buffer(struct dri2_egl_surface *dri2_surf)
+droid_window_enqueue_buffer(_EGLDisplay *disp, struct dri2_egl_surface *dri2_surf)
 {
+   /* To avoid blocking other EGL calls, release the display mutex before
+    * we enter droid_window_enqueue_buffer() and re-acquire the mutex upon
+    * return.
+    */
+   mtx_unlock(&disp->Mutex);
+
 #if ANDROID_VERSION >= 0x0402
   /* Queue the buffer without a sync fence. This informs the ANativeWindow
    * that it may access the buffer immediately.
@@ -185,14 +191,15 @@ droid_window_enqueue_buffer(struct dri2_egl_surface *dri2_surf)
   dri2_surf->buffer->common.decRef(&dri2_surf->buffer->common);
   dri2_surf->buffer = NULL;

+   mtx_lock(&disp->Mutex);
   return EGL_TRUE;
 }

 static void
-droid_window_cancel_buffer(struct dri2_egl_surface *dri2_surf)
+droid_window_cancel_buffer(_EGLDisplay *disp, struct dri2_egl_surface *dri2_surf)
 {
   /* no cancel buffer? */
-   droid_window_enqueue_buffer(dri2_surf);
+   droid_window_enqueue_buffer(disp, dri2_surf);
 }

 static __DRIbuffer *
@@ -325,7 +332,7 @@ droid_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)

   if (dri2_surf->base.Type == EGL_WINDOW_BIT) {
      if (dri2_surf->buffer)
-         droid_window_cancel_buffer(dri2_surf);
+         droid_window_cancel_buffer(disp, dri2_surf);

      dri2_surf->window->common.decRef(&dri2_surf->window->common);
   }
@@ -435,7 +442,7 @@ droid_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
   dri2_flush_drawable_for_swapbuffers(disp, draw);

   if (dri2_surf->buffer)
-      droid_window_enqueue_buffer(dri2_surf);
+      droid_window_enqueue_buffer(disp, dri2_surf);

   (*dri2_dpy->flush->invalidate)(dri2_surf->dri_drawable);

--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -236,6 +236,12 @@ _eglParseSurfaceAttribList(_EGLSurface *surf, const EGLint *attrib_list)
      }

      if (type == EGL_PBUFFER_BIT) {
+         if (tex_target == -1)
+            tex_target = surf->TextureTarget;
+
+         if (tex_format == -1)
+            tex_format = surf->TextureFormat;
+
         if ((tex_target == EGL_NO_TEXTURE && tex_format != EGL_NO_TEXTURE) ||
             (tex_format == EGL_NO_TEXTURE && tex_target != EGL_NO_TEXTURE)) {
            err = EGL_BAD_MATCH;
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -288,8 +288,6 @@ C_SOURCES := \
 	util/u_slab.h \
 	util/u_split_prim.h \
 	util/u_sse.h \
-	util/u_staging.c \
-	util/u_staging.h \
 	util/u_string.h \
 	util/u_suballoc.c \
 	util/u_suballoc.h \
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -91,6 +91,9 @@ struct cso_context {
   struct pipe_constant_buffer aux_constbuf_current[PIPE_SHADER_TYPES];
   struct pipe_constant_buffer aux_constbuf_saved[PIPE_SHADER_TYPES];

+   struct pipe_image_view fragment_image0_current;
+   struct pipe_image_view fragment_image0_saved;
+
   unsigned nr_so_targets;
   struct pipe_stream_output_target *so_targets[PIPE_MAX_SO_BUFFERS];

@@ -371,6 +374,9 @@ void cso_destroy_context( struct cso_context *ctx )
      pipe_resource_reference(&ctx->aux_constbuf_saved[i].buffer, NULL);
   }

+   pipe_resource_reference(&ctx->fragment_image0_current.resource, NULL);
+   pipe_resource_reference(&ctx->fragment_image0_saved.resource, NULL);
+
   for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
      pipe_so_target_reference(&ctx->so_targets[i], NULL);
      pipe_so_target_reference(&ctx->so_targets_saved[i], NULL);
@@ -1352,6 +1358,35 @@ cso_restore_fragment_sampler_views(struct cso_context *ctx)
 }


+void
+cso_set_shader_images(struct cso_context *ctx, unsigned shader_stage,
+                      unsigned start, unsigned count,
+                      struct pipe_image_view *images)
+{
+   if (shader_stage == PIPE_SHADER_FRAGMENT && start == 0 && count >= 1) {
+      util_copy_image_view(&ctx->fragment_image0_current, &images[0]);
+   }
+
+   ctx->pipe->set_shader_images(ctx->pipe, shader_stage, start, count, images);
+}
+
+
+static void
+cso_save_fragment_image0(struct cso_context *ctx)
+{
+   util_copy_image_view(&ctx->fragment_image0_saved,
+                        &ctx->fragment_image0_current);
+}
+
+
+static void
+cso_restore_fragment_image0(struct cso_context *ctx)
+{
+   cso_set_shader_images(ctx, PIPE_SHADER_FRAGMENT, 0, 1,
+                         &ctx->fragment_image0_saved);
+}
+
+
 void
 cso_set_stream_outputs(struct cso_context *ctx,
                       unsigned num_targets,
@@ -1541,6 +1576,8 @@ cso_save_state(struct cso_context *cso, unsigned state_mask)
      cso_save_viewport(cso);
   if (state_mask & CSO_BIT_PAUSE_QUERIES)
      cso->pipe->set_active_query_state(cso->pipe, false);
+   if (state_mask & CSO_BIT_FRAGMENT_IMAGE0)
+      cso_save_fragment_image0(cso);
 }


@@ -1594,6 +1631,8 @@ cso_restore_state(struct cso_context *cso)
      cso_restore_viewport(cso);
   if (state_mask & CSO_BIT_PAUSE_QUERIES)
      cso->pipe->set_active_query_state(cso->pipe, true);
+   if (state_mask & CSO_BIT_FRAGMENT_IMAGE0)
+      cso_restore_fragment_image0(cso);

   cso->saved_state = 0;
 }
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -171,6 +171,7 @@ void cso_set_render_condition(struct cso_context *cso,
 #define CSO_BIT_VERTEX_SHADER         0x20000
 #define CSO_BIT_VIEWPORT              0x40000
 #define CSO_BIT_PAUSE_QUERIES         0x80000
+#define CSO_BIT_FRAGMENT_IMAGE0      0x100000

 #define CSO_BITS_ALL_SHADERS (CSO_BIT_VERTEX_SHADER | \
                              CSO_BIT_FRAGMENT_SHADER | \
@@ -191,6 +192,14 @@ cso_set_sampler_views(struct cso_context *cso,
                      struct pipe_sampler_view **views);


+/* shader images */
+
+void
+cso_set_shader_images(struct cso_context *cso, unsigned shader_stage,
+                      unsigned start, unsigned count,
+                      struct pipe_image_view *views);
+
+
 /* constant buffers */

 void cso_set_constant_buffer(struct cso_context *cso, unsigned shader_stage,
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1123,10 +1123,8 @@ generate_viewport(struct draw_llvm_variant *variant,

      /* divide by w */
      out = LLVMBuildFMul(builder, out, out3, "");
-      /* mult by scale */
-      out = LLVMBuildFMul(builder, out, scale, "");
-      /* add translation */
-      out = LLVMBuildFAdd(builder, out, trans, "");
+      /* mult by scale, add translation */
+      out = lp_build_fmuladd(builder, out, scale, trans);

      /* store transformed outputs */
      LLVMBuildStore(builder, out, outputs[pos][i]);
@@ -1303,22 +1301,19 @@ generate_clipmask(struct draw_llvm *llvm,
            plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
            plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y");
            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
-            test = LLVMBuildFMul(builder, planes, cv_y, "");
-            sum = LLVMBuildFAdd(builder, sum, test, "");
+            sum = lp_build_fmuladd(builder, planes, cv_y, sum);

            indices[2] = lp_build_const_int32(gallivm, 2);
            plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
            plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z");
            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
-            test = LLVMBuildFMul(builder, planes, cv_z, "");
-            sum = LLVMBuildFAdd(builder, sum, test, "");
+            sum = lp_build_fmuladd(builder, planes, cv_z, sum);

            indices[2] = lp_build_const_int32(gallivm, 3);
            plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
            plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w");
            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
-            test = LLVMBuildFMul(builder, planes, cv_w, "");
-            sum = LLVMBuildFAdd(builder, sum, test, "");
+            sum = lp_build_fmuladd(builder, planes, cv_w, sum);

            test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, sum);
            temp = lp_build_const_int_vec(gallivm, i32_type, 1LL << plane_idx);
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -50,7 +50,6 @@
 #include "util/u_memory.h"
 #include "util/u_debug.h"
 #include "util/u_math.h"
-#include "util/u_string.h"
 #include "util/u_cpu_detect.h"

 #include "lp_bld_type.h"
@@ -262,6 +261,28 @@ lp_build_min_simple(struct lp_build_context *bld,
 }


+LLVMValueRef
+lp_build_fmuladd(LLVMBuilderRef builder,
+                 LLVMValueRef a,
+                 LLVMValueRef b,
+                 LLVMValueRef c)
+{
+   LLVMTypeRef type = LLVMTypeOf(a);
+   assert(type == LLVMTypeOf(b));
+   assert(type == LLVMTypeOf(c));
+   if (HAVE_LLVM < 0x0304) {
+      /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
+       * not supported, and instead it falls-back to a C function.
+       */
+      return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
+   }
+   char intrinsic[32];
+   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
+   LLVMValueRef args[] = { a, b, c };
+   return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
+}
+
+
 /**
 * Generate max(a, b)
 * No checks for special case values of a or b = 1 or 0 are done.
@@ -1023,6 +1044,22 @@ lp_build_mul(struct lp_build_context *bld,
 }


+/* a * b + c */
+LLVMValueRef
+lp_build_mad(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b,
+             LLVMValueRef c)
+{
+   const struct lp_type type = bld->type;
+   if (type.floating) {
+      return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
+   } else {
+      return lp_build_add(bld, lp_build_mul(bld, a, b), c);
+   }
+}
+
+
 /**
 * Small vector x scale multiplication optimization.
 */
@@ -1153,6 +1190,11 @@ lp_build_lerp_simple(struct lp_build_context *bld,

   delta = lp_build_sub(bld, v1, v0);

+   if (bld->type.floating) {
+      assert(flags == 0);
+      return lp_build_mad(bld, x, delta, v0);
+   }
+
   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
      if (!bld->type.sign) {
         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
@@ -2717,23 +2759,10 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
   /*
    * The magic pass: "Extended precision modular arithmetic"
    * x = ((x - y * DP1) - y * DP2) - y * DP3;
-    * xmm1 = _mm_mul_ps(y, xmm1);
-    * xmm2 = _mm_mul_ps(y, xmm2);
-    * xmm3 = _mm_mul_ps(y, xmm3);
    */
-   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
-   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
-   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
-
-   /*
-    * x = _mm_add_ps(x, xmm1);
-    * x = _mm_add_ps(x, xmm2);
-    * x = _mm_add_ps(x, xmm3);
-    */
-
-   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
-   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
-   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
+   LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
+   LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
+   LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);

   /*
    * Evaluate the first polynom  (0 <= x <= Pi/4)
@@ -2755,10 +2784,8 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
    * y = *(v4sf*)_ps_coscof_p0;
    * y = _mm_mul_ps(y, z);
    */
-   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
-   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
-   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
-   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
+   LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");

@@ -2796,13 +2823,10 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
    * y2 = _mm_add_ps(y2, x);
    */

-   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
-   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
-   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
-   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
+   LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
-   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
-   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
+   LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);

   /*
    * select the correct result from the two polynoms
@@ -2969,19 +2993,19 @@ lp_build_polynomial(struct lp_build_context *bld,

      if (i % 2 == 0) {
         if (even)
-            even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
+            even = lp_build_mad(bld, x2, even, coeff);
         else
            even = coeff;
      } else {
         if (odd)
-            odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
+            odd = lp_build_mad(bld, x2, odd, coeff);
         else
            odd = coeff;
      }
   }

   if (odd)
-      return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
+      return lp_build_mad(bld, odd, x, even);
   else if (even)
      return even;
   else
@@ -3212,7 +3236,7 @@ lp_build_log2_approx(struct lp_build_context *bld,
   LLVMValueRef exp = NULL;
   LLVMValueRef mant = NULL;
   LLVMValueRef logexp = NULL;
-   LLVMValueRef logmant = NULL;
+   LLVMValueRef p_z = NULL;
   LLVMValueRef res = NULL;

   assert(lp_check_value(bld->type, x));
@@ -3261,13 +3285,11 @@ lp_build_log2_approx(struct lp_build_context *bld,
      z = lp_build_mul(bld, y, y);

      /* compute P(z) */
-      logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
-                                    ARRAY_SIZE(lp_build_log2_polynomial));
+      p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
+                                ARRAY_SIZE(lp_build_log2_polynomial));

-      /* logmant = y * P(z) */
-      logmant = lp_build_mul(bld, y, logmant);
-
-      res = lp_build_add(bld, logmant, logexp);
+      /* y * P(z) + logexp */
+      res = lp_build_mad(bld, y, p_z, logexp);

      if (type.floating && handle_edge_cases) {
         LLVMValueRef negmask, infmask,  zmask;
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -87,6 +87,21 @@ lp_build_div(struct lp_build_context *bld,
             LLVMValueRef b);


+/* llvm.fmuladd.* intrinsic */
+LLVMValueRef
+lp_build_fmuladd(LLVMBuilderRef builder,
+                 LLVMValueRef a,
+                 LLVMValueRef b,
+                 LLVMValueRef c);
+
+/* a * b + c */
+LLVMValueRef
+lp_build_mad(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b,
+             LLVMValueRef c);
+
+
 /**
 * Set when the weights for normalized are prescaled, that is, in range
 * 0..2**n, as opposed to range 0..2**(n-1).
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -311,7 +311,7 @@ lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
       * important, we also get exact results for 0.0 and 1.0.
       */

-      unsigned n = MIN2(src_type.width - 1, dst_width);
+      unsigned n = MIN2(src_type.width - 1u, dst_width);

      double scale = (double)(1ULL << n);
      unsigned lshift = dst_width - n;
@@ -445,7 +445,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
                       unsigned num_srcs,
                       LLVMValueRef *dst)
 {
-   int i;
+   unsigned i;
   int num_dsts = num_srcs;

   if (src_type.floating == dst_type->floating &&
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
@@ -289,8 +289,7 @@ lp_build_linear_to_srgb(struct gallivm_state *gallivm,
      c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);

      tmp = lp_build_mul(&f32_bld, a_const, x0375);
-      tmp2 = lp_build_mul(&f32_bld, b_const, x05);
-      tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
+      tmp2 = lp_build_mad(&f32_bld, b_const, x05, c_const);
      pow_final = lp_build_add(&f32_bld, tmp, tmp2);
   }

--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -420,6 +420,7 @@ lp_build_init(void)
      util_cpu_caps.has_avx = 0;
      util_cpu_caps.has_avx2 = 0;
      util_cpu_caps.has_f16c = 0;
+      util_cpu_caps.has_fma = 0;
   }
 #endif

@@ -454,6 +455,7 @@ lp_build_init(void)
      util_cpu_caps.has_avx = 0;
      util_cpu_caps.has_avx2 = 0;
      util_cpu_caps.has_f16c = 0;
+      util_cpu_caps.has_fma = 0;
   }

 #ifdef PIPE_ARCH_PPC_64
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -88,8 +88,6 @@ lp_build_compare_ext(struct gallivm_state *gallivm,
   LLVMValueRef cond;
   LLVMValueRef res;

-   assert(func >= PIPE_FUNC_NEVER);
-   assert(func <= PIPE_FUNC_ALWAYS);
   assert(lp_check_value(type, a));
   assert(lp_check_value(type, b));

@@ -98,6 +96,9 @@ lp_build_compare_ext(struct gallivm_state *gallivm,
   if(func == PIPE_FUNC_ALWAYS)
      return ones;

+   assert(func > PIPE_FUNC_NEVER);
+   assert(func < PIPE_FUNC_ALWAYS);
+
   if(type.floating) {
      LLVMRealPredicate op;
      switch(func) {
@@ -176,8 +177,6 @@ lp_build_compare(struct gallivm_state *gallivm,
   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);

-   assert(func >= PIPE_FUNC_NEVER);
-   assert(func <= PIPE_FUNC_ALWAYS);
   assert(lp_check_value(type, a));
   assert(lp_check_value(type, b));

@@ -186,6 +185,9 @@ lp_build_compare(struct gallivm_state *gallivm,
   if(func == PIPE_FUNC_ALWAYS)
      return ones;

+   assert(func > PIPE_FUNC_NEVER);
+   assert(func < PIPE_FUNC_ALWAYS);
+
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
   /*
    * There are no unsigned integer comparison instructions in SSE.
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -570,6 +570,15 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
    */
   MAttrs.push_back(util_cpu_caps.has_avx  ? "+avx"  : "-avx");
   MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c");
+   if (HAVE_LLVM >= 0x0304) {
+      MAttrs.push_back(util_cpu_caps.has_fma  ? "+fma"  : "-fma");
+   } else {
+      /*
+       * The old JIT in LLVM 3.3 has a bug encoding llvm.fmuladd.f32 and
+       * llvm.fmuladd.v2f32 intrinsics when FMA is available.
+       */
+      MAttrs.push_back("-fma");
+   }
   MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2");
   /* disable avx512 and all subvariants */
 #if HAVE_LLVM >= 0x0304
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -236,7 +236,7 @@ lp_build_concat_n(struct gallivm_state *gallivm,
                  unsigned num_dsts)
 {
   int size = num_srcs / num_dsts;
-   int i;
+   unsigned i;

   assert(num_srcs >= num_dsts);
   assert((num_srcs % size) == 0);
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
@@ -155,10 +155,10 @@ lp_build_print_value(struct gallivm_state *gallivm,
 }


-static int
+static unsigned
 lp_get_printf_arg_count(const char *fmt)
 {
-   int count =0;
+   unsigned count = 0;
   const char *p = fmt;
   int c;

@@ -195,8 +195,7 @@ lp_build_printf(struct gallivm_state *gallivm,
 {
   LLVMValueRef params[50];
   va_list arglist;
-   int argcount;
-   int i;
+   unsigned argcount, i;

   argcount = lp_get_printf_arg_count(fmt);
   assert(ARRAY_SIZE(params) >= argcount + 1);
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -580,10 +580,8 @@ lp_build_brilinear_lod(struct lp_build_context *bld,

   lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);

-   lod_fpart = lp_build_mul(bld, lod_fpart,
-                            lp_build_const_vec(bld->gallivm, bld->type, factor));
-
-   lod_fpart = lp_build_add(bld, lod_fpart,
+   lod_fpart = lp_build_mad(bld, lod_fpart,
+                            lp_build_const_vec(bld->gallivm, bld->type, factor),
                            lp_build_const_vec(bld->gallivm, bld->type, post_offset));

   /*
@@ -639,10 +637,8 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
   /* fpart = rho / 2**ipart */
   lod_fpart = lp_build_extract_mantissa(bld, rho);

-   lod_fpart = lp_build_mul(bld, lod_fpart,
-                            lp_build_const_vec(bld->gallivm, bld->type, factor));
-
-   lod_fpart = lp_build_add(bld, lod_fpart,
+   lod_fpart = lp_build_mad(bld, lod_fpart,
+                            lp_build_const_vec(bld->gallivm, bld->type, factor),
                            lp_build_const_vec(bld->gallivm, bld->type, post_offset));

   /*
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -467,7 +467,7 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
      LLVMValueRef res;
      struct lp_type type4;
      unsigned cond = 0;
-      unsigned chan;
+      int chan;
      int shift;

      /*
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -186,15 +186,15 @@ void lp_build_fetch_args(
 }

 /**
- * with doubles src and dst channels aren't 1:1.
+ * with 64-bit src and dst channels aren't 1:1.
 * check the src/dst types for the opcode,
- * 1. if neither is double then src == dst;
- * 2. if dest is double
+ * 1. if neither is 64-bit then src == dst;
+ * 2. if dest is 64-bit
 *     - don't store to y or w
- *     - if src is double then src == dst.
+ *     - if src is 64-bit then src == dst.
 *     - else for f2d, d.xy = s.x
 *     - else for f2d, d.zw = s.y
- * 3. if dst is single, src is double
+ * 3. if dst is single, src is 64-bit
 *    - map dst x,z to src xy;
 *    - map dst y,w to src zw;
 */
@@ -204,12 +204,12 @@ static int get_src_chan_idx(unsigned opcode,
   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(opcode);
   enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(opcode);

-   if (dtype != TGSI_TYPE_DOUBLE && stype != TGSI_TYPE_DOUBLE)
+   if (!tgsi_type_is_64bit(dtype) && !tgsi_type_is_64bit(stype))
      return dst_chan_index;
-   if (dtype == TGSI_TYPE_DOUBLE) {
+   if (tgsi_type_is_64bit(dtype)) {
      if (dst_chan_index == 1 || dst_chan_index == 3)
         return -1;
-      if (stype == TGSI_TYPE_DOUBLE)
+      if (tgsi_type_is_64bit(stype))
         return dst_chan_index;
      if (dst_chan_index == 0)
         return 0;
@@ -335,7 +335,7 @@ lp_build_emit_fetch(
   enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(inst->Instruction.Opcode);

   if (chan_index == LP_CHAN_ALL) {
-      swizzle = ~0;
+      swizzle = ~0u;
   } else {
      swizzle = tgsi_util_get_full_src_register_swizzle(reg, chan_index);
      if (swizzle > 3) {
@@ -398,7 +398,7 @@ lp_build_emit_fetch(
    * Swizzle the argument
    */

-   if (swizzle == ~0) {
+   if (swizzle == ~0u) {
      res = bld_base->emit_swizzle(bld_base, res,
                     reg->Register.SwizzleX,
                     reg->Register.SwizzleY,
@@ -453,7 +453,7 @@ lp_build_emit_fetch_texoffset(
    * Swizzle the argument
    */

-   if (swizzle == ~0) {
+   if (swizzle == ~0u) {
      res = bld_base->emit_swizzle(bld_base, res,
                                   off->SwizzleX,
                                   off->SwizzleY,
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -52,7 +52,7 @@
 extern "C" {
 #endif

-#define LP_CHAN_ALL ~0
+#define LP_CHAN_ALL ~0u

 #define LP_MAX_INSTRUCTIONS 256

--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -1577,6 +1577,19 @@ log_emit_cpu(

 }

+/* TGSI_OPCODE_MAD (CPU Only) */
+
+static void
+mad_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      lp_build_mad(&bld_base->base,
+                   emit_data->args[0], emit_data->args[1], emit_data->args[2]);
+}
+
 /* TGSI_OPCODE_MAX (CPU Only) */

 static void
@@ -2162,6 +2175,7 @@ lp_set_default_actions_cpu(

   bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_MAD].emit = mad_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_MAX].emit = max_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_MIN].emit = min_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_MOD].emit = mod_emit_cpu;
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -642,7 +642,7 @@ static boolean default_analyse_is_last(struct lp_exec_mask *mask,
 {
   unsigned pc = bld_base->pc;
   struct function_ctx *ctx = func_ctx(mask);
-   unsigned curr_switch_stack = ctx->switch_stack_size;
+   int curr_switch_stack = ctx->switch_stack_size;

   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
      return false;
@@ -653,7 +653,7 @@ static boolean default_analyse_is_last(struct lp_exec_mask *mask,
      pc++;
   }

-   while (pc != -1 && pc < bld_base->num_instructions) {
+   while (pc != ~0u && pc < bld_base->num_instructions) {
      unsigned opcode = bld_base->instructions[pc].Instruction.Opcode;
      switch (opcode) {
      case TGSI_OPCODE_CASE:
@@ -856,7 +856,7 @@ static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
 static LLVMValueRef
 get_file_ptr(struct lp_build_tgsi_soa_context *bld,
             unsigned file,
-             unsigned index,
+             int index,
             unsigned chan)
 {
   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
@@ -1227,7 +1227,7 @@ emit_fetch_constant(
   LLVMValueRef res;

   /* XXX: Handle fetching xyzw components as a vector */
-   assert(swizzle != ~0);
+   assert(swizzle != ~0u);

   if (reg->Register.Dimension) {
      assert(!reg->Dimension.Indirect);
@@ -1264,7 +1264,7 @@ emit_fetch_constant(
      index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
      index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);

-      if (stype == TGSI_TYPE_DOUBLE) {
+      if (tgsi_type_is_64bit(stype)) {
         LLVMValueRef swizzle_vec2;
         swizzle_vec2 = lp_build_const_int_vec(gallivm, uint_bld->type, swizzle + 1);
         index_vec2 = lp_build_shl_imm(uint_bld, indirect_index, 2);
@@ -1299,14 +1299,14 @@ emit_fetch_constant(
 }

 /**
- * Fetch double values from two separate channels.
- * Doubles are stored split across two channels, like xy and zw.
+ * Fetch 64-bit values from two separate channels.
+ * 64-bit values are stored split across two channels, like xy and zw.
 * This function creates a set of 16 floats,
 * extracts the values from the two channels,
- * puts them in the correct place, then casts to 8 doubles.
+ * puts them in the correct place, then casts to 8 64-bits.
 */
 static LLVMValueRef
-emit_fetch_double(
+emit_fetch_64bit(
   struct lp_build_tgsi_context * bld_base,
   enum tgsi_opcode_type stype,
   LLVMValueRef input,
@@ -1369,7 +1369,7 @@ emit_fetch_immediate(
                                           indirect_index,
                                           swizzle,
                                           FALSE);
-         if (stype == TGSI_TYPE_DOUBLE)
+         if (tgsi_type_is_64bit(stype))
            index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
                                              indirect_index,
                                              swizzle + 1,
@@ -1383,7 +1383,7 @@ emit_fetch_immediate(
                                                bld->imms_array, &lindex, 1, "");
         res = LLVMBuildLoad(builder, imms_ptr, "");

-         if (stype == TGSI_TYPE_DOUBLE) {
+         if (tgsi_type_is_64bit(stype)) {
            LLVMValueRef lindex1;
            LLVMValueRef imms_ptr2;
            LLVMValueRef res2;
@@ -1393,22 +1393,19 @@ emit_fetch_immediate(
            imms_ptr2 = LLVMBuildGEP(builder,
                                      bld->imms_array, &lindex1, 1, "");
            res2 = LLVMBuildLoad(builder, imms_ptr2, "");
-            res = emit_fetch_double(bld_base, stype, res, res2);
+            res = emit_fetch_64bit(bld_base, stype, res, res2);
         }
      }
   }
   else {
      res = bld->immediates[reg->Register.Index][swizzle];
-      if (stype == TGSI_TYPE_DOUBLE)
-         res = emit_fetch_double(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle + 1]);
+      if (tgsi_type_is_64bit(stype))
+         res = emit_fetch_64bit(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle + 1]);
   }

-   if (stype == TGSI_TYPE_UNSIGNED) {
-      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
-   } else if (stype == TGSI_TYPE_SIGNED) {
-      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
-   } else if (stype == TGSI_TYPE_DOUBLE) {
-      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
+      struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+      res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
   }
   return res;
 }
@@ -1441,7 +1438,7 @@ emit_fetch_input(
                                        indirect_index,
                                        swizzle,
                                        TRUE);
-      if (stype == TGSI_TYPE_DOUBLE) {
+      if (tgsi_type_is_64bit(stype)) {
         index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
                                           indirect_index,
                                           swizzle + 1,
@@ -1461,7 +1458,7 @@ emit_fetch_input(
                                               bld->inputs_array, &lindex, 1, "");

         res = LLVMBuildLoad(builder, input_ptr, "");
-         if (stype == TGSI_TYPE_DOUBLE) {
+         if (tgsi_type_is_64bit(stype)) {
            LLVMValueRef lindex1;
            LLVMValueRef input_ptr2;
            LLVMValueRef res2;
@@ -1471,24 +1468,21 @@ emit_fetch_input(
            input_ptr2 = LLVMBuildGEP(builder,
                                      bld->inputs_array, &lindex1, 1, "");
            res2 = LLVMBuildLoad(builder, input_ptr2, "");
-            res = emit_fetch_double(bld_base, stype, res, res2);
+            res = emit_fetch_64bit(bld_base, stype, res, res2);
         }
      }
      else {
         res = bld->inputs[reg->Register.Index][swizzle];
-         if (stype == TGSI_TYPE_DOUBLE)
-            res = emit_fetch_double(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle + 1]);
+         if (tgsi_type_is_64bit(stype))
+            res = emit_fetch_64bit(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle + 1]);
      }
   }

   assert(res);

-   if (stype == TGSI_TYPE_UNSIGNED) {
-      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
-   } else if (stype == TGSI_TYPE_SIGNED) {
-      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
-   } else if (stype == TGSI_TYPE_DOUBLE) {
-      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
+      struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+      res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
   }

   return res;
@@ -1548,7 +1542,7 @@ emit_fetch_gs_input(
                                    swizzle_index);

   assert(res);
-   if (stype == TGSI_TYPE_DOUBLE) {
+   if (tgsi_type_is_64bit(stype)) {
      LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle + 1);
      LLVMValueRef res2;
      res2 = bld->gs_iface->fetch_input(bld->gs_iface, bld_base,
@@ -1558,7 +1552,7 @@ emit_fetch_gs_input(
                                        attrib_index,
                                        swizzle_index);
      assert(res2);
-      res = emit_fetch_double(bld_base, stype, res, res2);
+      res = emit_fetch_64bit(bld_base, stype, res, res2);
   } else if (stype == TGSI_TYPE_UNSIGNED) {
      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
   } else if (stype == TGSI_TYPE_SIGNED) {
@@ -1595,7 +1589,7 @@ emit_fetch_temporary(
                                        indirect_index,
                                        swizzle,
                                        TRUE);
-      if (stype == TGSI_TYPE_DOUBLE) {
+      if (tgsi_type_is_64bit(stype)) {
               index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
                                                  indirect_index,
                                                  swizzle + 1,
@@ -1614,12 +1608,12 @@ emit_fetch_temporary(
      temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
      res = LLVMBuildLoad(builder, temp_ptr, "");

-      if (stype == TGSI_TYPE_DOUBLE) {
+      if (tgsi_type_is_64bit(stype)) {
         LLVMValueRef temp_ptr2, res2;

         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle + 1);
         res2 = LLVMBuildLoad(builder, temp_ptr2, "");
-         res = emit_fetch_double(bld_base, stype, res, res2);
+         res = emit_fetch_64bit(bld_base, stype, res, res2);
      }
   }

@@ -1790,20 +1784,19 @@ emit_fetch_predicate(
 }

 /**
- * store an array of 8 doubles into two arrays of 8 floats
+ * store an array of 8 64-bit into two arrays of 8 floats
 * i.e.
 * value is d0, d1, d2, d3 etc.
- * each double has high and low pieces x, y
+ * each 64-bit has high and low pieces x, y
 * so gets stored into the separate channels as:
 * chan_ptr = d0.x, d1.x, d2.x, d3.x
 * chan_ptr2 = d0.y, d1.y, d2.y, d3.y
 */
 static void
-emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
-                       int dtype,
-                       LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
-                       LLVMValueRef pred,
-                       LLVMValueRef value)
+emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base,
+                      LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
+                      LLVMValueRef pred,
+                      LLVMValueRef value)
 {
   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
   struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1870,9 +1863,9 @@ emit_store_chan(
   if (reg->Register.Indirect) {
      /*
       * Currently the mesa/st doesn't generate indirect stores
-       * to doubles, it normally uses MOV to do indirect stores.
+       * to 64-bit values, it normally uses MOV to do indirect stores.
       */
-      assert(dtype != TGSI_TYPE_DOUBLE);
+      assert(!tgsi_type_is_64bit(dtype));
      indirect_index = get_indirect_index(bld,
                                          reg->Register.File,
                                          reg->Register.Index,
@@ -1912,11 +1905,11 @@ emit_store_chan(
         LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
                                                  chan_index);

-         if (dtype == TGSI_TYPE_DOUBLE) {
+         if (tgsi_type_is_64bit(dtype)) {
            LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
                                                      chan_index + 1);
-            emit_store_double_chan(bld_base, dtype, out_ptr, out_ptr2,
-                                   pred, value);
+            emit_store_64bit_chan(bld_base, out_ptr, out_ptr2,
+                                  pred, value);
         } else
            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
      }
@@ -1924,7 +1917,7 @@ emit_store_chan(

   case TGSI_FILE_TEMPORARY:
      /* Temporaries are always stored as floats */
-      if (dtype != TGSI_TYPE_DOUBLE)
+      if (!tgsi_type_is_64bit(dtype))
         value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
      else
         value = LLVMBuildBitCast(builder, value,  LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), "");
@@ -1950,12 +1943,12 @@ emit_store_chan(
         LLVMValueRef temp_ptr;
         temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);

-         if (dtype == TGSI_TYPE_DOUBLE) {
+         if (tgsi_type_is_64bit(dtype)) {
            LLVMValueRef temp_ptr2 = lp_get_temp_ptr_soa(bld,
                                                         reg->Register.Index,
                                                         chan_index + 1);
-            emit_store_double_chan(bld_base, dtype, temp_ptr, temp_ptr2,
-                                   pred, value);
+            emit_store_64bit_chan(bld_base, temp_ptr, temp_ptr2,
+                                  pred, value);
         }
         else
            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
@@ -2035,7 +2028,7 @@ emit_store(

      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {

-         if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
+         if (tgsi_type_is_64bit(dtype) && (chan_index == 1 || chan_index == 3))
             continue;
         emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
      }
@@ -2882,7 +2875,7 @@ emit_dump_file(struct lp_build_tgsi_soa_context *bld,
      int chan;

      if (index < 8 * sizeof(unsigned) &&
-          (info->file_mask[file] & (1 << index)) == 0)  {
+          (info->file_mask[file] & (1u << index)) == 0)  {
         /* This was not declared.*/
         continue;
      }
--- a/src/gallium/auxiliary/pipe-loader/Android.mk
+++ b/src/gallium/auxiliary/pipe-loader/Android.mk
@@ -38,10 +38,7 @@ LOCAL_SRC_FILES := $(COMMON_SOURCES)
 LOCAL_MODULE := libmesa_pipe_loader

 ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
-LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SRC_FILES += $(DRM_SOURCES)
-
-LOCAL_SHARED_LIBRARIES := libdrm
 LOCAL_STATIC_LIBRARIES := libmesa_loader
 endif

--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -676,10 +676,10 @@ static void
 micro_trunc(union tgsi_exec_channel *dst,
            const union tgsi_exec_channel *src)
 {
-   dst->f[0] = (float)(int)src->f[0];
-   dst->f[1] = (float)(int)src->f[1];
-   dst->f[2] = (float)(int)src->f[2];
-   dst->f[3] = (float)(int)src->f[3];
+   dst->f[0] = truncf(src->f[0]);
+   dst->f[1] = truncf(src->f[1]);
+   dst->f[2] = truncf(src->f[2]);
+   dst->f[3] = truncf(src->f[3]);
 }

 static void
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -262,6 +262,9 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
   { 1, 1, 0, 0, 0, 0, 0, COMP, "DFLR", TGSI_OPCODE_DFLR },
   { 1, 1, 0, 0, 0, 0, 0, COMP, "DROUND", TGSI_OPCODE_DROUND },
   { 1, 1, 0, 0, 0, 0, 0, COMP, "DSSG", TGSI_OPCODE_DSSG },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "VOTE_ANY", TGSI_OPCODE_VOTE_ANY },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "VOTE_ALL", TGSI_OPCODE_VOTE_ALL },
+   { 1, 1, 0, 0, 0, 0, 0, COMP, "VOTE_EQ", TGSI_OPCODE_VOTE_EQ },
 };

 const struct tgsi_opcode_info *
--- a/src/gallium/auxiliary/tgsi/tgsi_info.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.h
@@ -101,6 +101,13 @@ enum tgsi_opcode_type {
   TGSI_TYPE_DOUBLE
 };

+static inline bool tgsi_type_is_64bit(enum tgsi_opcode_type type)
+{
+   if (type == TGSI_TYPE_DOUBLE)
+      return true;
+   return false;
+}
+
 enum tgsi_opcode_type
 tgsi_opcode_infer_src_type( uint opcode );

--- a/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c
@@ -96,7 +96,7 @@ struct psprite_transform_context
   unsigned stream_out_point_pos:1; // set if to stream out original point pos
   unsigned aa_point:1;             // set if doing aa point
   unsigned out_tmp_index[PIPE_MAX_SHADER_OUTPUTS];
-   int max_generic;
+   int max_generic;                 // max generic semantic index
 };

 static inline struct psprite_transform_context *
@@ -133,7 +133,7 @@ psprite_decl(struct tgsi_transform_context *ctx,
      else if (decl->Semantic.Name == TGSI_SEMANTIC_GENERIC &&
               decl->Semantic.Index < 32) {
         ts->point_coord_decl |= 1 << decl->Semantic.Index;
-         ts->max_generic = MAX2(ts->max_generic, decl->Semantic.Index);
+         ts->max_generic = MAX2(ts->max_generic, (int)decl->Semantic.Index);
      }
      ts->num_out = MAX2(ts->num_out, decl->Range.Last + 1);
   }
@@ -216,7 +216,7 @@ psprite_prolog(struct tgsi_transform_context *ctx)
         if (en & 0x1) {
            tgsi_transform_output_decl(ctx, ts->num_out++,
                                       TGSI_SEMANTIC_GENERIC, i, 0);
-            ts->max_generic = MAX2(ts->max_generic, i);
+            ts->max_generic = MAX2(ts->max_generic, (int)i);
         }
      }
   }
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -100,8 +100,6 @@ struct blitter_context_priv

   /* FS which outputs an average of all samples. */
   void *fs_resolve[PIPE_MAX_TEXTURE_TYPES][NUM_RESOLVE_FRAG_SHADERS][2];
-   void *fs_resolve_sint[PIPE_MAX_TEXTURE_TYPES][NUM_RESOLVE_FRAG_SHADERS][2];
-   void *fs_resolve_uint[PIPE_MAX_TEXTURE_TYPES][NUM_RESOLVE_FRAG_SHADERS][2];

   /* Blend state. */
   void *blend[PIPE_MASK_RGBA+1][2]; /**< blend state with writemask */
@@ -487,16 +485,6 @@ void util_blitter_destroy(struct blitter_context *blitter)
         for (f = 0; f < 2; f++)
            if (ctx->fs_resolve[i][j][f])
               ctx->delete_fs_state(pipe, ctx->fs_resolve[i][j][f]);
-
-      for (j = 0; j< ARRAY_SIZE(ctx->fs_resolve_sint[i]); j++)
-         for (f = 0; f < 2; f++)
-            if (ctx->fs_resolve_sint[i][j][f])
-               ctx->delete_fs_state(pipe, ctx->fs_resolve_sint[i][j][f]);
-
-      for (j = 0; j< ARRAY_SIZE(ctx->fs_resolve_uint[i]); j++)
-         for (f = 0; f < 2; f++)
-            if (ctx->fs_resolve_uint[i][j][f])
-               ctx->delete_fs_state(pipe, ctx->fs_resolve_uint[i][j][f]);
   }

   if (ctx->fs_empty)
@@ -891,18 +879,18 @@ static void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
   if (src_nr_samples > 1) {
      void **shader;

-      if (dst_nr_samples <= 1) {
+      /* OpenGL requires that integer textures just copy 1 sample instead
+       * of averaging.
+       */
+      if (dst_nr_samples <= 1 &&
+          stype != TGSI_RETURN_TYPE_UINT &&
+          stype != TGSI_RETURN_TYPE_SINT) {
         /* The destination has one sample, so we'll do color resolve. */
         unsigned index = GET_MSAA_RESOLVE_FS_IDX(src_nr_samples);

         assert(filter < 2);

-         if (stype == TGSI_RETURN_TYPE_UINT)
-            shader = &ctx->fs_resolve_uint[target][index][filter];
-         else if (stype == TGSI_RETURN_TYPE_SINT)
-            shader = &ctx->fs_resolve_sint[target][index][filter];
-         else
-            shader = &ctx->fs_resolve[target][index][filter];
+         shader = &ctx->fs_resolve[target][index][filter];

         if (!*shader) {
            assert(!ctx->cached_all_shaders);
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -369,6 +369,7 @@ util_cpu_detect(void)
                                    ((regs2[2] >> 27) & 1) && // OSXSAVE
                                    ((xgetbv() & 6) == 6);    // XMM & YMM
         util_cpu_caps.has_f16c   = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx;
+         util_cpu_caps.has_fma    = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx;
         util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
 #if defined(PIPE_ARCH_X86_64)
         util_cpu_caps.has_daz = 1;
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -66,6 +66,7 @@ struct util_cpu_caps {
   unsigned has_avx:1;
   unsigned has_avx2:1;
   unsigned has_f16c:1;
+   unsigned has_fma:1;
   unsigned has_3dnow:1;
   unsigned has_3dnow_ext:1;
   unsigned has_xop:1;
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -626,10 +626,17 @@ static inline void
 util_copy_image_view(struct pipe_image_view *dst,
                     const struct pipe_image_view *src)
 {
-   pipe_resource_reference(&dst->resource, src->resource);
-   dst->format = src->format;
-   dst->access = src->access;
-   dst->u = src->u;
+   if (src) {
+      pipe_resource_reference(&dst->resource, src->resource);
+      dst->format = src->format;
+      dst->access = src->access;
+      dst->u = src->u;
+   } else {
+      pipe_resource_reference(&dst->resource, NULL);
+      dst->format = PIPE_FORMAT_NONE;
+      dst->access = 0;
+      memset(&dst->u, 0, sizeof(dst->u));
+   }
 }

 static inline unsigned
@@ -650,6 +657,18 @@ util_max_layer(const struct pipe_resource *r, unsigned level)
   }
 }

+static inline bool
+util_texrange_covers_whole_level(const struct pipe_resource *tex,
+                                 unsigned level, unsigned x, unsigned y,
+                                 unsigned z, unsigned width,
+                                 unsigned height, unsigned depth)
+{
+   return x == 0 && y == 0 && z == 0 &&
+          width == u_minify(tex->width0, level) &&
+          height == u_minify(tex->height0, level) &&
+          depth == util_max_layer(tex, level) + 1;
+}
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/gallium/auxiliary/util/u_staging.c
+++ b/src/gallium/auxiliary/util/u_staging.c
@@ -1,136 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 Luca Barbieri
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "util/u_staging.h"
-#include "pipe/p_context.h"
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-
-
-static void
-util_staging_resource_template(struct pipe_resource *pt, unsigned width,
-                               unsigned height, unsigned depth,
-                               struct pipe_resource *template)
-{
-   memset(template, 0, sizeof(struct pipe_resource));
-   if (pt->target != PIPE_BUFFER && depth <= 1)
-      template->target = PIPE_TEXTURE_RECT;
-   else
-      template->target = pt->target;
-   template->format = pt->format;
-   template->width0 = width;
-   template->height0 = height;
-   template->depth0 = depth;
-   template->array_size = 1;
-   template->last_level = 0;
-   template->nr_samples = pt->nr_samples;
-   template->bind = 0;
-   template->usage = PIPE_USAGE_STAGING;
-   template->flags = 0;
-}
-
-
-struct util_staging_transfer *
-util_staging_transfer_init(struct pipe_context *pipe,
-                           struct pipe_resource *pt,
-                           unsigned level, enum pipe_resource_usage usage,
-                           const struct pipe_box *box,
-                           boolean direct, struct util_staging_transfer *tx)
-{
-   struct pipe_screen *pscreen = pipe->screen;
-   struct pipe_resource staging_resource_template;
-
-   pipe_resource_reference(&tx->base.resource, pt);
-   tx->base.level = level;
-   tx->base.usage = usage;
-   tx->base.box = *box;
-
-   if (direct) {
-      tx->staging_resource = pt;
-      return tx;
-   }
-
-   util_staging_resource_template(pt, box->width, box->height,
-                                  box->depth, &staging_resource_template);
-   tx->staging_resource = pscreen->resource_create(pscreen,
-                                                   &staging_resource_template);
-   if (!tx->staging_resource) {
-      pipe_resource_reference(&tx->base.resource, NULL);
-      FREE(tx);
-      return NULL;
-   }
-
-   if (usage & PIPE_TRANSFER_READ) {
-      /* XXX this looks wrong dst is always the same but looping over src z? */
-      int zi;
-      struct pipe_box sbox;
-      sbox.x = box->x;
-      sbox.y = box->y;
-      sbox.z = box->z;
-      sbox.width = box->width;
-      sbox.height = box->height;
-      sbox.depth = 1;
-      for (zi = 0; zi < box->depth; ++zi) {
-         sbox.z = sbox.z + zi;
-         pipe->resource_copy_region(pipe, tx->staging_resource, 0, 0, 0, 0,
-                                    tx->base.resource, level, &sbox);
-      }
-   }
-
-   return tx;
-}
-
-
-void
-util_staging_transfer_destroy(struct pipe_context *pipe,
-                              struct pipe_transfer *ptx)
-{
-   struct util_staging_transfer *tx = (struct util_staging_transfer *)ptx;
-
-   if (tx->staging_resource != tx->base.resource) {
-      if (tx->base.usage & PIPE_TRANSFER_WRITE) {
-         /* XXX this looks wrong src is always the same but looping over dst z? */
-         int zi;
-         struct pipe_box sbox;
-         sbox.x = 0;
-         sbox.y = 0;
-         sbox.z = 0;
-         sbox.width = tx->base.box.width;
-         sbox.height = tx->base.box.height;
-         sbox.depth = 1;
-         for (zi = 0; zi < tx->base.box.depth; ++zi)
-            pipe->resource_copy_region(pipe, tx->base.resource, tx->base.level,
-                                       tx->base.box.x, tx->base.box.y,
-                                       tx->base.box.z + zi,
-                                       tx->staging_resource, 0, &sbox);
-      }
-
-      pipe_resource_reference(&tx->staging_resource, NULL);
-   }
-
-   pipe_resource_reference(&ptx->resource, NULL);
-   FREE(ptx);
-}
--- a/src/gallium/auxiliary/util/u_staging.h
+++ b/src/gallium/auxiliary/util/u_staging.h
@@ -1,67 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 Luca Barbieri
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/* Direct3D 10/11 has no concept of transfers. Applications instead
- * create resources with a STAGING or DYNAMIC usage, copy between them
- * and the real resource and use Map to map the STAGING/DYNAMIC resource.
- *
- * This util module allows to implement Gallium drivers as a Direct3D
- * driver would be implemented: transfers allocate a resource with
- * PIPE_USAGE_STAGING, and copy the data between it and the real resource
- * with resource_copy_region.
- */
-
-#ifndef U_STAGING_H
-#define U_STAGING_H
-
-#include "pipe/p_state.h"
-
-struct util_staging_transfer {
-   struct pipe_transfer base;
-
-   /* if direct, same as base.resource, otherwise the temporary staging
-    * resource
-    */
-   struct pipe_resource *staging_resource;
-};
-
-/* user must be stride, slice_stride and offset.
- * pt->usage == PIPE_USAGE_DYNAMIC || pt->usage == PIPE_USAGE_STAGING
- * should be a good value to pass for direct staging resource is currently
- * created with PIPE_USAGE_STAGING
- */
-struct util_staging_transfer *
-util_staging_transfer_init(struct pipe_context *pipe,
-                           struct pipe_resource *pt,
-                           unsigned level, enum pipe_resource_usage usage,
-                           const struct pipe_box *box,
-                           boolean direct, struct util_staging_transfer *tx);
-
-void
-util_staging_transfer_destroy(struct pipe_context *pipe,
-                              struct pipe_transfer *ptx);
-
-#endif
--- a/src/gallium/auxiliary/util/u_suballoc.c
+++ b/src/gallium/auxiliary/util/u_suballoc.c
@@ -41,7 +41,6 @@ struct u_suballocator {
   struct pipe_context *pipe;

   unsigned size;          /* Size of the whole buffer, in bytes. */
-   unsigned alignment;     /* Alignment of each sub-allocation. */
   unsigned bind;          /* Bitmask of PIPE_BIND_* flags. */
   enum pipe_resource_usage usage;
   boolean zero_buffer_memory; /* If the buffer contents should be zeroed. */
@@ -58,8 +57,7 @@ struct u_suballocator {
 * cleared to 0 after the allocation.
 */
 struct u_suballocator *
-u_suballocator_create(struct pipe_context *pipe, unsigned size,
-                      unsigned alignment, unsigned bind,
+u_suballocator_create(struct pipe_context *pipe, unsigned size, unsigned bind,
                      enum pipe_resource_usage usage,
 		      boolean zero_buffer_memory)
 {
@@ -68,8 +66,7 @@ u_suballocator_create(struct pipe_context *pipe, unsigned size,
      return NULL;

   allocator->pipe = pipe;
-   allocator->size = align(size, alignment);
-   allocator->alignment = alignment;
+   allocator->size = size;
   allocator->bind = bind;
   allocator->usage = usage;
   allocator->zero_buffer_memory = zero_buffer_memory;
@@ -85,17 +82,18 @@ u_suballocator_destroy(struct u_suballocator *allocator)

 void
 u_suballocator_alloc(struct u_suballocator *allocator, unsigned size,
-                     unsigned *out_offset, struct pipe_resource **outbuf)
+                     unsigned alignment, unsigned *out_offset,
+                     struct pipe_resource **outbuf)
 {
-   unsigned alloc_size = align(size, allocator->alignment);
+   allocator->offset = align(allocator->offset, alignment);

   /* Don't allow allocations larger than the buffer size. */
-   if (alloc_size > allocator->size)
+   if (size > allocator->size)
      goto fail;

   /* Make sure we have enough space in the buffer. */
   if (!allocator->buffer ||
-       allocator->offset + alloc_size > allocator->size) {
+       allocator->offset + size > allocator->size) {
      /* Allocate a new buffer. */
      pipe_resource_reference(&allocator->buffer, NULL);
      allocator->offset = 0;
@@ -117,15 +115,15 @@ u_suballocator_alloc(struct u_suballocator *allocator, unsigned size,
      }
   }

-   assert(allocator->offset % allocator->alignment == 0);
+   assert(allocator->offset % alignment == 0);
   assert(allocator->offset < allocator->buffer->width0);
-   assert(allocator->offset + alloc_size <= allocator->buffer->width0);
+   assert(allocator->offset + size <= allocator->buffer->width0);

   /* Return the buffer. */
   *out_offset = allocator->offset;
   pipe_resource_reference(outbuf, allocator->buffer);

-   allocator->offset += alloc_size;
+   allocator->offset += size;
   return;

 fail:
--- a/src/gallium/auxiliary/util/u_suballoc.h
+++ b/src/gallium/auxiliary/util/u_suballoc.h
@@ -34,8 +34,7 @@
 struct u_suballocator;

 struct u_suballocator *
-u_suballocator_create(struct pipe_context *pipe, unsigned size,
-                      unsigned alignment, unsigned bind,
+u_suballocator_create(struct pipe_context *pipe, unsigned size, unsigned bind,
                      enum pipe_resource_usage usage,
 		      boolean zero_buffer_memory);

@@ -44,6 +43,7 @@ u_suballocator_destroy(struct u_suballocator *allocator);

 void
 u_suballocator_alloc(struct u_suballocator *allocator, unsigned size,
-                     unsigned *out_offset, struct pipe_resource **outbuf);
+                     unsigned alignment, unsigned *out_offset,
+                     struct pipe_resource **outbuf);

 #endif
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -238,8 +238,21 @@ util_fill_box(ubyte * dst,
 }


+/** Mipmap level size computation, with minimum block size */
+static inline unsigned
+minify(unsigned value, unsigned levels, unsigned blocksize)
+{
+   return MAX2(blocksize, value >> levels);
+}
+
+
 /**
 * Fallback function for pipe->resource_copy_region().
+ * We support copying between different formats (including compressed/
+ * uncompressed) if the bytes per block or pixel matches.  If copying
+ * compressed -> uncompressed, the dst region is reduced by the block
+ * width, height.  If copying uncompressed -> compressed, the dest region
+ * is expanded by the block width, height.  See GL_ARB_copy_image.
 * Note: (X,Y)=(0,0) is always the upper-left corner.
 */
 void
@@ -249,14 +262,15 @@ util_resource_copy_region(struct pipe_context *pipe,
                          unsigned dst_x, unsigned dst_y, unsigned dst_z,
                          struct pipe_resource *src,
                          unsigned src_level,
-                          const struct pipe_box *src_box)
+                          const struct pipe_box *src_box_in)
 {
   struct pipe_transfer *src_trans, *dst_trans;
   uint8_t *dst_map;
   const uint8_t *src_map;
   MAYBE_UNUSED enum pipe_format src_format;
   enum pipe_format dst_format;
-   struct pipe_box dst_box;
+   struct pipe_box src_box, dst_box;
+   unsigned src_bs, dst_bs, src_bw, dst_bw, src_bh, dst_bh;

   assert(src && dst);
   if (!src || !dst)
@@ -268,47 +282,112 @@ util_resource_copy_region(struct pipe_context *pipe,
   src_format = src->format;
   dst_format = dst->format;

-   assert(util_format_get_blocksize(dst_format) == util_format_get_blocksize(src_format));
-   assert(util_format_get_blockwidth(dst_format) == util_format_get_blockwidth(src_format));
-   assert(util_format_get_blockheight(dst_format) == util_format_get_blockheight(src_format));
+   /* init src box */
+   src_box = *src_box_in;
+
+   /* init dst box */
+   dst_box.x = dst_x;
+   dst_box.y = dst_y;
+   dst_box.z = dst_z;
+   dst_box.width  = src_box.width;
+   dst_box.height = src_box.height;
+   dst_box.depth  = src_box.depth;
+
+   src_bs = util_format_get_blocksize(src_format);
+   src_bw = util_format_get_blockwidth(src_format);
+   src_bh = util_format_get_blockheight(src_format);
+   dst_bs = util_format_get_blocksize(dst_format);
+   dst_bw = util_format_get_blockwidth(dst_format);
+   dst_bh = util_format_get_blockheight(dst_format);
+
+   /* Note: all box positions and sizes are in pixels */
+   if (src_bw > 1 && dst_bw == 1) {
+      /* Copy from compressed to uncompressed.
+       * Shrink dest box by the src block size.
+       */
+      dst_box.width /= src_bw;
+      dst_box.height /= src_bh;
+   }
+   else if (src_bw == 1 && dst_bw > 1) {
+      /* Copy from uncompressed to compressed.
+       * Expand dest box by the dest block size.
+       */
+      dst_box.width *= dst_bw;
+      dst_box.height *= dst_bh;
+   }
+   else {
+      /* compressed -> compressed or uncompressed -> uncompressed copy */
+      assert(src_bw == dst_bw);
+      assert(src_bh == dst_bh);
+   }
+
+   assert(src_bs == dst_bs);
+   if (src_bs != dst_bs) {
+      /* This can happen if we fail to do format checking before hand.
+       * Don't crash below.
+       */
+      return;
+   }
+
+   /* check that region boxes are block aligned */
+   assert(src_box.x % src_bw == 0);
+   assert(src_box.y % src_bh == 0);
+   assert(src_box.width % src_bw == 0 ||
+          src_box.x + src_box.width == minify(src->width0, src_level, src_bw));
+   assert(src_box.height % src_bh == 0 ||
+          src_box.y + src_box.height == minify(src->height0, src_level, src_bh));
+   assert(dst_box.x % dst_bw == 0);
+   assert(dst_box.y % dst_bh == 0);
+   assert(dst_box.width % dst_bw == 0 ||
+          dst_box.x + dst_box.width == minify(dst->width0, dst_level, dst_bw));
+   assert(dst_box.height % dst_bh == 0 ||
+          dst_box.y + dst_box.height == minify(dst->height0, dst_level, dst_bh));
+
+   /* check that region boxes are not out of bounds */
+   assert(src_box.x + src_box.width <=
+          minify(src->width0, src_level, src_bw));
+   assert(src_box.y + src_box.height <=
+          minify(src->height0, src_level, src_bh));
+   assert(dst_box.x + dst_box.width <=
+          minify(dst->width0, dst_level, dst_bw));
+   assert(dst_box.y + dst_box.height <=
+          minify(dst->height0, dst_level, dst_bh));
+
+   /* check that total number of src, dest bytes match */
+   assert((src_box.width / src_bw) * (src_box.height / src_bh) * src_bs ==
+          (dst_box.width / dst_bw) * (dst_box.height / dst_bh) * dst_bs);

   src_map = pipe->transfer_map(pipe,
                                src,
                                src_level,
                                PIPE_TRANSFER_READ,
-                                src_box, &src_trans);
+                                &src_box, &src_trans);
   assert(src_map);
   if (!src_map) {
      goto no_src_map;
   }

-   dst_box.x = dst_x;
-   dst_box.y = dst_y;
-   dst_box.z = dst_z;
-   dst_box.width  = src_box->width;
-   dst_box.height = src_box->height;
-   dst_box.depth  = src_box->depth;
-
   dst_map = pipe->transfer_map(pipe,
                                dst,
                                dst_level,
-                                PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
-                                &dst_box, &dst_trans);
+                                PIPE_TRANSFER_WRITE |
+                                PIPE_TRANSFER_DISCARD_RANGE, &dst_box,
+                                &dst_trans);
   assert(dst_map);
   if (!dst_map) {
      goto no_dst_map;
   }

   if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
-      assert(src_box->height == 1);
-      assert(src_box->depth == 1);
-      memcpy(dst_map, src_map, src_box->width);
+      assert(src_box.height == 1);
+      assert(src_box.depth == 1);
+      memcpy(dst_map, src_map, src_box.width);
   } else {
      util_copy_box(dst_map,
-                    dst_format,
+                    src_format,
                    dst_trans->stride, dst_trans->layer_stride,
                    0, 0, 0,
-                    src_box->width, src_box->height, src_box->depth,
+                    src_box.width, src_box.height, src_box.depth,
                    src_map,
                    src_trans->stride, src_trans->layer_stride,
                    0, 0, 0);
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -132,8 +132,10 @@ create_frag_shader_video_buffer(struct vl_compositor *c)
   struct ureg_src tc;
   struct ureg_src csc[3];
   struct ureg_src sampler[3];
+   struct ureg_src lumakey;
   struct ureg_dst texel;
   struct ureg_dst fragment;
+   struct ureg_dst temp[2];
   unsigned i;

   shader = ureg_create(PIPE_SHADER_FRAGMENT);
@@ -145,6 +147,11 @@ create_frag_shader_video_buffer(struct vl_compositor *c)
      csc[i] = ureg_DECL_constant(shader, i);
      sampler[i] = ureg_DECL_sampler(shader, i);
   }
+
+   for (i = 0; i < 2; ++i)
+      temp[i] = ureg_DECL_temporary(shader);
+
+   lumakey = ureg_DECL_constant(shader, 3);
   texel = ureg_DECL_temporary(shader);
   fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);

@@ -160,7 +167,17 @@ create_frag_shader_video_buffer(struct vl_compositor *c)
   for (i = 0; i < 3; ++i)
      ureg_DP4(shader, ureg_writemask(fragment, TGSI_WRITEMASK_X << i), csc[i], ureg_src(texel));

-   ureg_MOV(shader, ureg_writemask(fragment, TGSI_WRITEMASK_W), ureg_imm1f(shader, 1.0f));
+   ureg_MOV(shader, ureg_writemask(temp[0], TGSI_WRITEMASK_W),
+            ureg_scalar(ureg_src(texel), TGSI_SWIZZLE_Z));
+   ureg_SLE(shader, ureg_writemask(temp[1],TGSI_WRITEMASK_W),
+            ureg_src(temp[0]), ureg_scalar(lumakey, TGSI_SWIZZLE_X));
+   ureg_SGT(shader, ureg_writemask(temp[0],TGSI_WRITEMASK_W),
+            ureg_src(temp[0]), ureg_scalar(lumakey, TGSI_SWIZZLE_Y));
+   ureg_MAX(shader, ureg_writemask(fragment, TGSI_WRITEMASK_W),
+            ureg_src(temp[0]), ureg_src(temp[1]));
+
+   for (i = 0; i < 2; ++i)
+       ureg_release_temporary(shader, temp[i]);

   ureg_release_temporary(shader, texel);
   ureg_END(shader);
@@ -852,20 +869,23 @@ vl_compositor_cleanup(struct vl_compositor *c)
 }

 void
-vl_compositor_set_csc_matrix(struct vl_compositor_state *s, vl_csc_matrix const *matrix)
+vl_compositor_set_csc_matrix(struct vl_compositor_state *s,
+                             vl_csc_matrix const *matrix,
+                             float luma_min, float luma_max)
 {
   struct pipe_transfer *buf_transfer;

   assert(s);

-   memcpy
-   (
-      pipe_buffer_map(s->pipe, s->csc_matrix,
-                      PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
-                      &buf_transfer),
-      matrix,
-      sizeof(vl_csc_matrix)
-   );
+   float *ptr = pipe_buffer_map(s->pipe, s->csc_matrix,
+                               PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
+                               &buf_transfer);
+
+   memcpy(ptr, matrix, sizeof(vl_csc_matrix));
+
+   ptr += sizeof(vl_csc_matrix)/sizeof(float);
+   ptr[0] = luma_min;
+   ptr[1] = luma_max;

   pipe_buffer_unmap(s->pipe, buf_transfer);
 }
@@ -1142,13 +1162,13 @@ vl_compositor_init_state(struct vl_compositor_state *s, struct pipe_context *pip
      pipe->screen,
      PIPE_BIND_CONSTANT_BUFFER,
      PIPE_USAGE_DEFAULT,
-      sizeof(csc_matrix)
+      sizeof(csc_matrix) + 2*sizeof(float)
   );

   vl_compositor_clear_layers(s);

   vl_csc_get_matrix(VL_CSC_COLOR_STANDARD_IDENTITY, NULL, true, &csc_matrix);
-   vl_compositor_set_csc_matrix(s, (const vl_csc_matrix *)&csc_matrix);
+   vl_compositor_set_csc_matrix(s, (const vl_csc_matrix *)&csc_matrix, 1.0f, 0.0f);

   return true;
 }
--- a/src/gallium/auxiliary/vl/vl_compositor.h
+++ b/src/gallium/auxiliary/vl/vl_compositor.h
@@ -138,7 +138,9 @@ vl_compositor_init_state(struct vl_compositor_state *state, struct pipe_context
 * set yuv -> rgba conversion matrix
 */
 void
-vl_compositor_set_csc_matrix(struct vl_compositor_state *settings, const vl_csc_matrix *matrix);
+vl_compositor_set_csc_matrix(struct vl_compositor_state *settings,
+                             const vl_csc_matrix *matrix,
+                             float luma_min, float luma_max);

 /**
 * reset dirty area, so it's cleared with the clear colour
--- a/src/gallium/auxiliary/vl/vl_deint_filter.c
+++ b/src/gallium/auxiliary/vl/vl_deint_filter.c
@@ -447,7 +447,8 @@ vl_deint_filter_render(struct vl_deint_filter *filter,
   struct pipe_sampler_view *sampler_views[4];
   struct pipe_surface **dst_surfaces;
   const unsigned *plane_order;
-   int i, j;
+   int i;
+   unsigned j;

   assert(filter && prevprev && prev && cur && next && field <= 1);

--- a/src/gallium/auxiliary/vl/vl_idct.c
+++ b/src/gallium/auxiliary/vl/vl_idct.c
@@ -321,13 +321,11 @@ static void *
 create_stage1_frag_shader(struct vl_idct *idct)
 {
   struct ureg_program *shader;
-
   struct ureg_src l_addr[2], r_addr[2];
-
   struct ureg_dst l[4][2], r[2];
   struct ureg_dst *fragment;
-
-   int i, j;
+   unsigned i;
+   int j;

   shader = ureg_create(PIPE_SHADER_FRAGMENT);
   if (!shader)
--- a/src/gallium/auxiliary/vl/vl_matrix_filter.c
+++ b/src/gallium/auxiliary/vl/vl_matrix_filter.c
@@ -85,7 +85,7 @@ create_frag_shader(struct vl_matrix_filter *filter, unsigned num_offsets,
   struct ureg_dst t_sum;
   struct ureg_dst o_fragment;
   bool first;
-   int i;
+   unsigned i;

   shader = ureg_create(PIPE_SHADER_FRAGMENT);
   if (!shader) {
--- a/src/gallium/auxiliary/vl/vl_median_filter.c
+++ b/src/gallium/auxiliary/vl/vl_median_filter.c
@@ -84,7 +84,7 @@ create_frag_shader(struct vl_median_filter *filter,
   struct ureg_dst *t_array = MALLOC(sizeof(struct ureg_dst) * num_offsets);
   struct ureg_dst o_fragment;
   const unsigned median = num_offsets >> 1;
-   int i, j;
+   unsigned i, j;

   assert(num_offsets & 1); /* we need an odd number of offsets */
   if (!(num_offsets & 1)) { /* yeah, we REALLY need an odd number of offsets!!! */
@@ -158,7 +158,8 @@ static void
 generate_offsets(enum vl_median_filter_shape shape, unsigned size,
                 struct vertex2f **offsets, unsigned *num_offsets)
 {
-   int i = 0, half_size;
+   unsigned i = 0;
+   int half_size;
   struct vertex2f v;

   assert(offsets && num_offsets);
--- a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
@@ -583,12 +583,12 @@ init_dct_coeff_table(struct dct_coeff *dst, const struct dct_coeff_compressed *s
         break;
      }

-      for(i=0; i<(1 << (17 - coeff.length)); ++i)
+      for(i = 0; i < (1u << (17 - coeff.length)); ++i)
         dst[src->bitcode << 1 | i] = coeff;

      if (has_sign) {
 	 coeff.level = -coeff.level;
-         for(; i<(1 << (18 - coeff.length)); ++i)
+         for(; i < (1u << (18 - coeff.length)); ++i)
            dst[src->bitcode << 1 | i] = coeff;
      }
   }
--- a/src/gallium/auxiliary/vl/vl_vlc.h
+++ b/src/gallium/auxiliary/vl/vl_vlc.h
@@ -79,7 +79,7 @@ vl_vlc_init_table(struct vl_vlc_entry *dst, unsigned dst_size, const struct vl_v
   }

   for(; src_size > 0; --src_size, ++src) {
-      for(i=0; i<(1 << (bits - src->entry.length)); ++i)
+      for(i = 0; i < (1u << (bits - src->entry.length)); ++i)
         dst[src->bitcode >> (16 - bits) | i] = src->entry;
   }
 }
@@ -293,7 +293,7 @@ vl_vlc_search_byte(struct vl_vlc *vlc, unsigned num_bits, uint8_t value)
 {
   /* make sure we are on a byte boundary */
   assert((vl_vlc_valid_bits(vlc) % 8) == 0);
-   assert(num_bits == ~0 || (num_bits % 8) == 0);
+   assert(num_bits == ~0u || (num_bits % 8) == 0);

   /* deplete the bit buffer */
   while (vl_vlc_valid_bits(vlc) > 0) {
@@ -305,7 +305,7 @@ vl_vlc_search_byte(struct vl_vlc *vlc, unsigned num_bits, uint8_t value)

      vl_vlc_eatbits(vlc, 8);

-      if (num_bits != ~0) {
+      if (num_bits != ~0u) {
         num_bits -= 8;
         if (num_bits == 0)
            return FALSE;
@@ -332,7 +332,7 @@ vl_vlc_search_byte(struct vl_vlc *vlc, unsigned num_bits, uint8_t value)
      }

      ++vlc->data;
-      if (num_bits != ~0) {
+      if (num_bits != ~0u) {
         num_bits -= 8;
         if (num_bits == 0) {
            vl_vlc_align_data_ptr(vlc);
--- a/src/gallium/auxiliary/vl/vl_zscan.c
+++ b/src/gallium/auxiliary/vl/vl_zscan.c
@@ -99,15 +99,12 @@ static void *
 create_vert_shader(struct vl_zscan *zscan)
 {
   struct ureg_program *shader;
-
   struct ureg_src scale;
   struct ureg_src vrect, vpos, block_num;
-
   struct ureg_dst tmp;
   struct ureg_dst o_vpos;
   struct ureg_dst *o_vtex;
-
-   signed i;
+   unsigned i;

   shader = ureg_create(PIPE_SHADER_VERTEX);
   if (!shader)
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -340,6 +340,7 @@ The integer capabilities:
  extension and thus implements proper support for culling planes.
 * ``PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES``: Whether primitive restart is
  supported for patch primitives.
+* ``PIPE_CAP_TGSI_VOTE``: Whether the ``VOTE_*`` ops can be used in shaders.


 .. _pipe_capf:
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2557,6 +2557,23 @@ only be used with 32-bit integer image formats.
  resource[offset] = (dst_x > src_x ? dst_x : src_x)


+.. _voteopcodes:
+
+Vote opcodes
+^^^^^^^^^^^^
+
+These opcodes compare the given value across the shader invocations
+running in the current SIMD group. The details of exactly which
+invocations get compared are implementation-defined, and it would be a
+correct implementation to only ever consider the current thread's
+value. (i.e. SIMD group of 1). The argument is treated as a boolean.
+
+.. opcode:: VOTE_ANY - Value is set in any of the current invocations
+
+.. opcode:: VOTE_ALL - Value is set in all of the current invocations
+
+.. opcode:: VOTE_EQ - Value is the same in all of the current invocations
+

 Explanation of symbols used
 ------------------------------
--- a/src/gallium/drivers/freedreno/Android.mk
+++ b/src/gallium/drivers/freedreno/Android.mk
@@ -40,7 +40,7 @@ LOCAL_C_INCLUDES := \

 LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)

-LOCAL_SHARED_LIBRARIES := libdrm libdrm_freedreno
+LOCAL_SHARED_LIBRARIES := libdrm_freedreno
 LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir
 LOCAL_MODULE := libmesa_pipe_freedreno

--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -142,16 +142,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			[SB_FRAG_TEX] = REG_A3XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR,
 	};
 	struct fd3_context *fd3_ctx = fd3_context(ctx);
-	unsigned i, j, off;
-	void *ptr;
-
-	u_upload_alloc(fd3_ctx->border_color_uploader,
-			0, BORDER_COLOR_UPLOAD_SIZE,
-		       BORDER_COLOR_UPLOAD_SIZE, &off,
-			&fd3_ctx->border_color_buf,
-			&ptr);
-
-	fd_setup_border_colors(tex, ptr, tex_off[sb]);
+	bool needs_border = false;
+	unsigned i, j;

 	if (tex->num_samplers > 0) {
 		/* output sampler state: */
@@ -170,6 +162,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,

 			OUT_RING(ring, sampler->texsamp0);
 			OUT_RING(ring, sampler->texsamp1);
+
+			needs_border |= sampler->needs_border;
 		}
 	}

@@ -233,10 +227,23 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		}
 	}

-	OUT_PKT0(ring, bcolor_reg[sb], 1);
-	OUT_RELOC(ring, fd_resource(fd3_ctx->border_color_buf)->bo, off, 0, 0);
+	if (needs_border) {
+		unsigned off;
+		void *ptr;

-	u_upload_unmap(fd3_ctx->border_color_uploader);
+		u_upload_alloc(fd3_ctx->border_color_uploader,
+				0, BORDER_COLOR_UPLOAD_SIZE,
+			       BORDER_COLOR_UPLOAD_SIZE, &off,
+				&fd3_ctx->border_color_buf,
+				&ptr);
+
+		fd_setup_border_colors(tex, ptr, tex_off[sb]);
+
+		OUT_PKT0(ring, bcolor_reg[sb], 1);
+		OUT_RELOC(ring, fd_resource(fd3_ctx->border_color_buf)->bo, off, 0, 0);
+
+		u_upload_unmap(fd3_ctx->border_color_uploader);
+	}
 }

 /* emit texture state for mem->gmem restore operation.. eventually it would
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -79,7 +79,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			if (rsc->stencil) {
 				rsc = rsc->stencil;
 				pformat = rsc->base.b.format;
-				bases++;
+				if (bases)
+					bases++;
 			}
 			slice = fd_resource_slice(rsc, psurf->u.tex.level);
 			format = fd3_pipe2color(pformat);
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -36,7 +36,7 @@
 #include "fd3_format.h"

 static enum a3xx_tex_clamp
-tex_clamp(unsigned wrap, bool clamp_to_edge)
+tex_clamp(unsigned wrap, bool clamp_to_edge, bool *needs_border)
 {
 	/* Hardware does not support _CLAMP, but we emulate it: */
 	if (wrap == PIPE_TEX_WRAP_CLAMP) {
@@ -50,6 +50,7 @@ tex_clamp(unsigned wrap, bool clamp_to_edge)
 	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 		return A3XX_TEX_CLAMP_TO_EDGE;
 	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		*needs_border = true;
 		return A3XX_TEX_CLAMP_TO_BORDER;
 	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 		/* only works for PoT.. need to emulate otherwise! */
@@ -113,6 +114,7 @@ fd3_sampler_state_create(struct pipe_context *pctx,
 		so->saturate_r = (cso->wrap_r == PIPE_TEX_WRAP_CLAMP);
 	}

+	so->needs_border = false;
 	so->texsamp0 =
 			COND(!cso->normalized_coords, A3XX_TEX_SAMP_0_UNNORM_COORDS) |
 			COND(!cso->seamless_cube_map, A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF) |
@@ -120,9 +122,9 @@ fd3_sampler_state_create(struct pipe_context *pctx,
 			A3XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) |
 			A3XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) |
 			A3XX_TEX_SAMP_0_ANISO(aniso) |
-			A3XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, clamp_to_edge)) |
-			A3XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, clamp_to_edge)) |
-			A3XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, clamp_to_edge));
+			A3XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, clamp_to_edge, &so->needs_border)) |
+			A3XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, clamp_to_edge, &so->needs_border)) |
+			A3XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, clamp_to_edge, &so->needs_border));

 	if (cso->compare_mode)
 		so->texsamp0 |= A3XX_TEX_SAMP_0_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
@@ -41,6 +41,7 @@ struct fd3_sampler_stateobj {
 	struct pipe_sampler_state base;
 	uint32_t texsamp0, texsamp1;
 	bool saturate_s, saturate_t, saturate_r;
+	bool needs_border;
 };

 static inline struct fd3_sampler_stateobj *
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
@@ -88,7 +88,7 @@ fd4_draw(struct fd_context *ctx, struct fd_ringbuffer *ring,
 }


-static inline enum pc_di_index_size
+static inline enum a4xx_index_size
 fd4_size2indextype(unsigned index_size)
 {
 	switch (index_size) {
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -131,16 +131,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			[SB_FRAG_TEX] = REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR,
 	};
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
-	unsigned i, off;
-	void *ptr;
-
-	u_upload_alloc(fd4_ctx->border_color_uploader,
-			0, BORDER_COLOR_UPLOAD_SIZE,
-		       BORDER_COLOR_UPLOAD_SIZE, &off,
-			&fd4_ctx->border_color_buf,
-			&ptr);
-
-	fd_setup_border_colors(tex, ptr, 0);
+	bool needs_border = false;
+	unsigned i;

 	if (tex->num_samplers > 0) {
 		int num_samplers;
@@ -166,6 +158,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 					&dummy_sampler;
 			OUT_RING(ring, sampler->texsamp0);
 			OUT_RING(ring, sampler->texsamp1);
+
+			needs_border |= sampler->needs_border;
 		}

 		for (; i < num_samplers; i++) {
@@ -235,10 +229,22 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		debug_assert(v->astc_srgb.count == 0);
 	}

-	OUT_PKT0(ring, bcolor_reg[sb], 1);
-	OUT_RELOC(ring, fd_resource(fd4_ctx->border_color_buf)->bo, off, 0, 0);
+	if (needs_border) {
+		unsigned off;
+		void *ptr;

-	u_upload_unmap(fd4_ctx->border_color_uploader);
+		u_upload_alloc(fd4_ctx->border_color_uploader,
+				0, BORDER_COLOR_UPLOAD_SIZE,
+				BORDER_COLOR_UPLOAD_SIZE, &off,
+				&fd4_ctx->border_color_buf,
+				&ptr);
+
+		fd_setup_border_colors(tex, ptr, 0);
+		OUT_PKT0(ring, bcolor_reg[sb], 1);
+		OUT_RELOC(ring, fd_resource(fd4_ctx->border_color_buf)->bo, off, 0, 0);
+
+		u_upload_unmap(fd4_ctx->border_color_uploader);
+	}
 }

 /* emit texture state for mem->gmem restore operation.. eventually it would
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -80,7 +80,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			if (rsc->stencil) {
 				rsc = rsc->stencil;
 				pformat = rsc->base.b.format;
-				bases++;
+				if (bases)
+					bases++;
 			}

 			slice = fd_resource_slice(rsc, psurf->u.tex.level);
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -121,6 +121,12 @@ emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
 		OUT_RELOC(ring, so->bo, 0,
 				CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER), 0);
 	}
+
+	/* for how clever coverity is, it is sometimes rather dull, and
+	 * doesn't realize that the only case where bin==NULL, sz==0:
+	 */
+	assume(bin || (sz == 0));
+
 	for (i = 0; i < sz; i++) {
 		OUT_RING(ring, bin[i]);
 	}
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -36,7 +36,7 @@
 #include "fd4_format.h"

 static enum a4xx_tex_clamp
-tex_clamp(unsigned wrap, bool clamp_to_edge)
+tex_clamp(unsigned wrap, bool clamp_to_edge, bool *needs_border)
 {
 	/* Hardware does not support _CLAMP, but we emulate it: */
 	if (wrap == PIPE_TEX_WRAP_CLAMP) {
@@ -50,6 +50,7 @@ tex_clamp(unsigned wrap, bool clamp_to_edge)
 	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 		return A4XX_TEX_CLAMP_TO_EDGE;
 	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		*needs_border = true;
 		return A4XX_TEX_CLAMP_TO_BORDER;
 	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 		/* only works for PoT.. need to emulate otherwise! */
@@ -113,14 +114,15 @@ fd4_sampler_state_create(struct pipe_context *pctx,
 		so->saturate_r = (cso->wrap_r == PIPE_TEX_WRAP_CLAMP);
 	}

+	so->needs_border = false;
 	so->texsamp0 =
 		COND(miplinear, A4XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) |
 		A4XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) |
 		A4XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) |
 		A4XX_TEX_SAMP_0_ANISO(aniso) |
-		A4XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, clamp_to_edge)) |
-		A4XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, clamp_to_edge)) |
-		A4XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, clamp_to_edge));
+		A4XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, clamp_to_edge, &so->needs_border)) |
+		A4XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, clamp_to_edge, &so->needs_border)) |
+		A4XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, clamp_to_edge, &so->needs_border));

 	so->texsamp1 =
 //		COND(miplinear, A4XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR) |
--- a/Show More
+++ b/Show More