Bump version to 10.2.1

radeonsi: Fix build error introduced in 5ab9a9c
While resolving conflicts in cherry picking commit d226191, I accidentally introduced some garbage. Because radeonsi isn't built by default, the problem went unnoticed by me. Signed-off-by: Ian Romanick <ian.d.romanick@intel.com> Reported-by: Laurent Carlier <lordheavym@gmail.com> Tested-by: Laurent Carlier <lordheavym@gmail.com>
2014-06-06 23:20:00 -07:00 · 2014-06-06 23:19:53 -07:00 · 2014-06-06 21:17:02 -07:00 · 2014-06-06 20:40:00 -07:00 · 2014-06-06 20:40:00 -07:00 · 2014-06-06 18:40:58 -07:00
89 changed files with 2486 additions and 487 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-10.2.0-rc3
+10.2.1
--- a/configure.ac
+++ b/configure.ac
@@ -1293,6 +1293,7 @@ if test "x$enable_xa" = xyes; then
    fi
    GALLIUM_STATE_TRACKERS_DIRS="xa $GALLIUM_STATE_TRACKERS_DIRS"
    enable_gallium_loader=yes
+    enable_gallium_drm_loader=yes
 fi
 AM_CONDITIONAL(HAVE_ST_XA, test "x$enable_xa" = xyes)

--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,13 @@

 <h1>News</h1>

+<h2>June 6, 2014</h2>
+<p>
+<a href="relnotes/10.2.html">Mesa 10.2</a> is released.  This is a new
+development release.  See the release notes for more information about
+the release.
+</p>
+
 <h2>April 18, 2014</h2>
 <p>
 <a href="relnotes/10.1.1.html">Mesa 10.1.1</a> is released.
--- a/docs/relnotes/10.2.html
+++ b/docs/relnotes/10.2.html
@@ -14,7 +14,7 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">

-<h1>Mesa 10.2 Release Notes / TBD</h1>
+<h1>Mesa 10.2 Release Notes / June 6, 2014</h1>

 <p>
 Mesa 10.2 is a new development release.
@@ -33,7 +33,9 @@ because compatibility contexts are not supported.

 <h2>MD5 checksums</h2>
 <pre>
-TBD.
+c87bfb6dd5cbcf1fdef42e5ccd972581  MesaLib-10.2.0.tar.gz
+7aaba90bd7169a94ae2fe83febdec963  MesaLib-10.2.0.tar.bz2
+58b203aca15dadc25ab4d1126db1052b  MesaLib-10.2.0.zip
 </pre>


--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -524,8 +524,12 @@ eglMakeCurrent(EGLDisplay dpy, EGLSurface draw, EGLSurface read,
   if (!context && ctx != EGL_NO_CONTEXT)
      RETURN_EGL_ERROR(disp, EGL_BAD_CONTEXT, EGL_FALSE);
   if (!draw_surf || !read_surf) {
-      /* surfaces may be NULL if surfaceless */
-      if (!disp->Extensions.KHR_surfaceless_context)
+      /* From the EGL 1.4 (20130211) spec:
+       *
+       *    To release the current context without assigning a new one, set ctx
+       *    to EGL_NO_CONTEXT and set draw and read to EGL_NO_SURFACE.
+       */
+      if (!disp->Extensions.KHR_surfaceless_context && ctx != EGL_NO_CONTEXT)
         RETURN_EGL_ERROR(disp, EGL_BAD_SURFACE, EGL_FALSE);

      if ((!draw_surf && draw != EGL_NO_SURFACE) ||
@@ -567,6 +571,10 @@ _eglCreateWindowSurfaceCommon(_EGLDisplay *disp, EGLConfig config,
   EGLSurface ret;

   _EGL_CHECK_CONFIG(disp, conf, EGL_NO_SURFACE, drv);
+
+   if (native_window == NULL)
+      RETURN_EGL_ERROR(disp, EGL_BAD_NATIVE_WINDOW, EGL_NO_SURFACE);
+
   surf = drv->API.CreateWindowSurface(drv, disp, conf, native_window,
                                       attrib_list);
   ret = (surf) ? _eglLinkSurface(surf) : EGL_NO_SURFACE;
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -120,7 +120,8 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
   "FS_COORD_PIXEL_CENTER",
   "FS_COLOR0_WRITES_ALL_CBUFS",
   "FS_DEPTH_LAYOUT",
-   "VS_PROHIBIT_UCPS"
+   "VS_PROHIBIT_UCPS",
+   "GS_INVOCATIONS",
 };

 const char *tgsi_type_names[5] =
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -3,6 +3,8 @@ C_SOURCES := \
 	freedreno_lowering.c \
 	freedreno_program.c \
 	freedreno_query.c \
+	freedreno_query_hw.c \
+	freedreno_query_sw.c \
 	freedreno_fence.c \
 	freedreno_resource.c \
 	freedreno_surface.c \
@@ -38,6 +40,7 @@ a3xx_SOURCES := \
 	a3xx/fd3_emit.c \
 	a3xx/fd3_gmem.c \
 	a3xx/fd3_program.c \
+	a3xx/fd3_query.c \
 	a3xx/fd3_rasterizer.c \
 	a3xx/fd3_screen.c \
 	a3xx/fd3_texture.c \
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -10,11 +10,11 @@ git clone https://github.com/freedreno/envytools.git
 The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32840 bytes, from 2014-01-05 14:44:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (   9009 bytes, from 2014-01-11 16:56:35)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  12362 bytes, from 2014-01-07 14:47:36)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  56545 bytes, from 2014-02-26 16:32:11)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (   8344 bytes, from 2013-11-30 14:49:47)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32580 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10186 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14477 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  57831 bytes, from 2014-05-19 21:02:34)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  26293 bytes, from 2014-05-16 11:51:57)

 Copyright (C) 2013-2014 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -125,7 +125,7 @@ emit_texture(struct fd_ringbuffer *ring, struct fd_context *ctx,
 {
 	unsigned const_idx = fd2_get_const_idx(ctx, tex, samp_id);
 	static const struct fd2_sampler_stateobj dummy_sampler = {};
-	struct fd2_sampler_stateobj *sampler;
+	const struct fd2_sampler_stateobj *sampler;
 	struct fd2_pipe_sampler_view *view;

 	if (emitted & (1 << const_idx))
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -10,11 +10,11 @@ git clone https://github.com/freedreno/envytools.git
 The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32840 bytes, from 2014-01-05 14:44:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (   9009 bytes, from 2014-01-11 16:56:35)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  12362 bytes, from 2014-01-07 14:47:36)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  56545 bytes, from 2014-02-26 16:32:11)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (   8344 bytes, from 2013-11-30 14:49:47)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32580 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10186 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14477 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  57831 bytes, from 2014-05-19 21:02:34)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  26293 bytes, from 2014-05-16 11:51:57)

 Copyright (C) 2013-2014 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -41,31 +41,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */


-enum a3xx_render_mode {
-	RB_RENDERING_PASS = 0,
-	RB_TILING_PASS = 1,
-	RB_RESOLVE_PASS = 2,
-};
-
 enum a3xx_tile_mode {
 	LINEAR = 0,
 	TILE_32X32 = 2,
 };

-enum a3xx_threadmode {
-	MULTI = 0,
-	SINGLE = 1,
-};
-
-enum a3xx_instrbuffermode {
-	BUFFER = 1,
-};
-
-enum a3xx_threadsize {
-	TWO_QUADS = 0,
-	FOUR_QUADS = 1,
-};
-
 enum a3xx_state_block_id {
 	HLSQ_BLOCK_ID_TP_TEX = 2,
 	HLSQ_BLOCK_ID_TP_MIPMAP = 3,
@@ -180,12 +160,6 @@ enum a3xx_color_swap {
 	XYZW = 3,
 };

-enum a3xx_msaa_samples {
-	MSAA_ONE = 0,
-	MSAA_TWO = 1,
-	MSAA_FOUR = 2,
-};
-
 enum a3xx_sp_perfcounter_select {
 	SP_FS_CFLOW_INSTRUCTIONS = 12,
 	SP_FS_FULL_ALU_INSTRUCTIONS = 14,
@@ -212,11 +186,6 @@ enum a3xx_rop_code {
 	ROP_SET = 15,
 };

-enum adreno_rb_copy_control_mode {
-	RB_COPY_RESOLVE = 1,
-	RB_COPY_DEPTH_STENCIL = 5,
-};
-
 enum a3xx_tex_filter {
 	A3XX_TEX_NEAREST = 0,
 	A3XX_TEX_LINEAR = 1,
@@ -337,6 +306,7 @@ enum a3xx_tex_type {
 #define REG_A3XX_RBBM_INT_0_STATUS				0x00000064

 #define REG_A3XX_RBBM_PERFCTR_CTL				0x00000080
+#define A3XX_RBBM_PERFCTR_CTL_ENABLE				0x00000001

 #define REG_A3XX_RBBM_PERFCTR_LOAD_CMD0				0x00000081

@@ -570,6 +540,10 @@ static inline uint32_t REG_A3XX_CP_PROTECT_REG(uint32_t i0) { return 0x00000460

 #define REG_A3XX_CP_AHB_FAULT					0x0000054d

+#define REG_A3XX_SP_GLOBAL_MEM_SIZE				0x00000e22
+
+#define REG_A3XX_SP_GLOBAL_MEM_ADDR				0x00000e23
+
 #define REG_A3XX_GRAS_CL_CLIP_CNTL				0x00002040
 #define A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER			0x00001000
 #define A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE			0x00010000
@@ -644,8 +618,26 @@ static inline uint32_t A3XX_GRAS_CL_VPORT_ZSCALE(float val)
 }

 #define REG_A3XX_GRAS_SU_POINT_MINMAX				0x00002068
+#define A3XX_GRAS_SU_POINT_MINMAX_MIN__MASK			0x0000ffff
+#define A3XX_GRAS_SU_POINT_MINMAX_MIN__SHIFT			0
+static inline uint32_t A3XX_GRAS_SU_POINT_MINMAX_MIN(float val)
+{
+	return ((((uint32_t)(val * 8.0))) << A3XX_GRAS_SU_POINT_MINMAX_MIN__SHIFT) & A3XX_GRAS_SU_POINT_MINMAX_MIN__MASK;
+}
+#define A3XX_GRAS_SU_POINT_MINMAX_MAX__MASK			0xffff0000
+#define A3XX_GRAS_SU_POINT_MINMAX_MAX__SHIFT			16
+static inline uint32_t A3XX_GRAS_SU_POINT_MINMAX_MAX(float val)
+{
+	return ((((uint32_t)(val * 8.0))) << A3XX_GRAS_SU_POINT_MINMAX_MAX__SHIFT) & A3XX_GRAS_SU_POINT_MINMAX_MAX__MASK;
+}

 #define REG_A3XX_GRAS_SU_POINT_SIZE				0x00002069
+#define A3XX_GRAS_SU_POINT_SIZE__MASK				0xffffffff
+#define A3XX_GRAS_SU_POINT_SIZE__SHIFT				0
+static inline uint32_t A3XX_GRAS_SU_POINT_SIZE(float val)
+{
+	return ((((uint32_t)(val * 8.0))) << A3XX_GRAS_SU_POINT_SIZE__SHIFT) & A3XX_GRAS_SU_POINT_SIZE__MASK;
+}

 #define REG_A3XX_GRAS_SU_POLY_OFFSET_SCALE			0x0000206c
 #define A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL__MASK		0x00ffffff
@@ -992,6 +984,12 @@ static inline uint32_t A3XX_RB_COPY_CONTROL_MODE(enum adreno_rb_copy_control_mod
 {
 	return ((val) << A3XX_RB_COPY_CONTROL_MODE__SHIFT) & A3XX_RB_COPY_CONTROL_MODE__MASK;
 }
+#define A3XX_RB_COPY_CONTROL_FASTCLEAR__MASK			0x00000f00
+#define A3XX_RB_COPY_CONTROL_FASTCLEAR__SHIFT			8
+static inline uint32_t A3XX_RB_COPY_CONTROL_FASTCLEAR(uint32_t val)
+{
+	return ((val) << A3XX_RB_COPY_CONTROL_FASTCLEAR__SHIFT) & A3XX_RB_COPY_CONTROL_FASTCLEAR__MASK;
+}
 #define A3XX_RB_COPY_CONTROL_GMEM_BASE__MASK			0xffffc000
 #define A3XX_RB_COPY_CONTROL_GMEM_BASE__SHIFT			14
 static inline uint32_t A3XX_RB_COPY_CONTROL_GMEM_BASE(uint32_t val)
@@ -1034,6 +1032,12 @@ static inline uint32_t A3XX_RB_COPY_DEST_INFO_SWAP(enum a3xx_color_swap val)
 {
 	return ((val) << A3XX_RB_COPY_DEST_INFO_SWAP__SHIFT) & A3XX_RB_COPY_DEST_INFO_SWAP__MASK;
 }
+#define A3XX_RB_COPY_DEST_INFO_DITHER_MODE__MASK		0x00000c00
+#define A3XX_RB_COPY_DEST_INFO_DITHER_MODE__SHIFT		10
+static inline uint32_t A3XX_RB_COPY_DEST_INFO_DITHER_MODE(enum adreno_rb_dither_mode val)
+{
+	return ((val) << A3XX_RB_COPY_DEST_INFO_DITHER_MODE__SHIFT) & A3XX_RB_COPY_DEST_INFO_DITHER_MODE__MASK;
+}
 #define A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE__MASK		0x0003c000
 #define A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE__SHIFT		14
 static inline uint32_t A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(uint32_t val)
@@ -1202,6 +1206,8 @@ static inline uint32_t A3XX_RB_WINDOW_OFFSET_Y(uint32_t val)
 }

 #define REG_A3XX_RB_SAMPLE_COUNT_CONTROL			0x00002110
+#define A3XX_RB_SAMPLE_COUNT_CONTROL_RESET			0x00000001
+#define A3XX_RB_SAMPLE_COUNT_CONTROL_COPY			0x00000002

 #define REG_A3XX_RB_SAMPLE_COUNT_ADDR				0x00002111

@@ -1366,10 +1372,36 @@ static inline uint32_t A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(uint32_t val)
 }

 #define REG_A3XX_HLSQ_CL_NDRANGE_0_REG				0x0000220a
+#define A3XX_HLSQ_CL_NDRANGE_0_REG_WORKDIM__MASK		0x00000003
+#define A3XX_HLSQ_CL_NDRANGE_0_REG_WORKDIM__SHIFT		0
+static inline uint32_t A3XX_HLSQ_CL_NDRANGE_0_REG_WORKDIM(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CL_NDRANGE_0_REG_WORKDIM__SHIFT) & A3XX_HLSQ_CL_NDRANGE_0_REG_WORKDIM__MASK;
+}
+#define A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE0__MASK		0x00000ffc
+#define A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE0__SHIFT		2
+static inline uint32_t A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE0(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE0__SHIFT) & A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE0__MASK;
+}
+#define A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE1__MASK		0x003ff000
+#define A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE1__SHIFT		12
+static inline uint32_t A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE1(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE1__SHIFT) & A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE1__MASK;
+}
+#define A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE2__MASK		0xffc00000
+#define A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE2__SHIFT		22
+static inline uint32_t A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE2(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE2__SHIFT) & A3XX_HLSQ_CL_NDRANGE_0_REG_LOCALSIZE2__MASK;
+}

-#define REG_A3XX_HLSQ_CL_NDRANGE_1_REG				0x0000220b
+static inline uint32_t REG_A3XX_HLSQ_CL_GLOBAL_WORK(uint32_t i0) { return 0x0000220b + 0x2*i0; }

-#define REG_A3XX_HLSQ_CL_NDRANGE_2_REG				0x0000220c
+static inline uint32_t REG_A3XX_HLSQ_CL_GLOBAL_WORK_SIZE(uint32_t i0) { return 0x0000220b + 0x2*i0; }
+
+static inline uint32_t REG_A3XX_HLSQ_CL_GLOBAL_WORK_OFFSET(uint32_t i0) { return 0x0000220c + 0x2*i0; }

 #define REG_A3XX_HLSQ_CL_CONTROL_0_REG				0x00002211

@@ -1377,7 +1409,9 @@ static inline uint32_t A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(uint32_t val)

 #define REG_A3XX_HLSQ_CL_KERNEL_CONST_REG			0x00002214

-#define REG_A3XX_HLSQ_CL_KERNEL_GROUP_X_REG			0x00002215
+static inline uint32_t REG_A3XX_HLSQ_CL_KERNEL_GROUP(uint32_t i0) { return 0x00002215 + 0x1*i0; }
+
+static inline uint32_t REG_A3XX_HLSQ_CL_KERNEL_GROUP_RATIO(uint32_t i0) { return 0x00002215 + 0x1*i0; }

 #define REG_A3XX_HLSQ_CL_KERNEL_GROUP_Y_REG			0x00002216

@@ -1624,6 +1658,7 @@ static inline uint32_t A3XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
 }
 #define A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE			0x00200000
 #define A3XX_SP_VS_CTRL_REG0_PIXLODENABLE			0x00400000
+#define A3XX_SP_VS_CTRL_REG0_COMPUTEMODE			0x00800000
 #define A3XX_SP_VS_CTRL_REG0_LENGTH__MASK			0xff000000
 #define A3XX_SP_VS_CTRL_REG0_LENGTH__SHIFT			24
 static inline uint32_t A3XX_SP_VS_CTRL_REG0_LENGTH(uint32_t val)
@@ -1797,6 +1832,7 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
 }
 #define A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE			0x00200000
 #define A3XX_SP_FS_CTRL_REG0_PIXLODENABLE			0x00400000
+#define A3XX_SP_FS_CTRL_REG0_COMPUTEMODE			0x00800000
 #define A3XX_SP_FS_CTRL_REG0_LENGTH__MASK			0xff000000
 #define A3XX_SP_FS_CTRL_REG0_LENGTH__SHIFT			24
 static inline uint32_t A3XX_SP_FS_CTRL_REG0_LENGTH(uint32_t val)
@@ -1976,6 +2012,42 @@ static inline uint32_t A3XX_TPL1_TP_FS_TEX_OFFSET_BASETABLEPTR(uint32_t val)

 #define REG_A3XX_VBIF_OUT_AXI_AOOO				0x0000305f

+#define REG_A3XX_VBIF_PERF_CNT_EN				0x00003070
+#define A3XX_VBIF_PERF_CNT_EN_CNT0				0x00000001
+#define A3XX_VBIF_PERF_CNT_EN_CNT1				0x00000002
+#define A3XX_VBIF_PERF_CNT_EN_PWRCNT0				0x00000004
+#define A3XX_VBIF_PERF_CNT_EN_PWRCNT1				0x00000008
+#define A3XX_VBIF_PERF_CNT_EN_PWRCNT2				0x00000010
+
+#define REG_A3XX_VBIF_PERF_CNT_CLR				0x00003071
+#define A3XX_VBIF_PERF_CNT_CLR_CNT0				0x00000001
+#define A3XX_VBIF_PERF_CNT_CLR_CNT1				0x00000002
+#define A3XX_VBIF_PERF_CNT_CLR_PWRCNT0				0x00000004
+#define A3XX_VBIF_PERF_CNT_CLR_PWRCNT1				0x00000008
+#define A3XX_VBIF_PERF_CNT_CLR_PWRCNT2				0x00000010
+
+#define REG_A3XX_VBIF_PERF_CNT_SEL				0x00003072
+
+#define REG_A3XX_VBIF_PERF_CNT0_LO				0x00003073
+
+#define REG_A3XX_VBIF_PERF_CNT0_HI				0x00003074
+
+#define REG_A3XX_VBIF_PERF_CNT1_LO				0x00003075
+
+#define REG_A3XX_VBIF_PERF_CNT1_HI				0x00003076
+
+#define REG_A3XX_VBIF_PERF_PWR_CNT0_LO				0x00003077
+
+#define REG_A3XX_VBIF_PERF_PWR_CNT0_HI				0x00003078
+
+#define REG_A3XX_VBIF_PERF_PWR_CNT1_LO				0x00003079
+
+#define REG_A3XX_VBIF_PERF_PWR_CNT1_HI				0x0000307a
+
+#define REG_A3XX_VBIF_PERF_PWR_CNT2_LO				0x0000307b
+
+#define REG_A3XX_VBIF_PERF_PWR_CNT2_HI				0x0000307c
+
 #define REG_A3XX_VSC_BIN_SIZE					0x00000c01
 #define A3XX_VSC_BIN_SIZE_WIDTH__MASK				0x0000001f
 #define A3XX_VSC_BIN_SIZE_WIDTH__SHIFT				0
@@ -2249,6 +2321,12 @@ static inline uint32_t A3XX_TEX_SAMP_0_WRAP_R(enum a3xx_tex_clamp val)
 {
 	return ((val) << A3XX_TEX_SAMP_0_WRAP_R__SHIFT) & A3XX_TEX_SAMP_0_WRAP_R__MASK;
 }
+#define A3XX_TEX_SAMP_0_COMPARE_FUNC__MASK			0x00700000
+#define A3XX_TEX_SAMP_0_COMPARE_FUNC__SHIFT			20
+static inline uint32_t A3XX_TEX_SAMP_0_COMPARE_FUNC(enum adreno_compare_func val)
+{
+	return ((val) << A3XX_TEX_SAMP_0_COMPARE_FUNC__SHIFT) & A3XX_TEX_SAMP_0_COMPARE_FUNC__MASK;
+}
 #define A3XX_TEX_SAMP_0_UNNORM_COORDS				0x80000000

 #define REG_A3XX_TEX_SAMP_1					0x00000001
@@ -2267,6 +2345,7 @@ static inline uint32_t A3XX_TEX_SAMP_1_MIN_LOD(float val)

 #define REG_A3XX_TEX_CONST_0					0x00000000
 #define A3XX_TEX_CONST_0_TILED					0x00000001
+#define A3XX_TEX_CONST_0_SRGB					0x00000004
 #define A3XX_TEX_CONST_0_SWIZ_X__MASK				0x00000070
 #define A3XX_TEX_CONST_0_SWIZ_X__SHIFT				4
 static inline uint32_t A3XX_TEX_CONST_0_SWIZ_X(enum a3xx_tex_swiz val)
@@ -2303,6 +2382,7 @@ static inline uint32_t A3XX_TEX_CONST_0_FMT(enum a3xx_tex_fmt val)
 {
 	return ((val) << A3XX_TEX_CONST_0_FMT__SHIFT) & A3XX_TEX_CONST_0_FMT__MASK;
 }
+#define A3XX_TEX_CONST_0_NOCONVERT				0x20000000
 #define A3XX_TEX_CONST_0_TYPE__MASK				0xc0000000
 #define A3XX_TEX_CONST_0_TYPE__SHIFT				30
 static inline uint32_t A3XX_TEX_CONST_0_TYPE(enum a3xx_tex_type val)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@@ -1074,77 +1074,154 @@ trans_arl(const struct instr_translater *t,
 	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
 }

-/* texture fetch/sample instructions: */
-static void
-trans_samp(const struct instr_translater *t,
-		struct fd3_compile_context *ctx,
+/*
+ * texture fetch/sample instructions:
+ */
+
+struct tex_info {
+	int8_t order[4];
+	unsigned src_wrmask, flags;
+};
+
+static const struct tex_info *
+get_tex_info(struct fd3_compile_context *ctx,
 		struct tgsi_full_instruction *inst)
 {
-	struct ir3_instruction *instr;
-	struct tgsi_src_register *coord = &inst->Src[0].Register;
-	struct tgsi_src_register *samp  = &inst->Src[1].Register;
-	unsigned tex = inst->Texture.Texture;
-	int8_t *order;
-	unsigned i, flags = 0, src_wrmask;
-	bool needs_mov = false;
+	static const struct tex_info tex1d = {
+		.order = { 0, -1, -1, -1 },  /* coord.x */
+		.src_wrmask = TGSI_WRITEMASK_XY,
+		.flags = 0,
+	};
+	static const struct tex_info tex1ds = {
+		.order = { 0, -1,  2, -1 },  /* coord.xz */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_S,
+	};
+	static const struct tex_info tex2d = {
+		.order = { 0,  1, -1, -1 },  /* coord.xy */
+		.src_wrmask = TGSI_WRITEMASK_XY,
+		.flags = 0,
+	};
+	static const struct tex_info tex2ds = {
+		.order = { 0,  1,  2, -1 },  /* coord.xyz */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_S,
+	};
+	static const struct tex_info tex3d = {
+		.order = { 0,  1,  2, -1 },  /* coord.xyz */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_3D,
+	};
+	static const struct tex_info tex3ds = {
+		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
+		.src_wrmask = TGSI_WRITEMASK_XYZW,
+		.flags = IR3_INSTR_S | IR3_INSTR_3D,
+	};
+	static const struct tex_info txp1d = {
+		.order = { 0, -1,  3, -1 },  /* coord.xw */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_P,
+	};
+	static const struct tex_info txp1ds = {
+		.order = { 0, -1,  2,  3 },  /* coord.xzw */
+		.src_wrmask = TGSI_WRITEMASK_XYZW,
+		.flags = IR3_INSTR_P | IR3_INSTR_S,
+	};
+	static const struct tex_info txp2d = {
+		.order = { 0,  1,  3, -1 },  /* coord.xyw */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_P,
+	};
+	static const struct tex_info txp2ds = {
+		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
+		.src_wrmask = TGSI_WRITEMASK_XYZW,
+		.flags = IR3_INSTR_P | IR3_INSTR_S,
+	};
+	static const struct tex_info txp3d = {
+		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
+		.src_wrmask = TGSI_WRITEMASK_XYZW,
+		.flags = IR3_INSTR_P | IR3_INSTR_3D,
+	};

-	switch (t->arg) {
+	unsigned tex = inst->Texture.Texture;
+
+	switch (inst->Instruction.Opcode) {
 	case TGSI_OPCODE_TEX:
 		switch (tex) {
+		case TGSI_TEXTURE_1D:
+			return &tex1d;
+		case TGSI_TEXTURE_SHADOW1D:
+			return &tex1ds;
 		case TGSI_TEXTURE_2D:
 		case TGSI_TEXTURE_RECT:
-			order = (int8_t[4]){ 0,  1, -1, -1 };
-			src_wrmask = TGSI_WRITEMASK_XY;
-			break;
+			return &tex2d;
+		case TGSI_TEXTURE_SHADOW2D:
+		case TGSI_TEXTURE_SHADOWRECT:
+			return &tex2ds;
 		case TGSI_TEXTURE_3D:
 		case TGSI_TEXTURE_CUBE:
-			order = (int8_t[4]){ 0,  1,  2, -1 };
-			src_wrmask = TGSI_WRITEMASK_XYZ;
-			flags |= IR3_INSTR_3D;
-			break;
+			return &tex3d;
+		case TGSI_TEXTURE_SHADOWCUBE:
+			return &tex3ds;
 		default:
 			compile_error(ctx, "unknown texture type: %s\n",
 					tgsi_texture_names[tex]);
-			break;
+			return NULL;
 		}
 		break;
 	case TGSI_OPCODE_TXP:
 		switch (tex) {
+		case TGSI_TEXTURE_1D:
+			return &txp1d;
+		case TGSI_TEXTURE_SHADOW1D:
+			return &txp1ds;
 		case TGSI_TEXTURE_2D:
 		case TGSI_TEXTURE_RECT:
-			order = (int8_t[4]){ 0,  1,  3, -1 };
-			src_wrmask = TGSI_WRITEMASK_XYZ;
-			break;
+			return &txp2d;
+		case TGSI_TEXTURE_SHADOW2D:
+		case TGSI_TEXTURE_SHADOWRECT:
+			return &txp2ds;
 		case TGSI_TEXTURE_3D:
 		case TGSI_TEXTURE_CUBE:
-			order = (int8_t[4]){ 0,  1,  2,  3 };
-			src_wrmask = TGSI_WRITEMASK_XYZW;
-			flags |= IR3_INSTR_3D;
-			break;
+			return &txp3d;
 		default:
 			compile_error(ctx, "unknown texture type: %s\n",
 					tgsi_texture_names[tex]);
 			break;
 		}
-		flags |= IR3_INSTR_P;
-		break;
-	default:
-		compile_assert(ctx, 0);
 		break;
 	}
+	compile_assert(ctx, 0);
+	return NULL;
+}
+
+static struct tgsi_src_register *
+get_tex_coord(struct fd3_compile_context *ctx,
+		struct tgsi_full_instruction *inst,
+		const struct tex_info *tinf)
+{
+	struct tgsi_src_register *coord = &inst->Src[0].Register;
+	struct ir3_instruction *instr;
+	unsigned tex = inst->Texture.Texture;
+	bool needs_mov = false;
+	unsigned i;

 	/* cat5 instruction cannot seem to handle const or relative: */
 	if (is_rel_or_const(coord))
 		needs_mov = true;

+	/* 1D textures we fix up w/ 0.0 as 2nd coord: */
+	if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D))
+		needs_mov = true;
+
 	/* The texture sample instructions need to coord in successive
 	 * registers/components (ie. src.xy but not src.yx).  And TXP
 	 * needs the .w component in .z for 2D..  so in some cases we
 	 * might need to emit some mov instructions to shuffle things
 	 * around:
 	 */
-	for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++)
-		if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i]))
+	for (i = 1; (i < 4) && (tinf->order[i] >= 0) && !needs_mov; i++)
+		if (src_swiz(coord, i) != (src_swiz(coord, 0) + tinf->order[i]))
 			needs_mov = true;

 	if (needs_mov) {
@@ -1157,28 +1234,55 @@ trans_samp(const struct instr_translater *t,
 		/* need to move things around: */
 		tmp_src = get_internal_temp(ctx, &tmp_dst);

-		for (j = 0; (j < 4) && (order[j] >= 0); j++) {
-			instr = instr_create(ctx, 1, 0);
+		for (j = 0; j < 4; j++) {
+			if (tinf->order[j] < 0)
+				continue;
+			instr = instr_create(ctx, 1, 0);  /* mov */
 			instr->cat1.src_type = type_mov;
 			instr->cat1.dst_type = type_mov;
 			add_dst_reg(ctx, instr, &tmp_dst, j);
 			add_src_reg(ctx, instr, coord,
-					src_swiz(coord, order[j]));
+					src_swiz(coord, tinf->order[j]));
+		}
+
+		/* fix up .y coord: */
+		if ((tex == TGSI_TEXTURE_1D) ||
+				(tex == TGSI_TEXTURE_SHADOW1D)) {
+			instr = instr_create(ctx, 1, 0);  /* mov */
+			instr->cat1.src_type = type_mov;
+			instr->cat1.dst_type = type_mov;
+			add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
+			ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5;
 		}

 		coord = tmp_src;
 	}

+	return coord;
+}
+
+static void
+trans_samp(const struct instr_translater *t,
+		struct fd3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+	struct tgsi_src_register *coord;
+	struct tgsi_src_register *samp  = &inst->Src[1].Register;
+	const struct tex_info *tinf;
+
+	tinf = get_tex_info(ctx, inst);
+	coord = get_tex_coord(ctx, inst, tinf);
+
 	instr = instr_create(ctx, 5, t->opc);
 	instr->cat5.type = get_ftype(ctx);
 	instr->cat5.samp = samp->Index;
 	instr->cat5.tex  = samp->Index;
-	instr->flags |= flags;
+	instr->flags |= tinf->flags;

-	add_dst_reg_wrmask(ctx, instr, &inst->Dst[0].Register, 0,
-			inst->Dst[0].Register.WriteMask);
-
-	add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, src_wrmask);
+	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
+	add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask);
 }

 /*
@@ -1231,15 +1335,19 @@ trans_cmp(const struct instr_translater *t,

 	switch (t->tgsi_opc) {
 	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_FSEQ:
 		condition = IR3_COND_EQ;
 		break;
 	case TGSI_OPCODE_SNE:
+	case TGSI_OPCODE_FSNE:
 		condition = IR3_COND_NE;
 		break;
 	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_FSGE:
 		condition = IR3_COND_GE;
 		break;
 	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_FSLT:
 		condition = IR3_COND_LT;
 		break;
 	case TGSI_OPCODE_SLE:
@@ -1269,11 +1377,15 @@ trans_cmp(const struct instr_translater *t,

 	switch (t->tgsi_opc) {
 	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_FSEQ:
 	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_FSGE:
 	case TGSI_OPCODE_SLE:
 	case TGSI_OPCODE_SNE:
+	case TGSI_OPCODE_FSNE:
 	case TGSI_OPCODE_SGT:
 	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_FSLT:
 		/* cov.u16f16 dst, tmp0 */
 		instr = instr_create(ctx, 1, 0);
 		instr->cat1.src_type = get_utype(ctx);
@@ -1293,6 +1405,96 @@ trans_cmp(const struct instr_translater *t,
 	put_dst(ctx, inst, dst);
 }

+/*
+ * USNE(a,b) = (a != b) ? 1 : 0
+ *   cmps.u32.ne dst, a, b
+ *
+ * USEQ(a,b) = (a == b) ? 1 : 0
+ *   cmps.u32.eq dst, a, b
+ *
+ * ISGE(a,b) = (a > b) ? 1 : 0
+ *   cmps.s32.ge dst, a, b
+ *
+ * USGE(a,b) = (a > b) ? 1 : 0
+ *   cmps.u32.ge dst, a, b
+ *
+ * ISLT(a,b) = (a < b) ? 1 : 0
+ *   cmps.s32.lt dst, a, b
+ *
+ * USLT(a,b) = (a < b) ? 1 : 0
+ *   cmps.u32.lt dst, a, b
+ *
+ * UCMP(a,b,c) = (a < 0) ? b : c
+ *   cmps.u32.lt tmp0, a, {0}
+ *   sel.b16 dst, b, tmp0, c
+ */
+static void
+trans_icmp(const struct instr_translater *t,
+		struct fd3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register constval0;
+	struct tgsi_src_register *a0, *a1, *a2;
+	unsigned condition;
+
+	a0 = &inst->Src[0].Register;  /* a */
+	a1 = &inst->Src[1].Register;  /* b */
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_USNE:
+		condition = IR3_COND_NE;
+		break;
+	case TGSI_OPCODE_USEQ:
+		condition = IR3_COND_EQ;
+		break;
+	case TGSI_OPCODE_ISGE:
+	case TGSI_OPCODE_USGE:
+		condition = IR3_COND_GE;
+		break;
+	case TGSI_OPCODE_ISLT:
+	case TGSI_OPCODE_USLT:
+		condition = IR3_COND_LT;
+		break;
+	case TGSI_OPCODE_UCMP:
+		get_immediate(ctx, &constval0, 0);
+		a0 = &inst->Src[0].Register;  /* a */
+		a1 = &constval0;              /* {0} */
+		condition = IR3_COND_LT;
+		break;
+
+	default:
+		compile_assert(ctx, 0);
+		return;
+	}
+
+	if (is_const(a0) && is_const(a1))
+		a0 = get_unconst(ctx, a0);
+
+	if (t->tgsi_opc == TGSI_OPCODE_UCMP) {
+		struct tgsi_dst_register tmp_dst;
+		struct tgsi_src_register *tmp_src;
+		tmp_src = get_internal_temp(ctx, &tmp_dst);
+		/* cmps.u32.lt tmp, a0, a1 */
+		instr = instr_create(ctx, 2, t->opc);
+		instr->cat2.condition = condition;
+		vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+		a1 = &inst->Src[1].Register;
+		a2 = &inst->Src[2].Register;
+		/* sel.{b32,b16} dst, src2, tmp, src1 */
+		instr = instr_create(ctx, 3, OPC_SEL_B32);
+		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
+	} else {
+		/* cmps.{u32,s32}.<cond> dst, a0, a1 */
+		instr = instr_create(ctx, 2, t->opc);
+		instr->cat2.condition = condition;
+		vectorize(ctx, instr, dst, 2, a0, 0, a1, 0);
+	}
+	put_dst(ctx, inst, dst);
+}
+
 /*
 * Conditional / Flow control
 */
@@ -1533,7 +1735,7 @@ trans_endif(const struct instr_translater *t,
 }

 /*
- * Kill / Kill-if
+ * Kill
 */

 static void
@@ -1579,6 +1781,76 @@ trans_kill(const struct instr_translater *t,
 	ctx->kill[ctx->kill_count++] = instr;
 }

+/*
+ * Kill-If
+ */
+
+static void
+trans_killif(const struct instr_translater *t,
+		struct fd3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	struct ir3_instruction *instr, *immed, *cond = NULL;
+	bool inv = false;
+
+	immed = create_immed(ctx, 0.0);
+
+	/* cmps.f.ne p0.x, cond, {0.0} */
+	instr = instr_create(ctx, 2, OPC_CMPS_F);
+	instr->cat2.condition = IR3_COND_NE;
+	ir3_reg_create(instr, regid(REG_P0, 0), 0);
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
+	add_src_reg(ctx, instr, src, src->SwizzleX);
+
+	cond = instr;
+
+	/* kill p0.x */
+	instr = instr_create(ctx, 0, OPC_KILL);
+	instr->cat0.inv = inv;
+	ir3_reg_create(instr, 0, 0);  /* dummy dst */
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+
+	ctx->kill[ctx->kill_count++] = instr;
+
+}
+/*
+ * I2F / U2F / F2I / F2U
+ */
+
+static void
+trans_cov(const struct instr_translater *t,
+		struct fd3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+
+	// cov.f32s32 dst, tmp0 /
+	instr = instr_create(ctx, 1, 0);
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_U2F:
+		instr->cat1.src_type = TYPE_U32;
+		instr->cat1.dst_type = TYPE_F32;
+		break;
+	case TGSI_OPCODE_I2F:
+		instr->cat1.src_type = TYPE_S32;
+		instr->cat1.dst_type = TYPE_F32;
+		break;
+	case TGSI_OPCODE_F2U:
+		instr->cat1.src_type = TYPE_F32;
+		instr->cat1.dst_type = TYPE_U32;
+		break;
+	case TGSI_OPCODE_F2I:
+		instr->cat1.src_type = TYPE_F32;
+		instr->cat1.dst_type = TYPE_S32;
+		break;
+
+	}
+	vectorize(ctx, instr, dst, 1, src, 0);
+}
+
 /*
 * Handlers for TGSI instructions which do have 1:1 mapping to native
 * instructions:
@@ -1616,9 +1888,11 @@ instr_cat2(const struct instr_translater *t,

 	switch (t->tgsi_opc) {
 	case TGSI_OPCODE_ABS:
+	case TGSI_OPCODE_IABS:
 		src0_flags = IR3_REG_ABS;
 		break;
 	case TGSI_OPCODE_SUB:
+	case TGSI_OPCODE_INEG:
 		src1_flags = IR3_REG_NEGATE;
 		break;
 	}
@@ -1724,6 +1998,22 @@ static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
 	INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
 	INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
 	INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
+	INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
+	INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
+	INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
+	INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
+	INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
+	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
+	INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
+	INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
+	INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
+	INSTR(UMUL,         instr_cat2, .opc = OPC_MUL_U),
+	INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
+	INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
+	INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
+	INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
+	INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
+	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
 	INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
 	INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
 	INSTR(CLAMP,        trans_clamp),
@@ -1741,16 +2031,33 @@ static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
 	INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
 	INSTR(SGT,          trans_cmp),
 	INSTR(SLT,          trans_cmp),
+	INSTR(FSLT,         trans_cmp),
 	INSTR(SGE,          trans_cmp),
+	INSTR(FSGE,         trans_cmp),
 	INSTR(SLE,          trans_cmp),
 	INSTR(SNE,          trans_cmp),
+	INSTR(FSNE,         trans_cmp),
 	INSTR(SEQ,          trans_cmp),
+	INSTR(FSEQ,         trans_cmp),
 	INSTR(CMP,          trans_cmp),
+	INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
+	INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
+	INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(UCMP,         trans_icmp, .opc = OPC_CMPS_U),
 	INSTR(IF,           trans_if),
+	INSTR(UIF,          trans_if),
 	INSTR(ELSE,         trans_else),
 	INSTR(ENDIF,        trans_endif),
 	INSTR(END,          instr_cat0, .opc = OPC_END),
 	INSTR(KILL,         trans_kill, .opc = OPC_KILL),
+	INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
+	INSTR(I2F,          trans_cov),
+	INSTR(U2F,          trans_cov),
+	INSTR(F2I,          trans_cov),
+	INSTR(F2U,          trans_cov),
 };

 static fd3_semantic
@@ -1935,6 +2242,8 @@ decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)

 		DBG("decl in -> r%d", i);

+		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
+
 		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
 		so->inputs[n].compmask = (1 << ncomp) - 1;
 		so->inputs[n].regid = r;
@@ -2024,6 +2333,8 @@ decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)

 		ncomp = 4;

+		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+
 		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
 		so->outputs[n].regid = regid(i, comp);

@@ -2147,6 +2458,7 @@ compile_instructions(struct fd3_compile_context *ctx)
 			struct tgsi_full_immediate *imm =
 					&ctx->parser.FullToken.FullImmediate;
 			unsigned n = ctx->so->immediates_count++;
+			compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
 			memcpy(ctx->so->immediates[n].val, imm->u, 16);
 			break;
 		}
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c
@@ -1324,6 +1324,8 @@ decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)

 		DBG("decl in -> r%d", i + base);   // XXX

+		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
+
 		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
 		so->inputs[n].compmask = (1 << ncomp) - 1;
 		so->inputs[n].ncomp = ncomp;
@@ -1410,6 +1412,7 @@ decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)

 	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
 		unsigned n = so->outputs_count++;
+		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
 		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
 		so->outputs[n].regid = regid(i + base, comp);
 	}
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -33,6 +33,7 @@
 #include "fd3_emit.h"
 #include "fd3_gmem.h"
 #include "fd3_program.h"
+#include "fd3_query.h"
 #include "fd3_rasterizer.h"
 #include "fd3_texture.h"
 #include "fd3_zsa.h"
@@ -134,5 +135,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv)
 	fd3_ctx->solid_vbuf = create_solid_vertexbuf(pctx);
 	fd3_ctx->blit_texcoord_vbuf = create_blit_texcoord_vertexbuf(pctx);

+	fd3_query_context_init(pctx);
+
 	return pctx;
 }
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -406,7 +406,7 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 			A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
 			A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(fp->total_in, 4) / 4));

-	for (i = 0, j = -1; j < (int)fp->inputs_count; i++) {
+	for (i = 0, j = -1; (i < 8) && (j < (int)fp->inputs_count); i++) {
 		uint32_t reg = 0;

 		OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i), 1);
@@ -428,7 +428,7 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 		OUT_RING(ring, reg);
 	}

-	for (i = 0, j = -1; j < (int)fp->inputs_count; i++) {
+	for (i = 0, j = -1; (i < 4) && (j < (int)fp->inputs_count); i++) {
 		uint32_t reg = 0;

 		OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1);
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
@@ -91,7 +91,7 @@ struct fd3_shader_variant {
 	struct {
 		fd3_semantic semantic;
 		uint8_t regid;
-	} outputs[16];
+	} outputs[16 + 2];  /* +POSITION +PSIZE */
 	bool writes_pos, writes_psize;

 	/* vertices/inputs: */
@@ -104,7 +104,7 @@ struct fd3_shader_variant {
 		/* in theory inloc of fs should match outloc of vs: */
 		uint8_t inloc;
 		uint8_t bary;
-	} inputs[16];
+	} inputs[16 + 2];  /* +POSITION +FACE */

 	unsigned total_in;       /* sum of inputs (scalar) */

--- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
@@ -0,0 +1,139 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "freedreno_query_hw.h"
+#include "freedreno_context.h"
+#include "freedreno_util.h"
+
+#include "fd3_query.h"
+#include "fd3_util.h"
+
+
+struct fd_rb_samp_ctrs {
+	uint64_t ctr[16];
+};
+
+/*
+ * Occlusion Query:
+ *
+ * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
+ * interpret results
+ */
+
+static struct fd_hw_sample *
+occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+	struct fd_hw_sample *samp =
+			fd_hw_sample_init(ctx, sizeof(struct fd_rb_samp_ctrs));
+
+	/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
+	 * HW_QUERY_BASE_REG register:
+	 */
+	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+	OUT_RING(ring, CP_REG(REG_A3XX_RB_SAMPLE_COUNT_ADDR) | 0x80000000);
+	OUT_RING(ring, HW_QUERY_BASE_REG);
+	OUT_RING(ring, samp->offset);
+
+	OUT_PKT0(ring, REG_A3XX_RB_SAMPLE_COUNT_CONTROL, 1);
+	OUT_RING(ring, A3XX_RB_SAMPLE_COUNT_CONTROL_COPY);
+
+	OUT_PKT3(ring, CP_DRAW_INDX, 3);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, DRAW(DI_PT_POINTLIST_A2XX, DI_SRC_SEL_AUTO_INDEX,
+			INDEX_SIZE_IGN, USE_VISIBILITY));
+	OUT_RING(ring, 0);             /* NumIndices */
+
+	OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+	OUT_RING(ring, ZPASS_DONE);
+
+	OUT_PKT0(ring, REG_A3XX_RBBM_PERFCTR_CTL, 1);
+	OUT_RING(ring, A3XX_RBBM_PERFCTR_CTL_ENABLE);
+
+	OUT_PKT0(ring, REG_A3XX_VBIF_PERF_CNT_EN, 1);
+	OUT_RING(ring, A3XX_VBIF_PERF_CNT_EN_CNT0 |
+			A3XX_VBIF_PERF_CNT_EN_CNT1 |
+			A3XX_VBIF_PERF_CNT_EN_PWRCNT0 |
+			A3XX_VBIF_PERF_CNT_EN_PWRCNT1 |
+			A3XX_VBIF_PERF_CNT_EN_PWRCNT2);
+
+	return samp;
+}
+
+static uint64_t
+count_samples(const struct fd_rb_samp_ctrs *start,
+		const struct fd_rb_samp_ctrs *end)
+{
+	uint64_t n = 0;
+	unsigned i;
+
+	/* not quite sure what all of these are, possibly different
+	 * counters for each MRT render target:
+	 */
+	for (i = 0; i < 16; i += 4)
+		n += end->ctr[i] - start->ctr[i];
+
+	return n;
+}
+
+static void
+occlusion_counter_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = count_samples(start, end);
+	result->u64 += n;
+}
+
+static void
+occlusion_predicate_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = count_samples(start, end);
+	result->b |= (n > 0);
+}
+
+static const struct fd_hw_sample_provider occlusion_counter = {
+		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
+		.active = FD_STAGE_DRAW, /* | FD_STAGE_CLEAR ??? */
+		.get_sample = occlusion_get_sample,
+		.accumulate_result = occlusion_counter_accumulate_result,
+};
+
+static const struct fd_hw_sample_provider occlusion_predicate = {
+		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
+		.active = FD_STAGE_DRAW, /* | FD_STAGE_CLEAR ??? */
+		.get_sample = occlusion_get_sample,
+		.accumulate_result = occlusion_predicate_accumulate_result,
+};
+
+void fd3_query_context_init(struct pipe_context *pctx)
+{
+	fd_hw_query_register_provider(pctx, &occlusion_counter);
+	fd_hw_query_register_provider(pctx, &occlusion_predicate);
+}
--- a/src/gallium/drivers/freedreno/a3xx/fd3_query.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.h
@@ -0,0 +1,36 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FD3_QUERY_H_
+#define FD3_QUERY_H_
+
+#include "pipe/p_context.h"
+
+void fd3_query_context_init(struct pipe_context *pctx);
+
+#endif /* FD3_QUERY_H_ */
--- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
@@ -40,6 +40,7 @@ fd3_rasterizer_state_create(struct pipe_context *pctx,
 		const struct pipe_rasterizer_state *cso)
 {
 	struct fd3_rasterizer_stateobj *so;
+	float psize_min, psize_max;

 	so = CALLOC_STRUCT(fd3_rasterizer_stateobj);
 	if (!so)
@@ -47,19 +48,28 @@ fd3_rasterizer_state_create(struct pipe_context *pctx,

 	so->base = *cso;

+	if (cso->point_size_per_vertex) {
+		psize_min = util_get_min_point_size(cso);
+		psize_max = 8192;
+	} else {
+		/* Force the point size to be as if the vertex output was disabled. */
+		psize_min = cso->point_size;
+		psize_max = cso->point_size;
+	}
+
 /*
 	if (cso->line_stipple_enable) {
 		??? TODO line stipple
 	}
 	TODO cso->half_pixel_center
-	TODO cso->point_size
-	TODO psize_min/psize_max
 	if (cso->multisample)
 		TODO
 */
 	so->gras_cl_clip_cntl = A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER; /* ??? */
-	so->gras_su_point_minmax = 0xffc00010;  /* ??? */
-	so->gras_su_point_size   = 0x00000008;  /* ??? */
+	so->gras_su_point_minmax =
+			A3XX_GRAS_SU_POINT_MINMAX_MIN(psize_min/2) |
+			A3XX_GRAS_SU_POINT_MINMAX_MAX(psize_max/2);
+	so->gras_su_point_size   = A3XX_GRAS_SU_POINT_SIZE(cso->point_size/2);
 	so->gras_su_poly_offset_scale =
 			A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(cso->offset_scale);
 	so->gras_su_poly_offset_offset =
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -30,6 +30,7 @@
 #include "util/u_string.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
+#include "util/u_format.h"

 #include "fd3_texture.h"
 #include "fd3_util.h"
@@ -99,6 +100,9 @@ fd3_sampler_state_create(struct pipe_context *pctx,
 			A3XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t)) |
 			A3XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r));

+	if (cso->compare_mode)
+		so->texsamp0 |= A3XX_TEX_SAMP_0_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */
+
 	if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
 		so->texsamp1 =
 				A3XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) |
@@ -158,6 +162,10 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 			A3XX_TEX_CONST_0_MIPLVLS(miplevels) |
 			fd3_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g,
 						cso->swizzle_b, cso->swizzle_a);
+
+	if (util_format_is_srgb(cso->format))
+		so->texconst0 |= A3XX_TEX_CONST_0_SRGB;
+
 	so->texconst1 =
 			A3XX_TEX_CONST_1_FETCHSIZE(fd3_pipe2fetchsize(cso->format)) |
 			A3XX_TEX_CONST_1_WIDTH(prsc->width0) |
--- a/src/gallium/drivers/freedreno/a3xx/fd3_util.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_util.c
@@ -235,6 +235,10 @@ fd3_pipe2tex(enum pipe_format format)
 	case PIPE_FORMAT_B8G8R8X8_UNORM:
 	case PIPE_FORMAT_R8G8B8A8_UNORM:
 	case PIPE_FORMAT_R8G8B8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_SRGB:
+	case PIPE_FORMAT_B8G8R8X8_SRGB:
+	case PIPE_FORMAT_R8G8B8A8_SRGB:
+	case PIPE_FORMAT_R8G8B8X8_SRGB:
 		return TFMT_NORM_UINT_8_8_8_8;

 	case PIPE_FORMAT_Z24X8_UNORM:
@@ -275,6 +279,12 @@ fd3_pipe2fetchsize(enum pipe_format format)

 	case PIPE_FORMAT_B8G8R8A8_UNORM:
 	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R8G8B8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_SRGB:
+	case PIPE_FORMAT_B8G8R8X8_SRGB:
+	case PIPE_FORMAT_R8G8B8A8_SRGB:
+	case PIPE_FORMAT_R8G8B8X8_SRGB:
 	case PIPE_FORMAT_Z24X8_UNORM:
 	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 		return TFETCH_4_BYTE;
@@ -379,14 +389,14 @@ fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g,
 {
 	const struct util_format_description *desc =
 			util_format_description(format);
-	uint8_t swiz[] = {
+	unsigned char swiz[4] = {
 			swizzle_r, swizzle_g, swizzle_b, swizzle_a,
-			PIPE_SWIZZLE_ZERO, PIPE_SWIZZLE_ONE,
-			PIPE_SWIZZLE_ONE, PIPE_SWIZZLE_ONE,
-	};
+	}, rswiz[4];

-	return A3XX_TEX_CONST_0_SWIZ_X(tex_swiz(swiz[desc->swizzle[0]])) |
-			A3XX_TEX_CONST_0_SWIZ_Y(tex_swiz(swiz[desc->swizzle[1]])) |
-			A3XX_TEX_CONST_0_SWIZ_Z(tex_swiz(swiz[desc->swizzle[2]])) |
-			A3XX_TEX_CONST_0_SWIZ_W(tex_swiz(swiz[desc->swizzle[3]]));
+	util_format_compose_swizzles(desc->swizzle, swiz, rswiz);
+
+	return A3XX_TEX_CONST_0_SWIZ_X(tex_swiz(rswiz[0])) |
+			A3XX_TEX_CONST_0_SWIZ_Y(tex_swiz(rswiz[1])) |
+			A3XX_TEX_CONST_0_SWIZ_Z(tex_swiz(rswiz[2])) |
+			A3XX_TEX_CONST_0_SWIZ_W(tex_swiz(rswiz[3]));
 }
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -10,11 +10,11 @@ git clone https://github.com/freedreno/envytools.git
 The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32840 bytes, from 2014-01-05 14:44:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (   9009 bytes, from 2014-01-11 16:56:35)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  12362 bytes, from 2014-01-07 14:47:36)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  56545 bytes, from 2014-02-26 16:32:11)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (   8344 bytes, from 2013-11-30 14:49:47)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32580 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10186 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14477 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  57831 bytes, from 2014-05-19 21:02:34)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  26293 bytes, from 2014-05-16 11:51:57)

 Copyright (C) 2013-2014 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -116,6 +116,39 @@ enum adreno_rb_depth_format {
 	DEPTHX_24_8 = 1,
 };

+enum adreno_rb_copy_control_mode {
+	RB_COPY_RESOLVE = 1,
+	RB_COPY_CLEAR = 2,
+	RB_COPY_DEPTH_STENCIL = 5,
+};
+
+enum a3xx_render_mode {
+	RB_RENDERING_PASS = 0,
+	RB_TILING_PASS = 1,
+	RB_RESOLVE_PASS = 2,
+	RB_COMPUTE_PASS = 3,
+};
+
+enum a3xx_msaa_samples {
+	MSAA_ONE = 0,
+	MSAA_TWO = 1,
+	MSAA_FOUR = 2,
+};
+
+enum a3xx_threadmode {
+	MULTI = 0,
+	SINGLE = 1,
+};
+
+enum a3xx_instrbuffermode {
+	BUFFER = 1,
+};
+
+enum a3xx_threadsize {
+	TWO_QUADS = 0,
+	FOUR_QUADS = 1,
+};
+
 #define REG_AXXX_CP_RB_BASE					0x000001c0

 #define REG_AXXX_CP_RB_CNTL					0x000001c1
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -10,11 +10,11 @@ git clone https://github.com/freedreno/envytools.git
 The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32840 bytes, from 2014-01-05 14:44:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (   9009 bytes, from 2014-01-11 16:56:35)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  12362 bytes, from 2014-01-07 14:47:36)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  56545 bytes, from 2014-02-26 16:32:11)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (   8344 bytes, from 2013-11-30 14:49:47)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32580 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10186 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14477 bytes, from 2014-05-16 11:51:57)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  57831 bytes, from 2014-05-19 21:02:34)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  26293 bytes, from 2014-05-16 11:51:57)

 Copyright (C) 2013-2014 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -164,6 +164,11 @@ enum adreno_pm4_type3_packets {
 	CP_SET_BIN = 76,
 	CP_TEST_TWO_MEMS = 113,
 	CP_WAIT_FOR_ME = 19,
+	CP_SET_DRAW_STATE = 67,
+	CP_DRAW_INDX_OFFSET = 56,
+	CP_DRAW_INDIRECT = 40,
+	CP_DRAW_INDX_INDIRECT = 41,
+	CP_DRAW_AUTO = 36,
 	IN_IB_PREFETCH_END = 23,
 	IN_SUBBLK_PREFETCH = 31,
 	IN_INSTR_PREFETCH = 32,
@@ -351,6 +356,93 @@ static inline uint32_t CP_DRAW_INDX_2_2_NUM_INDICES(uint32_t val)
 	return ((val) << CP_DRAW_INDX_2_2_NUM_INDICES__SHIFT) & CP_DRAW_INDX_2_2_NUM_INDICES__MASK;
 }

+#define REG_CP_DRAW_INDX_OFFSET_0				0x00000000
+#define CP_DRAW_INDX_OFFSET_0_PRIM_TYPE__MASK			0x0000003f
+#define CP_DRAW_INDX_OFFSET_0_PRIM_TYPE__SHIFT			0
+static inline uint32_t CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(enum pc_di_primtype val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_0_PRIM_TYPE__SHIFT) & CP_DRAW_INDX_OFFSET_0_PRIM_TYPE__MASK;
+}
+#define CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__MASK		0x000000c0
+#define CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__SHIFT		6
+static inline uint32_t CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(enum pc_di_src_sel val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__SHIFT) & CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__MASK;
+}
+#define CP_DRAW_INDX_OFFSET_0_VIS_CULL__MASK			0x00000700
+#define CP_DRAW_INDX_OFFSET_0_VIS_CULL__SHIFT			8
+static inline uint32_t CP_DRAW_INDX_OFFSET_0_VIS_CULL(enum pc_di_vis_cull_mode val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_0_VIS_CULL__SHIFT) & CP_DRAW_INDX_OFFSET_0_VIS_CULL__MASK;
+}
+#define CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__MASK			0x00000800
+#define CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__SHIFT			11
+static inline uint32_t CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(enum pc_di_index_size val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__SHIFT) & CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__MASK;
+}
+#define CP_DRAW_INDX_OFFSET_0_NOT_EOP				0x00001000
+#define CP_DRAW_INDX_OFFSET_0_SMALL_INDEX			0x00002000
+#define CP_DRAW_INDX_OFFSET_0_PRE_DRAW_INITIATOR_ENABLE		0x00004000
+#define CP_DRAW_INDX_OFFSET_0_NUM_INDICES__MASK			0xffff0000
+#define CP_DRAW_INDX_OFFSET_0_NUM_INDICES__SHIFT		16
+static inline uint32_t CP_DRAW_INDX_OFFSET_0_NUM_INDICES(uint32_t val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_0_NUM_INDICES__SHIFT) & CP_DRAW_INDX_OFFSET_0_NUM_INDICES__MASK;
+}
+
+#define REG_CP_DRAW_INDX_OFFSET_1				0x00000001
+
+#define REG_CP_DRAW_INDX_OFFSET_2				0x00000002
+#define CP_DRAW_INDX_OFFSET_2_NUM_INDICES__MASK			0xffffffff
+#define CP_DRAW_INDX_OFFSET_2_NUM_INDICES__SHIFT		0
+static inline uint32_t CP_DRAW_INDX_OFFSET_2_NUM_INDICES(uint32_t val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_2_NUM_INDICES__SHIFT) & CP_DRAW_INDX_OFFSET_2_NUM_INDICES__MASK;
+}
+
+#define REG_CP_DRAW_INDX_OFFSET_2				0x00000002
+#define CP_DRAW_INDX_OFFSET_2_INDX_BASE__MASK			0xffffffff
+#define CP_DRAW_INDX_OFFSET_2_INDX_BASE__SHIFT			0
+static inline uint32_t CP_DRAW_INDX_OFFSET_2_INDX_BASE(uint32_t val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_2_INDX_BASE__SHIFT) & CP_DRAW_INDX_OFFSET_2_INDX_BASE__MASK;
+}
+
+#define REG_CP_DRAW_INDX_OFFSET_2				0x00000002
+#define CP_DRAW_INDX_OFFSET_2_INDX_SIZE__MASK			0xffffffff
+#define CP_DRAW_INDX_OFFSET_2_INDX_SIZE__SHIFT			0
+static inline uint32_t CP_DRAW_INDX_OFFSET_2_INDX_SIZE(uint32_t val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_2_INDX_SIZE__SHIFT) & CP_DRAW_INDX_OFFSET_2_INDX_SIZE__MASK;
+}
+
+#define REG_CP_SET_DRAW_STATE_0					0x00000000
+#define CP_SET_DRAW_STATE_0_COUNT__MASK				0x0000ffff
+#define CP_SET_DRAW_STATE_0_COUNT__SHIFT			0
+static inline uint32_t CP_SET_DRAW_STATE_0_COUNT(uint32_t val)
+{
+	return ((val) << CP_SET_DRAW_STATE_0_COUNT__SHIFT) & CP_SET_DRAW_STATE_0_COUNT__MASK;
+}
+#define CP_SET_DRAW_STATE_0_DIRTY				0x00010000
+#define CP_SET_DRAW_STATE_0_DISABLE				0x00020000
+#define CP_SET_DRAW_STATE_0_DISABLE_ALL_GROUPS			0x00040000
+#define CP_SET_DRAW_STATE_0_LOAD_IMMED				0x00080000
+#define CP_SET_DRAW_STATE_0_GROUP_ID__MASK			0x1f000000
+#define CP_SET_DRAW_STATE_0_GROUP_ID__SHIFT			24
+static inline uint32_t CP_SET_DRAW_STATE_0_GROUP_ID(uint32_t val)
+{
+	return ((val) << CP_SET_DRAW_STATE_0_GROUP_ID__SHIFT) & CP_SET_DRAW_STATE_0_GROUP_ID__MASK;
+}
+
+#define REG_CP_SET_DRAW_STATE_1					0x00000001
+#define CP_SET_DRAW_STATE_1_ADDR__MASK				0xffffffff
+#define CP_SET_DRAW_STATE_1_ADDR__SHIFT				0
+static inline uint32_t CP_SET_DRAW_STATE_1_ADDR(uint32_t val)
+{
+	return ((val) << CP_SET_DRAW_STATE_1_ADDR__SHIFT) & CP_SET_DRAW_STATE_1_ADDR__MASK;
+}
+
 #define REG_CP_SET_BIN_0					0x00000000

 #define REG_CP_SET_BIN_1					0x00000001
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -34,6 +34,7 @@
 #include "freedreno_state.h"
 #include "freedreno_gmem.h"
 #include "freedreno_query.h"
+#include "freedreno_query_hw.h"
 #include "freedreno_util.h"

 static struct fd_ringbuffer *next_rb(struct fd_context *ctx)
@@ -145,6 +146,7 @@ fd_context_destroy(struct pipe_context *pctx)
 	DBG("");

 	fd_prog_fini(pctx);
+	fd_hw_query_fini(pctx);

 	util_slab_destroy(&ctx->transfer_pool);

@@ -221,6 +223,7 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
 	fd_query_context_init(pctx);
 	fd_texture_init(pctx);
 	fd_state_init(pctx);
+	fd_hw_query_init(pctx);

 	ctx->blitter = util_blitter_create(pctx);
 	if (!ctx->blitter)
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -33,6 +33,7 @@
 #include "pipe/p_context.h"
 #include "indices/u_primconvert.h"
 #include "util/u_blitter.h"
+#include "util/u_double_list.h"
 #include "util/u_slab.h"
 #include "util/u_string.h"

@@ -82,16 +83,80 @@ struct fd_vertex_stateobj {
 	unsigned num_elements;
 };

+/* Bitmask of stages in rendering that a particular query query is
+ * active.  Queries will be automatically started/stopped (generating
+ * additional fd_hw_sample_period's) on entrance/exit from stages that
+ * are applicable to the query.
+ *
+ * NOTE: set the stage to NULL at end of IB to ensure no query is still
+ * active.  Things aren't going to work out the way you want if a query
+ * is active across IB's (or between tile IB and draw IB)
+ */
+enum fd_render_stage {
+	FD_STAGE_NULL     = 0x00,
+	FD_STAGE_DRAW     = 0x01,
+	FD_STAGE_CLEAR    = 0x02,
+	/* TODO before queries which include MEM2GMEM or GMEM2MEM will
+	 * work we will need to call fd_hw_query_prepare() from somewhere
+	 * appropriate so that queries in the tiling IB get backed with
+	 * memory to write results to.
+	 */
+	FD_STAGE_MEM2GMEM = 0x04,
+	FD_STAGE_GMEM2MEM = 0x08,
+	/* used for driver internal draws (ie. util_blitter_blit()): */
+	FD_STAGE_BLIT     = 0x10,
+};
+
+#define MAX_HW_SAMPLE_PROVIDERS 4
+struct fd_hw_sample_provider;
+struct fd_hw_sample;
+
 struct fd_context {
 	struct pipe_context base;

 	struct fd_device *dev;
 	struct fd_screen *screen;
+
 	struct blitter_context *blitter;
 	struct primconvert_context *primconvert;

+	/* slab for pipe_transfer allocations: */
 	struct util_slab_mempool transfer_pool;

+	/* slabs for fd_hw_sample and fd_hw_sample_period allocations: */
+	struct util_slab_mempool sample_pool;
+	struct util_slab_mempool sample_period_pool;
+
+	/* next sample offset.. incremented for each sample in the batch/
+	 * submit, reset to zero on next submit.
+	 */
+	uint32_t next_sample_offset;
+
+	/* sample-providers for hw queries: */
+	const struct fd_hw_sample_provider *sample_providers[MAX_HW_SAMPLE_PROVIDERS];
+
+	/* cached samples (in case multiple queries need to reference
+	 * the same sample snapshot)
+	 */
+	struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS];
+
+	/* tracking for current stage, to know when to start/stop
+	 * any active queries:
+	 */
+	enum fd_render_stage stage;
+
+	/* list of active queries: */
+	struct list_head active_queries;
+
+	/* list of queries that are not active, but were active in the
+	 * current submit:
+	 */
+	struct list_head current_queries;
+
+	/* current query result bo and tile stride: */
+	struct fd_bo *query_bo;
+	uint32_t query_tile_stride;
+
 	/* table with PIPE_PRIM_MAX entries mapping PIPE_PRIM_x to
 	 * DI_PT_x value to use for draw initiator.  There are some
 	 * slight differences between generation:
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -36,6 +36,7 @@
 #include "freedreno_context.h"
 #include "freedreno_state.h"
 #include "freedreno_resource.h"
+#include "freedreno_query_hw.h"
 #include "freedreno_util.h"


@@ -70,7 +71,7 @@ fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		idx_bo = fd_resource(idx->buffer)->bo;
 		idx_type = size2indextype(idx->index_size);
 		idx_size = idx->index_size * info->count;
-		idx_offset = idx->offset;
+		idx_offset = idx->offset + (info->start * idx->index_size);
 		src_sel = DI_SRC_SEL_DMA;
 	} else {
 		idx_bo = NULL;
@@ -156,6 +157,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	/* and any buffers used, need to be resolved: */
 	ctx->resolve |= buffers;

+	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_DRAW);
 	ctx->draw(ctx, info);
 }

@@ -188,6 +190,8 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 		util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 		util_format_short_name(pipe_surface_format(pfb->zsbuf)));

+	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_CLEAR);
+
 	ctx->clear(ctx, buffers, color, depth, stencil);

 	ctx->dirty |= FD_DIRTY_ZSA |
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -35,6 +35,7 @@
 #include "freedreno_gmem.h"
 #include "freedreno_context.h"
 #include "freedreno_resource.h"
+#include "freedreno_query_hw.h"
 #include "freedreno_util.h"

 /*
@@ -273,17 +274,24 @@ render_tiles(struct fd_context *ctx)

 		ctx->emit_tile_prep(ctx, tile);

-		if (ctx->restore)
+		if (ctx->restore) {
+			fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_MEM2GMEM);
 			ctx->emit_tile_mem2gmem(ctx, tile);
+			fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
+		}

 		ctx->emit_tile_renderprep(ctx, tile);

+		fd_hw_query_prepare_tile(ctx, i, ctx->ring);
+
 		/* emit IB to drawcmds: */
 		OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
 		fd_reset_wfi(ctx);

 		/* emit gmem2mem to transfer tile back to system memory: */
+		fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_GMEM2MEM);
 		ctx->emit_tile_gmem2mem(ctx, tile);
+		fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
 	}
 }

@@ -292,6 +300,8 @@ render_sysmem(struct fd_context *ctx)
 {
 	ctx->emit_sysmem_prep(ctx);

+	fd_hw_query_prepare_tile(ctx, 0, ctx->ring);
+
 	/* emit IB to drawcmds: */
 	OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
 	fd_reset_wfi(ctx);
@@ -314,6 +324,11 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
 		}
 	}

+	/* close out the draw cmds by making sure any active queries are
+	 * paused:
+	 */
+	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
+
 	/* mark the end of the clear/draw cmds before emitting per-tile cmds: */
 	fd_ringmarker_mark(ctx->draw_end);
 	fd_ringmarker_mark(ctx->binning_end);
@@ -326,6 +341,7 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
 		DBG("rendering sysmem (%s/%s)",
 			util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 			util_format_short_name(pipe_surface_format(pfb->zsbuf)));
+		fd_hw_query_prepare(ctx, 1);
 		render_sysmem(ctx);
 		ctx->stats.batch_sysmem++;
 	} else {
@@ -334,6 +350,7 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
 		DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y,
 			util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
 			util_format_short_name(pipe_surface_format(pfb->zsbuf)));
+		fd_hw_query_prepare(ctx, gmem->nbins_x * gmem->nbins_y);
 		render_tiles(ctx);
 		ctx->stats.batch_gmem++;
 	}
--- a/src/gallium/drivers/freedreno/freedreno_query.c
+++ b/src/gallium/drivers/freedreno/freedreno_query.c
@@ -1,7 +1,7 @@
 /* -*- mode: C; c-file-style: "k&r"; ttxab-width 4; indent-tabs-mode: t; -*- */

 /*
- * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
+ * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -27,63 +27,27 @@
 */

 #include "pipe/p_state.h"
-#include "util/u_string.h"
 #include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "os/os_time.h"

 #include "freedreno_query.h"
+#include "freedreno_query_sw.h"
+#include "freedreno_query_hw.h"
 #include "freedreno_context.h"
 #include "freedreno_util.h"

-#define FD_QUERY_DRAW_CALLS      (PIPE_QUERY_DRIVER_SPECIFIC + 0)
-#define FD_QUERY_BATCH_TOTAL     (PIPE_QUERY_DRIVER_SPECIFIC + 1)  /* total # of batches (submits) */
-#define FD_QUERY_BATCH_SYSMEM    (PIPE_QUERY_DRIVER_SPECIFIC + 2)  /* batches using system memory (GMEM bypass) */
-#define FD_QUERY_BATCH_GMEM      (PIPE_QUERY_DRIVER_SPECIFIC + 3)  /* batches using GMEM */
-#define FD_QUERY_BATCH_RESTORE   (PIPE_QUERY_DRIVER_SPECIFIC + 4)  /* batches requiring GMEM restore */
-
-/* Currently just simple cpu query's supported.. probably need
- * to refactor this a bit when I'm eventually ready to add gpu
- * queries:
+/*
+ * Pipe Query interface:
 */
-struct fd_query {
-	int type;
-	/* storage for the collected data */
-	union pipe_query_result data;
-	bool active;
-	uint64_t begin_value, end_value;
-	uint64_t begin_time, end_time;
-};
-
-static inline struct fd_query *
-fd_query(struct pipe_query *pq)
-{
-	return (struct fd_query *)pq;
-}

 static struct pipe_query *
 fd_create_query(struct pipe_context *pctx, unsigned query_type)
 {
+	struct fd_context *ctx = fd_context(pctx);
 	struct fd_query *q;

-	switch (query_type) {
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-	case FD_QUERY_DRAW_CALLS:
-	case FD_QUERY_BATCH_TOTAL:
-	case FD_QUERY_BATCH_SYSMEM:
-	case FD_QUERY_BATCH_GMEM:
-	case FD_QUERY_BATCH_RESTORE:
-		break;
-	default:
-		return NULL;
-	}
-
-	q = CALLOC_STRUCT(fd_query);
+	q = fd_sw_create_query(ctx, query_type);
 	if (!q)
-		return NULL;
-
-	q->type = query_type;
+		q = fd_hw_create_query(ctx, query_type);

 	return (struct pipe_query *) q;
 }
@@ -92,64 +56,21 @@ static void
 fd_destroy_query(struct pipe_context *pctx, struct pipe_query *pq)
 {
 	struct fd_query *q = fd_query(pq);
-	free(q);
-}
-
-static uint64_t
-read_counter(struct pipe_context *pctx, int type)
-{
-	struct fd_context *ctx = fd_context(pctx);
-	switch (type) {
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-		/* for now same thing as _PRIMITIVES_EMITTED */
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-		return ctx->stats.prims_emitted;
-	case FD_QUERY_DRAW_CALLS:
-		return ctx->stats.draw_calls;
-	case FD_QUERY_BATCH_TOTAL:
-		return ctx->stats.batch_total;
-	case FD_QUERY_BATCH_SYSMEM:
-		return ctx->stats.batch_sysmem;
-	case FD_QUERY_BATCH_GMEM:
-		return ctx->stats.batch_gmem;
-	case FD_QUERY_BATCH_RESTORE:
-		return ctx->stats.batch_restore;
-	}
-	return 0;
-}
-
-static bool
-is_rate_query(struct fd_query *q)
-{
-	switch (q->type) {
-	case FD_QUERY_BATCH_TOTAL:
-	case FD_QUERY_BATCH_SYSMEM:
-	case FD_QUERY_BATCH_GMEM:
-	case FD_QUERY_BATCH_RESTORE:
-		return true;
-	default:
-		return false;
-	}
+	q->funcs->destroy_query(fd_context(pctx), q);
 }

 static void
 fd_begin_query(struct pipe_context *pctx, struct pipe_query *pq)
 {
 	struct fd_query *q = fd_query(pq);
-	q->active = true;
-	q->begin_value = read_counter(pctx, q->type);
-	if (is_rate_query(q))
-		q->begin_time = os_time_get();
+	q->funcs->begin_query(fd_context(pctx), q);
 }

 static void
 fd_end_query(struct pipe_context *pctx, struct pipe_query *pq)
 {
 	struct fd_query *q = fd_query(pq);
-	q->active = false;
-	q->end_value = read_counter(pctx, q->type);
-	if (is_rate_query(q))
-		q->end_time = os_time_get();
+	q->funcs->end_query(fd_context(pctx), q);
 }

 static boolean
@@ -157,21 +78,7 @@ fd_get_query_result(struct pipe_context *pctx, struct pipe_query *pq,
 		boolean wait, union pipe_query_result *result)
 {
 	struct fd_query *q = fd_query(pq);
-
-	if (q->active)
-		return false;
-
-	util_query_clear_result(result, q->type);
-
-	result->u64 = q->end_value - q->begin_value;
-
-	if (is_rate_query(q)) {
-		double fps = (result->u64 * 1000000) /
-				(double)(q->end_time - q->begin_time);
-		result->u64 = (uint64_t)fps;
-	}
-
-	return true;
+	return q->funcs->get_query_result(fd_context(pctx), q, wait, result);
 }

 static int
--- a/src/gallium/drivers/freedreno/freedreno_query.h
+++ b/src/gallium/drivers/freedreno/freedreno_query.h
@@ -1,7 +1,7 @@
 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */

 /*
- * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
+ * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -31,6 +31,37 @@

 #include "pipe/p_context.h"

+struct fd_context;
+struct fd_query;
+
+struct fd_query_funcs {
+	void (*destroy_query)(struct fd_context *ctx,
+			struct fd_query *q);
+	void (*begin_query)(struct fd_context *ctx, struct fd_query *q);
+	void (*end_query)(struct fd_context *ctx, struct fd_query *q);
+	boolean (*get_query_result)(struct fd_context *ctx,
+			struct fd_query *q, boolean wait,
+			union pipe_query_result *result);
+};
+
+struct fd_query {
+	const struct fd_query_funcs *funcs;
+	bool active;
+	int type;
+};
+
+static inline struct fd_query *
+fd_query(struct pipe_query *pq)
+{
+	return (struct fd_query *)pq;
+}
+
+#define FD_QUERY_DRAW_CALLS      (PIPE_QUERY_DRIVER_SPECIFIC + 0)
+#define FD_QUERY_BATCH_TOTAL     (PIPE_QUERY_DRIVER_SPECIFIC + 1)  /* total # of batches (submits) */
+#define FD_QUERY_BATCH_SYSMEM    (PIPE_QUERY_DRIVER_SPECIFIC + 2)  /* batches using system memory (GMEM bypass) */
+#define FD_QUERY_BATCH_GMEM      (PIPE_QUERY_DRIVER_SPECIFIC + 3)  /* batches using GMEM */
+#define FD_QUERY_BATCH_RESTORE   (PIPE_QUERY_DRIVER_SPECIFIC + 4)  /* batches requiring GMEM restore */
+
 void fd_query_screen_init(struct pipe_screen *pscreen);
 void fd_query_context_init(struct pipe_context *pctx);

--- a/src/gallium/drivers/freedreno/freedreno_query_hw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.c
@@ -0,0 +1,465 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+#include "freedreno_query_hw.h"
+#include "freedreno_context.h"
+#include "freedreno_util.h"
+
+struct fd_hw_sample_period {
+	struct fd_hw_sample *start, *end;
+	struct list_head list;
+};
+
+/* maps query_type to sample provider idx: */
+static int pidx(unsigned query_type)
+{
+	switch (query_type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+		return 0;
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+		return 1;
+	default:
+		return -1;
+	}
+}
+
+static struct fd_hw_sample *
+get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		unsigned query_type)
+{
+	struct fd_hw_sample *samp = NULL;
+	int idx = pidx(query_type);
+
+	if (!ctx->sample_cache[idx]) {
+		ctx->sample_cache[idx] =
+			ctx->sample_providers[idx]->get_sample(ctx, ring);
+	}
+
+	fd_hw_sample_reference(ctx, &samp, ctx->sample_cache[idx]);
+
+	return samp;
+}
+
+static void
+clear_sample_cache(struct fd_context *ctx)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ctx->sample_cache); i++)
+		fd_hw_sample_reference(ctx, &ctx->sample_cache[i], NULL);
+}
+
+static bool
+is_active(struct fd_hw_query *hq, enum fd_render_stage stage)
+{
+	return !!(hq->provider->active & stage);
+}
+
+
+static void
+resume_query(struct fd_context *ctx, struct fd_hw_query *hq,
+		struct fd_ringbuffer *ring)
+{
+	assert(!hq->period);
+	hq->period = util_slab_alloc(&ctx->sample_period_pool);
+	list_inithead(&hq->period->list);
+	hq->period->start = get_sample(ctx, ring, hq->base.type);
+	/* NOTE: util_slab_alloc() does not zero out the buffer: */
+	hq->period->end = NULL;
+}
+
+static void
+pause_query(struct fd_context *ctx, struct fd_hw_query *hq,
+		struct fd_ringbuffer *ring)
+{
+	assert(hq->period && !hq->period->end);
+	hq->period->end = get_sample(ctx, ring, hq->base.type);
+	list_addtail(&hq->period->list, &hq->current_periods);
+	hq->period = NULL;
+}
+
+static void
+destroy_periods(struct fd_context *ctx, struct list_head *list)
+{
+	struct fd_hw_sample_period *period, *s;
+	LIST_FOR_EACH_ENTRY_SAFE(period, s, list, list) {
+		fd_hw_sample_reference(ctx, &period->start, NULL);
+		fd_hw_sample_reference(ctx, &period->end, NULL);
+		list_del(&period->list);
+		util_slab_free(&ctx->sample_period_pool, period);
+	}
+}
+
+static void
+fd_hw_destroy_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_hw_query *hq = fd_hw_query(q);
+
+	destroy_periods(ctx, &hq->periods);
+	destroy_periods(ctx, &hq->current_periods);
+	list_del(&hq->list);
+
+	free(hq);
+}
+
+static void
+fd_hw_begin_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_hw_query *hq = fd_hw_query(q);
+	if (q->active)
+		return;
+
+	/* begin_query() should clear previous results: */
+	destroy_periods(ctx, &hq->periods);
+
+	if (is_active(hq, ctx->stage))
+		resume_query(ctx, hq, ctx->ring);
+
+	q->active = true;
+
+	/* add to active list: */
+	list_del(&hq->list);
+	list_addtail(&hq->list, &ctx->active_queries);
+}
+
+static void
+fd_hw_end_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_hw_query *hq = fd_hw_query(q);
+	if (!q->active)
+		return;
+	if (is_active(hq, ctx->stage))
+		pause_query(ctx, hq, ctx->ring);
+	q->active = false;
+	/* move to current list: */
+	list_del(&hq->list);
+	list_addtail(&hq->list, &ctx->current_queries);
+}
+
+/* helper to get ptr to specified sample: */
+static void * sampptr(struct fd_hw_sample *samp, uint32_t n, void *ptr)
+{
+	return ((char *)ptr) + (samp->tile_stride * n) + samp->offset;
+}
+
+static boolean
+fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q,
+		boolean wait, union pipe_query_result *result)
+{
+	struct fd_hw_query *hq = fd_hw_query(q);
+	const struct fd_hw_sample_provider *p = hq->provider;
+	struct fd_hw_sample_period *period;
+
+	if (q->active)
+		return false;
+
+	/* if the app tries to read back the query result before the
+	 * back is submitted, that forces us to flush so that there
+	 * are actually results to wait for:
+	 */
+	if (!LIST_IS_EMPTY(&hq->list)) {
+		DBG("reading query result forces flush!");
+		ctx->needs_flush = true;
+		fd_context_render(&ctx->base);
+	}
+
+	util_query_clear_result(result, q->type);
+
+	if (LIST_IS_EMPTY(&hq->periods))
+		return true;
+
+	assert(LIST_IS_EMPTY(&hq->list));
+	assert(LIST_IS_EMPTY(&hq->current_periods));
+	assert(!hq->period);
+
+	if (LIST_IS_EMPTY(&hq->periods))
+		return true;
+
+	/* if !wait, then check the last sample (the one most likely to
+	 * not be ready yet) and bail if it is not ready:
+	 */
+	if (!wait) {
+		int ret;
+
+		period = LIST_ENTRY(struct fd_hw_sample_period,
+				hq->periods.prev, list);
+
+		ret = fd_bo_cpu_prep(period->end->bo, ctx->screen->pipe,
+				DRM_FREEDRENO_PREP_READ | DRM_FREEDRENO_PREP_NOSYNC);
+		if (ret)
+			return false;
+
+		fd_bo_cpu_fini(period->end->bo);
+	}
+
+	/* sum the result across all sample periods: */
+	LIST_FOR_EACH_ENTRY(period, &hq->periods, list) {
+		struct fd_hw_sample *start = period->start;
+		struct fd_hw_sample *end = period->end;
+		unsigned i;
+
+		/* start and end samples should be from same batch: */
+		assert(start->bo == end->bo);
+		assert(start->num_tiles == end->num_tiles);
+
+		for (i = 0; i < start->num_tiles; i++) {
+			void *ptr;
+
+			fd_bo_cpu_prep(start->bo, ctx->screen->pipe,
+					DRM_FREEDRENO_PREP_READ);
+
+			ptr = fd_bo_map(start->bo);
+
+			p->accumulate_result(ctx, sampptr(period->start, i, ptr),
+					sampptr(period->end, i, ptr), result);
+
+			fd_bo_cpu_fini(start->bo);
+		}
+	}
+
+	return true;
+}
+
+static const struct fd_query_funcs hw_query_funcs = {
+		.destroy_query    = fd_hw_destroy_query,
+		.begin_query      = fd_hw_begin_query,
+		.end_query        = fd_hw_end_query,
+		.get_query_result = fd_hw_get_query_result,
+};
+
+struct fd_query *
+fd_hw_create_query(struct fd_context *ctx, unsigned query_type)
+{
+	struct fd_hw_query *hq;
+	struct fd_query *q;
+	int idx = pidx(query_type);
+
+	if ((idx < 0) || !ctx->sample_providers[idx])
+		return NULL;
+
+	hq = CALLOC_STRUCT(fd_hw_query);
+	if (!hq)
+		return NULL;
+
+	hq->provider = ctx->sample_providers[idx];
+
+	list_inithead(&hq->periods);
+	list_inithead(&hq->current_periods);
+	list_inithead(&hq->list);
+
+	q = &hq->base;
+	q->funcs = &hw_query_funcs;
+	q->type = query_type;
+
+	return q;
+}
+
+struct fd_hw_sample *
+fd_hw_sample_init(struct fd_context *ctx, uint32_t size)
+{
+	struct fd_hw_sample *samp = util_slab_alloc(&ctx->sample_pool);
+	pipe_reference_init(&samp->reference, 1);
+	samp->size = size;
+	samp->offset = ctx->next_sample_offset;
+	/* NOTE: util_slab_alloc() does not zero out the buffer: */
+	samp->bo = NULL;
+	samp->num_tiles = 0;
+	samp->tile_stride = 0;
+	ctx->next_sample_offset += size;
+	return samp;
+}
+
+void
+__fd_hw_sample_destroy(struct fd_context *ctx, struct fd_hw_sample *samp)
+{
+	if (samp->bo)
+		fd_bo_del(samp->bo);
+	util_slab_free(&ctx->sample_pool, samp);
+}
+
+static void
+prepare_sample(struct fd_hw_sample *samp, struct fd_bo *bo,
+		uint32_t num_tiles, uint32_t tile_stride)
+{
+	if (samp->bo) {
+		assert(samp->bo == bo);
+		assert(samp->num_tiles == num_tiles);
+		assert(samp->tile_stride == tile_stride);
+		return;
+	}
+	samp->bo = bo;
+	samp->num_tiles = num_tiles;
+	samp->tile_stride = tile_stride;
+}
+
+static void
+prepare_query(struct fd_hw_query *hq, struct fd_bo *bo,
+		uint32_t num_tiles, uint32_t tile_stride)
+{
+	struct fd_hw_sample_period *period, *s;
+
+	/* prepare all the samples in the query: */
+	LIST_FOR_EACH_ENTRY_SAFE(period, s, &hq->current_periods, list) {
+		prepare_sample(period->start, bo, num_tiles, tile_stride);
+		prepare_sample(period->end, bo, num_tiles, tile_stride);
+
+		/* move from current_periods list to periods list: */
+		list_del(&period->list);
+		list_addtail(&period->list, &hq->periods);
+	}
+}
+
+static void
+prepare_queries(struct fd_context *ctx, struct fd_bo *bo,
+		uint32_t num_tiles, uint32_t tile_stride,
+		struct list_head *list, bool remove)
+{
+	struct fd_hw_query *hq, *s;
+	LIST_FOR_EACH_ENTRY_SAFE(hq, s, list, list) {
+		prepare_query(hq, bo, num_tiles, tile_stride);
+		if (remove)
+			list_delinit(&hq->list);
+	}
+}
+
+/* called from gmem code once total storage requirements are known (ie.
+ * number of samples times number of tiles)
+ */
+void
+fd_hw_query_prepare(struct fd_context *ctx, uint32_t num_tiles)
+{
+	uint32_t tile_stride = ctx->next_sample_offset;
+	struct fd_bo *bo;
+
+	if (ctx->query_bo)
+		fd_bo_del(ctx->query_bo);
+
+	if (tile_stride > 0) {
+		bo = fd_bo_new(ctx->dev, tile_stride * num_tiles,
+				DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+				DRM_FREEDRENO_GEM_TYPE_KMEM);
+	} else {
+		bo = NULL;
+	}
+
+	ctx->query_bo = bo;
+	ctx->query_tile_stride = tile_stride;
+
+	prepare_queries(ctx, bo, num_tiles, tile_stride,
+			&ctx->active_queries, false);
+	prepare_queries(ctx, bo, num_tiles, tile_stride,
+			&ctx->current_queries, true);
+
+	/* reset things for next batch: */
+	ctx->next_sample_offset = 0;
+}
+
+void
+fd_hw_query_prepare_tile(struct fd_context *ctx, uint32_t n,
+		struct fd_ringbuffer *ring)
+{
+	uint32_t tile_stride = ctx->query_tile_stride;
+	uint32_t offset = tile_stride * n;
+
+	/* bail if no queries: */
+	if (tile_stride == 0)
+		return;
+
+	fd_wfi(ctx, ring);
+	OUT_PKT0 (ring, HW_QUERY_BASE_REG, 1);
+	OUT_RELOCW(ring, ctx->query_bo, offset, 0, 0);
+}
+
+void
+fd_hw_query_set_stage(struct fd_context *ctx, struct fd_ringbuffer *ring,
+		enum fd_render_stage stage)
+{
+	/* special case: internal blits (like mipmap level generation)
+	 * go through normal draw path (via util_blitter_blit()).. but
+	 * we need to ignore the FD_STAGE_DRAW which will be set, so we
+	 * don't enable queries which should be paused during internal
+	 * blits:
+	 */
+	if ((ctx->stage == FD_STAGE_BLIT) &&
+			(stage != FD_STAGE_NULL))
+		return;
+
+	if (stage != ctx->stage) {
+		struct fd_hw_query *hq;
+		LIST_FOR_EACH_ENTRY(hq, &ctx->active_queries, list) {
+			bool was_active = is_active(hq, ctx->stage);
+			bool now_active = is_active(hq, stage);
+
+			if (now_active && !was_active)
+				resume_query(ctx, hq, ring);
+			else if (was_active && !now_active)
+				pause_query(ctx, hq, ring);
+		}
+	}
+	clear_sample_cache(ctx);
+	ctx->stage = stage;
+}
+
+void
+fd_hw_query_register_provider(struct pipe_context *pctx,
+		const struct fd_hw_sample_provider *provider)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	int idx = pidx(provider->query_type);
+
+	assert((0 <= idx) && (idx < MAX_HW_SAMPLE_PROVIDERS));
+	assert(!ctx->sample_providers[idx]);
+
+	ctx->sample_providers[idx] = provider;
+}
+
+void
+fd_hw_query_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+
+	util_slab_create(&ctx->sample_pool, sizeof(struct fd_hw_sample),
+			16, UTIL_SLAB_SINGLETHREADED);
+	util_slab_create(&ctx->sample_period_pool, sizeof(struct fd_hw_sample_period),
+			16, UTIL_SLAB_SINGLETHREADED);
+	list_inithead(&ctx->active_queries);
+	list_inithead(&ctx->current_queries);
+}
+
+void
+fd_hw_query_fini(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+
+	util_slab_destroy(&ctx->sample_pool);
+	util_slab_destroy(&ctx->sample_period_pool);
+}
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.h
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.h
@@ -0,0 +1,164 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FREEDRENO_QUERY_HW_H_
+#define FREEDRENO_QUERY_HW_H_
+
+#include "util/u_double_list.h"
+
+#include "freedreno_query.h"
+#include "freedreno_context.h"
+
+
+/*
+ * HW Queries:
+ *
+ * See: https://github.com/freedreno/freedreno/wiki/Queries#hardware-queries
+ *
+ * Hardware queries will be specific to gpu generation, but they need
+ * some common infrastructure for triggering start/stop samples at
+ * various points (for example, to exclude mem2gmem/gmem2mem or clear)
+ * as well as per tile tracking.
+ *
+ * NOTE: in at least some cases hw writes sample values to memory addr
+ * specified in some register.  So we don't really have the option to
+ * just sample the same counter multiple times for multiple different
+ * queries with the same query_type.  So we cache per sample provider
+ * the most recent sample since the last draw.  This way multiple
+ * sample periods for multiple queries can reference the same sample.
+ *
+ * fd_hw_sample_provider:
+ *   - one per query type, registered/implemented by gpu generation
+ *     specific code
+ *   - can construct fd_hw_samples on demand
+ *   - most recent sample (since last draw) cached so multiple
+ *     different queries can ref the same sample
+ *
+ * fd_hw_sample:
+ *   - abstracts one snapshot of counter value(s) across N tiles
+ *   - backing object not allocated until submit time when number
+ *     of samples and number of tiles is known
+ *
+ * fd_hw_sample_period:
+ *   - consists of start and stop sample
+ *   - a query accumulates a list of sample periods
+ *   - the query result is the sum of the sample periods
+ */
+
+struct fd_hw_sample_provider {
+	unsigned query_type;
+
+	/* stages applicable to the query type: */
+	enum fd_render_stage active;
+
+	/* when a new sample is required, emit appropriate cmdstream
+	 * and return a sample object:
+	 */
+	struct fd_hw_sample *(*get_sample)(struct fd_context *ctx,
+			struct fd_ringbuffer *ring);
+
+	/* accumulate the results from specified sample period: */
+	void (*accumulate_result)(struct fd_context *ctx,
+			const void *start, const void *end,
+			union pipe_query_result *result);
+};
+
+struct fd_hw_sample {
+	struct pipe_reference reference;  /* keep this first */
+
+	/* offset and size of the sample are know at the time the
+	 * sample is constructed.
+	 */
+	uint32_t size;
+	uint32_t offset;
+
+	/* backing object, offset/stride/etc are determined not when
+	 * the sample is constructed, but when the batch is submitted.
+	 * This way we can defer allocation until total # of requested
+	 * samples, and total # of tiles, is known.
+	 */
+	struct fd_bo *bo;
+	uint32_t num_tiles;
+	uint32_t tile_stride;
+};
+
+struct fd_hw_sample_period;
+
+struct fd_hw_query {
+	struct fd_query base;
+
+	const struct fd_hw_sample_provider *provider;
+
+	/* list of fd_hw_sample_period in previous submits: */
+	struct list_head periods;
+
+	/* list of fd_hw_sample_period's in current submit: */
+	struct list_head current_periods;
+
+	/* if active and not paused, the current sample period (not
+	 * yet added to current_periods):
+	 */
+	struct fd_hw_sample_period *period;
+
+	struct list_head list;  /* list-node in ctx->active_queries */
+};
+
+static inline struct fd_hw_query *
+fd_hw_query(struct fd_query *q)
+{
+	return (struct fd_hw_query *)q;
+}
+
+struct fd_query * fd_hw_create_query(struct fd_context *ctx, unsigned query_type);
+/* helper for sample providers: */
+struct fd_hw_sample * fd_hw_sample_init(struct fd_context *ctx, uint32_t size);
+/* don't call directly, use fd_hw_sample_reference() */
+void __fd_hw_sample_destroy(struct fd_context *ctx, struct fd_hw_sample *samp);
+void fd_hw_query_prepare(struct fd_context *ctx, uint32_t num_tiles);
+void fd_hw_query_prepare_tile(struct fd_context *ctx, uint32_t n,
+		struct fd_ringbuffer *ring);
+void fd_hw_query_set_stage(struct fd_context *ctx,
+		struct fd_ringbuffer *ring, enum fd_render_stage stage);
+void fd_hw_query_register_provider(struct pipe_context *pctx,
+		const struct fd_hw_sample_provider *provider);
+void fd_hw_query_init(struct pipe_context *pctx);
+void fd_hw_query_fini(struct pipe_context *pctx);
+
+static inline void
+fd_hw_sample_reference(struct fd_context *ctx,
+		struct fd_hw_sample **ptr, struct fd_hw_sample *samp)
+{
+	struct fd_hw_sample *old_samp = *ptr;
+
+	if (pipe_reference(&(*ptr)->reference, &samp->reference))
+		__fd_hw_sample_destroy(ctx, old_samp);
+	if (ptr)
+		*ptr = samp;
+}
+
+#endif /* FREEDRENO_QUERY_HW_H_ */
--- a/src/gallium/drivers/freedreno/freedreno_query_sw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_sw.c
@@ -0,0 +1,165 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "os/os_time.h"
+
+#include "freedreno_query_sw.h"
+#include "freedreno_context.h"
+#include "freedreno_util.h"
+
+/*
+ * SW Queries:
+ *
+ * In the core, we have some support for basic sw counters
+ */
+
+static void
+fd_sw_destroy_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_sw_query *sq = fd_sw_query(q);
+	free(sq);
+}
+
+static uint64_t
+read_counter(struct fd_context *ctx, int type)
+{
+	switch (type) {
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+		/* for now same thing as _PRIMITIVES_EMITTED */
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+		return ctx->stats.prims_emitted;
+	case FD_QUERY_DRAW_CALLS:
+		return ctx->stats.draw_calls;
+	case FD_QUERY_BATCH_TOTAL:
+		return ctx->stats.batch_total;
+	case FD_QUERY_BATCH_SYSMEM:
+		return ctx->stats.batch_sysmem;
+	case FD_QUERY_BATCH_GMEM:
+		return ctx->stats.batch_gmem;
+	case FD_QUERY_BATCH_RESTORE:
+		return ctx->stats.batch_restore;
+	}
+	return 0;
+}
+
+static bool
+is_rate_query(struct fd_query *q)
+{
+	switch (q->type) {
+	case FD_QUERY_BATCH_TOTAL:
+	case FD_QUERY_BATCH_SYSMEM:
+	case FD_QUERY_BATCH_GMEM:
+	case FD_QUERY_BATCH_RESTORE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static void
+fd_sw_begin_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_sw_query *sq = fd_sw_query(q);
+	q->active = true;
+	sq->begin_value = read_counter(ctx, q->type);
+	if (is_rate_query(q))
+		sq->begin_time = os_time_get();
+}
+
+static void
+fd_sw_end_query(struct fd_context *ctx, struct fd_query *q)
+{
+	struct fd_sw_query *sq = fd_sw_query(q);
+	q->active = false;
+	sq->end_value = read_counter(ctx, q->type);
+	if (is_rate_query(q))
+		sq->end_time = os_time_get();
+}
+
+static boolean
+fd_sw_get_query_result(struct fd_context *ctx, struct fd_query *q,
+		boolean wait, union pipe_query_result *result)
+{
+	struct fd_sw_query *sq = fd_sw_query(q);
+
+	if (q->active)
+		return false;
+
+	util_query_clear_result(result, q->type);
+
+	result->u64 = sq->end_value - sq->begin_value;
+
+	if (is_rate_query(q)) {
+		double fps = (result->u64 * 1000000) /
+				(double)(sq->end_time - sq->begin_time);
+		result->u64 = (uint64_t)fps;
+	}
+
+	return true;
+}
+
+static const struct fd_query_funcs sw_query_funcs = {
+		.destroy_query    = fd_sw_destroy_query,
+		.begin_query      = fd_sw_begin_query,
+		.end_query        = fd_sw_end_query,
+		.get_query_result = fd_sw_get_query_result,
+};
+
+struct fd_query *
+fd_sw_create_query(struct fd_context *ctx, unsigned query_type)
+{
+	struct fd_sw_query *sq;
+	struct fd_query *q;
+
+	switch (query_type) {
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case FD_QUERY_DRAW_CALLS:
+	case FD_QUERY_BATCH_TOTAL:
+	case FD_QUERY_BATCH_SYSMEM:
+	case FD_QUERY_BATCH_GMEM:
+	case FD_QUERY_BATCH_RESTORE:
+		break;
+	default:
+		return NULL;
+	}
+
+	sq = CALLOC_STRUCT(fd_sw_query);
+	if (!sq)
+		return NULL;
+
+	q = &sq->base;
+	q->funcs = &sw_query_funcs;
+	q->type = query_type;
+
+	return q;
+}
--- a/src/gallium/drivers/freedreno/freedreno_query_sw.h
+++ b/src/gallium/drivers/freedreno/freedreno_query_sw.h
@@ -0,0 +1,55 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FREEDRENO_QUERY_SW_H_
+#define FREEDRENO_QUERY_SW_H_
+
+#include "freedreno_query.h"
+
+/*
+ * SW Queries:
+ *
+ * In the core, we have some support for basic sw counters
+ */
+
+struct fd_sw_query {
+	struct fd_query base;
+	uint64_t begin_value, end_value;
+	uint64_t begin_time, end_time;
+};
+
+static inline struct fd_sw_query *
+fd_sw_query(struct fd_query *q)
+{
+	return (struct fd_sw_query *)q;
+}
+
+struct fd_query * fd_sw_create_query(struct fd_context *ctx,
+		unsigned query_type);
+
+#endif /* FREEDRENO_QUERY_SW_H_ */
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -36,6 +36,7 @@
 #include "freedreno_screen.h"
 #include "freedreno_surface.h"
 #include "freedreno_context.h"
+#include "freedreno_query_hw.h"
 #include "freedreno_util.h"

 #include <errno.h>
@@ -401,7 +402,9 @@ render_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
 	util_blitter_save_fragment_sampler_views(ctx->blitter,
 			ctx->fragtex.num_textures, ctx->fragtex.textures);

+	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_BLIT);
 	util_blitter_blit(ctx->blitter, info);
+	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);

 	return true;
 }
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -143,6 +143,8 @@ tables for things that differ if the delta is not too much..
 static int
 fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
+	struct fd_screen *screen = fd_screen(pscreen);
+
 	/* this is probably not totally correct.. but it's a start: */
 	switch (param) {
 	/* Supported features (boolean caps). */
@@ -161,8 +163,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
 	case PIPE_CAP_SM3:
 	case PIPE_CAP_SEAMLESS_CUBE_MAP:
-	case PIPE_CAP_PRIMITIVE_RESTART:
-	case PIPE_CAP_CONDITIONAL_RENDER:
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
@@ -180,6 +180,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_SHADER_STENCIL_EXPORT:
 	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+	case PIPE_CAP_CONDITIONAL_RENDER:
+	case PIPE_CAP_PRIMITIVE_RESTART:
 		return 0;

 	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
@@ -229,17 +231,18 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
 		return MAX_MIP_LEVELS;
 	case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-		return 9192;
+		return 0;  /* TODO: a3xx+ should support (required in gles3) */

 	/* Render targets. */
 	case PIPE_CAP_MAX_RENDER_TARGETS:
 		return 1;

-	/* Timer queries. */
+	/* Queries. */
 	case PIPE_CAP_QUERY_TIME_ELAPSED:
-	case PIPE_CAP_OCCLUSION_QUERY:
 	case PIPE_CAP_QUERY_TIMESTAMP:
 		return 0;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return (screen->gpu_id >= 300) ? 1: 0;

 	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
 	case PIPE_CAP_MIN_TEXEL_OFFSET:
@@ -252,7 +255,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_ENDIANNESS:
 		return PIPE_ENDIAN_LITTLE;

-        case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+	case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
 		return 64;

 	default:
@@ -315,7 +318,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 	case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
 		return 8; /* XXX */
 	case PIPE_SHADER_CAP_MAX_INPUTS:
-		return 32;
+		return 16;
 	case PIPE_SHADER_CAP_MAX_TEMPS:
 		return 64; /* Max native temporaries. */
 	case PIPE_SHADER_CAP_MAX_ADDRS:
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -223,11 +223,18 @@ OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
 	emit_marker(ring, 6);
 }

+/* CP_SCRATCH_REG4 is used to hold base address for query results: */
+#define HW_QUERY_BASE_REG REG_AXXX_CP_SCRATCH_REG4
+
 static inline void
 emit_marker(struct fd_ringbuffer *ring, int scratch_idx)
 {
 	extern unsigned marker_cnt;
-	OUT_PKT0(ring, REG_AXXX_CP_SCRATCH_REG0 + scratch_idx, 1);
+	unsigned reg = REG_AXXX_CP_SCRATCH_REG0 + scratch_idx;
+	assert(reg != HW_QUERY_BASE_REG);
+	if (reg == HW_QUERY_BASE_REG)
+		return;
+	OUT_PKT0(ring, reg, 1);
 	OUT_RING(ring, ++marker_cnt);
 }

--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -312,9 +312,15 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,

         /* color buffer */
         for (i = 0; i < scene->fb.nr_cbufs; i++){
-            stride[i] = scene->cbufs[i].stride;
-            color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, tile_x + x,
-                                                                  tile_y + y, inputs->layer);
+            if (scene->fb.cbufs[i]) {
+               stride[i] = scene->cbufs[i].stride;
+               color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, tile_x + x,
+                                                                     tile_y + y, inputs->layer);
+            }
+            else {
+               stride[i] = 0;
+               color[i] = NULL;
+            }
         }

         /* depth buffer */
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -633,7 +633,7 @@ CodeEmitterGK110::emitISAD(const Instruction *i)
 {
   assert(i->dType == TYPE_S32 || i->dType == TYPE_U32);

-   emitForm_21(i, 0x1fc, 0xb74);
+   emitForm_21(i, 0x1f4, 0xb74);

   if (i->dType == TYPE_S32)
      code[1] |= 1 << 19;
@@ -952,7 +952,7 @@ CodeEmitterGK110::emitSLCT(const CmpInstruction *i)
      FTZ_(32);
      emitCondCode(cc, 0x33, 0xf);
   } else {
-      emitForm_21(i, 0x1a4, 0xb20);
+      emitForm_21(i, 0x1a0, 0xb20);
      emitCondCode(cc, 0x34, 0x7);
   }
 }
@@ -967,7 +967,7 @@ void CodeEmitterGK110::emitSELP(const Instruction *i)

 void CodeEmitterGK110::emitTEXBAR(const Instruction *i)
 {
-   code[0] = 0x00000002 | (i->subOp << 23);
+   code[0] = 0x0000003e | (i->subOp << 23);
   code[1] = 0x77000000;

   emitPredicate(i);
@@ -1204,7 +1204,7 @@ CodeEmitterGK110::emitFlow(const Instruction *i)
   case OP_PRECONT:  code[1] = 0x15800000; mask = 2; break;
   case OP_PRERET:   code[1] = 0x13800000; mask = 2; break;

-   case OP_QUADON:  code[1] = 0x1b000000; mask = 0; break;
+   case OP_QUADON:  code[1] = 0x1b800000; mask = 0; break;
   case OP_QUADPOP: code[1] = 0x1c000000; mask = 0; break;
   case OP_BRKPT:   code[1] = 0x00000000; mask = 0; break;
   default:
@@ -1326,7 +1326,8 @@ CodeEmitterGK110::emitOUT(const Instruction *i)
 void
 CodeEmitterGK110::emitInterpMode(const Instruction *i)
 {
-   code[1] |= i->ipa << 21; // TODO: INTERP_SAMPLEID
+   code[1] |= (i->ipa & 0x3) << 21; // TODO: INTERP_SAMPLEID
+   code[1] |= (i->ipa & 0xc) << (19 - 2);
 }

 void
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2199,7 +2199,6 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
   case TGSI_OPCODE_IMUL_HI:
   case TGSI_OPCODE_UMUL_HI:
   case TGSI_OPCODE_OR:
-   case TGSI_OPCODE_POW:
   case TGSI_OPCODE_SHL:
   case TGSI_OPCODE_ISHR:
   case TGSI_OPCODE_USHR:
@@ -2254,6 +2253,11 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
         mkOp1(OP_MOV, TYPE_U32, dst0[c], fetchSrc(0, c));
      break;
+   case TGSI_OPCODE_POW:
+      val0 = mkOp2v(op, TYPE_F32, getScratch(), fetchSrc(0, 0), fetchSrc(1, 0));
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp1(OP_MOV, TYPE_F32, dst0[c], val0);
+      break;
   case TGSI_OPCODE_EX2:
   case TGSI_OPCODE_LG2:
      val0 = mkOp1(op, TYPE_F32, getScratch(), fetchSrc(0, 0))->getDef(0);
@@ -2453,7 +2457,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
      break;
   case TGSI_OPCODE_KILL_IF:
      val0 = new_LValue(func, FILE_PREDICATE);
+      mask = 0;
      for (c = 0; c < 4; ++c) {
+         const int s = tgsi.getSrc(0).getSwizzle(c);
+         if (mask & (1 << s))
+            continue;
+         mask |= 1 << s;
         mkCmp(OP_SET, CC_LT, TYPE_F32, val0, TYPE_F32, fetchSrc(0, c), zero);
         mkOp(OP_DISCARD, TYPE_NONE, NULL)->setPredicate(CC_P, val0);
      }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -37,18 +37,25 @@ namespace nv50_ir {
 //    ah*bl 00
 //
 // fffe0001 + fffe0001
+//
+// Note that this sort of splitting doesn't work for signed values, so we
+// compute the sign on those manually and then perform an unsigned multiply.
 static bool
 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
 {
   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;

-   DataType fTy = mul->sType; // full type
-   DataType hTy;
+   DataType fTy; // full type
+   switch (mul->sType) {
+   case TYPE_S32: fTy = TYPE_U32; break;
+   case TYPE_S64: fTy = TYPE_U64; break;
+   default: fTy = mul->sType; break;
+   }
+
+   DataType hTy; // half type
   switch (fTy) {
-   case TYPE_S32: hTy = TYPE_S16; break;
   case TYPE_U32: hTy = TYPE_U16; break;
   case TYPE_U64: hTy = TYPE_U32; break;
-   case TYPE_S64: hTy = TYPE_S32; break;
   default:
      return false;
   }
@@ -59,15 +66,25 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)

   bld->setPosition(mul, true);

+   Value *s[2];
   Value *a[2], *b[2];
-   Value *c[2];
   Value *t[4];
   for (int j = 0; j < 4; ++j)
      t[j] = bld->getSSA(fullSize);

+   s[0] = mul->getSrc(0);
+   s[1] = mul->getSrc(1);
+
+   if (isSignedType(mul->sType)) {
+      s[0] = bld->getSSA(fullSize);
+      s[1] = bld->getSSA(fullSize);
+      bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
+      bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
+   }
+
   // split sources into halves
-   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
-   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+   i[0] = bld->mkSplit(a, halfSize, s[0]);
+   i[1] = bld->mkSplit(b, halfSize, s[1]);

   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
@@ -75,23 +92,76 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);

   if (highResult) {
-      Value *r[3];
+      Value *c[2];
+      Value *r[5];
      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
      c[0] = bld->getSSA(1, FILE_FLAGS);
      c[1] = bld->getSSA(1, FILE_FLAGS);
-      for (int j = 0; j < 3; ++j)
+      for (int j = 0; j < 5; ++j)
         r[j] = bld->getSSA(fullSize);

      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
-      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
-      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+      bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
+      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
+      i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);

      // set carry defs / sources
      i[3]->setFlagsDef(1, c[0]);
-      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+      // actual result required in negative case, but ignored for
+      // unsigned. for some reason the compiler ends up dropping the whole
+      // instruction if the destination is unused but the flags are.
+      if (isSignedType(mul->sType))
+         i[4]->setFlagsDef(1, c[1]);
+      else
+         i[4]->setFlagsDef(0, c[1]);
      i[6]->setPredicate(CC_C, c[0]);
      i[5]->setFlagsSrc(3, c[1]);
+
+      if (isSignedType(mul->sType)) {
+         Value *cc[2];
+         Value *rr[7];
+         Value *one = bld->getSSA(fullSize);
+         bld->loadImm(one, 1);
+         for (int j = 0; j < 7; j++)
+            rr[j] = bld->getSSA(fullSize);
+
+         // NOTE: this logic uses predicates because splitting basic blocks is
+         // ~impossible during the SSA phase. The RA relies on a correlation
+         // between edge order and phi node sources.
+
+         // Set the sign of the result based on the inputs
+         bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
+            ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
+
+         // 1s complement of 64-bit value
+         bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
+            ->setPredicate(CC_S, cc[0]);
+         bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
+            ->setPredicate(CC_S, cc[0]);
+
+         // add to low 32-bits, keep track of the carry
+         Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
+         n->setPredicate(CC_S, cc[0]);
+         n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
+
+         // If there was a carry, add 1 to the upper 32 bits
+         // XXX: These get executed even if they shouldn't be
+         bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
+            ->setPredicate(CC_C, cc[1]);
+         bld->mkMov(rr[3], rr[0])
+            ->setPredicate(CC_NC, cc[1]);
+         bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
+
+         // Merge the results from the negative and non-negative paths
+         bld->mkMov(rr[5], rr[4])
+            ->setPredicate(CC_S, cc[0]);
+         bld->mkMov(rr[6], r[4])
+            ->setPredicate(CC_NS, cc[0]);
+         bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
+      } else {
+         bld->mkMov(mul->getDef(0), r[4]);
+      }
   } else {
      bld->mkMov(mul->getDef(0), t[3]);
   }
@@ -1209,8 +1279,11 @@ NV50LoweringPreSSA::checkPredicate(Instruction *insn)
   Value *pred = insn->getPredicate();
   Value *cdst;

-   if (!pred || pred->reg.file == FILE_FLAGS)
+   // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
+   if (!pred ||
+       pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
      return;
+
   cdst = bld.getSSA(1, FILE_FLAGS);

   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -425,7 +425,17 @@ ConstantFolding::expr(Instruction *i,
      case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break;
      case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
      case TYPE_S32:
-      case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break;
+         if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
+            res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
+            break;
+         }
+         /* fallthrough */
+      case TYPE_U32:
+         if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
+            res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
+            break;
+         }
+         res.data.u32 = a->data.u32 * b->data.u32; break;
      default:
         return;
      }
@@ -551,8 +561,9 @@ ConstantFolding::expr(Instruction *i,
      if (i->src(0).getImmediate(src0))
         expr(i, src0, *i->getSrc(1)->asImm());
   } else {
-      i->op = OP_MOV;
+      i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
   }
+   i->subOp = 0;
 }

 void
@@ -602,6 +613,7 @@ ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
   switch (i->op) {
   case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
   case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
+   case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
   case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
   case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
   case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
@@ -691,12 +703,41 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 {
   const int t = !s;
   const operation op = i->op;
+   Instruction *newi = i;

   switch (i->op) {
   case OP_MUL:
      if (i->dType == TYPE_F32)
         tryCollapseChainedMULs(i, s, imm0);

+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
+         assert(!isFloatType(i->sType));
+         if (imm0.isInteger(1) && i->dType == TYPE_S32) {
+            bld.setPosition(i, false);
+            // Need to set to the sign value, which is a compare.
+            newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
+                             TYPE_S32, i->getSrc(t), bld.mkImm(0));
+            delete_Instruction(prog, i);
+         } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
+            // The high bits can't be set in this case (either mul by 0 or
+            // unsigned by 1)
+            i->op = OP_MOV;
+            i->subOp = 0;
+            i->setSrc(0, new_ImmediateValue(prog, 0u));
+            i->src(0).mod = Modifier(0);
+            i->setSrc(1, NULL);
+         } else if (!imm0.isNegative() && imm0.isPow2()) {
+            // Translate into a shift
+            imm0.applyLog2();
+            i->op = OP_SHR;
+            i->subOp = 0;
+            imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
+            i->setSrc(0, i->getSrc(t));
+            i->src(0).mod = i->src(t).mod;
+            i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
+            i->src(1).mod = 0;
+         }
+      } else
      if (imm0.isInteger(0)) {
         i->op = OP_MOV;
         i->setSrc(0, new_ImmediateValue(prog, 0u));
@@ -787,7 +828,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
         else
            tA = tB;
         tB = s ? bld.getSSA() : i->getDef(0);
-         bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
+         newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
         if (s)
            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));

@@ -819,7 +860,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
         tA = bld.getSSA();
         bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
         tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
-         bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
+         newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
         if (d < 0)
            bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);

@@ -883,6 +924,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)

   case OP_ABS:
   case OP_NEG:
+   case OP_SAT:
   case OP_LG2:
   case OP_RCP:
   case OP_SQRT:
@@ -897,7 +939,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
   default:
      return;
   }
-   if (i->op != op)
+   if (newi->op != op)
      foldCount++;
 }

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -998,7 +998,9 @@ GCRA::doCoalesce(ArrayList& insns, unsigned int mask)
      case OP_TXQ:
      case OP_TXD:
      case OP_TXG:
+      case OP_TXLQ:
      case OP_TEXCSAA:
+      case OP_TEXPREP:
         if (!(mask & JOIN_MASK_TEX))
            break;
         for (c = 0; insn->srcExists(c) && c != insn->predSrc; ++c)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -331,6 +331,8 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
         return false;
      if (sf == FILE_IMMEDIATE)
         return false;
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH && sf == FILE_MEMORY_CONST)
+         return false;
      ldSize = 2;
   } else {
      ldSize = typeSizeof(ld->dType);
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -122,12 +122,9 @@ nv50_destroy(struct pipe_context *pipe)
 {
   struct nv50_context *nv50 = nv50_context(pipe);

-   if (nv50_context_screen(nv50)->cur_ctx == nv50) {
-      nv50->base.pushbuf->kick_notify = NULL;
+   if (nv50_context_screen(nv50)->cur_ctx == nv50)
      nv50_context_screen(nv50)->cur_ctx = NULL;
-      nouveau_pushbuf_bufctx(nv50->base.pushbuf, NULL);
-   }
-   /* need to flush before destroying the bufctx */
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, NULL);
   nouveau_pushbuf_kick(nv50->base.pushbuf, nv50->base.pushbuf->channel);

   nv50_context_unreference_resources(nv50);
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -400,6 +400,10 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
   ctx_to->viewports_dirty = ~0;
   ctx_to->scissors_dirty = ~0;

+   ctx_to->constbuf_dirty[0] =
+   ctx_to->constbuf_dirty[1] =
+   ctx_to->constbuf_dirty[2] = (1 << NV50_MAX_PIPE_CONSTBUFS) - 1;
+
   if (!ctx_to->vertex)
      ctx_to->dirty &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS);

--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -288,6 +288,14 @@ nv50_clear_render_target(struct pipe_context *pipe,

   PUSH_REFN(push, bo, mt->base.domain | NOUVEAU_BO_WR);

+   BEGIN_NV04(push, NV50_3D(SCREEN_SCISSOR_HORIZ), 2);
+   PUSH_DATA (push, ( width << 16) | dstx);
+   PUSH_DATA (push, (height << 16) | dsty);
+   BEGIN_NV04(push, NV50_3D(SCISSOR_HORIZ(0)), 2);
+   PUSH_DATA (push, 8192 << 16);
+   PUSH_DATA (push, 8192 << 16);
+   nv50->scissors_dirty |= 1;
+
   BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
   PUSH_DATA (push, 1);
   BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 5);
@@ -325,7 +333,7 @@ nv50_clear_render_target(struct pipe_context *pipe,
                 (z << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
   }

-   nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }

 static void
@@ -364,6 +372,14 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,

   PUSH_REFN(push, bo, mt->base.domain | NOUVEAU_BO_WR);

+   BEGIN_NV04(push, NV50_3D(SCREEN_SCISSOR_HORIZ), 2);
+   PUSH_DATA (push, ( width << 16) | dstx);
+   PUSH_DATA (push, (height << 16) | dsty);
+   BEGIN_NV04(push, NV50_3D(SCISSOR_HORIZ(0)), 2);
+   PUSH_DATA (push, 8192 << 16);
+   PUSH_DATA (push, 8192 << 16);
+   nv50->scissors_dirty |= 1;
+
   BEGIN_NV04(push, NV50_3D(ZETA_ADDRESS_HIGH), 5);
   PUSH_DATAh(push, bo->offset + sf->offset);
   PUSH_DATA (push, bo->offset + sf->offset);
@@ -390,7 +406,7 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
                 (z << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
   }

-   nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }

 void
@@ -1142,6 +1158,12 @@ nv50_blit_3d(struct nv50_context *nv50, const struct pipe_blit_info *info)
   y0 *= (float)(1 << nv50_miptree(src)->ms_y);
   y1 *= (float)(1 << nv50_miptree(src)->ms_y);

+   /* XXX: multiply by 6 for cube arrays ? */
+   dz = (float)info->src.box.depth / (float)info->dst.box.depth;
+   z = (float)info->src.box.z;
+   if (nv50_miptree(src)->layout_3d)
+      z += 0.5f * dz;
+
   if (src->last_level > 0) {
      /* If there are mip maps, GPU always assumes normalized coordinates. */
      const unsigned l = info->src.level;
@@ -1151,14 +1173,12 @@ nv50_blit_3d(struct nv50_context *nv50, const struct pipe_blit_info *info)
      x1 /= fh;
      y0 /= fv;
      y1 /= fv;
+      if (nv50_miptree(src)->layout_3d) {
+         z /= u_minify(src->depth0, l);
+         dz /= u_minify(src->depth0, l);
+      }
   }

-   /* XXX: multiply by 6 for cube arrays ? */
-   dz = (float)info->src.box.depth / (float)info->dst.box.depth;
-   z = (float)info->src.box.z;
-   if (nv50_miptree(src)->layout_3d)
-      z += 0.5f * dz;
-
   BEGIN_NV04(push, NV50_3D(VIEWPORT_TRANSFORM_EN), 1);
   PUSH_DATA (push, 0);
   BEGIN_NV04(push, NV50_3D(VIEW_VOLUME_CLIP_CTRL), 1);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -123,11 +123,12 @@ nvc0_destroy(struct pipe_context *pipe)
 {
   struct nvc0_context *nvc0 = nvc0_context(pipe);

-   if (nvc0->screen->cur_ctx == nvc0) {
-      nvc0->base.pushbuf->kick_notify = NULL;
+   if (nvc0->screen->cur_ctx == nvc0)
      nvc0->screen->cur_ctx = NULL;
-      nouveau_pushbuf_bufctx(nvc0->base.pushbuf, NULL);
-   }
+   /* Unset bufctx, we don't want to revalidate any resources after the flush.
+    * Other contexts will always set their bufctx again on action calls.
+    */
+   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, NULL);
   nouveau_pushbuf_kick(nvc0->base.pushbuf, nvc0->base.pushbuf->channel);

   nvc0_context_unreference_resources(nvc0);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -133,17 +133,12 @@ static int
 nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info)
 {
   unsigned count = info->prop.fp.numColourResults * 4;
-   unsigned i, c, ci;
+   unsigned i, c;

-   for (i = 0, ci = 0; i < info->numOutputs; ++i) {
-      if (info->out[i].sn == TGSI_SEMANTIC_COLOR) {
+   for (i = 0; i < info->numOutputs; ++i)
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
         for (c = 0; c < 4; ++c)
-            info->out[i].slot[c] = ci * 4 + c;
-         ci++;
-      }
-   }
-
-   assert(ci == info->prop.fp.numColourResults);
+            info->out[i].slot[c] = info->out[i].si * 4 + c;

   if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
      info->out[info->io.sampleMask].slot[0] = count++;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -171,7 +171,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
      return 0;
   case PIPE_CAP_COMPUTE:
-      return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
+      return (class_3d == NVE4_3D_CLASS) ? 1 : 0;
   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
      return 1;
   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
@@ -211,7 +211,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_FRAGMENT:
      break;
   case PIPE_SHADER_COMPUTE:
-      if (class_3d < NVE4_3D_CLASS)
+      if (class_3d != NVE4_3D_CLASS)
         return 0;
      break;
   default:
@@ -514,9 +514,10 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
         return nvc0_screen_compute_setup(screen, screen->base.pushbuf);
      return 0;
   case 0xe0:
+      return nve4_screen_compute_setup(screen, screen->base.pushbuf);
   case 0xf0:
   case 0x100:
-      return nve4_screen_compute_setup(screen, screen->base.pushbuf);
+      return 0;
   default:
      return -1;
   }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -531,6 +531,7 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
   for (s = 0; s < 5; ++s) {
      ctx_to->samplers_dirty[s] = ~0;
      ctx_to->textures_dirty[s] = ~0;
+      ctx_to->constbuf_dirty[s] = (1 << NVC0_MAX_PIPE_CONSTBUFS) - 1;
   }

   if (!ctx_to->vertex)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -543,9 +543,22 @@ nvc0_blitter_make_vp(struct nvc0_blitter *blit)
      0x03f01c46, 0x0a7e0080, /* export b96 o[0x80] $r0:$r1:$r2 */
      0x00001de7, 0x80000000, /* exit */
   };
+   static const uint32_t code_gk110[] =
+   {
+      0x00000000, 0x08000000, /* sched */
+      0x401ffc12, 0x7ec7fc00, /* ld b64 $r4d a[0x80] 0x0 0x0 */
+      0x481ffc02, 0x7ecbfc00, /* ld b96 $r0t a[0x90] 0x0 0x0 */
+      0x381ffc12, 0x7f07fc00, /* st b64 a[0x70] $r4d 0x0 0x0 */
+      0x401ffc02, 0x7f0bfc00, /* st b96 a[0x80] $r0t 0x0 0x0 */
+      0x001c003c, 0x18000000, /* exit */
+   };

   blit->vp.type = PIPE_SHADER_VERTEX;
   blit->vp.translated = TRUE;
+   if (blit->screen->base.class_3d >= NVF0_3D_CLASS) {
+      blit->vp.code = (uint32_t *)code_gk110; /* const_cast */
+      blit->vp.code_size = sizeof(code_gk110);
+   } else
   if (blit->screen->base.class_3d >= NVE4_3D_CLASS) {
      blit->vp.code = (uint32_t *)code_nve4; /* const_cast */
      blit->vp.code_size = sizeof(code_nve4);
@@ -896,6 +909,11 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
   y0 *= (float)(1 << nv50_miptree(src)->ms_y);
   y1 *= (float)(1 << nv50_miptree(src)->ms_y);

+   dz = (float)info->src.box.depth / (float)info->dst.box.depth;
+   z = (float)info->src.box.z;
+   if (nv50_miptree(src)->layout_3d)
+      z += 0.5f * dz;
+
   if (src->last_level > 0) {
      /* If there are mip maps, GPU always assumes normalized coordinates. */
      const unsigned l = info->src.level;
@@ -905,13 +923,12 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
      x1 /= fh;
      y0 /= fv;
      y1 /= fv;
+      if (nv50_miptree(src)->layout_3d) {
+         z /= u_minify(src->depth0, l);
+         dz /= u_minify(src->depth0, l);
+      }
   }

-   dz = (float)info->src.box.depth / (float)info->dst.box.depth;
-   z = (float)info->src.box.z;
-   if (nv50_miptree(src)->layout_3d)
-      z += 0.5f * dz;
-
   IMMED_NVC0(push, NVC0_3D(VIEWPORT_TRANSFORM_EN), 0);
   IMMED_NVC0(push, NVC0_3D(VIEW_VOLUME_CLIP_CTRL), 0x2 |
              NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_RANGE_0_1);
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -789,7 +789,8 @@ static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
 	    info->src.box.width == dst_width &&
 	    info->src.box.height == dst_height &&
 	    info->src.box.depth == 1 &&
-	    dst->surface.level[info->dst.level].mode >= RADEON_SURF_MODE_1D) {
+	    dst->surface.level[info->dst.level].mode >= RADEON_SURF_MODE_1D &&
+	    (!dst->cmask.size || !dst->dirty_level_mask) /* dst cannot be fast-cleared */) {
 		r600_blitter_begin(ctx, R600_COLOR_RESOLVE);
 		util_blitter_custom_resolve_color(rctx->blitter,
 						  info->dst.resource, info->dst.level,
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -1235,6 +1235,9 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 {
 	int i;

+	if (rctx->current_render_cond)
+		return;
+
 	for (i = 0; i < fb->nr_cbufs; i++) {
 		struct r600_texture *tex;
 		unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -689,7 +689,8 @@ static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
 	    info->src.box.height == dst_height &&
 	    info->src.box.depth == 1 &&
 	    dst->surface.level[info->dst.level].mode >= RADEON_SURF_MODE_1D &&
-	    !(dst->surface.flags & RADEON_SURF_SCANOUT)) {
+	    !(dst->surface.flags & RADEON_SURF_SCANOUT) &&
+	    (!dst->cmask.size || !dst->dirty_level_mask) /* dst cannot be fast-cleared */) {
 		si_blitter_begin(ctx, SI_COLOR_RESOLVE);
 		util_blitter_custom_resolve_color(sctx->blitter,
 						  info->dst.resource, info->dst.level,
--- a/src/gallium/targets/egl-static/Makefile.am
+++ b/src/gallium/targets/egl-static/Makefile.am
@@ -48,7 +48,7 @@ AM_LDFLAGS = \
 	-module \
 	-no-undefined \
 	-avoid-version \
-	-Wl,--version-script=$(top_srcdir)/src/gallium/targets/egl-static/egl.sym
+	-Wl,--version-script=$(top_srcdir)/src/gallium/targets/egl-static/egl.sym \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)

--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3652,11 +3652,15 @@ ast_declarator_list::hir(exec_list *instructions,
       * instruction stream.
       */
      exec_list initializer_instructions;
+
+      /* Examine var name here since var may get deleted in the next call */
+      bool var_is_gl_id = (strncmp(var->name, "gl_", 3) == 0);
+
      ir_variable *earlier =
         get_variable_being_redeclared(var, decl->get_location(), state,
                                       false /* allow_all_redeclarations */);
      if (earlier != NULL) {
-         if (strncmp(var->name, "gl_", 3) == 0 &&
+         if (var_is_gl_id &&
             earlier->data.how_declared == ir_var_declared_in_block) {
            _mesa_glsl_error(&loc, state,
                             "`%s' has already been redeclared using "
--- a/src/glx/apple/apple_visual.c
+++ b/src/glx/apple/apple_visual.c
@@ -73,11 +73,15 @@ apple_visual_create_pfobj(CGLPixelFormatObj * pfobj, const struct glx_config * m
   GLint vsref = 0;
   CGLError error = 0;

-   /* Request an OpenGL 3.2 profile if one is available */
-   if(apple_cgl.version_major > 1 || (apple_cgl.version_major == 1 && apple_cgl.version_minor >= 3)) {
-      attr[numattr++] = kCGLPFAOpenGLProfile;
-      attr[numattr++] = kCGLOGLPVersion_3_2_Core;
-   }
+   /* Request an OpenGL 3.2 profile if one is available and supported */
+   attr[numattr++] = kCGLPFAOpenGLProfile;
+   attr[numattr++] = kCGLOGLPVersion_3_2_Core;
+
+   /* Test for kCGLPFAOpenGLProfile support at runtime and roll it out if not supported */
+   attr[numattr] = 0;
+   error = apple_cgl.choose_pixel_format(attr, pfobj, &vsref);
+   if (error == kCGLBadAttribute)
+      numattr -= 2;

   if (offscreen) {
      apple_glx_diagnostic
--- a/src/glx/glxext.c
+++ b/src/glx/glxext.c
@@ -249,6 +249,10 @@ glx_display_free(struct glx_display *priv)
   if (priv->dri2Display)
      (*priv->dri2Display->destroyDisplay) (priv->dri2Display);
   priv->dri2Display = NULL;
+
+   if (priv->dri3Display)
+      (*priv->dri3Display->destroyDisplay) (priv->dri3Display);
+   priv->dri3Display = NULL;
 #endif

   free((char *) priv);
--- a/src/mapi/glapi/gen/gl_gentable.py
+++ b/src/mapi/glapi/gen/gl_gentable.py
@@ -113,7 +113,7 @@ __glapi_gentable_set_remaining_noop(struct _glapi_table *disp) {

 struct _glapi_table *
 _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) {
-    struct _glapi_table *disp = calloc(1, sizeof(struct _glapi_table));
+    struct _glapi_table *disp = calloc(1, _glapi_get_dispatch_table_size() * sizeof(_glapi_proc));
    char symboln[512];

    if(!disp)
--- a/src/mapi/glapi/glapi_dispatch.c
+++ b/src/mapi/glapi/glapi_dispatch.c
@@ -87,6 +87,63 @@
 /* those link to libglapi.a should provide the entry points */
 #define _GLAPI_SKIP_PROTO_ENTRY_POINTS
 #endif
+
+/* These prototypes are necessary because GLES1 library builds will create
+ * dispatch functions for them.  We can't directly include GLES/gl.h because
+ * it would conflict the previously-included GL/gl.h.  Since GLES1 ABI is not
+ * expected to every add more functions, the path of least resistance is to
+ * just duplicate the prototypes for the functions that aren't already in
+ * desktop OpenGL.
+ */
+#include <GLES/glplatform.h>
+
+GL_API void GL_APIENTRY glClearDepthf (GLclampf depth);
+GL_API void GL_APIENTRY glClipPlanef (GLenum plane, const GLfloat *equation);
+GL_API void GL_APIENTRY glFrustumf (GLfloat left, GLfloat right, GLfloat bottom, GLfloat top, GLfloat zNear, GLfloat zFar);
+GL_API void GL_APIENTRY glGetClipPlanef (GLenum pname, GLfloat eqn[4]);
+GL_API void GL_APIENTRY glOrthof (GLfloat left, GLfloat right, GLfloat bottom, GLfloat top, GLfloat zNear, GLfloat zFar);
+
+GL_API void GL_APIENTRY glAlphaFuncx (GLenum func, GLclampx ref);
+GL_API void GL_APIENTRY glClearColorx (GLclampx red, GLclampx green, GLclampx blue, GLclampx alpha);
+GL_API void GL_APIENTRY glClearDepthx (GLclampx depth);
+GL_API void GL_APIENTRY glClipPlanex (GLenum plane, const GLfixed *equation);
+GL_API void GL_APIENTRY glColor4x (GLfixed red, GLfixed green, GLfixed blue, GLfixed alpha);
+GL_API void GL_APIENTRY glDepthRangex (GLclampx zNear, GLclampx zFar);
+GL_API void GL_APIENTRY glFogx (GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glFogxv (GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glFrustumx (GLfixed left, GLfixed right, GLfixed bottom, GLfixed top, GLfixed zNear, GLfixed zFar);
+GL_API void GL_APIENTRY glGetClipPlanex (GLenum pname, GLfixed eqn[4]);
+GL_API void GL_APIENTRY glGetFixedv (GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glGetLightxv (GLenum light, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glGetMaterialxv (GLenum face, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glGetTexEnvxv (GLenum env, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glGetTexParameterxv (GLenum target, GLenum pname, GLfixed *params);
+GL_API void GL_APIENTRY glLightModelx (GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glLightModelxv (GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glLightx (GLenum light, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glLightxv (GLenum light, GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glLineWidthx (GLfixed width);
+GL_API void GL_APIENTRY glLoadMatrixx (const GLfixed *m);
+GL_API void GL_APIENTRY glMaterialx (GLenum face, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glMaterialxv (GLenum face, GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glMultMatrixx (const GLfixed *m);
+GL_API void GL_APIENTRY glMultiTexCoord4x (GLenum target, GLfixed s, GLfixed t, GLfixed r, GLfixed q);
+GL_API void GL_APIENTRY glNormal3x (GLfixed nx, GLfixed ny, GLfixed nz);
+GL_API void GL_APIENTRY glOrthox (GLfixed left, GLfixed right, GLfixed bottom, GLfixed top, GLfixed zNear, GLfixed zFar);
+GL_API void GL_APIENTRY glPointParameterx (GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glPointParameterxv (GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glPointSizex (GLfixed size);
+GL_API void GL_APIENTRY glPolygonOffsetx (GLfixed factor, GLfixed units);
+GL_API void GL_APIENTRY glRotatex (GLfixed angle, GLfixed x, GLfixed y, GLfixed z);
+GL_API void GL_APIENTRY glSampleCoveragex (GLclampx value, GLboolean invert);
+GL_API void GL_APIENTRY glScalex (GLfixed x, GLfixed y, GLfixed z);
+GL_API void GL_APIENTRY glTexEnvx (GLenum target, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glTexEnvxv (GLenum target, GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glTexParameterx (GLenum target, GLenum pname, GLfixed param);
+GL_API void GL_APIENTRY glTexParameterxv (GLenum target, GLenum pname, const GLfixed *params);
+GL_API void GL_APIENTRY glTranslatex (GLfixed x, GLfixed y, GLfixed z);
+GL_API void GL_APIENTRY glPointSizePointerOES (GLenum type, GLsizei stride, const GLvoid *pointer);
+
 #include "glapi/glapitemp.h"

 #endif /* USE_X86_ASM */
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -242,10 +242,25 @@ _mesa_meta_setup_blit_shader(struct gl_context *ctx,
                             GLenum target,
                             struct blit_shader_table *table)
 {
-   const char *vs_source;
-   char *fs_source;
+   char *vs_source, *fs_source;
   void *const mem_ctx = ralloc_context(NULL);
   struct blit_shader *shader = choose_blit_shader(target, table);
+   const char *vs_input, *vs_output, *fs_input, *vs_preprocess, *fs_preprocess;
+
+   if (ctx->Const.GLSLVersion < 130) {
+      vs_preprocess = "";
+      vs_input = "attribute";
+      vs_output = "varying";
+      fs_preprocess = "#extension GL_EXT_texture_array : enable";
+      fs_input = "varying";
+   } else {
+      vs_preprocess = "#version 130";
+      vs_input = "in";
+      vs_output = "out";
+      fs_preprocess = "#version 130";
+      fs_input = "in";
+      shader->func = "texture";
+   }

   assert(shader != NULL);

@@ -254,57 +269,30 @@ _mesa_meta_setup_blit_shader(struct gl_context *ctx,
      return;
   }

-   if (ctx->Const.GLSLVersion < 130) {
-      vs_source =
-         "attribute vec2 position;\n"
-         "attribute vec4 textureCoords;\n"
-         "varying vec4 texCoords;\n"
-         "void main()\n"
-         "{\n"
-         "   texCoords = textureCoords;\n"
-         "   gl_Position = vec4(position, 0.0, 1.0);\n"
-         "}\n";
-
-      fs_source = ralloc_asprintf(mem_ctx,
-                                  "#extension GL_EXT_texture_array : enable\n"
-                                  "#extension GL_ARB_texture_cube_map_array: enable\n"
-                                  "uniform %s texSampler;\n"
-                                  "varying vec4 texCoords;\n"
-                                  "void main()\n"
-                                  "{\n"
-                                  "   gl_FragColor = %s(texSampler, %s);\n"
-                                  "   gl_FragDepth = gl_FragColor.x;\n"
-                                  "}\n",
-                                  shader->type,
-                                  shader->func, shader->texcoords);
-   }
-   else {
-      vs_source = ralloc_asprintf(mem_ctx,
-                                  "#version 130\n"
-                                  "in vec2 position;\n"
-                                  "in vec4 textureCoords;\n"
-                                  "out vec4 texCoords;\n"
-                                  "void main()\n"
-                                  "{\n"
-                                  "   texCoords = textureCoords;\n"
-                                  "   gl_Position = vec4(position, 0.0, 1.0);\n"
-                                  "}\n");
-      fs_source = ralloc_asprintf(mem_ctx,
-                                  "#version 130\n"
-                                  "#extension GL_ARB_texture_cube_map_array: enable\n"
-                                  "uniform %s texSampler;\n"
-                                  "in vec4 texCoords;\n"
-                                  "out vec4 out_color;\n"
-                                  "\n"
-                                  "void main()\n"
-                                  "{\n"
-                                  "   out_color = texture(texSampler, %s);\n"
-                                  "   gl_FragDepth = out_color.x;\n"
-                                  "}\n",
-                                  shader->type,
-                                  shader->texcoords);
-   }
+   vs_source = ralloc_asprintf(mem_ctx,
+                "%s\n"
+                "%s vec2 position;\n"
+                "%s vec4 textureCoords;\n"
+                "%s vec4 texCoords;\n"
+                "void main()\n"
+                "{\n"
+                "   texCoords = textureCoords;\n"
+                "   gl_Position = vec4(position, 0.0, 1.0);\n"
+                "}\n",
+                vs_preprocess, vs_input, vs_input, vs_output);

+   fs_source = ralloc_asprintf(mem_ctx,
+                "%s\n"
+                "#extension GL_ARB_texture_cube_map_array: enable\n"
+                "uniform %s texSampler;\n"
+                "%s vec4 texCoords;\n"
+                "void main()\n"
+                "{\n"
+                "   gl_FragColor = %s(texSampler, %s);\n"
+                "   gl_FragDepth = gl_FragColor.x;\n"
+                "}\n",
+                fs_preprocess, shader->type, fs_input,
+                shader->func, shader->texcoords);

   _mesa_meta_compile_and_link_program(ctx, vs_source, fs_source,
                                       ralloc_asprintf(mem_ctx, "%s blit",
@@ -2860,13 +2848,13 @@ copytexsubimage_using_blit_framebuffer(struct gl_context *ctx, GLuint dims,
    * are too strict for CopyTexImage.  We know meta will be fine with format
    * changes.
    */
-   _mesa_meta_BlitFramebuffer(ctx, x, y,
-                              x + width, y + height,
-                              xoffset, yoffset,
-                              xoffset + width, yoffset + height,
-                              mask, GL_NEAREST);
+   mask = _mesa_meta_BlitFramebuffer(ctx, x, y,
+                                     x + width, y + height,
+                                     xoffset, yoffset,
+                                     xoffset + width, yoffset + height,
+                                     mask, GL_NEAREST);
   ctx->Meta->Blit.no_ctsi_fallback = false;
-   success = true;
+   success = mask == 0x0;

 out:
   _mesa_lock_texture(ctx, texObj);
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -270,7 +270,7 @@ struct blit_state
 struct fb_tex_blit_state
 {
   GLint baseLevelSave, maxLevelSave;
-   GLuint sampler, samplerSave;
+   GLuint sampler, samplerSave, stencilSamplingSave;
   GLuint tempTex;
 };

@@ -407,7 +407,7 @@ _mesa_meta_fb_tex_blit_begin(const struct gl_context *ctx,
                             struct fb_tex_blit_state *blit);

 extern void
-_mesa_meta_fb_tex_blit_end(const struct gl_context *ctx, GLenum target,
+_mesa_meta_fb_tex_blit_end(struct gl_context *ctx, GLenum target,
                           struct fb_tex_blit_state *blit);

 extern GLboolean
@@ -422,12 +422,20 @@ _mesa_meta_setup_sampler(struct gl_context *ctx,
                         const struct gl_texture_object *texObj,
                         GLenum target, GLenum filter, GLuint srcLevel);

-extern void
+extern GLbitfield
 _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
                           GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                           GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                           GLbitfield mask, GLenum filter);

+extern void
+_mesa_meta_and_swrast_BlitFramebuffer(struct gl_context *ctx,
+                                      GLint srcX0, GLint srcY0,
+                                      GLint srcX1, GLint srcY1,
+                                      GLint dstX0, GLint dstY0,
+                                      GLint dstX1, GLint dstY1,
+                                      GLbitfield mask, GLenum filter);
+
 extern void
 _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers);

--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -273,7 +273,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
                                   samples);
         } else {
            ralloc_asprintf_append(&sample_resolve,
-                                   "   out_color = sample_%d_0 / %f;\n",
+                                   "   gl_FragColor = sample_%d_0 / %f;\n",
                                   samples, (float)samples);
         }
      }
@@ -328,7 +328,10 @@ setup_glsl_blit_framebuffer(struct gl_context *ctx,
   /* target = GL_TEXTURE_RECTANGLE is not supported in GLES 3.0 */
   assert(_mesa_is_desktop_gl(ctx) || target == GL_TEXTURE_2D);

-   _mesa_meta_setup_vertex_objects(&blit->VAO, &blit->VBO, true, 2, 2, 0);
+   unsigned texcoord_size = 2 + (src_rb->Depth > 1 ? 1 : 0);
+
+   _mesa_meta_setup_vertex_objects(&blit->VAO, &blit->VBO, true,
+                                   2, texcoord_size, 0);

   if (target == GL_TEXTURE_2D_MULTISAMPLE ||
       target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY) {
@@ -441,6 +444,7 @@ blitframebuffer_texture(struct gl_context *ctx,

   fb_tex_blit.baseLevelSave = texObj->BaseLevel;
   fb_tex_blit.maxLevelSave = texObj->MaxLevel;
+   fb_tex_blit.stencilSamplingSave = texObj->StencilSampling;

   if (glsl_version) {
      setup_glsl_blit_framebuffer(ctx, blit, rb, target);
@@ -533,12 +537,16 @@ blitframebuffer_texture(struct gl_context *ctx,

      verts[0].tex[0] = s0;
      verts[0].tex[1] = t0;
+      verts[0].tex[2] = readAtt->Zoffset;
      verts[1].tex[0] = s1;
      verts[1].tex[1] = t0;
+      verts[1].tex[2] = readAtt->Zoffset;
      verts[2].tex[0] = s1;
      verts[2].tex[1] = t1;
+      verts[2].tex[2] = readAtt->Zoffset;
      verts[3].tex[0] = s0;
      verts[3].tex[1] = t1;
+      verts[3].tex[2] = readAtt->Zoffset;

      _mesa_BufferSubData(GL_ARRAY_BUFFER_ARB, 0, sizeof(verts), verts);
   }
@@ -567,7 +575,7 @@ _mesa_meta_fb_tex_blit_begin(const struct gl_context *ctx,
 }

 void
-_mesa_meta_fb_tex_blit_end(const struct gl_context *ctx, GLenum target,
+_mesa_meta_fb_tex_blit_end(struct gl_context *ctx, GLenum target,
                           struct fb_tex_blit_state *blit)
 {
   /* Restore texture object state, the texture binding will
@@ -576,6 +584,16 @@ _mesa_meta_fb_tex_blit_end(const struct gl_context *ctx, GLenum target,
   if (target != GL_TEXTURE_RECTANGLE_ARB) {
      _mesa_TexParameteri(target, GL_TEXTURE_BASE_LEVEL, blit->baseLevelSave);
      _mesa_TexParameteri(target, GL_TEXTURE_MAX_LEVEL, blit->maxLevelSave);
+
+      if (ctx->Extensions.ARB_stencil_texturing) {
+         const struct gl_texture_object *texObj =
+            _mesa_get_current_tex_object(ctx, target);
+
+         if (texObj->StencilSampling != blit->stencilSamplingSave)
+            _mesa_TexParameteri(target, GL_DEPTH_STENCIL_TEXTURE_MODE,
+                                blit->stencilSamplingSave ?
+                                   GL_STENCIL_INDEX : GL_DEPTH_COMPONENT);
+      }
   }

   _mesa_BindSampler(ctx->Texture.CurrentUnit, blit->samplerSave);
@@ -644,7 +662,7 @@ _mesa_meta_setup_sampler(struct gl_context *ctx,
 * Meta implementation of ctx->Driver.BlitFramebuffer() in terms
 * of texture mapping and polygon rendering.
 */
-void
+GLbitfield
 _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
                           GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                           GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
@@ -669,7 +687,7 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
   /* Multisample texture blit support requires texture multisample. */
   if (ctx->ReadBuffer->Visual.samples > 0 &&
       !ctx->Extensions.ARB_texture_multisample) {
-      goto fallback;
+      return mask;
   }

   /* Clip a copy of the blit coordinates. If these differ from the input
@@ -678,7 +696,7 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
   if (!_mesa_clip_blit(ctx, &clip.srcX0, &clip.srcY0, &clip.srcX1, &clip.srcY1,
                        &clip.dstX0, &clip.dstY0, &clip.dstX1, &clip.dstY1)) {
      /* clipped/scissored everything away */
-      return;
+      return 0;
   }

   /* Only scissor affects blit, but we're doing to set a custom scissor if
@@ -705,10 +723,6 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
                                  filter, dstFlipX, dstFlipY,
                                  use_glsl_version, false)) {
         mask &= ~GL_COLOR_BUFFER_BIT;
-         if (mask == 0x0) {
-            _mesa_meta_end(ctx);
-            return;
-         }
      }
   }

@@ -718,10 +732,6 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
                                  filter, dstFlipX, dstFlipY,
                                  use_glsl_version, true)) {
         mask &= ~GL_DEPTH_BUFFER_BIT;
-         if (mask == 0x0) {
-            _mesa_meta_end(ctx);
-            return;
-         }
      }
   }

@@ -731,11 +741,7 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,

   _mesa_meta_end(ctx);

-fallback:
-   if (mask && !ctx->Meta->Blit.no_ctsi_fallback) {
-      _swrast_BlitFramebuffer(ctx, srcX0, srcY0, srcX1, srcY1,
-                              dstX0, dstY0, dstX1, dstY1, mask, filter);
-   }
+   return mask;
 }

 void
@@ -753,3 +759,24 @@ _mesa_meta_glsl_blit_cleanup(struct blit_state *blit)
   _mesa_DeleteTextures(1, &blit->depthTex.TexObj);
   blit->depthTex.TexObj = 0;
 }
+
+void
+_mesa_meta_and_swrast_BlitFramebuffer(struct gl_context *ctx,
+                                      GLint srcX0, GLint srcY0,
+                                      GLint srcX1, GLint srcY1,
+                                      GLint dstX0, GLint dstY0,
+                                      GLint dstX1, GLint dstY1,
+                                      GLbitfield mask, GLenum filter)
+{
+   mask = _mesa_meta_BlitFramebuffer(ctx,
+                                     srcX0, srcY0, srcX1, srcY1,
+                                     dstX0, dstY0, dstX1, dstY1,
+                                     mask, filter);
+   if (mask == 0x0)
+      return;
+
+   _swrast_BlitFramebuffer(ctx,
+                           srcX0, srcY0, srcX1, srcY1,
+                           dstX0, dstY0, dstX1, dstY1,
+                           mask, filter);
+}
--- a/src/mesa/drivers/dri/i915/intel_context.c
+++ b/src/mesa/drivers/dri/i915/intel_context.c
@@ -507,7 +507,7 @@ intelInitContext(struct intel_context *intel,

   _mesa_meta_init(ctx);

-   intel->hw_stencil = mesaVis->stencilBits && mesaVis->depthBits == 24;
+   intel->hw_stencil = mesaVis && mesaVis->stencilBits && mesaVis->depthBits == 24;
   intel->hw_stipple = 1;

   intel->RenderIndex = ~0;
--- a/src/mesa/drivers/dri/i915/intel_fbo.c
+++ b/src/mesa/drivers/dri/i915/intel_fbo.c
@@ -741,10 +741,10 @@ intel_blit_framebuffer(struct gl_context *ctx,
      return;


-   _mesa_meta_BlitFramebuffer(ctx,
-                              srcX0, srcY0, srcX1, srcY1,
-                              dstX0, dstY0, dstX1, dstY1,
-                              mask, filter);
+   _mesa_meta_and_swrast_BlitFramebuffer(ctx,
+                                         srcX0, srcY0, srcX1, srcY1,
+                                         dstX0, dstY0, dstX1, dstY1,
+                                         mask, filter);
 }

 /**
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -606,6 +606,7 @@
 #define BRW_TEXCOORDMODE_CUBE            3
 #define BRW_TEXCOORDMODE_CLAMP_BORDER    4
 #define BRW_TEXCOORDMODE_MIRROR_ONCE     5
+#define GEN8_TEXCOORDMODE_HALF_BORDER    6

 #define BRW_THREAD_PRIORITY_NORMAL   0
 #define BRW_THREAD_PRIORITY_HIGH     1
@@ -1694,7 +1695,7 @@ enum brw_message_target {
 /* GEN7/DW1: */
 # define GEN7_SF_DEPTH_BUFFER_SURFACE_FORMAT_SHIFT	12
 /* GEN7/DW2: */
-# define HSW_SF_LINE_STIPPLE_ENABLE			14
+# define HSW_SF_LINE_STIPPLE_ENABLE			(1 << 14)

 # define GEN8_SF_SMOOTH_POINT_ENABLE                    (1 << 13)

--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -192,33 +192,44 @@ static const struct brw_device_info brw_device_info_hsw_gt3 = {
   },
 };

-/* Thread counts and URB limits are placeholders, and may not be accurate. */
 #define GEN8_FEATURES                               \
   .gen = 8,                                        \
   .has_hiz_and_separate_stencil = true,            \
   .must_use_separate_stencil = true,               \
   .has_llc = true,                                 \
   .has_pln = true,                                 \
-   .max_vs_threads = 280,                           \
-   .max_gs_threads = 256,                           \
-   .max_wm_threads = 408,                           \
-   .urb = {                                         \
-      .size = 128,                                  \
-      .min_vs_entries = 64,                         \
-      .max_vs_entries = 1664,                       \
-      .max_gs_entries = 640,                        \
-   }
+   .max_vs_threads = 504,                           \
+   .max_gs_threads = 504,                           \
+   .max_wm_threads = 384                            \

 static const struct brw_device_info brw_device_info_bdw_gt1 = {
   GEN8_FEATURES, .gt = 1,
+   .urb = {
+      .size = 192,
+      .min_vs_entries = 64,
+      .max_vs_entries = 2560,
+      .max_gs_entries = 960,
+   }
 };

 static const struct brw_device_info brw_device_info_bdw_gt2 = {
   GEN8_FEATURES, .gt = 2,
+   .urb = {
+      .size = 384,
+      .min_vs_entries = 64,
+      .max_vs_entries = 2560,
+      .max_gs_entries = 960,
+   }
 };

 static const struct brw_device_info brw_device_info_bdw_gt3 = {
   GEN8_FEATURES, .gt = 3,
+   .urb = {
+      .size = 384,
+      .min_vs_entries = 64,
+      .max_vs_entries = 2560,
+      .max_gs_entries = 960,
+   }
 };

 /* Thread counts and URB limits are placeholders, and may not be accurate.
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -77,21 +77,40 @@ is_coalesce_candidate(const fs_inst *inst, const int *virtual_grf_sizes)

 static bool
 can_coalesce_vars(brw::fs_live_variables *live_intervals,
-                  const exec_list *instructions, const fs_inst *inst, int ip,
+                  const exec_list *instructions, const fs_inst *inst,
                  int var_to, int var_from)
 {
   if (!live_intervals->vars_interfere(var_from, var_to))
      return true;

-   assert(ip >= live_intervals->start[var_to]);
+   /* We know that the live ranges of A (var_from) and B (var_to)
+    * interfere because of the ->vars_interfere() call above. If the end
+    * of B's live range is after the end of A's range, then we know two
+    * things:
+    *  - the start of B's live range must be in A's live range (since we
+    *    already know the two ranges interfere, this is the only remaining
+    *    possibility)
+    *  - the interference isn't of the form we're looking for (where B is
+    *    entirely inside A)
+    */
+   if (live_intervals->end[var_to] > live_intervals->end[var_from])
+      return false;

-   fs_inst *scan_inst;
-   for (scan_inst = (fs_inst *)inst->next;
-        !scan_inst->is_tail_sentinel() && ip <= live_intervals->end[var_to];
-        scan_inst = (fs_inst *)scan_inst->next, ip++) {
-      if (scan_inst->opcode == BRW_OPCODE_WHILE)
+   int scan_ip = -1;
+
+   foreach_list(n, instructions) {
+      fs_inst *scan_inst = (fs_inst *)n;
+      scan_ip++;
+
+      if (scan_inst->is_control_flow())
         return false;

+      if (scan_ip <= live_intervals->start[var_to])
+         continue;
+
+      if (scan_ip > live_intervals->end[var_to])
+         break;
+
      if (scan_inst->dst.equals(inst->dst) ||
          scan_inst->dst.equals(inst->src[0]))
         return false;
@@ -114,11 +133,9 @@ fs_visitor::register_coalesce()
   fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
   int var_to[MAX_SAMPLER_MESSAGE_SIZE];
   int var_from[MAX_SAMPLER_MESSAGE_SIZE];
-   int ip = -1;

   foreach_list(node, &this->instructions) {
      fs_inst *inst = (fs_inst *)node;
-      ip++;

      if (!is_coalesce_candidate(inst, virtual_grf_sizes))
         continue;
@@ -157,7 +174,7 @@ fs_visitor::register_coalesce()
         var_to[i] = live_intervals->var_from_vgrf[reg_to] + reg_to_offset[i];
         var_from[i] = live_intervals->var_from_vgrf[reg_from] + i;

-         if (!can_coalesce_vars(live_intervals, &instructions, inst, ip,
+         if (!can_coalesce_vars(live_intervals, &instructions, inst,
                                var_to[i], var_from[i])) {
            can_coalesce = false;
            reg_from = -1;
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -221,15 +221,18 @@ fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
       !y.is_valid_3src() ||
       !a.is_valid_3src()) {
      /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
+      fs_reg y_times_a           = fs_reg(this, glsl_type::float_type);
      fs_reg one_minus_a         = fs_reg(this, glsl_type::float_type);
+      fs_reg x_times_one_minus_a = fs_reg(this, glsl_type::float_type);
+
+      emit(MUL(y_times_a, y, a));

      fs_reg negative_a = a;
      negative_a.negate = !a.negate;
-
      emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
-      fs_inst *mul = emit(MUL(reg_null_f, y, a));
-      mul->writes_accumulator = true;
-      emit(MAC(dst, x, one_minus_a));
+      emit(MUL(x_times_one_minus_a, x, one_minus_a));
+
+      emit(ADD(dst, x_times_one_minus_a, y_times_a));
   } else {
      /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
       * we need to reorder the operands.
@@ -1480,15 +1483,28 @@ fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
 	 return coordinate;
      }

-      scale_x = fs_reg(UNIFORM, uniforms);
-      scale_y = fs_reg(UNIFORM, uniforms + 1);
-
      GLuint index = _mesa_add_state_reference(params,
 					       (gl_state_index *)tokens);
-      stage_prog_data->param[uniforms++] =
-         &prog->Parameters->ParameterValues[index][0].f;
-      stage_prog_data->param[uniforms++] =
-         &prog->Parameters->ParameterValues[index][1].f;
+      /* Try to find existing copies of the texrect scale uniforms. */
+      for (unsigned i = 0; i < uniforms; i++) {
+         if (stage_prog_data->param[i] ==
+             &prog->Parameters->ParameterValues[index][0].f) {
+            scale_x = fs_reg(UNIFORM, i);
+            scale_y = fs_reg(UNIFORM, i + 1);
+            break;
+         }
+      }
+
+      /* If we didn't already set them up, do so now. */
+      if (scale_x.file == BAD_FILE) {
+         scale_x = fs_reg(UNIFORM, uniforms);
+         scale_y = fs_reg(UNIFORM, uniforms + 1);
+
+         stage_prog_data->param[uniforms++] =
+            &prog->Parameters->ParameterValues[index][0].f;
+         stage_prog_data->param[uniforms++] =
+            &prog->Parameters->ParameterValues[index][1].f;
+      }
   }

   /* The 965 requires the EU to do the normalization of GL rectangle
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -391,6 +391,7 @@ set_read_rb_tex_image(struct gl_context *ctx, struct fb_tex_blit_state *blit,

   blit->baseLevelSave = tex_obj->BaseLevel;
   blit->maxLevelSave = tex_obj->MaxLevel;
+   blit->stencilSamplingSave = tex_obj->StencilSampling;
   blit->sampler = _mesa_meta_setup_sampler(ctx, tex_obj, *target,
                                            GL_NEAREST, level);
 }
--- a/src/mesa/drivers/dri/i965/brw_reset.c
+++ b/src/mesa/drivers/dri/i965/brw_reset.c
@@ -42,6 +42,13 @@ brw_get_graphics_reset_status(struct gl_context *ctx)
    */
   assert(brw->hw_ctx != NULL);

+   /* A reset status other than NO_ERROR was returned last time. I915 returns
+    * nonzero active/pending only if reset has been encountered and completed.
+    * Return NO_ERROR from now on.
+    */
+   if (brw->reset_count != 0)
+      return GL_NO_ERROR;
+
   err = drm_intel_get_reset_stats(brw->hw_ctx, &reset_count, &active,
                                   &pending);
   if (err)
@@ -50,18 +57,19 @@ brw_get_graphics_reset_status(struct gl_context *ctx)
   /* A reset was observed while a batch from this context was executing.
    * Assume that this context was at fault.
    */
-   if (active != 0)
+   if (active != 0) {
+      brw->reset_count = reset_count;
      return GL_GUILTY_CONTEXT_RESET_ARB;
+   }

   /* A reset was observed while a batch from this context was in progress,
    * but the batch was not executing.  In this case, assume that the context
    * was not at fault.
    */
-   if (pending != 0)
+   if (pending != 0) {
+      brw->reset_count = reset_count;
      return GL_INNOCENT_CONTEXT_RESET_ARB;
-
-   /* FINISHME: Should we report anything if reset_count > brw->reset_count?
-    */
+   }

   return GL_NO_ERROR;
 }
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -870,8 +870,7 @@ fs_instruction_scheduler::calculate_deps()
      } else if (inst->dst.is_accumulator() && gen6plus) {
         add_dep(last_accumulator_write, n);
         last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file != BAD_FILE) {
 	 add_barrier_deps(n);
      }

@@ -998,8 +997,7 @@ fs_instruction_scheduler::calculate_deps()
         }
      } else if (inst->dst.is_accumulator() && gen6plus) {
         last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file != BAD_FILE) {
 	 add_barrier_deps(n);
      }

@@ -1115,8 +1113,7 @@ vec4_instruction_scheduler::calculate_deps()
      } else if (inst->dst.is_accumulator() && gen6plus) {
         add_dep(last_accumulator_write, n);
         last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file != BAD_FILE) {
         add_barrier_deps(n);
      }

@@ -1208,8 +1205,7 @@ vec4_instruction_scheduler::calculate_deps()
         last_fixed_grf_write = n;
      } else if (inst->dst.is_accumulator() && gen6plus) {
         last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file != BAD_FILE) {
         add_barrier_deps(n);
      }

--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -243,7 +243,8 @@ void gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
 void gen8_init_vtable_surface_functions(struct brw_context *brw);

 /* brw_wm_sampler_state.c */
-uint32_t translate_wrap_mode(GLenum wrap, bool using_nearest);
+uint32_t translate_wrap_mode(struct brw_context *brw,
+                             GLenum wrap, bool using_nearest);
 void upload_default_color(struct brw_context *brw,
 			  struct gl_sampler_object *sampler,
 			  int unit,
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1171,13 +1171,17 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
      /* Earlier generations don't support three source operations, so we
       * need to emit x*(1-a) + y*a.
       */
-      dst_reg one_minus_a   = dst_reg(this, glsl_type::vec4_type);
-      one_minus_a.writemask = dst.writemask;
+      dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
+      dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
+      dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
+      y_times_a.writemask           = dst.writemask;
+      one_minus_a.writemask         = dst.writemask;
+      x_times_one_minus_a.writemask = dst.writemask;

+      emit(MUL(y_times_a, y, a));
      emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
-      vec4_instruction *mul = emit(MUL(dst_null_f(), y, a));
-      mul->writes_accumulator = true;
-      emit(MAC(dst, x, src_reg(one_minus_a)));
+      emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
+      emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
   }
 }

--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -352,7 +352,8 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
         if (alpha_depth || (brw->gen < 8 && !brw->is_haswell))
            key->swizzles[s] = brw_get_texture_swizzle(ctx, t);

-	 if (sampler->MinFilter != GL_NEAREST &&
+	 if (brw->gen < 8 &&
+             sampler->MinFilter != GL_NEAREST &&
 	     sampler->MagFilter != GL_NEAREST) {
 	    if (sampler->WrapS == GL_CLAMP)
 	       key->gl_clamp_mask[0] |= 1 << s;
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -46,7 +46,7 @@


 uint32_t
-translate_wrap_mode(GLenum wrap, bool using_nearest)
+translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
 {
   switch( wrap ) {
   case GL_REPEAT:
@@ -55,9 +55,16 @@ translate_wrap_mode(GLenum wrap, bool using_nearest)
      /* GL_CLAMP is the weird mode where coordinates are clamped to
       * [0.0, 1.0], so linear filtering of coordinates outside of
       * [0.0, 1.0] give you half edge texel value and half border
-       * color.  The fragment shader will clamp the coordinates, and
-       * we set clamp_border here, which gets the result desired.  We
-       * just use clamp(_to_edge) for nearest, because for nearest
+       * color.
+       *
+       * Gen8+ supports this natively.
+       */
+      if (brw->gen >= 8)
+         return GEN8_TEXCOORDMODE_HALF_BORDER;
+
+      /* On Gen4-7.5, we clamp the coordinates in the fragment shader
+       * and set clamp_border here, which gets the result desired.
+       * We just use clamp(_to_edge) for nearest, because for nearest
       * clamping to 1.0 gives border color instead of the desired
       * edge texels.
       */
@@ -276,11 +283,11 @@ static void brw_update_sampler_state(struct brw_context *brw,
      }
   }

-   sampler->ss1.r_wrap_mode = translate_wrap_mode(gl_sampler->WrapR,
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(brw, gl_sampler->WrapR,
 						  using_nearest);
-   sampler->ss1.s_wrap_mode = translate_wrap_mode(gl_sampler->WrapS,
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(brw, gl_sampler->WrapS,
 						  using_nearest);
-   sampler->ss1.t_wrap_mode = translate_wrap_mode(gl_sampler->WrapT,
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(brw, gl_sampler->WrapT,
 						  using_nearest);

   if (brw->gen >= 6 &&
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -45,6 +45,7 @@ bool
 brw_color_buffer_write_enabled(struct brw_context *brw)
 {
   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
   const struct gl_fragment_program *fp = brw->fragment_program;
   int i;

--- a/src/mesa/drivers/dri/i965/gen7_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
@@ -103,11 +103,11 @@ gen7_update_sampler_state(struct brw_context *brw, int unit, int ss_index,
      }
   }

-   sampler->ss3.r_wrap_mode = translate_wrap_mode(gl_sampler->WrapR,
+   sampler->ss3.r_wrap_mode = translate_wrap_mode(brw, gl_sampler->WrapR,
 						  using_nearest);
-   sampler->ss3.s_wrap_mode = translate_wrap_mode(gl_sampler->WrapS,
+   sampler->ss3.s_wrap_mode = translate_wrap_mode(brw, gl_sampler->WrapS,
 						  using_nearest);
-   sampler->ss3.t_wrap_mode = translate_wrap_mode(gl_sampler->WrapT,
+   sampler->ss3.t_wrap_mode = translate_wrap_mode(brw, gl_sampler->WrapT,
 						  using_nearest);

   /* Cube-maps on 965 and later must use the same wrap mode for all 3
--- a/src/mesa/drivers/dri/i965/gen8_blend_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_blend_state.c
@@ -215,7 +215,7 @@ gen8_upload_ps_blend(struct brw_context *brw)
   /* _NEW_BUFFERS */
   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];

-   /* _NEW_BUFFERS | _NEW_COLOR */
+   /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
   if (brw_color_buffer_write_enabled(brw))
      dw1 |= GEN8_PS_BLEND_HAS_WRITEABLE_RT;

@@ -290,7 +290,7 @@ gen8_upload_ps_blend(struct brw_context *brw)
 const struct brw_tracked_state gen8_ps_blend = {
   .dirty = {
      .mesa = _NEW_BUFFERS | _NEW_COLOR | _NEW_MULTISAMPLE,
-      .brw = BRW_NEW_CONTEXT,
+      .brw = BRW_NEW_CONTEXT | BRW_NEW_FRAGMENT_PROGRAM,
      .cache = 0,
   },
   .emit = gen8_upload_ps_blend
--- a/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp
@@ -73,16 +73,17 @@ gen8_fs_generator::generate_fb_write(fs_inst *ir)

      if (ir->target > 0 && c->key.replicate_alpha) {
         /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
-         OR(vec1(retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD)),
-            vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
-            brw_imm_ud(1 << 11));
+         gen8_instruction *inst =
+            OR(get_element_ud(brw_message_reg(ir->base_mrf), 0),
+               vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+               brw_imm_ud(1 << 11));
+         gen8_set_mask_control(inst, BRW_MASK_DISABLE);
      }

      if (ir->target > 0) {
         /* Set the render target index for choosing BLEND_STATE. */
-         MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, ir->base_mrf, 2),
-                    BRW_REGISTER_TYPE_UD),
-             brw_imm_ud(ir->target));
+         MOV_RAW(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, ir->base_mrf, 2),
+                 brw_imm_ud(ir->target));
      }
   }

--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -254,7 +254,8 @@ intelInitExtensions(struct gl_context *ctx)

      ctx->Extensions.EXT_framebuffer_multisample = true;
      ctx->Extensions.EXT_transform_feedback = true;
-      ctx->Extensions.EXT_framebuffer_multisample_blit_scaled = true;
+      if (brw->gen < 8)
+         ctx->Extensions.EXT_framebuffer_multisample_blit_scaled = true;
      ctx->Extensions.ARB_blend_func_extended = !driQueryOptionb(&brw->optionCache, "disable_blend_func_extended");
      ctx->Extensions.ARB_draw_buffers_blend = true;
      ctx->Extensions.ARB_ES3_compatibility = true;
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -865,6 +865,8 @@ intel_blit_framebuffer(struct gl_context *ctx,
                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                       GLbitfield mask, GLenum filter)
 {
+   struct brw_context *brw = brw_context(ctx);
+
   /* Page 679 of OpenGL 4.4 spec says:
    *    "Added BlitFramebuffer to commands affected by conditional rendering in
    *     section 10.10 (Bug 9562)."
@@ -872,14 +874,14 @@ intel_blit_framebuffer(struct gl_context *ctx,
   if (!_mesa_check_conditional_render(ctx))
      return;

-   mask = brw_blorp_framebuffer(brw_context(ctx),
+   mask = brw_blorp_framebuffer(brw,
                                srcX0, srcY0, srcX1, srcY1,
                                dstX0, dstY0, dstX1, dstY1,
                                mask, filter);
   if (mask == 0x0)
      return;

-   if (mask & GL_STENCIL_BUFFER_BIT) {
+   if (brw->gen >= 8 && (mask & GL_STENCIL_BUFFER_BIT)) {
      brw_meta_fbo_stencil_blit(brw_context(ctx),
                                srcX0, srcY0, srcX1, srcY1,
                                dstX0, dstY0, dstX1, dstY1);
@@ -896,11 +898,17 @@ intel_blit_framebuffer(struct gl_context *ctx,
   if (mask == 0x0)
      return;

+   mask = _mesa_meta_BlitFramebuffer(ctx,
+                                     srcX0, srcY0, srcX1, srcY1,
+                                     dstX0, dstY0, dstX1, dstY1,
+                                     mask, filter);
+   if (mask == 0x0)
+      return;

-   _mesa_meta_BlitFramebuffer(ctx,
-                              srcX0, srcY0, srcX1, srcY1,
-                              dstX0, dstY0, dstX1, dstY1,
-                              mask, filter);
+   _swrast_BlitFramebuffer(ctx,
+                           srcX0, srcY0, srcX1, srcY1,
+                           dstX0, dstY0, dstX1, dstY1,
+                           mask, filter);
 }

 /**
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
@@ -155,5 +155,5 @@ nouveau_driver_functions_init(struct dd_function_table *functions)
 	functions->DrawPixels = _mesa_meta_DrawPixels;
 	functions->CopyPixels = _mesa_meta_CopyPixels;
 	functions->Bitmap = _mesa_meta_Bitmap;
-	functions->BlitFramebuffer = _mesa_meta_BlitFramebuffer;
+	functions->BlitFramebuffer = _mesa_meta_and_swrast_BlitFramebuffer;
 }
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -873,7 +873,7 @@ void radeon_fbo_init(struct radeon_context *radeon)
  radeon->glCtx.Driver.RenderTexture = radeon_render_texture;
  radeon->glCtx.Driver.FinishRenderTexture = radeon_finish_render_texture;
  radeon->glCtx.Driver.ValidateFramebuffer = radeon_validate_framebuffer;
-  radeon->glCtx.Driver.BlitFramebuffer = _mesa_meta_BlitFramebuffer;
+  radeon->glCtx.Driver.BlitFramebuffer = _mesa_meta_and_swrast_BlitFramebuffer;
  radeon->glCtx.Driver.EGLImageTargetRenderbufferStorage =
 	  radeon_image_target_renderbuffer_storage;
 }
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -841,7 +841,7 @@ xmesa_init_driver_functions( XMesaVisual xmvisual,
   if (TEST_META_FUNCS) {
      driver->Clear = _mesa_meta_Clear;
      driver->CopyPixels = _mesa_meta_CopyPixels;
-      driver->BlitFramebuffer = _mesa_meta_BlitFramebuffer;
+      driver->BlitFramebuffer = _mesa_meta_and_swrast_BlitFramebuffer;
      driver->DrawPixels = _mesa_meta_DrawPixels;
      driver->Bitmap = _mesa_meta_Bitmap;
   }
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -500,6 +500,12 @@ _mesa_framebuffer_renderbuffer(struct gl_context *ctx,
   }
   else {
      remove_attachment(ctx, att);
+      if (attachment == GL_DEPTH_STENCIL_ATTACHMENT) {
+         /* detach stencil (depth was detached above) */
+         att = get_attachment(ctx, fb, GL_STENCIL_ATTACHMENT_EXT);
+         assert(att);
+         remove_attachment(ctx, att);
+      }
   }

   invalidate_framebuffer(fb);
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -847,6 +847,16 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
      v->value_int = ctx->Array.VAO->IndexBufferObj->Name;
      break;

+   /* ARB_vertex_array_bgra */
+   case GL_COLOR_ARRAY_SIZE:
+      array = &ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_COLOR0];
+      v->value_int = array->Format == GL_BGRA ? GL_BGRA : array->Size;
+      break;
+   case GL_SECONDARY_COLOR_ARRAY_SIZE:
+      array = &ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_COLOR1];
+      v->value_int = array->Format == GL_BGRA ? GL_BGRA : array->Size;
+      break;
+
   /* ARB_copy_buffer */
   case GL_COPY_READ_BUFFER:
      v->value_int = ctx->CopyReadBuffer->Name;
--- a/src/mesa/main/get_hash_generator.py
+++ b/src/mesa/main/get_hash_generator.py
@@ -52,7 +52,7 @@ def print_header():
          (prime_factor, prime_step)

 def print_params(params):
-   print "static struct value_desc values[] = {"
+   print "static const struct value_desc values[] = {"
   for p in params:
      print "    { %s, %s }," % (p[0], p[1])

--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -199,7 +199,7 @@ descriptor=[
  [ "NORMAL_ARRAY_TYPE", "ARRAY_ENUM(VertexAttrib[VERT_ATTRIB_NORMAL].Type), NO_EXTRA" ],
  [ "NORMAL_ARRAY_STRIDE", "ARRAY_INT(VertexAttrib[VERT_ATTRIB_NORMAL].Stride), NO_EXTRA" ],
  [ "COLOR_ARRAY", "ARRAY_BOOL(VertexAttrib[VERT_ATTRIB_COLOR0].Enabled), NO_EXTRA" ],
-  [ "COLOR_ARRAY_SIZE", "ARRAY_INT(VertexAttrib[VERT_ATTRIB_COLOR0].Size), NO_EXTRA" ],
+  [ "COLOR_ARRAY_SIZE", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],
  [ "COLOR_ARRAY_TYPE", "ARRAY_ENUM(VertexAttrib[VERT_ATTRIB_COLOR0].Type), NO_EXTRA" ],
  [ "COLOR_ARRAY_STRIDE", "ARRAY_INT(VertexAttrib[VERT_ATTRIB_COLOR0].Stride), NO_EXTRA" ],
  [ "TEXTURE_COORD_ARRAY", "LOC_CUSTOM, TYPE_BOOLEAN, offsetof(struct gl_client_array, Enabled), NO_EXTRA" ],
@@ -552,7 +552,7 @@ descriptor=[
  [ "SECONDARY_COLOR_ARRAY", "ARRAY_BOOL(VertexAttrib[VERT_ATTRIB_COLOR1].Enabled), NO_EXTRA" ],
  [ "SECONDARY_COLOR_ARRAY_TYPE", "ARRAY_ENUM(VertexAttrib[VERT_ATTRIB_COLOR1].Type), NO_EXTRA" ],
  [ "SECONDARY_COLOR_ARRAY_STRIDE", "ARRAY_INT(VertexAttrib[VERT_ATTRIB_COLOR1].Stride), NO_EXTRA" ],
-  [ "SECONDARY_COLOR_ARRAY_SIZE", "ARRAY_INT(VertexAttrib[VERT_ATTRIB_COLOR1].Size), NO_EXTRA" ],
+  [ "SECONDARY_COLOR_ARRAY_SIZE", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],

 # GL_EXT_fog_coord
  [ "CURRENT_FOG_COORDINATE", "CONTEXT_FLOAT(Current.Attrib[VERT_ATTRIB_FOG][0]), extra_flush_current" ],
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -679,32 +679,36 @@ st_translate_fragment_program(struct st_context *st,
         outputsWritten &= ~(1 << FRAG_RESULT_STENCIL);
      }

+      if (outputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
+         fs_output_semantic_name[fs_num_outputs] = TGSI_SEMANTIC_SAMPLEMASK;
+         fs_output_semantic_index[fs_num_outputs] = 0;
+         outputMapping[FRAG_RESULT_SAMPLE_MASK] = fs_num_outputs;
+         fs_num_outputs++;
+         outputsWritten &= ~(1 << FRAG_RESULT_SAMPLE_MASK);
+      }
+
      /* handle remaining outputs (color) */
      for (attr = 0; attr < FRAG_RESULT_MAX; attr++) {
         if (outputsWritten & BITFIELD64_BIT(attr)) {
-            int semantic = TGSI_SEMANTIC_COLOR;
            switch (attr) {
            case FRAG_RESULT_DEPTH:
            case FRAG_RESULT_STENCIL:
+            case FRAG_RESULT_SAMPLE_MASK:
               /* handled above */
               assert(0);
               break;
            case FRAG_RESULT_COLOR:
-               write_all = GL_TRUE;
-               break;
-            case FRAG_RESULT_SAMPLE_MASK:
-               semantic = TGSI_SEMANTIC_SAMPLEMASK;
+               write_all = GL_TRUE; /* fallthrough */
+            default:
+               assert(attr == FRAG_RESULT_COLOR ||
+                      (FRAG_RESULT_DATA0 <= attr && attr < FRAG_RESULT_MAX));
+               fs_output_semantic_name[fs_num_outputs] = TGSI_SEMANTIC_COLOR;
+               fs_output_semantic_index[fs_num_outputs] = numColors;
+               outputMapping[attr] = fs_num_outputs;
+               numColors++;
               break;
            }

-            assert(attr == FRAG_RESULT_COLOR ||
-                   attr == FRAG_RESULT_SAMPLE_MASK ||
-                   (FRAG_RESULT_DATA0 <= attr && attr < FRAG_RESULT_MAX));
-            fs_output_semantic_name[fs_num_outputs] = semantic;
-            fs_output_semantic_index[fs_num_outputs] = numColors;
-            outputMapping[attr] = fs_num_outputs;
-            numColors++;
-
            fs_num_outputs++;
         }
      }