mesa: Bump version number to 8.0 (final)

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
mesa: fix maximum allowed proxy texture size condition
2012-02-09 14:26:15 -08:00 · 2012-02-09 13:16:07 -08:00 · 2012-02-07 16:18:32 -08:00 · 2012-02-07 16:18:32 -08:00 · 2012-02-07 16:17:33 -08:00 · 2012-02-07 10:26:17 -08:00
231 changed files with 4446 additions and 9587 deletions
--- a/2
+++ b/2
@@ -184,7 +184,7 @@ ultrix-gcc:

 # Rules for making release tarballs

-PACKAGE_VERSION=8.0-devel
+PACKAGE_VERSION=8.0
 PACKAGE_DIR = Mesa-$(PACKAGE_VERSION)
 PACKAGE_NAME = MesaLib-$(PACKAGE_VERSION)

--- a/configs/linux-dri
+++ b/configs/linux-dri
@@ -70,7 +70,6 @@ INTEL_CFLAGS = $(shell $(PKG_CONFIG) --cflags libdrm_intel)
 NOUVEAU_LIBS = $(shell $(PKG_CONFIG) --libs libdrm_nouveau)
 NOUVEAU_CFLAGS = $(shell $(PKG_CONFIG) --cflags libdrm_nouveau)

-LIBDRM_RADEON_LIBS = $(shell $(PKG_CONFIG) --libs libdrm_radeon)
-LIBDRM_RADEON_CFLAGS = $(shell $(PKG_CONFIG) --cflags libdrm_radeon)
-RADEON_CFLAGS = "-DHAVE_LIBDRM_RADEON=1 $(LIBDRM_RADEON_CFLAGS)"
+RADEON_LIBS = $(shell $(PKG_CONFIG) --libs libdrm_radeon)
+RADEON_CFLAGS = $(shell $(PKG_CONFIG) --cflags libdrm_radeon)
 RADEON_LDFLAGS = $(LIBDRM_RADEON_LIBS)
--- a/configure.ac
+++ b/configure.ac
@@ -88,13 +88,13 @@ AC_COMPILE_IFELSE(
       not clang
 #endif
 ]])],
-[CLANG=yes], [CLANG=no])
+[acv_mesa_CLANG=yes], [acv_mesa_CLANG=no])

-AC_MSG_RESULT([$CLANG])
+AC_MSG_RESULT([$acv_mesa_CLANG])

 dnl If we're using GCC, make sure that it is at least version 3.3.0.  Older
 dnl versions are explictly not supported.
-if test "x$GCC" = xyes -a "x$CLANG" = xno; then
+if test "x$GCC" = xyes -a "x$acv_mesa_CLANG" = xno; then
    AC_MSG_CHECKING([whether gcc version is sufficient])
    major=0
    minor=0
@@ -662,7 +662,7 @@ AC_ARG_ENABLE([gallium_gbm],
    [enable_gallium_gbm=auto])

 # Option for Gallium drivers
-GALLIUM_DRIVERS_DEFAULT="r300,r600,swrast"
+GALLIUM_DRIVERS_DEFAULT="r300,r600,svga,swrast"

 AC_ARG_WITH([gallium-drivers],
    [AS_HELP_STRING([--with-gallium-drivers@<:@=DIRS...@:>@],
@@ -1566,7 +1566,7 @@ if test "x$enable_gallium_g3dvl" = xyes; then
 fi

 if test "x$enable_xvmc" = xyes; then
-    PKG_CHECK_MODULES([XVMC], [xvmc >= 1.0.6 xorg-server])
+    PKG_CHECK_MODULES([XVMC], [xvmc >= 1.0.6])
    GALLIUM_STATE_TRACKERS_DIRS="$GALLIUM_STATE_TRACKERS_DIRS xorg/xvmc"
    HAVE_ST_XVMC="yes"
 fi
--- a/docs/contents.html
+++ b/docs/contents.html
@@ -63,6 +63,7 @@ a:visited {
 <LI><A HREF="extensions.html" target="MainFrame">Mesa Extensions</A>
 <LI><A HREF="mangling.html" target="MainFrame">Function Name Mangling</A>
 <LI><A href="llvmpipe.html" target="MainFrame">Gallium llvmpipe driver</A>
+<LI><A href="vmware-guest.html" target="MainFrame">VMware SVGA3D guest driver</a>
 <LI><A href="postprocess.html" target="MainFrame">Gallium post-processing</A>
 <LI><A href="viewperf.html" target="MainFrame">Viewperf Issues</A>
 </ul>
--- a/docs/intro.html
+++ b/docs/intro.html
@@ -132,12 +132,26 @@ June 2007: Mesa 7.0 is released, implementing the OpenGL 2.1 specification
 and OpenGL Shading Language.
 </p>

+<p>
+2008: Keith Whitwell and other Tungsten Graphics employees develop
+<a href="http://en.wikipedia.org/wiki/Gallium3D"  target="_parent">Gallium</a>
+- a new GPU abstraction layer.  The latest Mesa drivers are based on
+Gallium and other APIs such as OpenVG are implemented on top of Gallium.
+</p>

 <p>
-Ongoing: Mesa is used as the core of many hardware OpenGL drivers for
-the XFree86 and X.org X servers within the
-<A href="http://dri.freedesktop.org/" target="_parent">DRI project</A>.
-I continue to enhance Mesa with new extensions and features.
+February 2012: Mesa 8.0 is released, implementing the OpenGL 3.0 specification
+and version 1.30 of the OpenGL Shading Language.
+</p>
+
+<p>
+Ongoing: Mesa is the OpenGL implementation for several types of hardware
+made by Intel, AMD and NVIDIA, plus the VMware virtual GPU.
+There's also several software-based renderers: swrast (the legacy
+Mesa rasterizer), softpipe (a gallium reference driver) and llvmpipe
+(LLVM/JIT-based high-speed rasterizer).
+Work continues on the drivers and core Mesa to implement newer versions
+of the OpenGL specification.
 </p>


@@ -151,6 +165,15 @@ of the OpenGL specification is implemented.
 </p>


+<H2>Version 8.x features</H2>
+<p>
+Version 8.x of Mesa implements the OpenGL 3.0 API.
+The developers at Intel deserve a lot of credit for implementing most
+of the OpenGL 3.0 features in core Mesa, the GLSL compiler as well as
+the i965 driver.
+</p>
+
+
 <H2>Version 7.x features</H2>
 <p>
 Version 7.x of Mesa implements the OpenGL 2.1 API.  The main feature
--- a/docs/relnotes-8.0.html
+++ b/docs/relnotes-8.0.html
@@ -10,7 +10,7 @@

 <body bgcolor="#eeeeee">

-<H1>Mesa 8.0 Release Notes / (release date TBD)</H1>
+<H1>Mesa 8.0 Release Notes / February 9, 2012</H1>

 <p>
 Mesa 8.0 is a new development release.
--- a/docs/vmware-guest.html
+++ b/docs/vmware-guest.html
@@ -0,0 +1,194 @@
+<html>
+
+<title>VMware guest GL driver</title>
+
+<link rel="stylesheet" type="text/css" href="mesa.css"></head>
+
+<body>
+
+
+<h1>VMware guest GL driver</h1>
+
+<p>
+This page describes how to build, install and use the VMware guest GL driver
+(aka the SVGA or SVGA3D driver) for Linux using the latest source code.
+This driver gives a Linux virtual machine access to the host's GPU for
+hardware-accelerated 3D.
+VMware Workstation running on Linux or Windows and VMware Fusion running on
+MacOS are all supported.
+</p>
+
+<p>
+End users shouldn't have to go through all these steps once the driver is
+included in newer Linux distributions.
+</p>
+
+<p>
+For more information about the X components see these wiki pages at x.org:
+</p>
+<ul>
+<li><a href="http://wiki.x.org/wiki/vmware" target="_parent">
+Driver Overview</a>
+<li><a href="http://wiki.x.org/wiki/vmware/vmware3D" target="_parent">
+xf86-video-vmware Details</a>
+</ul>
+
+
+<h2>Components</h2>
+
+The components involved in this include:
+<ul>
+<li>Linux kernel module: vmwgfx
+<li>X server 2D driver: xf86-video-vmware
+<li>User-space libdrm library
+<li>Mesa/gallium OpenGL driver: "svga"
+</ul>
+
+
+<h2>Prerequisites</h2>
+
+<ul>
+<li>Kernel version at least 2.6.25 
+<li>Xserver version at least 1.7 
+<li>Ubuntu: For ubuntu you need to install a number of build dependencies. 
+  <pre>
+  sudo apt-get install git-core
+  sudo apt-get install automake libtool libpthread-stubs0-dev
+  sudo apt-get install xserver-xorg-dev x11proto-xinerama-dev
+  sudo apt-get build-dep libgl1-mesa-dri libxcb-glx0-dev
+  </pre>
+<li>Fedora: For Fedora you also need to install a number of build dependencies. 
+  <pre>
+  sudo yum install mesa-libGL-devel xorg-x11-server-devel xorg-x11-util-macros
+  sudo yum install automake gcc libtool expat-devel kernel-devel git-core
+  </pre>
+</ul>
+
+<p>
+Depending on your Linux distro, other packages may be needed.
+The configure scripts should tell you what's missing.
+</p>
+
+
+
+<h2>Getting the Latest Source Code</h2>
+
+Begin by saving your current directory location:
+  <pre>
+  export TOP=$PWD
+  </pre>
+
+<ul>
+<li>Mesa/Gallium master branch. This code is used to build libGL, and the direct rendering svga driver for libGL, vmwgfx_dri.so, and the X acceleration library libxatracker.so.x.x.x. 
+  <pre>
+  git clone git://anongit.freedesktop.org/git/mesa/mesa
+  </pre>
+<li>VMware Linux guest kernel module. Note that this repo contains the complete DRM and TTM code. The vmware-specific driver is really only the files prefixed with vmwgfx. 
+  <pre>
+  git clone git://anongit.freedesktop.org/git/mesa/vmwgfx
+  </pre>
+
+<li>libdrm, A user-space library that interfaces with drm. Most distros ship with this driver. Safest bet is really to replace the system one. Optionally you can point LIBDRM_CFLAGS and LIBDRM_LIBS to the libdrm-2.4.22 package in toolchain. But here, we replace: 
+  <pre>
+  git clone git://anongit.freedesktop.org/git/mesa/drm
+  </pre>
+<li>xf86-video-vmware. The chainloading driver, vmware_drv.so, the legacy driver vmwlegacy_drv.so, and the vmwgfx driver vmwgfx_drv.so. 
+  <pre>
+  git clone git://anongit.freedesktop.org/git/xorg/driver/xf86-video-vmware
+  </pre>
+</ul>
+
+
+<h2>Building the Code</h2>
+
+<ul>
+<li>Build libdrm: If you're on a 32-bit system, you should skip the --libdir configure option. Note also the comment about toolchain libdrm above. 
+  <pre>
+  cd $TOP/drm
+  ./autogen.sh --prefix=/usr --enable-vmwgfx-experimental-api --libdir=/usr/lib64
+  make
+  sudo make install
+  </pre>
+<li>Build Mesa and the vmwgfx_dri.so driver, the vmwgfx_drv.so xorg driver, the X acceleration library libxatracker.
+The vmwgfx_dri.so is used by the OpenGL libraries during direct rendering,
+and by the Xorg server during accelerated indirect GL rendering.
+The libxatracker library is used exclusively by the X server to do render,
+copy and video acceleration:
+<br>
+The following configure options doesn't build the EGL system.
+<br>
+As before, if you're on a 32-bit system, you should skip the --libdir
+configure option.
+  <pre>
+  cd $TOP/mesa
+  ./autogen.sh --prefix=/usr --libdir=/usr/lib64 --with-gallium-drivers=svga --with-dri-drivers= --enable-xa
+  make
+  sudo make install
+  </pre>
+
+Note that you may have to install other packages that Mesa depends upon
+if they're not installed in your system.  You should be told what's missing.
+<br>
+<br>
+
+<li>xf86-video-vmware: Now, once libxatracker is installed, we proceed with building and replacing the current Xorg driver. First check if your system is 32- or 64-bit. If you're building for a 32-bit system, you will not be needing the --libdir=/usr/lib64 option to autogen. 
+  <pre>
+  cd $TOP/xf86-video-vmware
+  ./autogen.sh --prefix=/usr --libdir=/usr/lib64
+  make
+  sudo make install
+  </pre>
+<li>vmwgfx kernel module. First make sure that any old version of this kernel module is removed from the system by issuing
+  <pre>
+  sudo rm /lib/modules/`uname -r`/kernel/drivers/gpu/drm/vmwgfx.ko*
+  </pre>
+Then 
+  <pre>
+  cd $TOP/vmwgfx
+  make
+  sudo make install
+  sudo cp 00-vmwgfx.rules /etc/udev/rules.d
+  sudo depmod -ae
+  </pre>
+</ul>
+
+
+Now try to load the kernel module by issuing
+  <pre>
+  sudo modprobe vmwgfx</pre>
+Then type 
+  <pre>
+  dmesg</pre>
+to watch the debug output. It should contain a number of lines prefixed with "[vmwgfx]". 
+
+<p>
+Then restart the Xserver (or reboot).
+The lines starting with "vmwlegacy" or "VMWARE" in the file /var/log/Xorg.0.log
+should now have been replaced with lines starting with "vmwgfx", indicating that
+the new Xorg driver is in use. 
+</p>
+
+
+<h2>Running OpenGL Programs</h2>
+
+<p>
+In a shell, run 'glxinfo' and look for the following to verify that the
+driver is working:
+</p>
+
+<pre>
+OpenGL vendor string: VMware, Inc.
+OpenGL renderer string: Gallium 0.4 on SVGA3D; build: RELEASE;
+OpenGL version string: 2.1 Mesa 8.0
+</pre>
+
+If you don't see this, try setting this environment variable:
+  <pre>
+  export LIBGL_DEBUG=verbose</pre>
+then rerun glxinfo and examine the output for error messages.
+</p>
+
+
+
+</body>
+</html>
--- a/src/gallium/auxiliary/postprocess/postprocess.h
+++ b/src/gallium/auxiliary/postprocess/postprocess.h
@@ -72,8 +72,7 @@ void pp_free(struct pp_queue_t *);
 void pp_free_fbos(struct pp_queue_t *);
 void pp_debug(const char *, ...);
 struct program *pp_init_prog(struct pp_queue_t *, struct pipe_screen *);
-void pp_init_fbos(struct pp_queue_t *, unsigned int, unsigned int,
-                  struct pipe_resource *);
+void pp_init_fbos(struct pp_queue_t *, unsigned int, unsigned int);

 /* The filters */

--- a/src/gallium/auxiliary/postprocess/pp_init.c
+++ b/src/gallium/auxiliary/postprocess/pp_init.c
@@ -195,7 +195,7 @@ pp_debug(const char *fmt, ...)
 /** Allocate the temp FBOs. Called on makecurrent and resize. */
 void
 pp_init_fbos(struct pp_queue_t *ppq, unsigned int w,
-             unsigned int h, struct pipe_resource *indepth)
+             unsigned int h)
 {

   struct program *p = ppq->p;  /* The lazy will inherit the earth */
@@ -242,11 +242,7 @@ pp_init_fbos(struct pp_queue_t *ppq, unsigned int w,
         goto error;
   }

-   tmp_res.format = p->surf.format = indepth->format;
   tmp_res.bind = p->surf.usage = PIPE_BIND_DEPTH_STENCIL;
-   ppq->depth = indepth;
-   if (!ppq->depth)
-      goto error;

   tmp_res.format = p->surf.format = PIPE_FORMAT_S8_UINT_Z24_UNORM;

--- a/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/src/gallium/auxiliary/postprocess/pp_run.c
@@ -42,14 +42,14 @@ void
 pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
       struct pipe_resource *out, struct pipe_resource *indepth)
 {
-
+   struct pipe_resource *refin = NULL, *refout = NULL;
   unsigned int i;

   if (in->width0 != ppq->p->framebuffer.width ||
       in->height0 != ppq->p->framebuffer.height) {
      pp_debug("Resizing the temp pp buffers\n");
      pp_free_fbos(ppq);
-      pp_init_fbos(ppq, in->width0, in->height0, indepth);
+      pp_init_fbos(ppq, in->width0, in->height0);
   }

   if (in == out && ppq->n_filters == 1) {
@@ -64,6 +64,11 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
      in = ppq->tmp[0];
   }

+   // Kept only for this frame.
+   pipe_resource_reference(&ppq->depth, indepth);
+   pipe_resource_reference(&refin, in);
+   pipe_resource_reference(&refout, out);
+
   switch (ppq->n_filters) {
   case 1:                     /* No temp buf */
      ppq->pp_queue[0] (ppq, in, out, 0);
@@ -93,6 +98,10 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,

      break;
   }
+
+   pipe_resource_reference(&ppq->depth, NULL);
+   pipe_resource_reference(&refin, NULL);
+   pipe_resource_reference(&refout, NULL);
 }


--- a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp
@@ -938,6 +938,7 @@ nv50_ir_init_prog_info(struct nv50_ir_prog_info *info)
   }
   info->io.clipDistance = 0xff;
   info->io.pointSize = 0xff;
+   info->io.vertexId = 0xff;
   info->io.edgeFlagIn = 0xff;
   info->io.edgeFlagOut = 0xff;
   info->io.fragDepth = 0xff;
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h
@@ -42,6 +42,7 @@ struct nv50_ir_varying
   unsigned mask     : 4; /* vec4 mask */
   unsigned linear   : 1; /* linearly interpolated if true (and not flat) */
   unsigned flat     : 1;
+   unsigned sc       : 1; /* special colour interpolation mode (SHADE_MODEL) */
   unsigned centroid : 1;
   unsigned patch    : 1; /* patch constant value */
   unsigned regular  : 1; /* driver-specific meaning (e.g. input in sreg) */
@@ -155,6 +156,7 @@ struct nv50_ir_prog_info
      uint8_t cullDistanceMask;  /* clip distance mode (1 bit per output) */
      int8_t genUserClip;        /* request user clip planes for ClipVertex */
      uint8_t pointSize;         /* output index for PointSize */
+      uint8_t vertexId;          /* system value index of VertexID */
      uint8_t edgeFlagIn;
      uint8_t edgeFlagOut;
      uint8_t fragDepth;         /* output index of FragDepth */
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
@@ -817,9 +817,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
               case TGSI_INTERPOLATE_CONSTANT:
                  info->in[i].flat = 1;
                  break;
+               case TGSI_INTERPOLATE_COLOR:
+                  info->in[i].sc = 1;
+                  break;
               case TGSI_INTERPOLATE_LINEAR:
-                  if (sn != TGSI_SEMANTIC_COLOR) // GL_NICEST
-                     info->in[i].linear = 1;
+                  info->in[i].linear = 1;
                  break;
               default:
                  break;
@@ -864,6 +866,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
      }
      break;
   case TGSI_FILE_SYSTEM_VALUE:
+      switch (sn) {
+      case TGSI_SEMANTIC_VERTEXID:
+         info->io.vertexId = first;
+         break;
+      default:
+         break;
+      }
      for (i = first; i <= last; ++i, ++si) {
         info->sv[i].sn = sn;
         info->sv[i].si = si;
@@ -1134,7 +1143,7 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
 static inline uint8_t
 translateInterpMode(const struct nv50_ir_varying *var, operation& op)
 {
-   uint8_t mode;
+   uint8_t mode = NV50_IR_INTERP_PERSPECTIVE;

   if (var->flat)
      mode = NV50_IR_INTERP_FLAT;
@@ -1142,9 +1151,11 @@ translateInterpMode(const struct nv50_ir_varying *var, operation& op)
   if (var->linear)
      mode = NV50_IR_INTERP_LINEAR;
   else
-      mode = NV50_IR_INTERP_PERSPECTIVE;
+   if (var->sc)
+      mode = NV50_IR_INTERP_SC;

-   op = (mode == NV50_IR_INTERP_PERSPECTIVE) ? OP_PINTERP : OP_LINTERP;
+   op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC)
+      ? OP_PINTERP : OP_LINTERP;

   if (var->centroid)
      mode |= NV50_IR_INTERP_CENTROID;
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
@@ -703,11 +703,6 @@ NVC0LoweringPass::visit(Instruction *i)
         assert(prog->getType() != Program::TYPE_FRAGMENT);
      }
      break;
-   case OP_PINTERP:
-      if (i->getSrc(0)->reg.data.offset >= 0x280 &&
-          i->getSrc(0)->reg.data.offset <  0x2c0)
-         i->setInterpolate(i->getSampleMode() | NV50_IR_INTERP_SC);
-      break;
   default:
      break;
   }   
--- a/src/gallium/drivers/nvc0/nvc0_3d.xml.h
+++ b/src/gallium/drivers/nvc0/nvc0_3d.xml.h
@@ -913,6 +913,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT			0x04000000
 #define NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_CONT			0x08000000

+#define NVC0_3D_VERTEX_ID_REPLACE				0x0000161c
+#define NVC0_3D_VERTEX_ID_REPLACE_ENABLE			0x00000001
+#define NVC0_3D_VERTEX_ID_REPLACE_SOURCE__MASK			0x00000ff0
+#define NVC0_3D_VERTEX_ID_REPLACE_SOURCE__SHIFT			4
+
 #define NVC0_3D_VERTEX_DATA					0x00001640

 #define NVC0_3D_PRIM_RESTART_ENABLE				0x00001644
--- a/src/gallium/drivers/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nvc0/nvc0_context.h
@@ -134,9 +134,6 @@ struct nvc0_context {
   struct draw_context *draw;
 };

-#define NVC0_USING_EDGEFLAG(ctx) \
-   ((ctx)->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)
-
 static INLINE struct nvc0_context *
 nvc0_context(struct pipe_context *pipe)
 {
--- a/src/gallium/drivers/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nvc0/nvc0_program.c
@@ -107,7 +107,7 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)

   for (n = 0, i = 0; i < info->numInputs; ++i) {
      switch (info->in[i].sn) {
-      case TGSI_SEMANTIC_INSTANCEID:
+      case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
      case TGSI_SEMANTIC_VERTEXID:
         info->in[i].mask = 0x1;
         info->in[i].slot[0] =
@@ -580,7 +580,11 @@ nvc0_program_translate(struct nvc0_program *prog)
   prog->relocs = info->bin.relocData;
   prog->max_gpr = MAX2(4, (info->bin.maxGPR + 1));

-   prog->vp.edgeflag = PIPE_MAX_ATTRIBS;
+   prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+
+   if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
+      info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
+   prog->vp.edgeflag = info->io.edgeFlagIn;

   switch (prog->type) {
   case PIPE_SHADER_VERTEX:
--- a/src/gallium/drivers/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nvc0/nvc0_program.h
@@ -37,8 +37,9 @@ struct nvc0_program {
   struct {
      uint32_t clip_mode; /* clip/cull selection */
      uint8_t clip_enable; /* mask of defined clip planes */
-      uint8_t edgeflag;
      uint8_t num_ucps; /* also set to max if ClipDistance is used */
+      uint8_t edgeflag; /* attribute index of edgeflag input */
+      boolean need_vertex_id;
   } vp;
   struct {
      uint8_t early_z;
--- a/src/gallium/drivers/nvc0/nvc0_push.c
+++ b/src/gallium/drivers/nvc0/nvc0_push.c
@@ -21,6 +21,7 @@ struct push_context {
   struct translate *translate;

   boolean primitive_restart;
+   boolean need_vertex_id;
   uint32_t prim;
   uint32_t restart_index;
   uint32_t instance_id;
@@ -42,22 +43,23 @@ init_push_context(struct nvc0_context *nvc0, struct push_context *ctx)
   ctx->chan = nvc0->screen->base.channel;
   ctx->translate = nvc0->vertex->translate;

+   if (likely(nvc0->vertex->num_elements < 32))
+      ctx->need_vertex_id = nvc0->vertprog->vp.need_vertex_id;
+   else
+      ctx->need_vertex_id = FALSE;
+
+   ctx->edgeflag.buffer = -1;
   ctx->edgeflag.value = 0.5f;

-   if (NVC0_USING_EDGEFLAG(nvc0)) {
+   if (unlikely(nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)) {
      ve = &nvc0->vertex->element[nvc0->vertprog->vp.edgeflag].pipe;
-
      ctx->edgeflag.buffer = ve->vertex_buffer_index;
      ctx->edgeflag.offset = ve->src_offset;
-
      ctx->packet_vertex_limit = 1;
   } else {
-      ctx->edgeflag.buffer = -1;
-      ctx->edgeflag.offset = 0;
-      ctx->edgeflag.stride = 0;
-      ctx->edgeflag.data = NULL;
-
      ctx->packet_vertex_limit = nvc0->vertex->vtx_per_packet_max;
+      if (unlikely(ctx->need_vertex_id))
+         ctx->packet_vertex_limit = 1;
   }

   ctx->vertex_words = nvc0->vertex->vtx_size;
@@ -74,6 +76,17 @@ set_edgeflag(struct push_context *ctx, unsigned vtx_id)
   }
 }

+static INLINE void
+set_vertexid(struct push_context *ctx, uint32_t vtx_id)
+{
+#if 0
+   BEGIN_RING(ctx->chan, RING_3D(VERTEX_ID), 1); /* broken on nvc0 */
+#else
+   BEGIN_RING(ctx->chan, RING_3D(VERTEX_DATA), 1); /* as last attribute */
+#endif
+   OUT_RING  (ctx->chan, vtx_id);
+}
+
 static INLINE unsigned
 prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
 {
@@ -117,7 +130,7 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
      if (ctx->primitive_restart)
         nr = prim_restart_search_i08(elts, push, ctx->restart_index);

-      if (unlikely(ctx->edgeflag.buffer >= 0) && nr)
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
         set_edgeflag(ctx, elts[0]);

      size = ctx->vertex_words * nr;
@@ -126,8 +139,11 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)

      ctx->translate->run_elts8(ctx->translate, elts, nr, ctx->instance_id,
                                ctx->chan->cur);
-
      ctx->chan->cur += size;
+
+      if (unlikely(ctx->need_vertex_id) && likely(size))
+         set_vertexid(ctx, elts[0]);
+
      count -= nr;
      elts += nr;

@@ -155,7 +171,7 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
      if (ctx->primitive_restart)
         nr = prim_restart_search_i16(elts, push, ctx->restart_index);

-      if (unlikely(ctx->edgeflag.buffer >= 0) && nr)
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
         set_edgeflag(ctx, elts[0]);

      size = ctx->vertex_words * nr;
@@ -164,8 +180,11 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)

      ctx->translate->run_elts16(ctx->translate, elts, nr, ctx->instance_id,
                                 ctx->chan->cur);
-
      ctx->chan->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, elts[0]);
+
      count -= nr;
      elts += nr;

@@ -193,7 +212,7 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
      if (ctx->primitive_restart)
         nr = prim_restart_search_i32(elts, push, ctx->restart_index);

-      if (unlikely(ctx->edgeflag.buffer >= 0) && nr)
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
         set_edgeflag(ctx, elts[0]);

      size = ctx->vertex_words * nr;
@@ -202,8 +221,11 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)

      ctx->translate->run_elts(ctx->translate, elts, nr, ctx->instance_id,
                               ctx->chan->cur);
-
      ctx->chan->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, elts[0]);
+
      count -= nr;
      elts += nr;

@@ -233,6 +255,10 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
      ctx->translate->run(ctx->translate, start, push, ctx->instance_id,
                          ctx->chan->cur);
      ctx->chan->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, start);
+
      count -= push;
      start += push;
   }
@@ -326,6 +352,16 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   ctx.instance_id = info->start_instance;
   ctx.prim = nvc0_prim_gl(info->mode);

+   if (unlikely(ctx.need_vertex_id)) {
+      const unsigned a = nvc0->vertex->num_elements;
+      BEGIN_RING(ctx.chan, RING_3D(VERTEX_ATTRIB_FORMAT(a)), 1);
+      OUT_RING  (ctx.chan, (a << NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__SHIFT) |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32);
+      BEGIN_RING(ctx.chan, RING_3D(VERTEX_ID_REPLACE), 1);
+      OUT_RING  (ctx.chan, (((0x80 + a * 0x10) / 4) << 4) | 1);
+   }
+
   while (inst_count--) {
      BEGIN_RING(ctx.chan, RING_3D(VERTEX_BEGIN_GL), 1);
      OUT_RING  (ctx.chan, ctx.prim);
@@ -355,6 +391,16 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   if (unlikely(ctx.edgeflag.value == 0.0f))
      IMMED_RING(ctx.chan, RING_3D(EDGEFLAG_ENABLE), 1);

+   if (unlikely(ctx.need_vertex_id)) {
+      const unsigned a = nvc0->vertex->num_elements;
+      IMMED_RING(ctx.chan, RING_3D(VERTEX_ID_REPLACE), 0);
+      BEGIN_RING(ctx.chan, RING_3D(VERTEX_ATTRIB_FORMAT(a)), 1);
+      OUT_RING  (ctx.chan,
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32);
+   }
+
   if (info->indexed)
      nouveau_resource_unmap(nv04_resource(nvc0->idxbuf.buffer));

--- a/src/gallium/drivers/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nvc0/nvc0_screen.c
@@ -69,15 +69,14 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
   switch (param) {
   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
-      return 64;
+      return 16 * PIPE_SHADER_TYPES; /* NOTE: should not count COMPUTE */
   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 13;
-   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 10;
   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 13;
+      return 15;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 12;
   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return 8192;
+      return 2048;
   case PIPE_CAP_MIN_TEXEL_OFFSET:
      return -8;
   case PIPE_CAP_MAX_TEXEL_OFFSET:
@@ -167,7 +166,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_CAP_MAX_INPUTS:
      if (shader == PIPE_SHADER_VERTEX)
         return 32;
-      return 0x300 / 16;
+      if (shader == PIPE_SHADER_FRAGMENT)
+         return (0x200 + 0x20 + 0x80) / 16; /* generic + colors + TexCoords */
+      return (0x200 + 0x40 + 0x80) / 16; /* without 0x60 for per-patch inputs */
   case PIPE_SHADER_CAP_MAX_CONSTS:
      return 65536 / 16;
   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
@@ -191,7 +192,11 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_CAP_INTEGERS:
      return 1;
   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+      return 16; /* would be 32 in linked (OpenGL-style) mode */
+      /*
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLER_VIEWS:
      return 32;
+      */
   case PIPE_SHADER_CAP_OUTPUT_READ:
      return 0; /* shader != PIPE_SHADER_TESSELLATION_CONTROL; */
   default:
@@ -208,12 +213,13 @@ nvc0_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
      return 10.0f;
   case PIPE_CAPF_MAX_POINT_WIDTH:
+      return 63.0f;
   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
-      return 64.0f;
+      return 63.375f;
   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
      return 16.0f;
   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-      return 4.0f;
+      return 15.0f;
   default:
      NOUVEAU_ERR("unknown PIPE_CAP %d\n", param);
      return 0.0f;
--- a/src/gallium/drivers/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nvc0/nvc0_vbo.c
@@ -263,7 +263,8 @@ nvc0_vertex_arrays_validate(struct nvc0_context *nvc0)
   struct nvc0_vertex_element *ve;
   unsigned i;

-   if (unlikely(vertex->need_conversion || NVC0_USING_EDGEFLAG(nvc0))) {
+   if (unlikely(vertex->need_conversion) ||
+       unlikely(nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)) {
      nvc0->vbo_fifo = ~0;
      nvc0->vbo_user = 0;
   } else {
--- a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
+++ b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
@@ -547,7 +547,7 @@ static void do_advanced_regalloc(struct regalloc_state * s)
 	struct ra_graph * graph;

 	/* Allocate the main ra data structure */
-	regs = ra_alloc_reg_set(s->C->max_temp_regs * RC_MASK_XYZW);
+	regs = ra_alloc_reg_set(NULL, s->C->max_temp_regs * RC_MASK_XYZW);

 	/* Get list of program variables */
 	variables = rc_get_variables(s->C);
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -38,6 +38,23 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
+		/* prepend ALU_EXTENDED if we need more than 2 kcache sets */
+		if (cf->eg_alu_extended) {
+			bc->bytecode[id++] =
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(cf->kcache[2].bank) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(cf->kcache[3].bank) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(cf->kcache[2].mode);
+			bc->bytecode[id++] = EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_EXTENDED |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(cf->kcache[3].mode) |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(cf->kcache[2].addr) |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(cf->kcache[3].addr) |
+				S_SQ_CF_ALU_WORD1_EXT_BARRIER(1);
+		}
+
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
 			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
 			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -78,6 +78,10 @@
 #define   S_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) & 0x3) << 30)
 #define   G_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) >> 30) & 0x3)
 #define   C_SQ_CF_ALU_WORD0_KCACHE_MODE0                             0x3FFFFFFF
+#define     V_SQ_CF_KCACHE_NOP                                       0x00000000
+#define     V_SQ_CF_KCACHE_LOCK_1                                    0x00000001
+#define     V_SQ_CF_KCACHE_LOCK_2                                    0x00000002
+#define     V_SQ_CF_KCACHE_LOCK_LOOP_INDEX                           0x00000003
 #define P_SQ_CF_ALU_WORD1
 #define   S_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) & 0x3) << 0)
 #define   G_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) >> 0) & 0x3)
@@ -103,7 +107,50 @@
 #define   S_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) & 0x1) << 31)
 #define   G_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) >> 31) & 0x1)
 #define   C_SQ_CF_ALU_WORD1_BARRIER                                  0x7FFFFFFF
-/* extended TODO */
+
+#define P_SQ_CF_ALU_WORD0_EXT
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x)           (((x) & 0x3) << 4)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x)           (((x) >> 4) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0              0xFFFFFFCF
+#define	    V_SQ_CF_INDEX_NONE                                       0x00
+#define	    V_SQ_CF_INDEX_0                                          0x01
+#define	    V_SQ_CF_INDEX_1                                          0x02
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x)           (((x) & 0x3) << 6)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x)           (((x) >> 6) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1              0xFFFFFF3F
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x)           (((x) & 0x3) << 8)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x)           (((x) >> 8) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2              0xFFFFFCFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x)           (((x) & 0x3) << 10)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x)           (((x) >> 10) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3              0xFFFFF3FF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x)                      (((x) & 0xF) << 22)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x)                      (((x) >> 22) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2                         0xFC3FFFFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x)                      (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x)                      (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3                         0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x)                      (((x) & 0x3) << 30)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x)                      (((x) >> 30) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2                         0x3FFFFFFF
+
+#define P_SQ_CF_ALU_WORD1_EXT
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x)                      (((x) & 0x3) << 0)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x)                      (((x) >> 0) & 0x3)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3                         0xFFFFFFFC
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x)                      (((x) & 0xFF) << 2)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x)                      (((x) >> 2) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2                         0xFFFFFC03
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x)                      (((x) & 0xFF) << 10)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x)                      (((x) >> 10) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3                         0xFFFC03FF
+#define   S_SQ_CF_ALU_WORD1_EXT_CF_INST(x)                           (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD1_EXT_CF_INST(x)                           (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD1_EXT_CF_INST                              0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD1_EXT_BARRIER(x)                           (((x) & 0x1) << 31)
+#define   G_SQ_CF_ALU_WORD1_EXT_BARRIER(x)                           (((x) >> 31) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_EXT_BARRIER                              0x7FFFFFFF
+
 /* done */
 #define P_SQ_CF_ALLOC_EXPORT_WORD0
 #define   S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x)                   (((x) & 0x1FFF) << 0)
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -99,7 +99,9 @@ static const struct r600_reg evergreen_context_reg_list[] = {
 	{R_028058_DB_DEPTH_SIZE, 0, 0, 0},
 	{R_02805C_DB_DEPTH_SLICE, 0, 0, 0},
 	{R_028140_ALU_CONST_BUFFER_SIZE_PS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028144_ALU_CONST_BUFFER_SIZE_PS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028180_ALU_CONST_BUFFER_SIZE_VS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028184_ALU_CONST_BUFFER_SIZE_VS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028200_PA_SC_WINDOW_OFFSET, 0, 0, 0},
 	{R_028204_PA_SC_WINDOW_SCISSOR_TL, 0, 0, 0},
 	{R_028208_PA_SC_WINDOW_SCISSOR_BR, 0, 0, 0},
@@ -293,7 +295,9 @@ static const struct r600_reg evergreen_context_reg_list[] = {
 	{R_028924_SQ_GS_VERT_ITEMSIZE_2, 0, 0, 0},
 	{R_028928_SQ_GS_VERT_ITEMSIZE_3, 0, 0, 0},
 	{R_028940_ALU_CONST_CACHE_PS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028944_ALU_CONST_CACHE_PS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028980_ALU_CONST_CACHE_VS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028984_ALU_CONST_CACHE_VS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028A00_PA_SU_POINT_SIZE, 0, 0, 0},
 	{R_028A04_PA_SU_POINT_MINMAX, 0, 0, 0},
 	{R_028A08_PA_SU_LINE_CNTL, 0, 0, 0},
@@ -465,7 +469,9 @@ static const struct r600_reg cayman_context_reg_list[] = {
 	{R_028058_DB_DEPTH_SIZE, 0, 0, 0},
 	{R_02805C_DB_DEPTH_SLICE, 0, 0, 0},
 	{R_028140_ALU_CONST_BUFFER_SIZE_PS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028144_ALU_CONST_BUFFER_SIZE_PS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028180_ALU_CONST_BUFFER_SIZE_VS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028184_ALU_CONST_BUFFER_SIZE_VS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028200_PA_SC_WINDOW_OFFSET, 0, 0, 0},
 	{R_028204_PA_SC_WINDOW_SCISSOR_TL, 0, 0, 0},
 	{R_028208_PA_SC_WINDOW_SCISSOR_BR, 0, 0, 0},
@@ -658,7 +664,9 @@ static const struct r600_reg cayman_context_reg_list[] = {
 	{R_028924_SQ_GS_VERT_ITEMSIZE_2, 0, 0, 0},
 	{R_028928_SQ_GS_VERT_ITEMSIZE_3, 0, 0, 0},
 	{R_028940_ALU_CONST_CACHE_PS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028944_ALU_CONST_CACHE_PS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028980_ALU_CONST_CACHE_VS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028984_ALU_CONST_CACHE_VS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028A00_PA_SU_POINT_SIZE, 0, 0, 0},
 	{R_028A04_PA_SU_POINT_MINMAX, 0, 0, 0},
 	{R_028A08_PA_SU_LINE_CNTL, 0, 0, 0},
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -508,6 +508,10 @@ static uint32_t r600_translate_colorformat(enum pipe_format format)
 	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 		return V_028C70_COLOR_X24_8_32_FLOAT;

+	case PIPE_FORMAT_R32_UINT:
+	case PIPE_FORMAT_R32_SINT:
+		return V_028C70_COLOR_32;
+
 	case PIPE_FORMAT_R32_FLOAT:
 	case PIPE_FORMAT_Z32_FLOAT:
 		return V_028C70_COLOR_32_FLOAT;
@@ -902,6 +906,8 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	rs->clamp_fragment_color = state->clamp_fragment_color;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
+	rs->two_side = state->light_twoside;
+	rs->clip_plane_enable = state->clip_plane_enable;

 	clip_rule = state->scissor ? 0xAAAA : 0xFFFF;

@@ -939,8 +945,8 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 		S_028814_POLYMODE_FRONT_PTYPE(r600_translate_fill(state->fill_front)) |
 		S_028814_POLYMODE_BACK_PTYPE(r600_translate_fill(state->fill_back)), 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_02881C_PA_CL_VS_OUT_CNTL,
-			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex) |
-			S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex), 0xFFFFFFFF, NULL, 0);
+			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex),
+			S_02881C_USE_VTX_POINT_SIZE(1), NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028820_PA_CL_NANINF_CNTL, 0x00000000, 0xFFFFFFFF, NULL, 0);
 	/* point size 12.4 fixed point */
 	tmp = (unsigned)(state->point_size * 8.0);
@@ -987,9 +993,10 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp), 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_02820C_PA_SC_CLIPRECT_RULE, clip_rule, 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028810_PA_CL_CLIP_CNTL,
-			S_028810_PS_UCP_MODE(3) | (state->clip_plane_enable & 63) |
-			S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
-			S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip), 0xFFFFFFFF, NULL, 0);
+			S_028810_PS_UCP_MODE(3) | S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
+			S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip),
+			S_028810_PS_UCP_MODE(3) | S_028810_ZCLIP_NEAR_DISABLE(1) |
+			S_028810_ZCLIP_FAR_DISABLE(1), NULL, 0);
 	return rstate;
 }

@@ -1204,6 +1211,7 @@ static void evergreen_set_clip_state(struct pipe_context *ctx,
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+	struct pipe_resource *cbuf;

 	if (rstate == NULL)
 		return;
@@ -1228,6 +1236,13 @@ static void evergreen_set_clip_state(struct pipe_context *ctx,
 	free(rctx->states[R600_PIPE_STATE_CLIP]);
 	rctx->states[R600_PIPE_STATE_CLIP] = rstate;
 	r600_context_pipe_state_set(&rctx->ctx, rstate);
+
+	cbuf = pipe_user_buffer_create(ctx->screen,
+                                   state->ucp,
+                                   4*4*8, /* 8*4 floats */
+                                   PIPE_BIND_CONSTANT_BUFFER);
+	r600_set_constant_buffer(ctx, PIPE_SHADER_VERTEX, 1, cbuf);
+	pipe_resource_reference(&cbuf, NULL);
 }

 static void evergreen_set_polygon_stipple(struct pipe_context *ctx,
@@ -2462,6 +2477,16 @@ void evergreen_pipe_shader_vs(struct pipe_context *ctx, struct r600_pipe_shader
 	r600_pipe_state_add_reg(rstate,
 				R_03A200_SQ_LOOP_CONST_0 + (32 * 4), 0x01000FFF,
 				0xFFFFFFFF, NULL, 0);
+
+	r600_pipe_state_add_reg(rstate,
+				R_02881C_PA_CL_VS_OUT_CNTL,
+				S_02881C_VS_OUT_CCDIST0_VEC_ENA((rshader->clip_dist_write & 0x0F) != 0) |
+				S_02881C_VS_OUT_CCDIST1_VEC_ENA((rshader->clip_dist_write & 0xF0) != 0) |
+				S_02881C_VS_OUT_MISC_VEC_ENA(rshader->vs_out_misc_write),
+				S_02881C_VS_OUT_CCDIST0_VEC_ENA(1) |
+				S_02881C_VS_OUT_CCDIST1_VEC_ENA(1) |
+				S_02881C_VS_OUT_MISC_VEC_ENA(1),
+				NULL, 0);
 }

 void evergreen_fetch_shader(struct pipe_context *ctx,
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -675,13 +675,6 @@
 #define   G_028814_MULTI_PRIM_IB_ENA(x)                (((x) >> 21) & 0x1)
 #define   C_028814_MULTI_PRIM_IB_ENA                   0xFFDFFFFF

-#define R_028004_DB_DEPTH_VIEW                       0x028004
-#define   S_028004_SLICE_START(x)                      (((x) & 0x7FF) << 0)
-#define   G_028004_SLICE_START(x)                      (((x) >> 0) & 0x7FF)
-#define   C_028004_SLICE_START                         0xFFFFF800
-#define   S_028004_SLICE_MAX(x)                        (((x) & 0x7FF) << 13)
-#define   G_028004_SLICE_MAX(x)                        (((x) >> 13) & 0x7FF)
-#define   C_028004_SLICE_MAX                           0xFF001FFF
 #define R_028D24_DB_HTILE_SURFACE                    0x028D24
 #define   S_028D24_HTILE_WIDTH(x)                      (((x) & 0x1) << 0)
 #define   G_028D24_HTILE_WIDTH(x)                      (((x) >> 0) & 0x1)
@@ -1469,6 +1462,12 @@
 #define   S_028004_ZPASS_INCREMENT_DISABLE        (((x) & 0x1) << 0)
 #define   S_028004_PERFECT_ZPASS_COUNTS(x)        (((x) & 0x1) << 1)
 #define R_028008_DB_DEPTH_VIEW                       0x00028008
+#define   S_028008_SLICE_START(x)                      (((x) & 0x7FF) << 0)
+#define   G_028008_SLICE_START(x)                      (((x) >> 0) & 0x7FF)
+#define   C_028008_SLICE_START                         0xFFFFF800
+#define   S_028008_SLICE_MAX(x)                        (((x) & 0x7FF) << 13)
+#define   G_028008_SLICE_MAX(x)                        (((x) >> 13) & 0x7FF)
+#define   C_028008_SLICE_MAX                           0xFF001FFF
 #define R_02800C_DB_RENDER_OVERRIDE                  0x0002800C
 #define   V_02800C_FORCE_OFF                         0
 #define   V_02800C_FORCE_ENABLE                      1
@@ -1524,7 +1523,9 @@
 #define R_028050_DB_Z_WRITE_BASE                     0x00028050
 #define R_028054_DB_STENCIL_WRITE_BASE               0x00028054
 #define R_028140_ALU_CONST_BUFFER_SIZE_PS_0          0x00028140
+#define R_028144_ALU_CONST_BUFFER_SIZE_PS_1          0x00028144
 #define R_028180_ALU_CONST_BUFFER_SIZE_VS_0          0x00028180
+#define R_028184_ALU_CONST_BUFFER_SIZE_VS_1          0x00028184
 #define R_028200_PA_SC_WINDOW_OFFSET                 0x00028200
 #define R_02820C_PA_SC_CLIPRECT_RULE                 0x0002820C
 #define R_028210_PA_SC_CLIPRECT_0_TL                 0x00028210
@@ -1701,7 +1702,9 @@
 #define R_028924_SQ_GS_VERT_ITEMSIZE_2               0x00028924
 #define R_028928_SQ_GS_VERT_ITEMSIZE_3               0x00028928
 #define R_028940_ALU_CONST_CACHE_PS_0                0x00028940
+#define R_028944_ALU_CONST_CACHE_PS_1                0x00028944
 #define R_028980_ALU_CONST_CACHE_VS_0                0x00028980
+#define R_028984_ALU_CONST_CACHE_VS_1                0x00028984
 #define R_028A04_PA_SU_POINT_MINMAX                  0x00028A04
 #define R_028A08_PA_SU_LINE_CNTL                     0x00028A08
 #define   S_028A08_WIDTH(x)                            (((x) & 0xFFFF) << 0)
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -91,6 +91,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
@@ -236,8 +237,18 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
 	return tex;
 }

-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class)
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family)
 {
+	if ((chip_class == R600) && (family != CHIP_RV670))
+		bc->ar_handling = AR_HANDLE_RV6XX;
+	else
+		bc->ar_handling = AR_HANDLE_NORMAL;
+
+	if ((chip_class == R600) && (family != CHIP_RV670 && family != CHIP_RS780 &&
+					   family != CHIP_RS880))
+		bc->r6xx_nop_after_rel_dst = 1;
+	else
+		bc->r6xx_nop_after_rel_dst = 0;
 	LIST_INITHEAD(&bc->cf);
 	bc->chip_class = chip_class;
 }
@@ -249,8 +260,14 @@ static int r600_bytecode_add_cf(struct r600_bytecode *bc)
 	if (cf == NULL)
 		return -ENOMEM;
 	LIST_ADDTAIL(&cf->list, &bc->cf);
-	if (bc->cf_last)
+	if (bc->cf_last) {
 		cf->id = bc->cf_last->id + 2;
+		if (bc->cf_last->eg_alu_extended) {
+			/* take into account extended alu size */
+			cf->id += 2;
+			bc->ndw += 2;
+		}
+	}
 	bc->cf_last = cf;
 	bc->ncf++;
 	bc->ndw += 2;
@@ -428,7 +445,8 @@ static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *
 		return !alu->is_op3 && (
 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
-			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
+			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT ||
+			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT);
 	case EVERGREEN:
 	case CAYMAN:
 	default:
@@ -444,7 +462,8 @@ static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a
 	case R600:
 	case R700:
 		return is_alu_reduction_inst(bc, alu) ||
-			is_alu_mova_inst(bc, alu);
+			(is_alu_mova_inst(bc, alu) && 
+			 (alu->inst != V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT));
 	case EVERGREEN:
 	case CAYMAN:
 	default:
@@ -452,6 +471,7 @@ static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a
 			is_alu_mova_inst(bc, alu) ||
 			(alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
 			 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR ||
+			 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_LOAD_P0 ||
 			 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY ||
 			 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW);
 	}
@@ -465,6 +485,7 @@ static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode
 	case R700:
 		if (!alu->is_op3)
 			return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
+				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT ||
 				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
 			        alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT ||
 				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
@@ -536,6 +557,19 @@ static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a
 		!is_alu_trans_unit_inst(bc, alu);
 }

+static int is_nop_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
+{
+	switch (bc->chip_class) {
+	case R600:
+	case R700:
+		return (!alu->is_op3 && alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
+	case EVERGREEN:
+	case CAYMAN:
+	default:
+		return (!alu->is_op3 && alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
+	}
+}		
+
 static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
 			    struct r600_bytecode_alu *assignment[5])
 {
@@ -688,7 +722,7 @@ static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
 					return r;
 			}
 		} else if (is_cfile(sel)) {
-			r = reserve_cfile(bc, bs, sel, elem);
+			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
 			if (r)
 				return r;
 		}
@@ -715,7 +749,7 @@ static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
 				const_count++;
 		}
 		if (is_cfile(sel)) {
-			r = reserve_cfile(bc, bs, sel, elem);
+			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
 			if (r)
 				return r;
 		}
@@ -1037,6 +1071,10 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu
 		alu = slots[i];
 		num_once_inst += is_alu_once_inst(bc, alu);

+		/* don't reschedule NOPs */
+		if (is_nop_inst(bc, alu))
+			return 0;
+
 		/* Let's check dst gpr. */
 		if (alu->dst.rel) {
 			if (have_mova)
@@ -1111,117 +1149,203 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu
 	return 0;
 }

-/* This code handles kcache lines as single blocks of 32 constants. We could
- * probably do slightly better by recognizing that we actually have two
- * consecutive lines of 16 constants, but the resulting code would also be
- * somewhat more complicated. */
-static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
+/* we'll keep kcache sets sorted by bank & addr */
+static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
+		struct r600_bytecode_kcache *kcache,
+		unsigned bank, unsigned line)
 {
-	struct r600_bytecode_kcache *kcache = bc->cf_last->kcache;
-	unsigned int required_lines;
-	unsigned int free_lines = 0;
-	unsigned int cache_line[3];
-	unsigned int count = 0;
-	unsigned int i, j;
-	int r;
+	int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;

-	/* Collect required cache lines. */
-	for (i = 0; i < 3; ++i) {
-		boolean found = false;
-		unsigned int line;
+	for (i = 0; i < kcache_banks; i++) {
+		if (kcache[i].mode) {
+			int d;

-		if (alu->src[i].sel < 512)
+			if (kcache[i].bank < bank)
+				continue;
+
+			if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
+					kcache[i].bank > bank) {
+				/* try to insert new line */
+				if (kcache[kcache_banks-1].mode) {
+					/* all sets are in use */
+					return -ENOMEM;
+				}
+
+				memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
+				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
+				kcache[i].bank = bank;
+				kcache[i].addr = line;
+				return 0;
+			}
+
+			d = line - kcache[i].addr;
+
+			if (d == -1) {
+				kcache[i].addr--;
+				if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
+					/* we are prepending the line to the current set,
+					 * discarding the existing second line,
+					 * so we'll have to insert line+2 after it */
+					line += 2;
+					continue;
+				} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
+					kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
+					return 0;
+				} else {
+					/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
+					return -ENOMEM;
+				}
+			} else if (d == 1) {
+				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
+				return 0;
+			} else if (d == 0)
+				return 0;
+		} else { /* free kcache set - use it */
+			kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
+			kcache[i].bank = bank;
+			kcache[i].addr = line;
+			return 0;
+		}
+	}
+	return -ENOMEM;
+}
+
+static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
+		struct r600_bytecode_kcache *kcache,
+		struct r600_bytecode_alu *alu)
+{
+	int i, r;
+
+	for (i = 0; i < 3; i++) {
+		unsigned bank, line, sel = alu->src[i].sel;
+
+		if (sel < 512)
 			continue;

-		line = ((alu->src[i].sel - 512) / 32) * 2;
+		bank = alu->src[i].kc_bank;
+		line = (sel-512)>>4;

-		for (j = 0; j < count; ++j) {
-			if (cache_line[j] == line) {
-				found = true;
-				break;
-			}
-		}
-
-		if (!found)
-			cache_line[count++] = line;
-	}
-
-	/* This should never actually happen. */
-	if (count >= 3) return -ENOMEM;
-
-	for (i = 0; i < 2; ++i) {
-		if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
-			++free_lines;
-		}
-	}
-
-	/* Filter lines pulled in by previous intructions. Note that this is
-	 * only for the required_lines count, we can't remove these from the
-	 * cache_line array since we may have to start a new ALU clause. */
-	for (i = 0, required_lines = count; i < count; ++i) {
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == cache_line[i]) {
-				--required_lines;
-				break;
-			}
-		}
-	}
-
-	/* Start a new ALU clause if needed. */
-	if (required_lines > free_lines) {
-		if ((r = r600_bytecode_add_cf(bc))) {
+		if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line)))
 			return r;
-		}
-		bc->cf_last->inst = type;
-		kcache = bc->cf_last->kcache;
 	}
+	return 0;
+}

-	/* Setup the kcache lines. */
-	for (i = 0; i < count; ++i) {
-		boolean found = false;
-
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == cache_line[i]) {
-				found = true;
-				break;
-			}
-		}
-
-		if (found) continue;
-
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
-				kcache[j].bank = 0;
-				kcache[j].addr = cache_line[i];
-				kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
-				break;
-			}
-		}
-	}
+static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc,
+		struct r600_bytecode_alu *alu,
+		struct r600_bytecode_kcache * kcache)
+{
+	int i, j;

 	/* Alter the src operands to refer to the kcache. */
 	for (i = 0; i < 3; ++i) {
 		static const unsigned int base[] = {128, 160, 256, 288};
-		unsigned int line;
+		unsigned int line, sel = alu->src[i].sel, found = 0;

-		if (alu->src[i].sel < 512)
+		if (sel < 512)
 			continue;

-		alu->src[i].sel -= 512;
-		line = (alu->src[i].sel / 32) * 2;
+		sel -= 512;
+		line = sel>>4;

-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == line) {
-				alu->src[i].sel &= 0x1f;
-				alu->src[i].sel += base[j];
-				break;
+		for (j = 0; j < 4 && !found; ++j) {
+			switch (kcache[j].mode) {
+			case V_SQ_CF_KCACHE_NOP:
+			case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
+				R600_ERR("unexpected kcache line mode\n");
+				return -ENOMEM;
+			default:
+				if (kcache[j].bank == alu->src[i].kc_bank &&
+						kcache[j].addr <= line &&
+						line < kcache[j].addr + kcache[j].mode) {
+					alu->src[i].sel = sel - (kcache[j].addr<<4);
+					alu->src[i].sel += base[j];
+					found=1;
+			    }
 			}
 		}
 	}
+	return 0;
+}

+static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
+{
+	struct r600_bytecode_kcache kcache_sets[4];
+	struct r600_bytecode_kcache *kcache = kcache_sets;
+	int r;
+
+	memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
+
+	if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
+		/* can't alloc, need to start new clause */
+		if ((r = r600_bytecode_add_cf(bc))) {
+			return r;
+		}
+		bc->cf_last->inst = type;
+
+		/* retry with the new clause */
+		kcache = bc->cf_last->kcache;
+		if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
+			/* can't alloc again- should never happen */
+			return r;
+		}
+	} else {
+		/* update kcache sets */
+		memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
+	}
+
+	/* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */
+	if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) {
+		if (bc->chip_class < EVERGREEN)
+			return -ENOMEM;
+		bc->cf_last->eg_alu_extended = 1;
+	}
+
+	return 0;
+}
+
+static int insert_nop_r6xx(struct r600_bytecode *bc)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(alu));
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+		alu.src[0].chan = i;
+		alu.dst.chan = i;
+		alu.last = (i == 3);
+		r = r600_bytecode_add_alu(bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
+/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
+static int load_ar_r6xx(struct r600_bytecode *bc)
+{
+	struct r600_bytecode_alu alu;
+	int r;
+
+	if (bc->ar_loaded)
+		return 0;
+
+	/* hack to avoid making MOVA the last instruction in the clause */
+	if ((bc->cf_last->ndw>>1) >= 110)
+		bc->force_add_cf = 1;
+
+	memset(&alu, 0, sizeof(alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT;
+	alu.src[0].sel = bc->ar_reg;
+	alu.last = 1;
+	alu.index_mode = INDEX_MODE_LOOP;
+	r = r600_bytecode_add_alu(bc, &alu);
+	if (r)
+		return r;
+
+	/* no requirement to set uses waterfall on MOVA_GPR_INT */
+	bc->ar_loaded = 1;
 	return 0;
 }

@@ -1231,6 +1355,9 @@ static int load_ar(struct r600_bytecode *bc)
 	struct r600_bytecode_alu alu;
 	int r;

+	if (bc->ar_handling)
+		return load_ar_r6xx(bc);
+
 	if (bc->ar_loaded)
 		return 0;

@@ -1365,6 +1492,10 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytec
 		bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
 		bc->cf_last->curr_bs_head = NULL;
 	}
+
+	if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
+		insert_nop_r6xx(bc);
+
 	return 0;
 }

@@ -1588,6 +1719,7 @@ static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecod
 				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
 				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
 				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
+				S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
 				S_SQ_ALU_WORD0_LAST(alu->last);

 	if (alu->is_op3) {
@@ -1837,6 +1969,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 					if (r)
 						return r;
 					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
+					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
+
 					switch(bc->chip_class) {
 					case EVERGREEN: /* eg alu is same encoding as r700 */
 					case CAYMAN:
@@ -1932,6 +2066,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 					if (r)
 						return r;
 					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
+					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
+
 					switch(bc->chip_class) {
 					case R600:
 						r = r600_bytecode_alu_build(bc, alu, addr);
@@ -2072,6 +2208,19 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
+				if (cf->eg_alu_extended) {
+					fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]);
+					fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank);
+					fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank);
+					fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode);
+					id++;
+					fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]);
+					fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode);
+					fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr);
+					fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr);
+					id++;
+				}
+
 				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
 				fprintf(stderr, "ADDR:%d ", cf->addr);
 				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
@@ -2275,7 +2424,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 			fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
 			fprintf(stderr, "REL:%d ", alu->src[1].rel);
 			fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
-			fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
+			fprintf(stderr, "NEG:%d ", alu->src[1].neg);
+			fprintf(stderr, "IM:%d) ", alu->index_mode);
 			fprintf(stderr, "LAST:%d)\n", alu->last);
 			id++;
 			fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
@@ -2554,7 +2704,7 @@ int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, stru
 	}

 	memset(&bc, 0, sizeof(bc));
-	r600_bytecode_init(&bc, rctx->chip_class);
+	r600_bytecode_init(&bc, rctx->chip_class, rctx->family);

 	for (i = 0; i < ve->count; i++) {
 		if (elements[i].instance_divisor > 1) {
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -32,6 +32,7 @@ struct r600_bytecode_alu_src {
 	unsigned			neg;
 	unsigned			abs;
 	unsigned			rel;
+	unsigned			kc_bank;
 	uint32_t			value;
 };

@@ -54,6 +55,7 @@ struct r600_bytecode_alu {
 	unsigned			bank_swizzle;
 	unsigned			bank_swizzle_force;
 	unsigned			omod;
+	unsigned                        index_mode;
 };

 struct r600_bytecode_tex {
@@ -143,8 +145,9 @@ struct r600_bytecode_cf {
 	unsigned			cond;
 	unsigned			pop_count;
 	unsigned			cf_addr; /* control flow addr */
-	struct r600_bytecode_kcache		kcache[2];
+	struct r600_bytecode_kcache		kcache[4];
 	unsigned			r6xx_uses_waterfall;
+	unsigned			eg_alu_extended;
 	struct list_head		alu;
 	struct list_head		tex;
 	struct list_head		vtx;
@@ -176,6 +179,10 @@ struct r600_cf_callstack {
 	int				max;
 };

+#define AR_HANDLE_NORMAL 0
+#define AR_HANDLE_RV6XX 1 /* except RV670 */
+
+
 struct r600_bytecode {
 	enum chip_class			chip_class;
 	int				type;
@@ -194,13 +201,15 @@ struct r600_bytecode {
 	struct r600_cf_callstack	callstack[SQ_MAX_CALL_DEPTH];
 	unsigned	ar_loaded;
 	unsigned	ar_reg;
+	unsigned        ar_handling;
+	unsigned        r6xx_nop_after_rel_dst;
 };

 /* eg_asm.c */
 int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf);

 /* r600_asm.c */
-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class);
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family);
 void r600_bytecode_clear(struct r600_bytecode *bc);
 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu);
 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx);
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -408,9 +408,13 @@ static const struct r600_reg r600_context_reg_list[] = {
 	{R_028128_CB_CLEAR_BLUE, 0, 0, 0},
 	{R_02812C_CB_CLEAR_ALPHA, 0, 0, 0},
 	{R_028140_ALU_CONST_BUFFER_SIZE_PS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028144_ALU_CONST_BUFFER_SIZE_PS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028180_ALU_CONST_BUFFER_SIZE_VS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028184_ALU_CONST_BUFFER_SIZE_VS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028940_ALU_CONST_CACHE_PS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028944_ALU_CONST_CACHE_PS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028980_ALU_CONST_CACHE_VS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028984_ALU_CONST_CACHE_VS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_02823C_CB_SHADER_MASK, 0, 0, 0},
 	{R_028238_CB_TARGET_MASK, 0, 0, 0},
 	{R_028410_SX_ALPHA_TEST_CONTROL, 0, 0, 0},
@@ -1326,15 +1330,20 @@ void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *
 			if (block->pm4_bo_index[j]) {
 				/* find relocation */
 				struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
-				block->pm4[reloc->bo_pm4_index] =
-					r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
-				r600_context_bo_flush(ctx,
-						      reloc->flush_flags,
-						      reloc->flush_mask,
-						      reloc->bo);
+				if (reloc->bo) {
+					block->pm4[reloc->bo_pm4_index] =
+							r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
+					r600_context_bo_flush(ctx,
+							reloc->flush_flags,
+							reloc->flush_mask,
+							reloc->bo);
+				} else {
+					block->pm4[reloc->bo_pm4_index] = 0;
+				}
 				nbo--;
 				if (nbo == 0)
 					break;
+
 			}
 		}
 		ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -492,7 +492,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_MAX_CONSTS:
 		return R600_MAX_CONST_BUFFER_SIZE;
 	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-		return R600_MAX_CONST_BUFFERS;
+		return R600_MAX_CONST_BUFFERS-1;
 	case PIPE_SHADER_CAP_MAX_PREDS:
 		return 0; /* FIXME */
 	case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
@@ -505,8 +505,6 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_SUBROUTINES:
 		return 0;
 	case PIPE_SHADER_CAP_INTEGERS:
-		if (rscreen->chip_class == EVERGREEN)
-			return 1;
 		return 0;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 		return 16;
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -39,7 +39,7 @@
 #include "r600_shader.h"
 #include "r600_resource.h"

-#define R600_MAX_CONST_BUFFERS 1
+#define R600_MAX_CONST_BUFFERS 2
 #define R600_MAX_CONST_BUFFER_SIZE 4096

 #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -108,7 +108,9 @@ struct r600_pipe_rasterizer {
 	boolean				clamp_vertex_color;
 	boolean				clamp_fragment_color;
 	boolean				flatshade;
+	boolean				two_side;
 	unsigned			sprite_coord_enable;
+	unsigned                        clip_plane_enable;
 	float				offset_units;
 	float				offset_scale;
 };
@@ -218,6 +220,9 @@ struct r600_pipe_context {
 	/* shader information */
 	boolean				clamp_vertex_color;
 	boolean				clamp_fragment_color;
+	boolean				two_side;
+	unsigned			user_clip_plane_enable;
+	unsigned			clip_dist_enable;
 	unsigned			sprite_coord_enable;
 	boolean				export_16bpc;
 	unsigned			alpha_ref;
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -191,6 +191,10 @@ struct r600_shader_ctx {
 	boolean                                 input_linear;
 	boolean                                 input_perspective;
 	int					num_interp_gpr;
+	int					face_gpr;
+	int					colors_used;
+	boolean                 clip_vertex_write;
+	unsigned                cv_output;
 };

 struct r600_shader_tgsi_instruction {
@@ -374,12 +378,6 @@ static int r600_spi_sid(struct r600_shader_io * io)
 			/* For generic params simply use sid from tgsi */
 			index = io->sid;
 		} else {
-
-			/* FIXME: two-side rendering is broken in r600g, this will
-			 * keep old functionality */
-			if (name == TGSI_SEMANTIC_BCOLOR)
-				name = TGSI_SEMANTIC_COLOR;
-
 			/* For non-generic params - pack name and sid into 8 bits */
 			index = 0x80 | (name<<3) | (io->sid);
 		}
@@ -393,6 +391,51 @@ static int r600_spi_sid(struct r600_shader_io * io)
 	return index;
 };

+/* turn input into interpolate on EG */
+static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
+{
+	int r = 0;
+
+	if (ctx->shader->input[index].spi_sid) {
+		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
+		if (ctx->shader->input[index].interpolate > 0) {
+			r = evergreen_interp_alu(ctx, index);
+		} else {
+			r = evergreen_interp_flat(ctx, index);
+		}
+	}
+	return r;
+}
+
+static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
+{
+	struct r600_bytecode_alu alu;
+	int i, r;
+	int gpr_front = ctx->shader->input[front].gpr;
+	int gpr_back = ctx->shader->input[back].gpr;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(alu));
+		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
+		alu.is_op3 = 1;
+		alu.dst.write = 1;
+		alu.dst.sel = gpr_front;
+		alu.src[0].sel = ctx->face_gpr;
+		alu.src[1].sel = gpr_front;
+		alu.src[2].sel = gpr_back;
+
+		alu.dst.chan = i;
+		alu.src[1].chan = i;
+		alu.src[2].chan = i;
+		alu.last = (i==3);
+
+		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+			return r;
+	}
+
+	return 0;
+}
+
 static int tgsi_declaration(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
@@ -408,15 +451,15 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->input[i].interpolate = d->Declaration.Interpolate;
 		ctx->shader->input[i].centroid = d->Declaration.Centroid;
 		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
-		if (ctx->type == TGSI_PROCESSOR_FRAGMENT && ctx->bc->chip_class >= EVERGREEN) {
-			/* turn input into interpolate on EG */
-			if (ctx->shader->input[i].spi_sid) {
-				ctx->shader->input[i].lds_pos = ctx->shader->nlds++;
-				if (ctx->shader->input[i].interpolate > 0) {
-					evergreen_interp_alu(ctx, i);
-				} else {
-					evergreen_interp_flat(ctx, i);
-				}
+		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+			if (ctx->shader->input[i].name == TGSI_SEMANTIC_FACE)
+				ctx->face_gpr = ctx->shader->input[i].gpr;
+			else if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR)
+				ctx->colors_used++;
+			if (ctx->bc->chip_class >= EVERGREEN) {
+				r = evergreen_interp_input(ctx, i);
+				if (r)
+					return r;
 			}
 		}
 		break;
@@ -427,6 +470,21 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
 		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
 		ctx->shader->output[i].interpolate = d->Declaration.Interpolate;
+		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
+		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+			switch (d->Semantic.Name) {
+			case TGSI_SEMANTIC_CLIPDIST:
+				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
+				break;
+			case TGSI_SEMANTIC_PSIZE:
+				ctx->shader->vs_out_misc_write = 1;
+				break;
+			case TGSI_SEMANTIC_CLIPVERTEX:
+				ctx->clip_vertex_write = TRUE;
+				ctx->cv_output = i;
+				break;
+			}
+		}
 		break;
 	case TGSI_FILE_CONSTANT:
 	case TGSI_FILE_TEMPORARY:
@@ -690,6 +748,47 @@ static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
 	return 0;
 }

+static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
+{
+	int i, r, count = ctx->shader->ninput;
+
+	/* additional inputs will be allocated right after the existing inputs,
+	 * we won't need them after the color selection, so we don't need to
+	 * reserve these gprs for the rest of the shader code and to adjust
+	 * output offsets etc. */
+	int gpr = ctx->file_offset[TGSI_FILE_INPUT] +
+			ctx->info.file_max[TGSI_FILE_INPUT] + 1;
+
+	if (ctx->face_gpr == -1) {
+		i = ctx->shader->ninput++;
+		ctx->shader->input[i].name = TGSI_SEMANTIC_FACE;
+		ctx->shader->input[i].spi_sid = 0;
+		ctx->shader->input[i].gpr = gpr++;
+		ctx->face_gpr = ctx->shader->input[i].gpr;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
+			int ni = ctx->shader->ninput++;
+			memcpy(&ctx->shader->input[ni],&ctx->shader->input[i], sizeof(struct r600_shader_io));
+			ctx->shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
+			ctx->shader->input[ni].spi_sid = r600_spi_sid(&ctx->shader->input[ni]);
+			ctx->shader->input[ni].gpr = gpr++;
+
+			if (ctx->bc->chip_class >= EVERGREEN) {
+				r = evergreen_interp_input(ctx, ni);
+				if (r)
+					return r;
+			}
+
+			r = select_twoside_color(ctx, i, ni);
+			if (r)
+				return r;
+		}
+	}
+	return 0;
+}
+
 static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pipe_shader *pipeshader)
 {
 	struct r600_shader *shader = &pipeshader->shader;
@@ -701,11 +800,12 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 	struct r600_bytecode_output output[32];
 	unsigned output_done, noutput;
 	unsigned opcode;
-	int i, j, r = 0, pos0;
+	int i, j, k, r = 0;
+	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;

 	ctx.bc = &shader->bc;
 	ctx.shader = shader;
-	r600_bytecode_init(ctx.bc, rctx->chip_class);
+	r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
 	ctx.tokens = tokens;
 	tgsi_scan_shader(tokens, &ctx.info);
 	tgsi_parse_init(&ctx.parse, tokens);
@@ -713,6 +813,12 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 	shader->processor_type = ctx.type;
 	ctx.bc->type = shader->processor_type;

+	ctx.face_gpr = -1;
+	ctx.colors_used = 0;
+	ctx.clip_vertex_write = 0;
+
+	shader->two_side = (ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->two_side;
+
 	shader->clamp_color = (((ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->clamp_fragment_color) ||
 		((ctx.type == TGSI_PROCESSOR_VERTEX) && rctx->clamp_vertex_color));

@@ -791,6 +897,37 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 			if (r)
 				goto out_err;
 			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			break;
+		case TGSI_TOKEN_TYPE_PROPERTY:
+			property = &ctx.parse.FullToken.FullProperty;
+			switch (property->Property.PropertyName) {
+			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
+				if (property->u[0].Data == 1)
+					shader->fs_write_all = TRUE;
+				break;
+			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
+				if (property->u[0].Data == 1)
+					shader->vs_prohibit_ucps = TRUE;
+				break;
+			}
+			break;
+		default:
+			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
+			r = -EINVAL;
+			goto out_err;
+		}
+	}
+
+	if (shader->two_side && ctx.colors_used) {
+		if ((r = process_twoside_color_inputs(&ctx)))
+			return r;
+	}
+
+	tgsi_parse_init(&ctx.parse, tokens);
+	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
+		tgsi_parse_token(&ctx.parse);
+		switch (ctx.parse.FullToken.Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			r = tgsi_is_supported(&ctx);
 			if (r)
@@ -814,22 +951,57 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 			if (r)
 				goto out_err;
 			break;
-		case TGSI_TOKEN_TYPE_PROPERTY:
-			property = &ctx.parse.FullToken.FullProperty;
-			if (property->Property.PropertyName == TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS) {
-				if (property->u[0].Data == 1)
-					shader->fs_write_all = TRUE;
-			}
-			break;
 		default:
-			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
-			r = -EINVAL;
-			goto out_err;
+			break;
 		}
 	}

 	noutput = shader->noutput;

+	if (ctx.clip_vertex_write) {
+		/* need to convert a clipvertex write into clipdistance writes and not export
+		   the clip vertex anymore */
+
+		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
+		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
+		shader->output[noutput].gpr = ctx.temp_reg;
+		noutput++;
+		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
+		shader->output[noutput].gpr = ctx.temp_reg+1;
+		noutput++;
+
+		/* reset spi_sid for clipvertex output to avoid confusing spi */
+		shader->output[ctx.cv_output].spi_sid = 0;
+
+		shader->clip_dist_write = 0xFF;
+
+		for (i = 0; i < 8; i++) {
+			int oreg = i >> 2;
+			int ochan = i & 3;
+
+			for (j = 0; j < 4; j++) {
+				struct r600_bytecode_alu alu;
+				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
+				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
+				alu.src[0].chan = j;
+
+				alu.src[1].sel = 512 + i;
+				alu.src[1].kc_bank = 1;
+				alu.src[1].chan = j;
+
+				alu.dst.sel = ctx.temp_reg + oreg;
+				alu.dst.chan = j;
+				alu.dst.write = (j == ochan);
+				if (j == 3)
+					alu.last = 1;
+				r = r600_bytecode_add_alu(ctx.bc, &alu);
+				if (r)
+					return r;
+			}
+		}
+	}
+
 	/* clamp color outputs */
 	if (shader->clamp_color) {
 		for (i = 0; i < noutput; i++) {
@@ -949,68 +1121,86 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 	}

 	/* export output */
-	j = 0;
-	for (i = 0, pos0 = 0; i < noutput; i++) {
-		memset(&output[i], 0, sizeof(struct r600_bytecode_output));
-		output[i + j].gpr = shader->output[i].gpr;
-		output[i + j].elem_size = 3;
-		output[i + j].swizzle_x = 0;
-		output[i + j].swizzle_y = 1;
-		output[i + j].swizzle_z = 2;
-		output[i + j].swizzle_w = 3;
-		output[i + j].burst_count = 1;
-		output[i + j].barrier = 1;
-		output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
-		output[i + j].array_base = i - pos0;
-		output[i + j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
+	for (i = 0, j = 0; i < noutput; i++, j++) {
+		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
+		output[j].gpr = shader->output[i].gpr;
+		output[j].elem_size = 3;
+		output[j].swizzle_x = 0;
+		output[j].swizzle_y = 1;
+		output[j].swizzle_z = 2;
+		output[j].swizzle_w = 3;
+		output[j].burst_count = 1;
+		output[j].barrier = 1;
+		output[j].type = -1;
+		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
 		switch (ctx.type) {
 		case TGSI_PROCESSOR_VERTEX:
-			if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
-				output[i + j].array_base = 60;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
-				/* position doesn't count in array_base */
-				pos0++;
-			}
-			if (shader->output[i].name == TGSI_SEMANTIC_PSIZE) {
-				output[i + j].array_base = 61;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
-				/* position doesn't count in array_base */
-				pos0++;
+			switch (shader->output[i].name) {
+			case TGSI_SEMANTIC_POSITION:
+				output[j].array_base = next_pos_base++;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+				break;
+
+			case TGSI_SEMANTIC_PSIZE:
+				output[j].array_base = next_pos_base++;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+				break;
+			case TGSI_SEMANTIC_CLIPVERTEX:
+				j--;
+				break;
+			case TGSI_SEMANTIC_CLIPDIST:
+				output[j].array_base = next_pos_base++;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+				/* spi_sid is 0 for clipdistance outputs that were generated
+				 * for clipvertex - we don't need to pass them to PS */
+				if (shader->output[i].spi_sid) {
+					j++;
+					/* duplicate it as PARAM to pass to the pixel shader */
+					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
+					output[j].array_base = next_param_base++;
+					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+				}
+				break;
+			case TGSI_SEMANTIC_FOG:
+				output[j].swizzle_y = 4; /* 0 */
+				output[j].swizzle_z = 4; /* 0 */
+				output[j].swizzle_w = 5; /* 1 */
+				break;
 			}
 			break;
 		case TGSI_PROCESSOR_FRAGMENT:
 			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
-				output[i + j].array_base = shader->output[i].sid;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+				output[j].array_base = next_pixel_base++;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 				if (shader->fs_write_all && (rctx->chip_class >= EVERGREEN)) {
-					for (j = 1; j < shader->nr_cbufs; j++) {
-						memset(&output[i + j], 0, sizeof(struct r600_bytecode_output));
-						output[i + j].gpr = shader->output[i].gpr;
-						output[i + j].elem_size = 3;
-						output[i + j].swizzle_x = 0;
-						output[i + j].swizzle_y = 1;
-						output[i + j].swizzle_z = 2;
-						output[i + j].swizzle_w = 3;
-						output[i + j].burst_count = 1;
-						output[i + j].barrier = 1;
-						output[i + j].array_base = shader->output[i].sid + j;
-						output[i + j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
-						output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+					for (k = 1; k < shader->nr_cbufs; k++) {
+						j++;
+						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
+						output[j].gpr = shader->output[i].gpr;
+						output[j].elem_size = 3;
+						output[j].swizzle_x = 0;
+						output[j].swizzle_y = 1;
+						output[j].swizzle_z = 2;
+						output[j].swizzle_w = 3;
+						output[j].burst_count = 1;
+						output[j].barrier = 1;
+						output[j].array_base = next_pixel_base++;
+						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
+						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 					}
-					j = shader->nr_cbufs-1;
 				}
 			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
-				output[i + j].array_base = 61;
-				output[i + j].swizzle_x = 2;
-				output[i + j].swizzle_y = 7;
-				output[i + j].swizzle_z = output[i + j].swizzle_w = 7;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+				output[j].array_base = 61;
+				output[j].swizzle_x = 2;
+				output[j].swizzle_y = 7;
+				output[j].swizzle_z = output[j].swizzle_w = 7;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
-				output[i + j].array_base = 61;
-				output[i + j].swizzle_x = 7;
-				output[i + j].swizzle_y = 1;
-				output[i + j].swizzle_z = output[i + j].swizzle_w = 7;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+				output[j].array_base = 61;
+				output[j].swizzle_x = 7;
+				output[j].swizzle_y = 1;
+				output[j].swizzle_z = output[j].swizzle_w = 7;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else {
 				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
 				r = -EINVAL;
@@ -1022,48 +1212,49 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 			r = -EINVAL;
 			goto out_err;
 		}
+
+		if (output[j].type==-1) {
+			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+			output[j].array_base = next_param_base++;
+		}
 	}
-	noutput += j;
+
 	/* add fake param output for vertex shader if no param is exported */
-	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
-		for (i = 0, pos0 = 0; i < noutput; i++) {
-			if (output[i].type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) {
-				pos0 = 1;
-				break;
-			}
-		}
-		if (!pos0) {
-			memset(&output[i], 0, sizeof(struct r600_bytecode_output));
-			output[i].gpr = 0;
-			output[i].elem_size = 3;
-			output[i].swizzle_x = 7;
-			output[i].swizzle_y = 7;
-			output[i].swizzle_z = 7;
-			output[i].swizzle_w = 7;
-			output[i].burst_count = 1;
-			output[i].barrier = 1;
-			output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
-			output[i].array_base = 0;
-			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
-			noutput++;
-		}
+	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
+			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
+			output[j].gpr = 0;
+			output[j].elem_size = 3;
+			output[j].swizzle_x = 7;
+			output[j].swizzle_y = 7;
+			output[j].swizzle_z = 7;
+			output[j].swizzle_w = 7;
+			output[j].burst_count = 1;
+			output[j].barrier = 1;
+			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+			output[j].array_base = 0;
+			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
+			j++;
 	}
+
 	/* add fake pixel export */
-	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && !noutput) {
-		memset(&output[0], 0, sizeof(struct r600_bytecode_output));
-		output[0].gpr = 0;
-		output[0].elem_size = 3;
-		output[0].swizzle_x = 7;
-		output[0].swizzle_y = 7;
-		output[0].swizzle_z = 7;
-		output[0].swizzle_w = 7;
-		output[0].burst_count = 1;
-		output[0].barrier = 1;
-		output[0].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
-		output[0].array_base = 0;
-		output[0].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
-		noutput++;
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && j == 0) {
+		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
+		output[j].gpr = 0;
+		output[j].elem_size = 3;
+		output[j].swizzle_x = 7;
+		output[j].swizzle_y = 7;
+		output[j].swizzle_z = 7;
+		output[j].swizzle_w = 7;
+		output[j].burst_count = 1;
+		output[j].barrier = 1;
+		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+		output[j].array_base = 0;
+		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
+		j++;
 	}
+
+	noutput = j;
+
 	/* set export done on last export of each type */
 	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
 		if (ctx.bc->chip_class < CAYMAN) {
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -34,6 +34,7 @@ struct r600_shader_io {
 	unsigned		interpolate;
 	boolean                 centroid;
 	unsigned		lds_pos; /* for evergreen */
+	unsigned		write_mask;
 };

 struct r600_shader {
@@ -46,8 +47,14 @@ struct r600_shader {
 	struct r600_shader_io	output[32];
 	boolean			uses_kill;
 	boolean			fs_write_all;
+	boolean			vs_prohibit_ucps;
 	boolean			clamp_color;
+	boolean			two_side;
 	unsigned		nr_cbufs;
+	/* bit n is set if the shader writes gl_ClipDistance[n] */
+	unsigned		clip_dist_write;
+	/* flag is set if the shader writes VS_OUT_MISC_VEC (e.g. for PSIZE) */
+	boolean			vs_out_misc_write;
 };

 #endif
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -471,4 +471,11 @@
 #define SQ_ALU_SCL_122                           0x00000001
 #define SQ_ALU_SCL_212                           0x00000002
 #define SQ_ALU_SCL_221                           0x00000003
+
+#define   INDEX_MODE_AR_X 0
+#define   INDEX_MODE_AR_Y 1
+#define   INDEX_MODE_AR_Z 2
+#define   INDEX_MODE_AR_W 3
+#define   INDEX_MODE_LOOP 4
+
 #endif
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -509,6 +509,10 @@ static uint32_t r600_translate_colorformat(enum pipe_format format)
 	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 		return V_0280A0_COLOR_X24_8_32_FLOAT;

+	case PIPE_FORMAT_R32_UINT:
+	case PIPE_FORMAT_R32_SINT:
+		return V_0280A0_COLOR_32;
+
 	case PIPE_FORMAT_R32_FLOAT:
 	case PIPE_FORMAT_Z32_FLOAT:
 		return V_0280A0_COLOR_32_FLOAT;
@@ -954,6 +958,8 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	rs->clamp_fragment_color = state->clamp_fragment_color;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
+	rs->two_side = state->light_twoside;
+	rs->clip_plane_enable = state->clip_plane_enable;

 	clip_rule = state->scissor ? 0xAAAA : 0xFFFF;
 	/* offset */
@@ -990,8 +996,8 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 		S_028814_POLYMODE_FRONT_PTYPE(r600_translate_fill(state->fill_front)) |
 		S_028814_POLYMODE_BACK_PTYPE(r600_translate_fill(state->fill_back)), 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_02881C_PA_CL_VS_OUT_CNTL,
-			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex) |
-			S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex), 0xFFFFFFFF, NULL, 0);
+			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex),
+			S_02881C_USE_VTX_POINT_SIZE(1), NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028820_PA_CL_NANINF_CNTL, 0x00000000, 0xFFFFFFFF, NULL, 0);
 	/* point size 12.4 fixed point */
 	tmp = (unsigned)(state->point_size * 8.0);
@@ -1030,10 +1036,10 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate, R_028DFC_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp), 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_02820C_PA_SC_CLIPRECT_RULE, clip_rule, 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028810_PA_CL_CLIP_CNTL,
-			S_028810_PS_UCP_MODE(3) | (state->clip_plane_enable & 63) |
-			S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
-			S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip), 0xFFFFFFFF, NULL, 0);
-
+			S_028810_PS_UCP_MODE(3) | S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
+			S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip),
+			S_028810_PS_UCP_MODE(3) | S_028810_ZCLIP_NEAR_DISABLE(1) |
+			S_028810_ZCLIP_FAR_DISABLE(1), NULL, 0);
 	return rstate;
 }

@@ -1311,6 +1317,7 @@ static void r600_set_clip_state(struct pipe_context *ctx,
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+	struct pipe_resource * cbuf;

 	if (rstate == NULL)
 		return;
@@ -1335,6 +1342,13 @@ static void r600_set_clip_state(struct pipe_context *ctx,
 	free(rctx->states[R600_PIPE_STATE_CLIP]);
 	rctx->states[R600_PIPE_STATE_CLIP] = rstate;
 	r600_context_pipe_state_set(&rctx->ctx, rstate);
+
+	cbuf = pipe_user_buffer_create(ctx->screen,
+                                   state->ucp,
+                                   4*4*8, /* 8*4 floats */
+                                   PIPE_BIND_CONSTANT_BUFFER);
+	r600_set_constant_buffer(ctx, PIPE_SHADER_VERTEX, 1, cbuf);
+	pipe_resource_reference(&cbuf, NULL);
 }

 static void r600_set_polygon_stipple(struct pipe_context *ctx,
@@ -2069,7 +2083,7 @@ void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader *shad
 	struct r600_shader *rshader = &shader->shader;
 	unsigned i, exports_ps, num_cout, spi_ps_in_control_0, spi_input_z, spi_ps_in_control_1, db_shader_control;
 	int pos_index = -1, face_index = -1;
-	unsigned tmp, sid;
+	unsigned tmp, sid, ufi = 0;

 	rstate->nregs = 0;

@@ -2147,6 +2161,10 @@ void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader *shad
 			S_0286D0_FRONT_FACE_ADDR(rshader->input[face_index].gpr);
 	}

+	/* HW bug in original R600 */
+	if (rctx->family == CHIP_R600)
+		ufi = 1;
+
 	r600_pipe_state_add_reg(rstate, R_0286CC_SPI_PS_IN_CONTROL_0, spi_ps_in_control_0, 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_0286D0_SPI_PS_IN_CONTROL_1, spi_ps_in_control_1, 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_0286D8_SPI_INPUT_Z, spi_input_z, 0xFFFFFFFF, NULL, 0);
@@ -2156,7 +2174,8 @@ void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader *shad
 	r600_pipe_state_add_reg(rstate,
 				R_028850_SQ_PGM_RESOURCES_PS,
 				S_028850_NUM_GPRS(rshader->bc.ngpr) |
-				S_028850_STACK_SIZE(rshader->bc.nstack),
+				S_028850_STACK_SIZE(rshader->bc.nstack) |
+				S_028850_UNCACHED_FIRST_INST(ufi),
 				0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate,
 				R_028854_SQ_PGM_EXPORTS_PS,
@@ -2234,6 +2253,16 @@ void r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_pipe_shader *shad
 	r600_pipe_state_add_reg(rstate,
 				R_03E200_SQ_LOOP_CONST_0 + (32 * 4), 0x01000FFF,
 				0xFFFFFFFF, NULL, 0);
+
+	r600_pipe_state_add_reg(rstate,
+				R_02881C_PA_CL_VS_OUT_CNTL,
+				S_02881C_VS_OUT_CCDIST0_VEC_ENA((rshader->clip_dist_write & 0x0F) != 0) |
+				S_02881C_VS_OUT_CCDIST1_VEC_ENA((rshader->clip_dist_write & 0xF0) != 0) |
+				S_02881C_VS_OUT_MISC_VEC_ENA(rshader->vs_out_misc_write),
+				S_02881C_VS_OUT_CCDIST0_VEC_ENA(1) |
+				S_02881C_VS_OUT_CCDIST1_VEC_ENA(1) |
+				S_02881C_VS_OUT_MISC_VEC_ENA(1),
+				NULL, 0);
 }

 void r600_fetch_shader(struct pipe_context *ctx,
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -103,6 +103,7 @@ void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 	rctx->clamp_fragment_color = rs->clamp_fragment_color;

 	rctx->sprite_coord_enable = rs->sprite_coord_enable;
+	rctx->two_side = rs->two_side;

 	rctx->rasterizer = rs;

@@ -352,11 +353,11 @@ void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
 	case PIPE_SHADER_VERTEX:
 		rctx->vs_const_buffer.nregs = 0;
 		r600_pipe_state_add_reg(&rctx->vs_const_buffer,
-					R_028180_ALU_CONST_BUFFER_SIZE_VS_0,
+					R_028180_ALU_CONST_BUFFER_SIZE_VS_0 + index * 4,
 					ALIGN_DIVUP(buffer->width0 >> 4, 16),
 					0xFFFFFFFF, NULL, 0);
 		r600_pipe_state_add_reg(&rctx->vs_const_buffer,
-					R_028980_ALU_CONST_CACHE_VS_0,
+					R_028980_ALU_CONST_CACHE_VS_0 + index * 4,
 					offset >> 8, 0xFFFFFFFF, rbuffer, RADEON_USAGE_READ);
 		r600_context_pipe_state_set(&rctx->ctx, &rctx->vs_const_buffer);

@@ -549,6 +550,30 @@ static int r600_shader_rebuild(struct pipe_context * ctx, struct r600_pipe_shade
 static void r600_update_derived_state(struct r600_pipe_context *rctx)
 {
 	struct pipe_context * ctx = (struct pipe_context*)rctx;
+	struct r600_pipe_state rstate;
+	unsigned user_clip_plane_enable;
+	unsigned clip_dist_enable;
+
+	if (rctx->vs_shader->shader.clip_dist_write || rctx->vs_shader->shader.vs_prohibit_ucps)
+		user_clip_plane_enable = 0;
+	else
+		user_clip_plane_enable = rctx->rasterizer->clip_plane_enable & 0x3F;
+
+	clip_dist_enable = rctx->rasterizer->clip_plane_enable & rctx->vs_shader->shader.clip_dist_write;
+	rstate.nregs = 0;
+
+	if (user_clip_plane_enable != rctx->user_clip_plane_enable) {
+		r600_pipe_state_add_reg(&rstate, R_028810_PA_CL_CLIP_CNTL, user_clip_plane_enable , 0x3F, NULL, 0);
+		rctx->user_clip_plane_enable = user_clip_plane_enable;
+	}
+
+	if (clip_dist_enable != rctx->clip_dist_enable) {
+		r600_pipe_state_add_reg(&rstate, R_02881C_PA_CL_VS_OUT_CNTL, clip_dist_enable, 0xFF, NULL, 0);
+		rctx->clip_dist_enable = clip_dist_enable;
+	}
+
+	if (rstate.nregs)
+		r600_context_pipe_state_set(&rctx->ctx, &rstate);

 	if (!rctx->blitter->running) {
 		if (rctx->have_depth_fb || rctx->have_depth_texture)
@@ -564,6 +589,7 @@ static void r600_update_derived_state(struct r600_pipe_context *rctx)
 	}

 	if ((rctx->ps_shader->shader.clamp_color != rctx->clamp_fragment_color) ||
+	    (rctx->ps_shader->shader.two_side != rctx->two_side) ||
 	    ((rctx->chip_class >= EVERGREEN) && rctx->ps_shader->shader.fs_write_all &&
 	     (rctx->ps_shader->shader.nr_cbufs != rctx->nr_cbufs))) {
 		r600_shader_rebuild(&rctx->context, rctx->ps_shader);
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -869,6 +869,7 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
 	const struct util_format_description *desc;
 	boolean uniform = TRUE;
 	static int r600_enable_s3tc = -1;
+	bool is_srgb_valid = FALSE;

 	int i;
 	const uint32_t sign_bit[4] = {
@@ -980,14 +981,17 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
 		case PIPE_FORMAT_DXT1_SRGB:
 		case PIPE_FORMAT_DXT1_SRGBA:
 			result = FMT_BC1;
+			is_srgb_valid = TRUE;
 			goto out_word4;
 		case PIPE_FORMAT_DXT3_RGBA:
 		case PIPE_FORMAT_DXT3_SRGBA:
 			result = FMT_BC2;
+			is_srgb_valid = TRUE;
 			goto out_word4;
 		case PIPE_FORMAT_DXT5_RGBA:
 		case PIPE_FORMAT_DXT5_SRGBA:
 			result = FMT_BC3;
+			is_srgb_valid = TRUE;
 			goto out_word4;
 		default:
 			goto out_unknown;
@@ -1095,6 +1099,7 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
 				goto out_word4;
 			case 4:
 				result = FMT_8_8_8_8;
+				is_srgb_valid = TRUE;
 				goto out_word4;
 			}
 			goto out_unknown;
@@ -1158,6 +1163,9 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
 	}

 out_word4:
+
+	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && !is_srgb_valid)
+		return ~0;
 	if (word4_p)
 		*word4_p = word4;
 	if (yuv_format_p)
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -3538,9 +3538,13 @@
 #define R_038018_RESOURCE0_WORD6                     0x038018

 #define R_028140_ALU_CONST_BUFFER_SIZE_PS_0          0x00028140
+#define R_028144_ALU_CONST_BUFFER_SIZE_PS_1          0x00028144
 #define R_028180_ALU_CONST_BUFFER_SIZE_VS_0          0x00028180
+#define R_028184_ALU_CONST_BUFFER_SIZE_VS_1          0x00028184
 #define R_028940_ALU_CONST_CACHE_PS_0                0x00028940
+#define R_028944_ALU_CONST_CACHE_PS_1                0x00028944
 #define R_028980_ALU_CONST_CACHE_VS_0                0x00028980
+#define R_028984_ALU_CONST_CACHE_VS_1                0x00028984

 #define R_03CFF0_SQ_VTX_BASE_VTX_LOC                 0x03CFF0
 #define R_03CFF4_SQ_VTX_START_INST_LOC               0x03CFF4
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -45,7 +45,7 @@
 #include "sp_fence.h"
 #include "sp_public.h"

-DEBUG_GET_ONCE_BOOL_OPTION(use_llvm, "SOFTPIPE_USE_LLVM", FALSE);
+DEBUG_GET_ONCE_BOOL_OPTION(use_llvm, "SOFTPIPE_USE_LLVM", FALSE)

 static const char *
 softpipe_get_vendor(struct pipe_screen *screen)
@@ -121,7 +121,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
      return 1;
   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return 64; /* matches core Mesa defaults */
+      return 256; /* for GL3 */
   case PIPE_CAP_MIN_TEXEL_OFFSET:
      return -8;
   case PIPE_CAP_MAX_TEXEL_OFFSET:
@@ -138,7 +138,9 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
 static int
 softpipe_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_shader_cap param)
 {
+#ifdef HAVE_LLVM
   struct softpipe_screen *sp_screen = softpipe_screen(screen);
+#endif
   switch(shader)
   {
   case PIPE_SHADER_FRAGMENT:
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -88,7 +88,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
      vinfo->num_attribs = 0;
      for (i = 0; i < fsInfo->num_inputs; i++) {
         int src;
-         enum interp_mode interp;
+         enum interp_mode interp = INTERP_LINEAR;

         switch (fsInfo->input_interpolate[i]) {
         case TGSI_INTERPOLATE_CONSTANT:
@@ -105,7 +105,6 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
            break;
         default:
            assert(0);
-            interp = INTERP_LINEAR;
         }

         switch (fsInfo->input_semantic_name[i]) {
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -454,16 +454,19 @@ svga_texture_create(struct pipe_screen *screen,
   }

   /* 
-    * XXX: Never pass the SVGA3D_SURFACE_HINT_RENDERTARGET hint. Mesa cannot
+    * Note: Previously we never passed the
+    * SVGA3D_SURFACE_HINT_RENDERTARGET hint. Mesa cannot
    * know beforehand whether a texture will be used as a rendertarget or not
    * and it always requests PIPE_BIND_RENDER_TARGET, therefore
    * passing the SVGA3D_SURFACE_HINT_RENDERTARGET here defeats its purpose.
+    *
+    * However, this was changed since other state trackers
+    * (XA for example) uses it accurately and certain device versions
+    * relies on it in certain situations to render correctly.
    */
-#if 0
   if((template->bind & PIPE_BIND_RENDER_TARGET) &&
      !util_format_is_s3tc(template->format))
      tex->key.flags |= SVGA3D_SURFACE_HINT_RENDERTARGET;
-#endif
   
   if(template->bind & PIPE_BIND_DEPTH_STENCIL)
      tex->key.flags |= SVGA3D_SURFACE_HINT_DEPTHSTENCIL;
--- a/src/gallium/drivers/svga/svga_state_framebuffer.c
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@@ -477,7 +477,7 @@ emit_clip_planes( struct svga_context *svga,

   /* TODO: just emit directly from svga_set_clip_state()?
    */
-   for (i = 0; i < 6; i++) {
+   for (i = 0; i < SVGA3D_MAX_CLIP_PLANES; i++) {
      /* need to express the plane in D3D-style coordinate space.
       * GL coords get converted to D3D coords with the matrix:
       * [ 1  0  0  0 ]
--- a/src/gallium/drivers/svga/svga_state_rss.c
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -244,8 +244,8 @@ static int emit_rss( struct svga_context *svga,
      EMIT_RS_FLOAT( svga, bias, DEPTHBIAS, fail );
   }

-   if (dirty & SVGA_NEW_CLIP) {
-      /* the number of clip planes is how many planes to enable */
+   if (dirty & SVGA_NEW_RAST) {
+      /* bitmask of the enabled clip planes */
      unsigned enabled = svga->curr.rast->templ.clip_plane_enable;
      EMIT_RS( svga, enabled, CLIPPLANEENABLE, fail );
   }
@@ -285,7 +285,6 @@ struct svga_tracked_state svga_hw_rss =

   (SVGA_NEW_BLEND |
    SVGA_NEW_BLEND_COLOR |
-    SVGA_NEW_CLIP |
    SVGA_NEW_DEPTH_STENCIL |
    SVGA_NEW_STENCIL_REF |
    SVGA_NEW_RAST |
--- a/src/gallium/state_trackers/dri/common/dri_context.c
+++ b/src/gallium/state_trackers/dri/common/dri_context.c
@@ -232,8 +232,7 @@ dri_make_current(__DRIcontext * cPriv,
   if (draw->textures[ST_ATTACHMENT_BACK_LEFT] && draw->textures[ST_ATTACHMENT_DEPTH_STENCIL]
      && ctx->pp)
         pp_init_fbos(ctx->pp, draw->textures[ST_ATTACHMENT_BACK_LEFT]->width0,
-            draw->textures[ST_ATTACHMENT_BACK_LEFT]->height0,
-            draw->textures[ST_ATTACHMENT_DEPTH_STENCIL]);
+            draw->textures[ST_ATTACHMENT_BACK_LEFT]->height0);

   return GL_TRUE;
 }
--- a/src/gallium/state_trackers/dri/common/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/common/dri_drawable.c
@@ -53,6 +53,7 @@ dri_st_framebuffer_validate(struct st_framebuffer_iface *stfbi,
   unsigned statt_mask, new_mask;
   boolean new_stamp;
   int i;
+   unsigned int lastStamp;

   statt_mask = 0x0;
   for (i = 0; i < count; i++)
@@ -66,23 +67,26 @@ dri_st_framebuffer_validate(struct st_framebuffer_iface *stfbi,
    * client stamp.  It has the value of the server stamp when last
    * checked.
    */
-   new_stamp = (drawable->texture_stamp != drawable->dPriv->lastStamp);
+   do {
+      lastStamp = drawable->dPriv->lastStamp;
+      new_stamp = (drawable->texture_stamp != lastStamp);

-   if (new_stamp || new_mask || screen->broken_invalidate) {
-      if (new_stamp && drawable->update_drawable_info)
-         drawable->update_drawable_info(drawable);
+      if (new_stamp || new_mask || screen->broken_invalidate) {
+         if (new_stamp && drawable->update_drawable_info)
+            drawable->update_drawable_info(drawable);

-      drawable->allocate_textures(drawable, statts, count);
+         drawable->allocate_textures(drawable, statts, count);

-      /* add existing textures */
-      for (i = 0; i < ST_ATTACHMENT_COUNT; i++) {
-         if (drawable->textures[i])
-            statt_mask |= (1 << i);
+         /* add existing textures */
+         for (i = 0; i < ST_ATTACHMENT_COUNT; i++) {
+            if (drawable->textures[i])
+               statt_mask |= (1 << i);
+         }
+
+         drawable->texture_stamp = lastStamp;
+         drawable->texture_mask = statt_mask;
      }
-
-      drawable->texture_stamp = drawable->dPriv->lastStamp;
-      drawable->texture_mask = statt_mask;
-   }
+   } while (lastStamp != drawable->dPriv->lastStamp);

   if (!out)
      return TRUE;
--- a/src/gallium/state_trackers/dri/drm/dri2.c
+++ b/src/gallium/state_trackers/dri/drm/dri2.c
@@ -316,6 +316,9 @@ dri2_allocate_buffer(__DRIscreen *sPriv,

   switch (format) {
      case 32:
+         pf = PIPE_FORMAT_B8G8R8A8_UNORM;
+         break;
+      case 24:
         pf = PIPE_FORMAT_B8G8R8X8_UNORM;
         break;
      case 16:
--- a/src/gallium/state_trackers/egl/wayland/native_shm.c
+++ b/src/gallium/state_trackers/egl/wayland/native_shm.c
@@ -94,11 +94,10 @@ wayland_create_shm_buffer(struct wayland_display *display,

   switch (surface->color_format) {
   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      format = (surface->premultiplied_alpha) ?
-         WL_SHM_FORMAT_PREMULTIPLIED_ARGB32 : WL_SHM_FORMAT_ARGB32;
+      format = WL_SHM_FORMAT_ARGB8888;
      break;
   case PIPE_FORMAT_B8G8R8X8_UNORM:
-      format = WL_SHM_FORMAT_XRGB32;
+      format = WL_SHM_FORMAT_XRGB8888;
      break;
   default:
      return NULL;
@@ -116,7 +115,7 @@ shm_handle_format(void *data, struct wl_shm *shm, uint32_t format)
   struct wayland_shm_display *shmdpy = data;

   switch (format) {
-   case WL_SHM_FORMAT_ARGB32:
+   case WL_SHM_FORMAT_ARGB8888:
      shmdpy->base.formats |= HAS_ARGB8888;
      break;
   case WL_SHM_FORMAT_XRGB8888:
--- a/src/gallium/state_trackers/vega/path.c
+++ b/src/gallium/state_trackers/vega/path.c
@@ -367,6 +367,8 @@ static struct polygon_array * path_get_fill_polygons(struct path *p, struct matr
   void *coords = (VGfloat *)p->control_points->data;
   struct array *array;

+   memset(data, 0, sizeof(data));
+
   if (p->fill_polys.polygon_array.array)
   {
      if (memcmp( &p->fill_polys.matrix,
--- a/src/gallium/state_trackers/xa/xa_tracker.h
+++ b/src/gallium/state_trackers/xa/xa_tracker.h
@@ -36,8 +36,8 @@

 #include <stdint.h>

-#define XA_TRACKER_VERSION_MAJOR 0
-#define XA_TRACKER_VERSION_MINOR 6
+#define XA_TRACKER_VERSION_MAJOR 1
+#define XA_TRACKER_VERSION_MINOR 0
 #define XA_TRACKER_VERSION_PATCH 0

 #define XA_FLAG_SHARED         (1 << 0)
--- a/src/gallium/state_trackers/xorg/xvmc/subpicture.c
+++ b/src/gallium/state_trackers/xorg/xvmc/subpicture.c
@@ -29,7 +29,6 @@

 #include <X11/Xlibint.h>
 #include <X11/extensions/XvMClib.h>
-#include <xorg/fourcc.h>

 #include "pipe/p_screen.h"
 #include "pipe/p_video_decoder.h"
@@ -46,6 +45,8 @@
 #include "xvmc_private.h"

 #define FOURCC_RGB 0x0000003
+#define FOURCC_AI44 0x34344941
+#define FOURCC_IA44 0x34344149

 static enum pipe_format XvIDToPipe(int xvimage_id)
 {
--- a/src/gallium/targets/xa-vmwgfx/Makefile
+++ b/src/gallium/targets/xa-vmwgfx/Makefile
@@ -3,8 +3,8 @@ include $(TOP)/configs/current

 ##### MACROS #####

-XA_MAJOR = 0
-XA_MINOR = 6
+XA_MAJOR = 1
+XA_MINOR = 0
 XA_TINY = 0
 XA_CFLAGS = -Wall -pedantic

--- a/src/gbm/main/gbm.c
+++ b/src/gbm/main/gbm.c
@@ -48,6 +48,10 @@ struct gbm_device *devices[16];

 static int device_num = 0;

+/** Returns the file description for the gbm device
+ *
+ * \return The fd that the struct gbm_device was created with
+ */
 GBM_EXPORT int
 gbm_device_get_fd(struct gbm_device *gbm)
 {
@@ -55,12 +59,29 @@ gbm_device_get_fd(struct gbm_device *gbm)
 }

 /* FIXME: maybe superfluous, use udev subclass from the fd? */
+/** Get the backend name for the given gbm device
+ *
+ * \return The backend name string - this belongs to the device and must not
+ * be freed
+ */
 GBM_EXPORT const char *
 gbm_device_get_backend_name(struct gbm_device *gbm)
 {
   return gbm->name;
 }

+/** Test if a format is supported for a given set of usage flags.
+ *
+ * \param gbm The created buffer manager
+ * \param format The format to test
+ * \param usage A bitmask of the usages to test the format against
+ * \return 1 if the format is supported otherwise 0
+ *
+ * \sa enum gbm_bo_flags for the list of flags that the format can be
+ * tested against
+ *
+ * \sa enum gbm_bo_format for the list of formats
+ */
 int
 gbm_device_is_format_supported(struct gbm_device *gbm,
                               enum gbm_bo_format format,
@@ -69,6 +90,10 @@ gbm_device_is_format_supported(struct gbm_device *gbm,
   return gbm->is_format_supported(gbm, format, usage);
 }

+/** Destroy the gbm device and free all resources associated with it.
+ *
+ * \param gbm The device created using gbm_create_device()
+ */
 GBM_EXPORT void
 gbm_device_destroy(struct gbm_device *gbm)
 {
@@ -103,6 +128,18 @@ _gbm_mesa_get_device(int fd)
   return gbm;
 }

+/** Create a gbm device for allocating buffers
+ *
+ * The file descriptor passed in is used by the backend to communicate with
+ * platform for allocating the memory. For allocations using DRI this would be
+ * the file descriptor returned when opening a device such as \c
+ * /dev/dri/card0
+ *
+ * \param fd The file descriptor for an backend specific device
+ * \return The newly created struct gbm_device. The resources associated with
+ * the device should be freed with gbm_device_destroy() when it is no longer
+ * needed. If the creation of the device failed NULL will be returned.
+ */
 GBM_EXPORT struct gbm_device *
 gbm_create_device(int fd)
 {
@@ -131,36 +168,85 @@ gbm_create_device(int fd)
   return gbm;
 }

+/** Get the width of the buffer object
+ *
+ * \param bo The buffer object
+ * \return The width of the allocated buffer object
+ *
+ */
 GBM_EXPORT unsigned int
 gbm_bo_get_width(struct gbm_bo *bo)
 {
   return bo->width;
 }

+/** Get the height of the buffer object
+ *
+ * \param bo The buffer object
+ * \return The height of the allocated buffer object
+ */
 GBM_EXPORT unsigned int
 gbm_bo_get_height(struct gbm_bo *bo)
 {
   return bo->height;
 }

+/** Get the stride of the buffer object
+ *
+ * This is calculated by the backend when it does the allocation in
+ * gbm_bo_create()
+ *
+ * \param bo The buffer object
+ * \return The stride of the allocated buffer object
+ */
 GBM_EXPORT uint32_t
 gbm_bo_get_pitch(struct gbm_bo *bo)
 {
   return bo->pitch;
 }

+/** Get the handle of the buffer object
+ *
+ * This is stored in the platform generic union gbm_bo_handle type. However
+ * the format of this handle is platform specific.
+ *
+ * \param bo The buffer object
+ * \return Returns the handle of the allocated buffer object
+ */
 GBM_EXPORT union gbm_bo_handle
 gbm_bo_get_handle(struct gbm_bo *bo)
 {
   return bo->handle;
 }

+/**
+ * Destroys the given buffer object and frees all resources associated with
+ * it.
+ *
+ * \param bo The buffer object
+ */
 GBM_EXPORT void
 gbm_bo_destroy(struct gbm_bo *bo)
 {
   bo->gbm->bo_destroy(bo);
 }

+/**
+ * Allocate a buffer object for the given dimensions
+ *
+ * \param gbm The gbm device returned from gbm_create_device()
+ * \param width The width for the buffer
+ * \param height The height for the buffer
+ * \param format The format to use for the buffer
+ * \param usage The union of the usage flags for this buffer
+ *
+ * \return A newly allocated buffer that should be freed with gbm_bo_destroy()
+ * when no longer needed. If an error occurs during allocation %NULL will be
+ * returned.
+ *
+ * \sa enum gbm_bo_format for the list of formats
+ * \sa enum gbm_bo_flags for the list of usage flags
+ */
 GBM_EXPORT struct gbm_bo *
 gbm_bo_create(struct gbm_device *gbm,
              uint32_t width, uint32_t height,
@@ -176,6 +262,24 @@ gbm_bo_create(struct gbm_device *gbm,
   return gbm->bo_create(gbm, width, height, format, usage);
 }

+/**
+ * Create a buffer object representing the contents of an EGLImage
+ *
+ * \param gbm The gbm device returned from gbm_create_device()
+ * \param egl_dpy The EGLDisplay on which the EGLImage was created
+ * \param egl_image The EGLImage to create the buffer from
+ * \param width The width to use in the creation of the buffer object
+ * \param height The height to use in the creation of the buffer object
+ * \param usage The union of the usage flags for this buffer
+ *
+ * \return A newly allocated buffer object that should be freed with
+ * gbm_bo_destroy() when no longer needed.
+ *
+ * \sa enum gbm_bo_flags for the list of usage flags
+ *
+ * \note The expectation is that this function will use an efficient method
+ * for making the contents of the EGLImage available as a buffer object.
+ */
 GBM_EXPORT struct gbm_bo *
 gbm_bo_create_from_egl_image(struct gbm_device *gbm,
                             void *egl_dpy, void *egl_image,
--- a/src/gbm/main/gbm.h
+++ b/src/gbm/main/gbm.h
@@ -37,9 +37,28 @@ extern "C" {

 #include <stdint.h>

+/**
+ * \file gbm.h
+ * \brief Generic Buffer Manager
+ */
+
 struct gbm_device;
 struct gbm_bo;

+/**
+ * \mainpage The Generic Buffer Manager
+ *
+ * This module provides an abstraction that the caller can use to request a
+ * buffer from the underlying memory management system for the platform.
+ *
+ * This allows the creation of portable code whilst still allowing access to
+ * the underlying memory manager.
+ */
+
+/**
+ * Abstraction representing the handle to a buffer allocated by the
+ * manager
+ */
 union gbm_bo_handle {
   void *ptr;
   int32_t s32;
@@ -48,14 +67,36 @@ union gbm_bo_handle {
   uint64_t u64;
 };

+/** Format of the allocated buffer */
 enum gbm_bo_format {
-   GBM_BO_FORMAT_XRGB8888,
-   GBM_BO_FORMAT_ARGB8888,
+   /** RGB with 8 bits per channel in a 32 bit value */
+   GBM_BO_FORMAT_XRGB8888, 
+   /** ARGB with 8 bits per channel in a 32 bit value */
+   GBM_BO_FORMAT_ARGB8888
 };

+/**
+ * Flags to indicate the intended use for the buffer - these are passed into
+ * gbm_bo_create(). The caller must set the union of all the flags that are
+ * appropriate
+ *
+ * \sa Use gbm_device_is_format_supported() to check if the combination of format
+ * and use flags are supported
+ */
 enum gbm_bo_flags {
+   /**
+    * Buffer is going to be presented to the screen using an API such as KMS
+    */
   GBM_BO_USE_SCANOUT      = (1 << 0),
+   /**
+    * Buffer is going to be used as cursor - the dimensions for the buffer
+    * must be 64x64 if this flag is passed.
+    */
   GBM_BO_USE_CURSOR_64X64 = (1 << 1),
+   /**
+    * Buffer is to be used for rendering - for example it is going to be used
+    * as the storage for a color buffer
+    */
   GBM_BO_USE_RENDERING    = (1 << 2),
 };

--- a/src/gbm/main/gbmint.h
+++ b/src/gbm/main/gbmint.h
@@ -38,6 +38,16 @@
 #define GBM_EXPORT
 #endif

+/**
+ * \file gbmint.h
+ * \brief Internal implementation details of gbm
+ */
+
+/**
+ * The device used for the memory allocation.
+ *
+ * The members of this structure should be not accessed directly
+ */
 struct gbm_device {
   /* Hack to make a gbm_device detectable by its first element. */
   struct gbm_device *(*dummy)(int);
@@ -63,6 +73,11 @@ struct gbm_device {
   void (*bo_destroy)(struct gbm_bo *bo);
 };

+/**
+ * The allocated buffer object.
+ *
+ * The members in this structure should not be accessed directly.
+ */
 struct gbm_bo {
   struct gbm_device *gbm;
   uint32_t width;
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -54,6 +54,7 @@
 #include "glsl_parser_extras.h"
 #include "ast.h"
 #include "glsl_types.h"
+#include "program/hash_table.h"
 #include "ir.h"

 void
@@ -3405,7 +3406,7 @@ ast_jump_statement::hir(exec_list *instructions,
 			  "continue may only appear in a loop");
      } else if (mode == ast_break &&
 		 state->loop_nesting_ast == NULL &&
-		 state->switch_nesting_ast == NULL) {
+		 state->switch_state.switch_nesting_ast == NULL) {
 	 YYLTYPE loc = this->get_location();

 	 _mesa_glsl_error(& loc, state,
@@ -3423,11 +3424,11 @@ ast_jump_statement::hir(exec_list *instructions,
 							  state);
 	 }

-	 if (state->is_switch_innermost &&
+	 if (state->switch_state.is_switch_innermost &&
 	     mode == ast_break) {
 	    /* Force break out of switch by setting is_break switch state.
 	     */
-	    ir_variable *const is_break_var = state->is_break_var;
+	    ir_variable *const is_break_var = state->switch_state.is_break_var;
 	    ir_dereference_variable *const deref_is_break_var =
 	       new(ctx) ir_dereference_variable(is_break_var);
 	    ir_constant *const true_val = new(ctx) ir_constant(true);
@@ -3530,25 +3531,25 @@ ast_switch_statement::hir(exec_list *instructions,

   /* Track the switch-statement nesting in a stack-like manner.
    */
-   ir_variable *saved_test_var = state->test_var;
-   ir_variable *saved_is_fallthru_var = state->is_fallthru_var;
-   
-   bool save_is_switch_innermost = state->is_switch_innermost;
-   ast_switch_statement *saved_nesting_ast = state->switch_nesting_ast;
+   struct glsl_switch_state saved = state->switch_state;

-   state->is_switch_innermost = true;
-   state->switch_nesting_ast = this;
+   state->switch_state.is_switch_innermost = true;
+   state->switch_state.switch_nesting_ast = this;
+   state->switch_state.labels_ht = hash_table_ctor(0, hash_table_pointer_hash,
+						   hash_table_pointer_compare);
+   state->switch_state.previous_default = NULL;

   /* Initalize is_fallthru state to false.
    */
   ir_rvalue *const is_fallthru_val = new (ctx) ir_constant(false);
-   state->is_fallthru_var = new(ctx) ir_variable(glsl_type::bool_type,
-					        "switch_is_fallthru_tmp",
-					        ir_var_temporary);
-   instructions->push_tail(state->is_fallthru_var);
+   state->switch_state.is_fallthru_var =
+      new(ctx) ir_variable(glsl_type::bool_type,
+			   "switch_is_fallthru_tmp",
+			   ir_var_temporary);
+   instructions->push_tail(state->switch_state.is_fallthru_var);

   ir_dereference_variable *deref_is_fallthru_var =
-      new(ctx) ir_dereference_variable(state->is_fallthru_var);
+      new(ctx) ir_dereference_variable(state->switch_state.is_fallthru_var);
   instructions->push_tail(new(ctx) ir_assignment(deref_is_fallthru_var,
 						  is_fallthru_val,
 						  NULL));
@@ -3556,13 +3557,13 @@ ast_switch_statement::hir(exec_list *instructions,
   /* Initalize is_break state to false.
    */
   ir_rvalue *const is_break_val = new (ctx) ir_constant(false);
-   state->is_break_var = new(ctx) ir_variable(glsl_type::bool_type,
-					      "switch_is_break_tmp",
-					      ir_var_temporary);
-   instructions->push_tail(state->is_break_var);
+   state->switch_state.is_break_var = new(ctx) ir_variable(glsl_type::bool_type,
+							   "switch_is_break_tmp",
+							   ir_var_temporary);
+   instructions->push_tail(state->switch_state.is_break_var);

   ir_dereference_variable *deref_is_break_var =
-      new(ctx) ir_dereference_variable(state->is_break_var);
+      new(ctx) ir_dereference_variable(state->switch_state.is_break_var);
   instructions->push_tail(new(ctx) ir_assignment(deref_is_break_var,
 						  is_break_val,
 						  NULL));
@@ -3575,254 +3576,294 @@ ast_switch_statement::hir(exec_list *instructions,
    */
   body->hir(instructions, state);

-   /* Restore previous nesting before returning.
-    */
-   state->switch_nesting_ast = saved_nesting_ast;
-   state->is_switch_innermost = save_is_switch_innermost;
+   hash_table_dtor(state->switch_state.labels_ht);

-   state->test_var = saved_test_var;
-   state->is_fallthru_var = saved_is_fallthru_var;
+   state->switch_state = saved;

-   /* Switch statements do not have r-values.
-    */
-   return NULL;
-}
+     /* Switch statements do not have r-values.
+      */
+     return NULL;
+  }


-void
-ast_switch_statement::test_to_hir(exec_list *instructions,
-				  struct _mesa_glsl_parse_state *state)
-{
-   void *ctx = state;
+  void
+  ast_switch_statement::test_to_hir(exec_list *instructions,
+				    struct _mesa_glsl_parse_state *state)
+  {
+     void *ctx = state;

-   /* Cache value of test expression.
-    */
-   ir_rvalue *const test_val =
-      test_expression->hir(instructions,
-			   state);
+     /* Cache value of test expression.
+      */
+     ir_rvalue *const test_val =
+	test_expression->hir(instructions,
+			     state);

-   state->test_var = new(ctx) ir_variable(glsl_type::int_type,
-					  "switch_test_tmp",
-					  ir_var_temporary);
-   ir_dereference_variable *deref_test_var =
-      new(ctx) ir_dereference_variable(state->test_var);
+     state->switch_state.test_var = new(ctx) ir_variable(glsl_type::int_type,
+							 "switch_test_tmp",
+							 ir_var_temporary);
+     ir_dereference_variable *deref_test_var =
+	new(ctx) ir_dereference_variable(state->switch_state.test_var);

-   instructions->push_tail(state->test_var);
-   instructions->push_tail(new(ctx) ir_assignment(deref_test_var,
-						  test_val,
-						  NULL));
-}
+     instructions->push_tail(state->switch_state.test_var);
+     instructions->push_tail(new(ctx) ir_assignment(deref_test_var,
+						    test_val,
+						    NULL));
+  }


-ir_rvalue *
-ast_switch_body::hir(exec_list *instructions,
-		     struct _mesa_glsl_parse_state *state)
-{
-   if (stmts != NULL)
-      stmts->hir(instructions, state);
-      
-   /* Switch bodies do not have r-values.
-    */
-   return NULL;
-}
+  ir_rvalue *
+  ast_switch_body::hir(exec_list *instructions,
+		       struct _mesa_glsl_parse_state *state)
+  {
+     if (stmts != NULL)
+	stmts->hir(instructions, state);
+
+     /* Switch bodies do not have r-values.
+      */
+     return NULL;
+  }


-ir_rvalue *
-ast_case_statement_list::hir(exec_list *instructions,
-			     struct _mesa_glsl_parse_state *state)
-{
-   foreach_list_typed (ast_case_statement, case_stmt, link, & this->cases)
-      case_stmt->hir(instructions, state);
-         
-   /* Case statements do not have r-values.
-    */
-   return NULL;
-}
+  ir_rvalue *
+  ast_case_statement_list::hir(exec_list *instructions,
+			       struct _mesa_glsl_parse_state *state)
+  {
+     foreach_list_typed (ast_case_statement, case_stmt, link, & this->cases)
+	case_stmt->hir(instructions, state);
+
+     /* Case statements do not have r-values.
+      */
+     return NULL;
+  }


-ir_rvalue *
-ast_case_statement::hir(exec_list *instructions,
-			struct _mesa_glsl_parse_state *state)
-{
-   labels->hir(instructions, state);
-   
-   /* Conditionally set fallthru state based on break state.
-    */
-   ir_constant *const false_val = new(state) ir_constant(false);
-   ir_dereference_variable *const deref_is_fallthru_var =
-      new(state) ir_dereference_variable(state->is_fallthru_var);
-   ir_dereference_variable *const deref_is_break_var =
-      new(state) ir_dereference_variable(state->is_break_var);
-   ir_assignment *const reset_fallthru_on_break =
-      new(state) ir_assignment(deref_is_fallthru_var,
-			       false_val,
-			       deref_is_break_var);
-   instructions->push_tail(reset_fallthru_on_break);
+  ir_rvalue *
+  ast_case_statement::hir(exec_list *instructions,
+			  struct _mesa_glsl_parse_state *state)
+  {
+     labels->hir(instructions, state);

-   /* Guard case statements depending on fallthru state.
-    */
-   ir_dereference_variable *const deref_fallthru_guard =
-      new(state) ir_dereference_variable(state->is_fallthru_var);
-   ir_if *const test_fallthru = new(state) ir_if(deref_fallthru_guard);
-   
-   foreach_list_typed (ast_node, stmt, link, & this->stmts)
-      stmt->hir(& test_fallthru->then_instructions, state);
+     /* Conditionally set fallthru state based on break state.
+      */
+     ir_constant *const false_val = new(state) ir_constant(false);
+     ir_dereference_variable *const deref_is_fallthru_var =
+	new(state) ir_dereference_variable(state->switch_state.is_fallthru_var);
+     ir_dereference_variable *const deref_is_break_var =
+	new(state) ir_dereference_variable(state->switch_state.is_break_var);
+     ir_assignment *const reset_fallthru_on_break =
+	new(state) ir_assignment(deref_is_fallthru_var,
+				 false_val,
+				 deref_is_break_var);
+     instructions->push_tail(reset_fallthru_on_break);

-   instructions->push_tail(test_fallthru);
-         
-   /* Case statements do not have r-values.
-    */
-   return NULL;
-}
+     /* Guard case statements depending on fallthru state.
+      */
+     ir_dereference_variable *const deref_fallthru_guard =
+	new(state) ir_dereference_variable(state->switch_state.is_fallthru_var);
+     ir_if *const test_fallthru = new(state) ir_if(deref_fallthru_guard);
+
+     foreach_list_typed (ast_node, stmt, link, & this->stmts)
+	stmt->hir(& test_fallthru->then_instructions, state);
+
+     instructions->push_tail(test_fallthru);
+
+     /* Case statements do not have r-values.
+      */
+     return NULL;
+  }


-ir_rvalue *
-ast_case_label_list::hir(exec_list *instructions,
-			 struct _mesa_glsl_parse_state *state)
-{
-   foreach_list_typed (ast_case_label, label, link, & this->labels)
-      label->hir(instructions, state);
-         
-   /* Case labels do not have r-values.
-    */
-   return NULL;
-}
+  ir_rvalue *
+  ast_case_label_list::hir(exec_list *instructions,
+			   struct _mesa_glsl_parse_state *state)
+  {
+     foreach_list_typed (ast_case_label, label, link, & this->labels)
+	label->hir(instructions, state);
+
+     /* Case labels do not have r-values.
+      */
+     return NULL;
+  }


-ir_rvalue *
-ast_case_label::hir(exec_list *instructions,
-		    struct _mesa_glsl_parse_state *state)
-{
-   void *ctx = state;
+  ir_rvalue *
+  ast_case_label::hir(exec_list *instructions,
+		      struct _mesa_glsl_parse_state *state)
+  {
+     void *ctx = state;

-   ir_dereference_variable *deref_fallthru_var =
-      new(ctx) ir_dereference_variable(state->is_fallthru_var);
-   
-   ir_rvalue *const true_val = new(ctx) ir_constant(true);
+     ir_dereference_variable *deref_fallthru_var =
+	new(ctx) ir_dereference_variable(state->switch_state.is_fallthru_var);

-   /* If not default case, ...
-    */
-   if (this->test_value != NULL) {
-      /* Conditionally set fallthru state based on
-       * comparison of cached test expression value to case label.
-       */
-      ir_rvalue *const test_val = this->test_value->hir(instructions, state);
+     ir_rvalue *const true_val = new(ctx) ir_constant(true);

-      ir_dereference_variable *deref_test_var =
-	 new(ctx) ir_dereference_variable(state->test_var);
+     /* If not default case, ...
+      */
+     if (this->test_value != NULL) {
+	/* Conditionally set fallthru state based on
+	 * comparison of cached test expression value to case label.
+	 */
+	ir_rvalue *const label_rval = this->test_value->hir(instructions, state);
+	ir_constant *label_const = label_rval->constant_expression_value();

-      ir_rvalue *const test_cond = new(ctx) ir_expression(ir_binop_all_equal,
-							  glsl_type::bool_type,
-							  test_val,
-							  deref_test_var);
+	if (!label_const) {
+	   YYLTYPE loc = this->test_value->get_location();

-      ir_assignment *set_fallthru_on_test =
-	 new(ctx) ir_assignment(deref_fallthru_var,
-				true_val,
-				test_cond);
-   
-      instructions->push_tail(set_fallthru_on_test);
-   } else { /* default case */
-      /* Set falltrhu state.
-       */
-      ir_assignment *set_fallthru =
-	 new(ctx) ir_assignment(deref_fallthru_var,
-				true_val,
-				NULL);
-   
-      instructions->push_tail(set_fallthru);
-   }
-   
-   /* Case statements do not have r-values.
-    */
-   return NULL;
-}
+	   _mesa_glsl_error(& loc, state,
+			    "switch statement case label must be a "
+			    "constant expression");
+
+	   /* Stuff a dummy value in to allow processing to continue. */
+	   label_const = new(ctx) ir_constant(0);
+	} else {
+	   ast_expression *previous_label = (ast_expression *)
+	      hash_table_find(state->switch_state.labels_ht,
+			      (void *)(uintptr_t)label_const->value.u[0]);
+
+	   if (previous_label) {
+	      YYLTYPE loc = this->test_value->get_location();
+	      _mesa_glsl_error(& loc, state,
+			       "duplicate case value");
+
+	      loc = previous_label->get_location();
+	      _mesa_glsl_error(& loc, state,
+			       "this is the previous case label");
+	   } else {
+	      hash_table_insert(state->switch_state.labels_ht,
+				this->test_value,
+				(void *)(uintptr_t)label_const->value.u[0]);
+	   }
+	}
+
+	ir_dereference_variable *deref_test_var =
+	   new(ctx) ir_dereference_variable(state->switch_state.test_var);
+
+	ir_rvalue *const test_cond = new(ctx) ir_expression(ir_binop_all_equal,
+							    glsl_type::bool_type,
+							    label_const,
+							    deref_test_var);
+
+	ir_assignment *set_fallthru_on_test =
+	   new(ctx) ir_assignment(deref_fallthru_var,
+				  true_val,
+				  test_cond);
+
+	instructions->push_tail(set_fallthru_on_test);
+     } else { /* default case */
+	if (state->switch_state.previous_default) {
+	   printf("a\n");
+	   YYLTYPE loc = this->get_location();
+	   _mesa_glsl_error(& loc, state,
+			       "multiple default labels in one switch");
+
+	   printf("b\n");
+
+	   loc = state->switch_state.previous_default->get_location();
+	   _mesa_glsl_error(& loc, state,
+			    "this is the first default label");
+	}
+	state->switch_state.previous_default = this;
+
+	/* Set falltrhu state.
+	 */
+	ir_assignment *set_fallthru =
+	   new(ctx) ir_assignment(deref_fallthru_var,
+				  true_val,
+				  NULL);
+
+	instructions->push_tail(set_fallthru);
+     }
+
+     /* Case statements do not have r-values.
+      */
+     return NULL;
+  }


-void
-ast_iteration_statement::condition_to_hir(ir_loop *stmt,
-					  struct _mesa_glsl_parse_state *state)
-{
-   void *ctx = state;
+  void
+  ast_iteration_statement::condition_to_hir(ir_loop *stmt,
+					    struct _mesa_glsl_parse_state *state)
+  {
+     void *ctx = state;

-   if (condition != NULL) {
-      ir_rvalue *const cond =
-	 condition->hir(& stmt->body_instructions, state);
+     if (condition != NULL) {
+	ir_rvalue *const cond =
+	   condition->hir(& stmt->body_instructions, state);

-      if ((cond == NULL)
-	  || !cond->type->is_boolean() || !cond->type->is_scalar()) {
-	 YYLTYPE loc = condition->get_location();
+	if ((cond == NULL)
+	    || !cond->type->is_boolean() || !cond->type->is_scalar()) {
+	   YYLTYPE loc = condition->get_location();

-	 _mesa_glsl_error(& loc, state,
-			  "loop condition must be scalar boolean");
-      } else {
-	 /* As the first code in the loop body, generate a block that looks
-	  * like 'if (!condition) break;' as the loop termination condition.
-	  */
-	 ir_rvalue *const not_cond =
-	    new(ctx) ir_expression(ir_unop_logic_not, glsl_type::bool_type, cond,
-				   NULL);
+	   _mesa_glsl_error(& loc, state,
+			    "loop condition must be scalar boolean");
+	} else {
+	   /* As the first code in the loop body, generate a block that looks
+	    * like 'if (!condition) break;' as the loop termination condition.
+	    */
+	   ir_rvalue *const not_cond =
+	      new(ctx) ir_expression(ir_unop_logic_not, glsl_type::bool_type, cond,
+				     NULL);

-	 ir_if *const if_stmt = new(ctx) ir_if(not_cond);
+	   ir_if *const if_stmt = new(ctx) ir_if(not_cond);

-	 ir_jump *const break_stmt =
-	    new(ctx) ir_loop_jump(ir_loop_jump::jump_break);
+	   ir_jump *const break_stmt =
+	      new(ctx) ir_loop_jump(ir_loop_jump::jump_break);

-	 if_stmt->then_instructions.push_tail(break_stmt);
-	 stmt->body_instructions.push_tail(if_stmt);
-      }
-   }
-}
+	   if_stmt->then_instructions.push_tail(break_stmt);
+	   stmt->body_instructions.push_tail(if_stmt);
+	}
+     }
+  }


-ir_rvalue *
-ast_iteration_statement::hir(exec_list *instructions,
-			     struct _mesa_glsl_parse_state *state)
-{
-   void *ctx = state;
+  ir_rvalue *
+  ast_iteration_statement::hir(exec_list *instructions,
+			       struct _mesa_glsl_parse_state *state)
+  {
+     void *ctx = state;

-   /* For-loops and while-loops start a new scope, but do-while loops do not.
-    */
-   if (mode != ast_do_while)
-      state->symbols->push_scope();
+     /* For-loops and while-loops start a new scope, but do-while loops do not.
+      */
+     if (mode != ast_do_while)
+	state->symbols->push_scope();

-   if (init_statement != NULL)
-      init_statement->hir(instructions, state);
+     if (init_statement != NULL)
+	init_statement->hir(instructions, state);

-   ir_loop *const stmt = new(ctx) ir_loop();
-   instructions->push_tail(stmt);
+     ir_loop *const stmt = new(ctx) ir_loop();
+     instructions->push_tail(stmt);

-   /* Track the current loop nesting.
-    */
-   ast_iteration_statement *nesting_ast = state->loop_nesting_ast;
+     /* Track the current loop nesting.
+      */
+     ast_iteration_statement *nesting_ast = state->loop_nesting_ast;

-   state->loop_nesting_ast = this;
+     state->loop_nesting_ast = this;

-   /* Likewise, indicate that following code is closest to a loop,
-    * NOT closest to a switch.
-    */
-   bool saved_is_switch_innermost = state->is_switch_innermost;
-   state->is_switch_innermost = false;
+     /* Likewise, indicate that following code is closest to a loop,
+      * NOT closest to a switch.
+      */
+     bool saved_is_switch_innermost = state->switch_state.is_switch_innermost;
+     state->switch_state.is_switch_innermost = false;

-   if (mode != ast_do_while)
-      condition_to_hir(stmt, state);
+     if (mode != ast_do_while)
+	condition_to_hir(stmt, state);

-   if (body != NULL)
-      body->hir(& stmt->body_instructions, state);
+     if (body != NULL)
+	body->hir(& stmt->body_instructions, state);

-   if (rest_expression != NULL)
-      rest_expression->hir(& stmt->body_instructions, state);
+     if (rest_expression != NULL)
+	rest_expression->hir(& stmt->body_instructions, state);

-   if (mode == ast_do_while)
-      condition_to_hir(stmt, state);
+     if (mode == ast_do_while)
+	condition_to_hir(stmt, state);

-   if (mode != ast_do_while)
-      state->symbols->pop_scope();
+     if (mode != ast_do_while)
+	state->symbols->pop_scope();

-   /* Restore previous nesting before returning.
-    */
-   state->loop_nesting_ast = nesting_ast;
-   state->is_switch_innermost = saved_is_switch_innermost;
+     /* Restore previous nesting before returning.
+      */
+     state->loop_nesting_ast = nesting_ast;
+     state->switch_state.is_switch_innermost = saved_is_switch_innermost;

   /* Loops do not have r-values.
    */
--- a/src/glsl/glcpp/glcpp-lex.l
+++ b/src/glsl/glcpp/glcpp-lex.l
@@ -70,7 +70,15 @@ HSPACE		[ \t]
 HASH		^{HSPACE}*#{HSPACE}*
 IDENTIFIER	[_a-zA-Z][_a-zA-Z0-9]*
 PUNCTUATION	[][(){}.&*~!/%<>^|;,=+-]
-OTHER		[^][(){}.&*~!/%<>^|;,=#[:space:]+-]+
+
+/* The OTHER class is simply a catch-all for things that the CPP
+parser just doesn't care about. Since flex regular expressions that
+match longer strings take priority over those matching shorter
+strings, we have to be careful to avoid OTHER matching and hiding
+something that CPP does care about. So we simply exclude all
+characters that appear in any other expressions. */
+
+OTHER		[^][_#[:space:]#a-zA-Z0-9(){}.&*~!/%<>^|;,=+-]

 DIGITS			[0-9][0-9]*
 DECIMAL_INTEGER		[1-9][0-9]*[uU]?
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -85,7 +85,6 @@ _token_create_ival (void *ctx, int type, int ival);
 static token_list_t *
 _token_list_create (void *ctx);

-/* Note: This function calls ralloc_steal on token. */
 static void
 _token_list_append (token_list_t *list, token_t *token);

@@ -763,8 +762,6 @@ _token_list_append (token_list_t *list, token_t *token)
 	node->token = token;
 	node->next = NULL;

-	ralloc_steal (list, token);
-
 	if (list->head == NULL) {
 		list->head = node;
 	} else {
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1671,6 +1671,7 @@ switch_statement:
 	SWITCH '(' expression ')' switch_body
 	{
 	   $$ = new(state) ast_switch_statement($3, $5);
+	   $$->set_location(yylloc);
 	}
 	;

@@ -1691,10 +1692,12 @@ case_label:
 	CASE expression ':'
 	{
 	   $$ = new(state) ast_case_label($2);
+	   $$->set_location(yylloc);
 	}
 	| DEFAULT ':'
 	{
 	   $$ = new(state) ast_case_label(NULL);
+	   $$->set_location(yylloc);
 	}
 	;

@@ -1705,6 +1708,7 @@ case_label_list:

 	   labels->labels.push_tail(& $1->link);
 	   $$ = labels;
+	   $$->set_location(yylloc);
 	}
 	| case_label_list case_label
 	{
@@ -1717,6 +1721,7 @@ case_statement:
 	case_label_list statement
 	{
 	   ast_case_statement *stmts = new(state) ast_case_statement($1);
+	   stmts->set_location(yylloc);

 	   stmts->stmts.push_tail(& $2->link);
 	   $$ = stmts;
@@ -1732,6 +1737,7 @@ case_statement_list:
 	case_statement
 	{
 	   ast_case_statement_list *cases= new(state) ast_case_statement_list();
+	   cases->set_location(yylloc);

 	   cases->cases.push_tail(& $1->link);
 	   $$ = cases;
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -51,7 +51,7 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *ctx,
   this->info_log = ralloc_strdup(mem_ctx, "");
   this->error = false;
   this->loop_nesting_ast = NULL;
-   this->switch_nesting_ast = NULL;
+   this->switch_state.switch_nesting_ast = NULL;

   this->num_builtins_to_link = 0;

@@ -114,6 +114,9 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *ctx,
   }

   this->supported_version_string = supported;
+
+   if (ctx->Const.ForceGLSLExtensionsWarn)
+      _mesa_glsl_process_extension("all", NULL, "warn", NULL, this);
 }

 const char *
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -42,6 +42,20 @@ enum _mesa_glsl_parser_targets {

 struct gl_context;

+struct glsl_switch_state {
+   /** Temporary variables needed for switch statement. */
+   ir_variable *test_var;
+   ir_variable *is_fallthru_var;
+   ir_variable *is_break_var;
+   class ast_switch_statement *switch_nesting_ast;
+
+   /** Table of constant values already used in case labels */
+   struct hash_table *labels_ht;
+   class ast_case_label *previous_default;
+
+   bool is_switch_innermost; // if switch stmt is closest to break, ...
+};
+
 struct _mesa_glsl_parse_state {
   _mesa_glsl_parse_state(struct gl_context *ctx, GLenum target,
 			  void *mem_ctx);
@@ -150,13 +164,8 @@ struct _mesa_glsl_parse_state {

   /** Loop or switch statement containing the current instructions. */
   class ast_iteration_statement *loop_nesting_ast;
-   class ast_switch_statement *switch_nesting_ast;
-   bool is_switch_innermost; // if switch stmt is closest to break, ...

-   /** Temporary variables needed for switch statement. */
-   ir_variable *test_var;
-   ir_variable *is_fallthru_var;
-   ir_variable *is_break_var;
+   struct glsl_switch_state switch_state;

   /** List of structures defined in user code. */
   const glsl_type **user_structures;
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -147,12 +147,12 @@ glsl_type::sampler_index() const
      return TEXTURE_RECT_INDEX;
   case GLSL_SAMPLER_DIM_BUF:
      assert(!"FINISHME: Implement ARB_texture_buffer_object");
-      break;
+      return TEXTURE_BUFFER_INDEX;
   case GLSL_SAMPLER_DIM_EXTERNAL:
      return TEXTURE_EXTERNAL_INDEX;
   default:
      assert(!"Should not get here.");
-      break;
+      return TEXTURE_BUFFER_INDEX;
   }
 }

--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -287,7 +287,7 @@ private:
 	 this->uniforms[id].sampler = ~0;
      }

-      this->uniforms[id].name = strdup(name);
+      this->uniforms[id].name = ralloc_strdup(this->uniforms, name);
      this->uniforms[id].type = base_type;
      this->uniforms[id].initialized = 0;
      this->uniforms[id].num_driver_storage = 0;
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1388,9 +1388,10 @@ public:
   static bool is_same(const tfeedback_decl &x, const tfeedback_decl &y);
   bool assign_location(struct gl_context *ctx, struct gl_shader_program *prog,
                        ir_variable *output_var);
+   bool accumulate_num_outputs(struct gl_shader_program *prog, unsigned *count);
   bool store(struct gl_context *ctx, struct gl_shader_program *prog,
              struct gl_transform_feedback_info *info, unsigned buffer,
-	      unsigned varying) const;
+              unsigned varying, const unsigned max_outputs) const;


   /**
@@ -1624,16 +1625,9 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
 }


-/**
- * Update gl_transform_feedback_info to reflect this tfeedback_decl.
- *
- * If an error occurs, the error is reported through linker_error() and false
- * is returned.
- */
 bool
-tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
-                      struct gl_transform_feedback_info *info,
-                      unsigned buffer, unsigned varying) const
+tfeedback_decl::accumulate_num_outputs(struct gl_shader_program *prog,
+                                       unsigned *count)
 {
   if (!this->is_assigned()) {
      /* From GL_EXT_transform_feedback:
@@ -1648,6 +1642,28 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
      return false;
   }

+   unsigned translated_size = this->size;
+   if (this->is_clip_distance_mesa)
+      translated_size = (translated_size + 3) / 4;
+
+   *count += translated_size * this->matrix_columns;
+
+   return true;
+}
+
+
+/**
+ * Update gl_transform_feedback_info to reflect this tfeedback_decl.
+ *
+ * If an error occurs, the error is reported through linker_error() and false
+ * is returned.
+ */
+bool
+tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
+                      struct gl_transform_feedback_info *info,
+                      unsigned buffer,
+                      unsigned varying, const unsigned max_outputs) const
+{
   /* From GL_EXT_transform_feedback:
    *   A program will fail to link if:
    *
@@ -1663,19 +1679,6 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
      return false;
   }

-   /* Verify that the checks on MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS
-    * and MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS are sufficient to prevent
-    * overflow of info->Outputs[].  In worst case we generate one entry in
-    * Outputs[] per component so a conservative check is to verify that the
-    * size of the array is greater than or equal to both
-    * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS and
-    * MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS.
-    */
-   assert(Elements(info->Outputs) >=
-          ctx->Const.MaxTransformFeedbackInterleavedComponents);
-   assert(Elements(info->Outputs) >=
-          ctx->Const.MaxTransformFeedbackSeparateComponents);
-
   unsigned translated_size = this->size;
   if (this->is_clip_distance_mesa)
      translated_size = (translated_size + 3) / 4;
@@ -1683,6 +1686,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
   for (unsigned index = 0; index < translated_size; ++index) {
      for (unsigned v = 0; v < this->matrix_columns; ++v) {
         unsigned num_components = this->vector_elements;
+         assert(info->NumOutputs < max_outputs);
         info->Outputs[info->NumOutputs].ComponentOffset = 0;
         if (this->is_clip_distance_mesa) {
            if (this->is_subscripted) {
@@ -1976,6 +1980,7 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
      prog->TransformFeedback.BufferMode == GL_SEPARATE_ATTRIBS;

   ralloc_free(prog->LinkedTransformFeedback.Varyings);
+   ralloc_free(prog->LinkedTransformFeedback.Outputs);

   memset(&prog->LinkedTransformFeedback, 0,
          sizeof(prog->LinkedTransformFeedback));
@@ -1984,16 +1989,27 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
      separate_attribs_mode ? num_tfeedback_decls : 1;

   prog->LinkedTransformFeedback.Varyings =
-      rzalloc_array(prog->LinkedTransformFeedback.Varyings,
+      rzalloc_array(prog,
 		    struct gl_transform_feedback_varying_info,
 		    num_tfeedback_decls);

+   unsigned num_outputs = 0;
+   for (unsigned i = 0; i < num_tfeedback_decls; ++i)
+      if (!tfeedback_decls[i].accumulate_num_outputs(prog, &num_outputs))
+         return false;
+
+   prog->LinkedTransformFeedback.Outputs =
+      rzalloc_array(prog,
+                    struct gl_transform_feedback_output,
+                    num_outputs);
+
   for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
      unsigned buffer = separate_attribs_mode ? i : 0;
      if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback,
-                                    buffer, i))
+                                    buffer, i, num_outputs))
         return false;
   }
+   assert(prog->LinkedTransformFeedback.NumOutputs == num_outputs);

   return true;
 }
--- a/src/glsl/s_expression.cpp
+++ b/src/glsl/s_expression.cpp
@@ -23,6 +23,7 @@
 */

 #include <assert.h>
+#include <limits>
 #include "s_expression.h"

 s_symbol::s_symbol(const char *str, size_t n)
@@ -64,21 +65,28 @@ read_atom(void *ctx, const char *&src, char *&symbol_buffer)
   if (n == 0)
      return NULL; // no atom

-   // Check if the atom is a number.
-   char *float_end = NULL;
-   double f = glsl_strtod(src, &float_end);
-   if (float_end != src) {
-      char *int_end = NULL;
-      int i = strtol(src, &int_end, 10);
-      // If strtod matched more characters, it must have a decimal part
-      if (float_end > int_end)
-	 expr = new(ctx) s_float(f);
-      else
-	 expr = new(ctx) s_int(i);
+   // Check for the special symbol '+INF', which means +Infinity.  Note: C99
+   // requires strtod to parse '+INF' as +Infinity, but we still support some
+   // non-C99-compliant compilers (e.g. MSVC).
+   if (n == 4 && strncmp(src, "+INF", 4) == 0) {
+      expr = new(ctx) s_float(std::numeric_limits<float>::infinity());
   } else {
-      // Not a number; return a symbol.
-      symbol_buffer[n] = '\0';
-      expr = new(ctx) s_symbol(symbol_buffer, n);
+      // Check if the atom is a number.
+      char *float_end = NULL;
+      double f = glsl_strtod(src, &float_end);
+      if (float_end != src) {
+         char *int_end = NULL;
+         int i = strtol(src, &int_end, 10);
+         // If strtod matched more characters, it must have a decimal part
+         if (float_end > int_end)
+            expr = new(ctx) s_float(f);
+         else
+            expr = new(ctx) s_int(i);
+      } else {
+         // Not a number; return a symbol.
+         symbol_buffer[n] = '\0';
+         expr = new(ctx) s_symbol(symbol_buffer, n);
+      }
   }

   src += n;
--- a/src/mapi/glapi/glapi_nop.c
+++ b/src/mapi/glapi/glapi_nop.c
@@ -51,7 +51,11 @@ _glapi_set_warning_func(_glapi_proc func)
 {
 }

-#ifdef DEBUG
+/*
+ * When GLAPIENTRY is __stdcall (i.e. Windows), the stack is popped by the
+ * callee making the number/type of arguments significant.
+ */
+#if defined(_WIN32) || defined(DEBUG)

 /**
 * Called by each of the no-op GL entrypoints.
@@ -59,7 +63,7 @@ _glapi_set_warning_func(_glapi_proc func)
 static int
 Warn(const char *func)
 {
-#if !defined(_WIN32_WCE)
+#if defined(DEBUG) && !defined(_WIN32_WCE)
   if (getenv("MESA_DEBUG") || getenv("LIBGL_DEBUG")) {
      fprintf(stderr, "GL User Error: gl%s called without a rendering context\n",
              func);
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -119,8 +119,6 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
   driver->FreeTextureImageBuffer = _swrast_free_texture_image_buffer;
   driver->MapTextureImage = _swrast_map_teximage;
   driver->UnmapTextureImage = _swrast_unmap_teximage;
-   driver->MapTexture = NULL;
-   driver->UnmapTexture = NULL;
   driver->DrawTex = _mesa_meta_DrawTex;

   /* Vertex/fragment programs */
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -1451,7 +1451,12 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
   struct vertex verts[4];
   GLboolean newTex;

-   if (srcW > maxTexSize || srcH > maxTexSize) {
+   /* In addition to falling back if the blit size is larger than the maximum
+    * texture size, fallback if the source is multisampled.  This fallback can
+    * be removed once Mesa gets support ARB_texture_multisample.
+    */
+   if (srcW > maxTexSize || srcH > maxTexSize
+       || ctx->ReadBuffer->Visual.samples > 0) {
      /* XXX avoid this fallback */
      _swrast_BlitFramebuffer(ctx, srcX0, srcY0, srcX1, srcY1,
                              dstX0, dstY0, dstX1, dstY1, mask, filter);
@@ -2920,8 +2925,8 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,

   /* setup texcoords (XXX what about border?) */
   setup_texture_coords(faceTarget,
-                        0.0f, 0.0f, /* width, height never used here */
                        slice,
+                        0, 0, /* width, height never used here */
                        verts[0].tex,
                        verts[1].tex,
                        verts[2].tex,
@@ -3071,6 +3076,8 @@ get_temp_image_type(struct gl_context *ctx, GLenum baseFormat)
   switch (baseFormat) {
   case GL_RGBA:
   case GL_RGB:
+   case GL_RG:
+   case GL_RED:
   case GL_ALPHA:
   case GL_LUMINANCE:
   case GL_LUMINANCE_ALPHA:
@@ -3086,7 +3093,8 @@ get_temp_image_type(struct gl_context *ctx, GLenum baseFormat)
   case GL_DEPTH_STENCIL:
      return GL_UNSIGNED_INT_24_8;
   default:
-      _mesa_problem(ctx, "Unexpected format in get_temp_image_type()");
+      _mesa_problem(ctx, "Unexpected format %d in get_temp_image_type()",
+		    baseFormat);
      return 0;
   }
 }
@@ -3123,6 +3131,11 @@ copy_tex_sub_image(struct gl_context *ctx,
      format = GL_RGBA;
   }

+   if (_mesa_is_format_integer_color(texImage->TexFormat)) {
+      _mesa_problem(ctx, "unsupported integer color copyteximage");
+      return;
+   }
+
   type = get_temp_image_type(ctx, format);
   bpp = _mesa_bytes_per_pixel(format, type);
   if (bpp <= 0) {
--- a/src/mesa/drivers/dri/common/depthtmp.h
+++ b/src/mesa/drivers/dri/common/depthtmp.h
@@ -1,218 +0,0 @@
-
-/*
- * Notes:
- * 1. These functions plug into the gl_renderbuffer structure.
- * 2. The 'values' parameter always points to GLuint values, regardless of
- *    the actual Z buffer depth.
- */
-
-
-#include "spantmp_common.h"
-
-#ifndef DBG
-#define DBG 0
-#endif
-
-#ifndef HAVE_HW_DEPTH_SPANS
-#define HAVE_HW_DEPTH_SPANS 0
-#endif
-
-#ifndef HAVE_HW_DEPTH_PIXELS
-#define HAVE_HW_DEPTH_PIXELS 0
-#endif
-
-static void TAG(WriteDepthSpan)( struct gl_context *ctx,
-                                 struct gl_renderbuffer *rb,
-                                 GLuint n, GLint x, GLint y,
-				 const void *values,
-				 const GLubyte mask[] )
-{
-   HW_WRITE_LOCK()
-      {
-         const VALUE_TYPE *depth = (const VALUE_TYPE *) values;
-	 GLint x1;
-	 GLint n1;
-	 LOCAL_DEPTH_VARS;
-
-	 y = Y_FLIP( y );
-
-#if HAVE_HW_DEPTH_SPANS
-	 (void) x1; (void) n1;
-
-	 if ( DBG ) fprintf( stderr, "WriteDepthSpan 0..%d (x1 %d)\n",
-			     (int)n, (int)x );
-
-	 WRITE_DEPTH_SPAN();
-#else
-	 HW_CLIPLOOP()
-	    {
-	       GLint i = 0;
-	       CLIPSPAN( x, y, n, x1, n1, i );
-
-	       if ( DBG ) fprintf( stderr, "WriteDepthSpan %d..%d (x1 %d) (mask %p)\n",
-				   (int)i, (int)n1, (int)x1, mask );
-
-	       if ( mask ) {
-		  for ( ; n1>0 ; i++, x1++, n1-- ) {
-		     if ( mask[i] ) WRITE_DEPTH( x1, y, depth[i] );
-		  }
-	       } else {
-		  for ( ; n1>0 ; i++, x1++, n1-- ) {
-		     WRITE_DEPTH( x1, y, depth[i] );
-		  }
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-#endif
-      }
-   HW_WRITE_UNLOCK();
-
-   (void) ctx;
-}
-
-
-static void TAG(WriteDepthPixels)( struct gl_context *ctx,
-                                   struct gl_renderbuffer *rb,
-				   GLuint n,
-				   const GLint x[],
-				   const GLint y[],
-				   const void *values,
-				   const GLubyte mask[] )
-{
-   HW_WRITE_LOCK()
-      {
-         const VALUE_TYPE *depth = (const VALUE_TYPE *) values;
-	 GLuint i;
-	 LOCAL_DEPTH_VARS;
-
-	 if ( DBG ) fprintf( stderr, "WriteDepthPixels\n" );
-
-#if HAVE_HW_DEPTH_PIXELS
-	 (void) i;
-
-	 WRITE_DEPTH_PIXELS();
-#else
-	 HW_CLIPLOOP()
-	    {
-	       if ( mask ) {
-		  for ( i = 0 ; i < n ; i++ ) {
-		     if ( mask[i] ) {
-			const int fy = Y_FLIP( y[i] );
-			if ( CLIPPIXEL( x[i], fy ) )
-			   WRITE_DEPTH( x[i], fy, depth[i] );
-		     }
-		  }
-	       }
-	       else {
-		  for ( i = 0 ; i < n ; i++ ) {
-		     const int fy = Y_FLIP( y[i] );
-		     if ( CLIPPIXEL( x[i], fy ) )
-			WRITE_DEPTH( x[i], fy, depth[i] );
-		  }
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-#endif
-      }
-   HW_WRITE_UNLOCK();
-
-   (void) ctx;
-}
-
-
-/* Read depth spans and pixels
- */
-static void TAG(ReadDepthSpan)( struct gl_context *ctx,
-                                struct gl_renderbuffer *rb,
-				GLuint n, GLint x, GLint y,
-				void *values )
-{
-   HW_READ_LOCK()
-      {
-         VALUE_TYPE *depth = (VALUE_TYPE *) values;
-	 GLint x1, n1;
-	 LOCAL_DEPTH_VARS;
-
-	 y = Y_FLIP( y );
-
-	 if ( DBG ) fprintf( stderr, "ReadDepthSpan\n" );
-
-#if HAVE_HW_DEPTH_SPANS
-	 (void) x1; (void) n1;
-
-	 READ_DEPTH_SPAN();
-#else
-	 HW_CLIPLOOP()
-	    {
-	       GLint i = 0;
-	       CLIPSPAN( x, y, n, x1, n1, i );
-	       for ( ; n1>0 ; i++, n1-- ) {
-		  READ_DEPTH( depth[i], x+i, y );
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-#endif
-      }
-   HW_READ_UNLOCK();
-}
-
-static void TAG(ReadDepthPixels)( struct gl_context *ctx,
-                                  struct gl_renderbuffer *rb,
-                                  GLuint n,
-				  const GLint x[], const GLint y[],
-				  void *values )
-{
-   HW_READ_LOCK()
-      {
-         VALUE_TYPE *depth = (VALUE_TYPE *) values;
-	 GLuint i;
-	 LOCAL_DEPTH_VARS;
-
-	 if ( DBG ) fprintf( stderr, "ReadDepthPixels\n" );
-
-#if HAVE_HW_DEPTH_PIXELS
-	 (void) i;
-
-	 READ_DEPTH_PIXELS();
-#else
-	 HW_CLIPLOOP()
-	    {
-	       for ( i = 0 ; i < n ;i++ ) {
-		  int fy = Y_FLIP( y[i] );
-		  if ( CLIPPIXEL( x[i], fy ) )
-		     READ_DEPTH( depth[i], x[i], fy );
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-#endif
-      }
-   HW_READ_UNLOCK();
-
-   (void) ctx;
-}
-
-
-/**
- * Initialize the given renderbuffer's span routines to point to
- * the depth/z functions we generated above.
- */
-static void TAG(InitDepthPointers)(struct gl_renderbuffer *rb)
-{
-   rb->GetRow = TAG(ReadDepthSpan);
-   rb->GetValues = TAG(ReadDepthPixels);
-   rb->PutRow = TAG(WriteDepthSpan);
-   rb->PutValues = TAG(WriteDepthPixels);
-}
-
-
-#if HAVE_HW_DEPTH_SPANS
-#undef WRITE_DEPTH_SPAN
-#undef WRITE_DEPTH_PIXELS
-#undef READ_DEPTH_SPAN
-#undef READ_DEPTH_PIXELS
-#else
-#undef WRITE_DEPTH
-#undef READ_DEPTH
-#endif
-#undef TAG
-#undef VALUE_TYPE
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -0,0 +1,10 @@
+<driconf>
+    <device screen="0" driver="i965">
+        <application executable="Sanctuary">
+            <option name="force_glsl_extensions_warn" value="true" />
+	</application>
+        <application executable="Tropics">
+            <option name="force_glsl_extensions_warn" value="true" />
+	</application>
+    </device>
+</driconf>
--- a/src/mesa/drivers/dri/common/spantmp2.h
+++ b/src/mesa/drivers/dri/common/spantmp2.h
@@ -1,777 +0,0 @@
-/*
- * Copyright 2000-2001 VA Linux Systems, Inc.
- * (C) Copyright IBM Corporation 2004
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * VA LINUX SYSTEM, IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file spantmp2.h
- *
- * Template file of span read / write functions.
- *
- * \author Keith Whitwell <keithw@tungstengraphics.com>
- * \author Gareth Hughes <gareth@nvidia.com>
- * \author Ian Romanick <idr@us.ibm.com>
- */
-
-#include "main/colormac.h"
-#include "spantmp_common.h"
-
-#ifndef DBG
-#define DBG 0
-#endif
-
-#ifndef HW_READ_CLIPLOOP
-#define HW_READ_CLIPLOOP()	HW_CLIPLOOP()
-#endif
-
-#ifndef HW_WRITE_CLIPLOOP
-#define HW_WRITE_CLIPLOOP()	HW_CLIPLOOP()
-#endif
-
-#if (SPANTMP_PIXEL_FMT == GL_RGB)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5)
-
-/**
- ** GL_RGB, GL_UNSIGNED_SHORT_5_6_5
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-#define WRITE_RGBA( _x, _y, r, g, b, a )				\
-   PUT_VALUE(_x, _y, ((((int)r & 0xf8) << 8) |				\
-		      (((int)g & 0xfc) << 3) |				\
-		      (((int)b & 0xf8) >> 3)))				\
-
-#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
-
-#define READ_RGBA( rgba, _x, _y )					\
-   do {									\
-      GLushort p = GET_VALUE(_x, _y);					\
-      rgba[0] = ((p >> 8) & 0xf8) * 255 / 0xf8;				\
-      rgba[1] = ((p >> 3) & 0xfc) * 255 / 0xfc;				\
-      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;				\
-      rgba[3] = 0xff;							\
-   } while (0)
-
-#elif (SPANTMP_PIXEL_FMT == GL_RGB)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5_REV)
-
-/**
- ** GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-#define WRITE_RGBA( _x, _y, r, g, b, a )				\
-   PUT_VALUE(_x, _y, PACK_COLOR_565_REV( r, g, b ))
-
-#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
-
-#define READ_RGBA( rgba, _x, _y )					\
-   do {									\
-      GLushort p = GET_VALUE(_x, _y);					\
-      p = p << 8 | p >> 8;						\
-      rgba[0] = ((p >> 8) & 0xf8) * 255 / 0xf8;				\
-      rgba[1] = ((p >> 3) & 0xfc) * 255 / 0xfc;				\
-      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;				\
-      rgba[3] = 0xff;							\
-   } while (0)
-
-#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_4_4_4_4)
-
-/**
- ** GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-#define WRITE_RGBA( _x, _y, r, g, b, a )				\
-   PUT_VALUE(_x, _y, PACK_COLOR_4444_REV(a, r, g, b))			\
-
-#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
-
-#define READ_RGBA( rgba, _x, _y )					\
-   do {									\
-      GLushort p = GET_VALUE(_x, _y);					\
-      rgba[0] = ((p >> 0) & 0xf) * 0x11;				\
-      rgba[1] = ((p >> 12) & 0xf) * 0x11;				\
-      rgba[2] = ((p >> 4) & 0xf) * 0x11;				\
-      rgba[3] = ((p >> 8) & 0xf) * 0x11;				\
-   } while (0)
-
-
-#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_4_4_4_4_REV)
-
-/**
- ** GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4_REV
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-#define WRITE_RGBA( _x, _y, r, g, b, a )				\
-   PUT_VALUE(_x, _y, PACK_COLOR_4444(a, r, g, b))			\
-
-#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
-
-#define READ_RGBA( rgba, _x, _y )					\
-   do {									\
-      GLushort p = GET_VALUE(_x, _y);					\
-      rgba[0] = ((p >> 8) & 0xf) * 0x11;				\
-      rgba[1] = ((p >> 4) & 0xf) * 0x11;				\
-      rgba[2] = ((p >> 0) & 0xf) * 0x11;				\
-      rgba[3] = ((p >> 12) & 0xf) * 0x11;				\
-   } while (0)
-
-
-#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_1_5_5_5_REV)
-
-/**
- ** GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-#define WRITE_RGBA( _x, _y, r, g, b, a )				\
-   PUT_VALUE(_x, _y, PACK_COLOR_1555(a, r, g, b))			\
-
-#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
-
-#define READ_RGBA( rgba, _x, _y )					\
-   do {									\
-      GLushort p = GET_VALUE(_x, _y);					\
-      rgba[0] = ((p >> 7) & 0xf8) * 255 / 0xf8;				\
-      rgba[1] = ((p >> 2) & 0xf8) * 255 / 0xf8;				\
-      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;				\
-      rgba[3] = ((p >> 15) & 0x1) * 0xff;				\
-   } while (0)
-
-#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_1_5_5_5)
-
-/**
- ** GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-#define WRITE_RGBA( _x, _y, r, g, b, a )				\
-   PUT_VALUE(_x, _y, PACK_COLOR_1555_REV(a, r, g, b))			\
-
-#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
-
-#define READ_RGBA( rgba, _x, _y )					\
-   do {									\
-      GLushort p = GET_VALUE(_x, _y);					\
-      p = p << 8 | p >> 8;						\
-      rgba[0] = ((p >> 7) & 0xf8) * 255 / 0xf8;				\
-      rgba[1] = ((p >> 2) & 0xf8) * 255 / 0xf8;				\
-      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;				\
-      rgba[3] = ((p >> 15) & 0x1) * 0xff;				\
-   } while (0)
-
-#elif (SPANTMP_PIXEL_FMT == GL_BGRA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
-
-/**
- ** GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (     buf + (_x) * 4 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLuint *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLuint *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-# define WRITE_RGBA(_x, _y, r, g, b, a)                                 \
-   PUT_VALUE(_x, _y, ((r << 16) |					\
-		      (g << 8) |					\
-		      (b << 0) |					\
-		      (a << 24)))
-
-#define WRITE_PIXEL(_x, _y, p) PUT_VALUE(_x, _y, p)
-
-# if defined( USE_X86_ASM )
-#  define READ_RGBA(rgba, _x, _y)                                       \
-    do {                                                                \
-       GLuint p = GET_VALUE(_x, _y);					\
-       __asm__ __volatile__( "bswap	%0; rorl $8, %0"                \
-				: "=r" (p) : "0" (p) );                 \
-       ((GLuint *)rgba)[0] = p;                                         \
-    } while (0)
-# elif defined( MESA_BIG_ENDIAN )
-    /* On PowerPC with GCC 3.4.2 the shift madness below becomes a single
-     * rotlwi instruction.  It also produces good code on SPARC.
-     */
-#  define READ_RGBA( rgba, _x, _y )				        \
-     do {								\
-        GLuint p = GET_VALUE(_x, _y);					\
-        GLuint t = p;                                                   \
-        *((uint32_t *) rgba) = (t >> 24) | (p << 8);                    \
-     } while (0)
-# else
-#  define READ_RGBA( rgba, _x, _y )				        \
-     do {								\
-        GLuint p = GET_VALUE(_x, _y);					\
-	rgba[0] = (p >> 16) & 0xff;					\
-	rgba[1] = (p >>  8) & 0xff;					\
-	rgba[2] = (p >>  0) & 0xff;					\
-	rgba[3] = (p >> 24) & 0xff;					\
-     } while (0)
-# endif
-
-#elif (SPANTMP_PIXEL_FMT == GL_BGRA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8)
-
-/**
- ** GL_BGRA, GL_UNSIGNED_INT_8_8_8_8
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (     buf + (_x) * 4 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLuint *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLuint *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-# define WRITE_RGBA(_x, _y, r, g, b, a)                                 \
-   PUT_VALUE(_x, _y, ((r << 8) |					\
-		      (g << 16) |					\
-		      (b << 24) |					\
-		      (a << 0)))
-
-#define WRITE_PIXEL(_x, _y, p) PUT_VALUE(_x, _y, p)
-
-# if defined( USE_X86_ASM )
-#  define READ_RGBA(rgba, _x, _y)                                       \
-    do {                                                                \
-       GLuint p = GET_VALUE(_x, _y);					\
-       __asm__ __volatile__( "rorl $8, %0"				\
-				: "=r" (p) : "0" (p) );                 \
-       ((GLuint *)rgba)[0] = p;                                         \
-    } while (0)
-# elif defined( MESA_BIG_ENDIAN )
-    /* On PowerPC with GCC 3.4.2 the shift madness below becomes a single
-     * rotlwi instruction.  It also produces good code on SPARC.
-     */
-#  define READ_RGBA( rgba, _x, _y )				        \
-     do {								\
-        GLuint p = CPU_TO_LE32(GET_VALUE(_x, _y));                      \
-        GLuint t = p;                                                   \
-        *((uint32_t *) rgba) = (t >> 24) | (p << 8);                    \
-     } while (0)
-# else
-#  define READ_RGBA( rgba, _x, _y )				        \
-     do {								\
-        GLuint p = GET_VALUE(_x, _y);					\
-	rgba[0] = (p >>  8) & 0xff;					\
-	rgba[1] = (p >> 16) & 0xff;					\
-	rgba[2] = (p >> 24) & 0xff;					\
-	rgba[3] = (p >>  0) & 0xff;					\
-     } while (0)
-# endif
-
-#elif (SPANTMP_PIXEL_FMT == GL_BGR) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
-
-/**
- ** GL_BGR, GL_UNSIGNED_INT_8_8_8_8_REV
- **
- ** This is really for MESA_FORMAT_XRGB8888.  The spantmp code needs to be
- ** kicked to the curb, and we need to just code-gen this.
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (     buf + (_x) * 4 + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLuint *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLuint *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-# define WRITE_RGBA(_x, _y, r, g, b, a)					\
-   PUT_VALUE(_x, _y, ((r << 16) |					\
-		      (g << 8) |					\
-		      (b << 0) |					\
-		      (0xff << 24)))
-
-#define WRITE_PIXEL(_x, _y, p) PUT_VALUE(_x, _y, p)
-
-# if defined( USE_X86_ASM )
-#  define READ_RGBA(rgba, _x, _y)                                       \
-    do {                                                                \
-       GLuint p = GET_VALUE(_x, _y);					\
-       __asm__ __volatile__( "bswap	%0; rorl $8, %0"                \
-				: "=r" (p) : "0" (p) );                 \
-       ((GLuint *)rgba)[0] = p | 0xff000000;				\
-    } while (0)
-# elif defined( MESA_BIG_ENDIAN )
-    /* On PowerPC with GCC 3.4.2 the shift madness below becomes a single
-     * rotlwi instruction.  It also produces good code on SPARC.
-     */
-#  define READ_RGBA( rgba, _x, _y )				        \
-     do {								\
-        GLuint p = GET_VALUE(_x, _y);					\
-        *((uint32_t *) rgba) = (p << 8) | 0xff;				\
-     } while (0)
-# else
-#  define READ_RGBA( rgba, _x, _y )				        \
-     do {								\
-        GLuint p = GET_VALUE(_x, _y);					\
-	rgba[0] = (p >> 16) & 0xff;					\
-	rgba[1] = (p >>  8) & 0xff;					\
-	rgba[2] = (p >>  0) & 0xff;					\
-	rgba[3] = 0xff;							\
-     } while (0)
-# endif
-
-#elif (SPANTMP_PIXEL_FMT == GL_ALPHA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_BYTE)
-
-/**
- ** GL_ALPHA, GL_UNSIGNED_BYTE
- **/
-
-#ifndef GET_VALUE
-#ifndef GET_PTR
-#define GET_PTR(_x, _y) (     buf + (_x) + (_y) * pitch)
-#endif
-
-#define GET_VALUE(_x, _y) *(volatile GLubyte *)(GET_PTR(_x, _y))
-#define PUT_VALUE(_x, _y, _v) *(volatile GLubyte *)(GET_PTR(_x, _y)) = (_v)
-#endif /* GET_VALUE */
-
-# define WRITE_RGBA(_x, _y, r, g, b, a)                                 \
-   PUT_VALUE(_x, _y, a | (r & 0 /* quiet warnings */))
-
-#define WRITE_PIXEL(_x, _y, p) PUT_VALUE(_x, _y, p)
-
-#define READ_RGBA( rgba, _x, _y )				        \
-     do {								\
-        GLubyte p = GET_VALUE(_x, _y);					\
-	rgba[0] = 0;							\
-	rgba[1] = 0;							\
-	rgba[2] = 0;							\
-	rgba[3] = p;							\
-     } while (0)
-
-#else
-#error SPANTMP_PIXEL_FMT must be set to a valid value!
-#endif
-
-
-
-/**
- ** Assembly routines.
- **/
-
-#if defined( USE_MMX_ASM ) || defined( USE_SSE_ASM )
-#include "x86/read_rgba_span_x86.h"
-#include "x86/common_x86_asm.h"
-#endif
-
-static void TAG(WriteRGBASpan)( struct gl_context *ctx,
-                                struct gl_renderbuffer *rb,
-				GLuint n, GLint x, GLint y,
-				const void *values, const GLubyte mask[] )
-{
-   (void) ctx;
-
-   HW_WRITE_LOCK()
-      {
-         const GLubyte (*rgba)[4] = (const GLubyte (*)[4]) values;
-	 GLint x1;
-	 GLint n1;
-	 LOCAL_VARS;
-
-	 y = Y_FLIP(y);
-
-	 HW_WRITE_CLIPLOOP()
-	    {
-	       GLint i = 0;
-	       CLIPSPAN(x,y,n,x1,n1,i);
-
-	       if (DBG) fprintf(stderr, "WriteRGBASpan %d..%d (x1 %d)\n",
-				(int)i, (int)n1, (int)x1);
-
-	       if (mask)
-	       {
-		  for (;n1>0;i++,x1++,n1--)
-		     if (mask[i])
-			WRITE_RGBA( x1, y,
-				    rgba[i][0], rgba[i][1],
-				    rgba[i][2], rgba[i][3] );
-	       }
-	       else
-	       {
-		  for (;n1>0;i++,x1++,n1--)
-		     WRITE_RGBA( x1, y,
-				 rgba[i][0], rgba[i][1],
-				 rgba[i][2], rgba[i][3] );
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-      }
-   HW_WRITE_UNLOCK();
-}
-
-
-static void TAG(WriteRGBAPixels)( struct gl_context *ctx,
-                                  struct gl_renderbuffer *rb,
-                                  GLuint n, const GLint x[], const GLint y[],
-                                  const void *values, const GLubyte mask[] )
-{
-   (void) ctx;
-
-   HW_WRITE_LOCK()
-      {
-         const GLubyte (*rgba)[4] = (const GLubyte (*)[4]) values;
-	 GLint i;
-	 LOCAL_VARS;
-
-	 if (DBG) fprintf(stderr, "WriteRGBAPixels\n");
-
-	 HW_WRITE_CLIPLOOP()
-	    {
-	       if (mask)
-	       {
-	          for (i=0;i<n;i++)
-	          {
-		     if (mask[i]) {
-		        const int fy = Y_FLIP(y[i]);
-		        if (CLIPPIXEL(x[i],fy))
-			   WRITE_RGBA( x[i], fy,
-				       rgba[i][0], rgba[i][1],
-				       rgba[i][2], rgba[i][3] );
-		     }
-	          }
-	       }
-	       else
-	       {
-	          for (i=0;i<n;i++)
-	          {
-		     const int fy = Y_FLIP(y[i]);
-		     if (CLIPPIXEL(x[i],fy))
-			WRITE_RGBA( x[i], fy,
-				    rgba[i][0], rgba[i][1],
-				    rgba[i][2], rgba[i][3] );
-	          }
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-      }
-   HW_WRITE_UNLOCK();
-}
-
-
-static void TAG(ReadRGBASpan)( struct gl_context *ctx,
-                               struct gl_renderbuffer *rb,
-			       GLuint n, GLint x, GLint y, void *values)
-{
-   (void) ctx;
-
-   HW_READ_LOCK()
-      {
-         GLubyte (*rgba)[4] = (GLubyte (*)[4]) values;
-	 GLint x1,n1;
-	 LOCAL_VARS;
-
-	 y = Y_FLIP(y);
-
-	 if (DBG) fprintf(stderr, "ReadRGBASpan\n");
-
-	 HW_READ_CLIPLOOP()
-	    {
-	       GLint i = 0;
-	       CLIPSPAN(x,y,n,x1,n1,i);
-	       for (;n1>0;i++,x1++,n1--)
-		  READ_RGBA( rgba[i], x1, y );
-	    }
-         HW_ENDCLIPLOOP();
-      }
-   HW_READ_UNLOCK();
-}
-
-
-#if defined(GET_PTR) && \
-   defined(USE_MMX_ASM) && \
-   (((SPANTMP_PIXEL_FMT == GL_BGRA) && \
-	(SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)) || \
-    ((SPANTMP_PIXEL_FMT == GL_RGB) && \
-	(SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5)))
-static void TAG2(ReadRGBASpan,_MMX)( struct gl_context *ctx,
-                                     struct gl_renderbuffer *rb,
-                                     GLuint n, GLint x, GLint y, void *values)
-{
-#ifndef USE_INNER_EMMS
-   /* The EMMS instruction is directly in-lined here because using GCC's
-    * built-in _mm_empty function was found to utterly destroy performance.
-    */
-   __asm__ __volatile__( "emms" );
-#endif
-
-   (void) ctx;
-
-   HW_READ_LOCK()
-     {
-        GLubyte (*rgba)[4] = (GLubyte (*)[4]) values;
-	GLint x1,n1;
-	LOCAL_VARS;
-
-	y = Y_FLIP(y);
-
-	if (DBG) fprintf(stderr, "ReadRGBASpan\n");
-
-	HW_READ_CLIPLOOP()
-	  {
-	     GLint i = 0;
-	     CLIPSPAN(x,y,n,x1,n1,i);
-
-	       {
-		  const void * src = GET_PTR( x1, y );
-#if (SPANTMP_PIXEL_FMT == GL_RGB) && \
-		  (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5)
-		  _generic_read_RGBA_span_RGB565_MMX( src, rgba[i], n1 );
-#else
-		  _generic_read_RGBA_span_BGRA8888_REV_MMX( src, rgba[i], n1 );
-#endif
-	       }
-	  }
-	HW_ENDCLIPLOOP();
-     }
-   HW_READ_UNLOCK();
-#ifndef USE_INNER_EMMS
-   __asm__ __volatile__( "emms" );
-#endif
-}
-#endif
-
-
-#if defined(GET_PTR) &&	\
-   defined(USE_SSE_ASM) && \
-   (SPANTMP_PIXEL_FMT == GL_BGRA) && \
-     (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
-static void TAG2(ReadRGBASpan,_SSE2)( struct gl_context *ctx,
-                                      struct gl_renderbuffer *rb,
-                                      GLuint n, GLint x, GLint y,
-                                      void *values)
-{
-   (void) ctx;
-
-   HW_READ_LOCK()
-     {
-        GLubyte (*rgba)[4] = (GLubyte (*)[4]) values;
-	GLint x1,n1;
-	LOCAL_VARS;
-
-	y = Y_FLIP(y);
-
-	if (DBG) fprintf(stderr, "ReadRGBASpan\n");
-
-	HW_READ_CLIPLOOP()
-	  {
-	     GLint i = 0;
-	     CLIPSPAN(x,y,n,x1,n1,i);
-
-	       {
-		  const void * src = GET_PTR( x1, y );
-		  _generic_read_RGBA_span_BGRA8888_REV_SSE2( src, rgba[i], n1 );
-	       }
-	  }
-	HW_ENDCLIPLOOP();
-     }
-   HW_READ_UNLOCK();
-}
-#endif
-
-#if defined(GET_PTR) &&	\
-   defined(USE_SSE_ASM) && \
-   (SPANTMP_PIXEL_FMT == GL_BGRA) && \
-     (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
-static void TAG2(ReadRGBASpan,_SSE)( struct gl_context *ctx,
-                                     struct gl_renderbuffer *rb,
-                                     GLuint n, GLint x, GLint y,
-                                     void *values)
-{
-#ifndef USE_INNER_EMMS
-   /* The EMMS instruction is directly in-lined here because using GCC's
-    * built-in _mm_empty function was found to utterly destroy performance.
-    */
-   __asm__ __volatile__( "emms" );
-#endif
-
-   (void) ctx;
-
-   HW_READ_LOCK()
-     {
-        GLubyte (*rgba)[4] = (GLubyte (*)[4]) values;
-	GLint x1,n1;
-	LOCAL_VARS;
-
-	y = Y_FLIP(y);
-
-	if (DBG) fprintf(stderr, "ReadRGBASpan\n");
-
-	HW_READ_CLIPLOOP()
-	  {
-	     GLint i = 0;
-	     CLIPSPAN(x,y,n,x1,n1,i);
-
-	       {
-		  const void * src = GET_PTR( x1, y );
-		  _generic_read_RGBA_span_BGRA8888_REV_SSE( src, rgba[i], n1 );
-	       }
-	  }
-	HW_ENDCLIPLOOP();
-     }
-   HW_READ_UNLOCK();
-#ifndef USE_INNER_EMMS
-   __asm__ __volatile__( "emms" );
-#endif
-}
-#endif
-
-
-static void TAG(ReadRGBAPixels)( struct gl_context *ctx,
-                                 struct gl_renderbuffer *rb,
-				 GLuint n, const GLint x[], const GLint y[],
-				 void *values )
-{
-   (void) ctx;
-
-   HW_READ_LOCK()
-      {
-         GLubyte (*rgba)[4] = (GLubyte (*)[4]) values;
-	 GLint i;
-	 LOCAL_VARS;
-
-	 if (DBG) fprintf(stderr, "ReadRGBAPixels\n");
-
-	 HW_READ_CLIPLOOP()
-	    {
-               for (i=0;i<n;i++) {
-                  int fy = Y_FLIP( y[i] );
-                     if (CLIPPIXEL( x[i], fy ))
-                        READ_RGBA( rgba[i], x[i], fy );
-               }
-	    }
-	 HW_ENDCLIPLOOP();
-      }
-   HW_READ_UNLOCK();
-}
-
-static void TAG(InitPointers)(struct gl_renderbuffer *rb)
-{
-   rb->PutRow = TAG(WriteRGBASpan);
-   rb->PutValues = TAG(WriteRGBAPixels);
-   rb->GetValues = TAG(ReadRGBAPixels);
-
-#if defined(GET_PTR)
-#if defined(USE_SSE_ASM) && \
-   (SPANTMP_PIXEL_FMT == GL_BGRA) && \
-     (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
-   if ( cpu_has_xmm2 ) {
-      if (DBG) fprintf( stderr, "Using %s version of GetRow\n", "SSE2" );
-      rb->GetRow = TAG2(ReadRGBASpan, _SSE2);
-   }
-   else
-#endif
-#if defined(USE_SSE_ASM) && \
-   (SPANTMP_PIXEL_FMT == GL_BGRA) && \
-     (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
-   if ( cpu_has_xmm ) {
-      if (DBG) fprintf( stderr, "Using %s version of GetRow\n", "SSE" );
-      rb->GetRow = TAG2(ReadRGBASpan, _SSE);
-   }
-   else
-#endif
-#if defined(USE_MMX_ASM) && \
-   (((SPANTMP_PIXEL_FMT == GL_BGRA) && \
-	(SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)) || \
-    ((SPANTMP_PIXEL_FMT == GL_RGB) && \
-	(SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5)))
-   if ( cpu_has_mmx ) {
-      if (DBG) fprintf( stderr, "Using %s version of GetRow\n", "MMX" );
-      rb->GetRow = TAG2(ReadRGBASpan, _MMX);
-   }
-   else
-#endif
-#endif /* GET_PTR */
-   {
-      if (DBG) fprintf( stderr, "Using %s version of GetRow\n", "C" );
-      rb->GetRow = TAG(ReadRGBASpan);
-   }
-
-}
-
-
-#undef WRITE_PIXEL
-#undef WRITE_RGBA
-#undef READ_RGBA
-#undef TAG
-#undef TAG2
-#undef GET_VALUE
-#undef PUT_VALUE
-#undef GET_PTR
-#undef SPANTMP_PIXEL_FMT
-#undef SPANTMP_PIXEL_TYPE
--- a/src/mesa/drivers/dri/common/spantmp_common.h
+++ b/src/mesa/drivers/dri/common/spantmp_common.h
@@ -1,78 +0,0 @@
-/*
- * Copyright 2000-2001 VA Linux Systems, Inc.
- * (C) Copyright IBM Corporation 2004
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * VA LINUX SYSTEM, IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file spantmp_common.h
- *
- * common macros for span read / write functions to be used in the depth,
- * stencil and pixel span templates.
- */
-
-#ifndef HW_WRITE_LOCK
-#define HW_WRITE_LOCK()		HW_LOCK()
-#endif
-
-#ifndef HW_WRITE_UNLOCK
-#define HW_WRITE_UNLOCK()	HW_UNLOCK()
-#endif
-
-#ifndef HW_READ_LOCK
-#define HW_READ_LOCK()		HW_LOCK()
-#endif
-
-#ifndef HW_READ_UNLOCK
-#define HW_READ_UNLOCK()	HW_UNLOCK()
-#endif
-
-#ifndef HW_CLIPLOOP
-#define HW_CLIPLOOP()							\
-   do {									\
-      int minx = 0;							\
-      int miny = 0;							\
-      int maxx = dPriv->w;						\
-      int maxy = dPriv->h;
-#endif
-
-#ifndef HW_ENDCLIPLOOP
-#define HW_ENDCLIPLOOP()						\
-   } while (0)
-#endif
-
-#ifndef CLIPPIXEL
-#define CLIPPIXEL( _x, _y )						\
-   ((_x >= minx) && (_x < maxx) && (_y >= miny) && (_y < maxy))
-#endif
-
-#ifndef CLIPSPAN
-#define CLIPSPAN( _x, _y, _n, _x1, _n1, _i )				\
-   if ( _y < miny || _y >= maxy /*|| _x + n < minx || _x >=maxx*/ ) {	\
-      _n1 = 0, _x1 = x;							\
-   } else {								\
-      _n1 = _n;								\
-      _x1 = _x;								\
-      if ( _x1 < minx ) _i += (minx-_x1), n1 -= (minx-_x1), _x1 = minx; \
-      if ( _x1 + _n1 >= maxx ) n1 -= (_x1 + n1 - maxx);		        \
-   }
-#endif
--- a/src/mesa/drivers/dri/common/stenciltmp.h
+++ b/src/mesa/drivers/dri/common/stenciltmp.h
@@ -1,186 +0,0 @@
-
-#include "spantmp_common.h"
-
-#ifndef DBG
-#define DBG 0
-#endif
-
-#ifndef HAVE_HW_STENCIL_SPANS
-#define HAVE_HW_STENCIL_SPANS 0
-#endif
-
-#ifndef HAVE_HW_STENCIL_PIXELS
-#define HAVE_HW_STENCIL_PIXELS 0
-#endif
-
-static void TAG(WriteStencilSpan)( struct gl_context *ctx,
-                                   struct gl_renderbuffer *rb,
-				   GLuint n, GLint x, GLint y,
-				   const void *values, const GLubyte mask[] )
-{
-   HW_WRITE_LOCK()
-      {
-         const GLubyte *stencil = (const GLubyte *) values;
-	 GLint x1;
-	 GLint n1;
-	 LOCAL_STENCIL_VARS;
-
-	 y = Y_FLIP(y);
-
-#if HAVE_HW_STENCIL_SPANS
-	 (void) x1; (void) n1;
-
-	 if (DBG) fprintf(stderr, "WriteStencilSpan 0..%d (x1 %d)\n",
-			  (int)n1, (int)x1);
-
-	 WRITE_STENCIL_SPAN();
-#else /* HAVE_HW_STENCIL_SPANS */
-	 HW_CLIPLOOP() 
-	    {
-	       GLint i = 0;
-	       CLIPSPAN(x,y,n,x1,n1,i);
-
-	       if (DBG) fprintf(stderr, "WriteStencilSpan %d..%d (x1 %d)\n",
-				(int)i, (int)n1, (int)x1);
-
-	       if (mask)
-	       {
-		  for (;n1>0;i++,x1++,n1--)
-		     if (mask[i])
-			WRITE_STENCIL( x1, y, stencil[i] );
-	       }
-	       else
-	       {
-		  for (;n1>0;i++,x1++,n1--)
-		     WRITE_STENCIL( x1, y, stencil[i] );
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-#endif /* !HAVE_HW_STENCIL_SPANS */
-      }
-   HW_WRITE_UNLOCK();
-}
-
-
-static void TAG(WriteStencilPixels)( struct gl_context *ctx,
-                                     struct gl_renderbuffer *rb,
-				     GLuint n,
-				     const GLint x[], const GLint y[],
-				     const void *values, const GLubyte mask[] )
-{
-   HW_WRITE_LOCK()
-      {
-         const GLubyte *stencil = (const GLubyte *) values;
-	 GLuint i;
-	 LOCAL_STENCIL_VARS;
-
-	 if (DBG) fprintf(stderr, "WriteStencilPixels\n");
-
-#if HAVE_HW_STENCIL_PIXELS
-	 (void) i;
-
-	 WRITE_STENCIL_PIXELS();
-#else /* HAVE_HW_STENCIL_PIXELS */
-	 HW_CLIPLOOP()
-	    {
-	       for (i=0;i<n;i++)
-	       {
-		  if (mask[i]) {
-		     const int fy = Y_FLIP(y[i]);
-		     if (CLIPPIXEL(x[i],fy))
-			WRITE_STENCIL( x[i], fy, stencil[i] );
-		  }
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-#endif /* !HAVE_HW_STENCIL_PIXELS */
-      }
-   HW_WRITE_UNLOCK();
-}
-
-
-/* Read stencil spans and pixels
- */
-static void TAG(ReadStencilSpan)( struct gl_context *ctx,
-                                  struct gl_renderbuffer *rb,
-				  GLuint n, GLint x, GLint y,
-				  void *values)
-{
-   HW_READ_LOCK()
-      {
-         GLubyte *stencil = (GLubyte *) values;
-	 GLint x1,n1;
-	 LOCAL_STENCIL_VARS;
-
-	 y = Y_FLIP(y);
-
-	 if (DBG) fprintf(stderr, "ReadStencilSpan\n");
-
-#if HAVE_HW_STENCIL_SPANS
-	 (void) x1; (void) n1;
-
-	 READ_STENCIL_SPAN();
-#else /* HAVE_HW_STENCIL_SPANS */
-	 HW_CLIPLOOP() 
-	    {
-	       GLint i = 0;
-	       CLIPSPAN(x,y,n,x1,n1,i);
-	       for (;n1>0;i++,n1--)
-		  READ_STENCIL( stencil[i], (x+i), y );
-	    }
-	 HW_ENDCLIPLOOP();
-#endif /* !HAVE_HW_STENCIL_SPANS */
-      }
-   HW_READ_UNLOCK();
-}
-
-static void TAG(ReadStencilPixels)( struct gl_context *ctx,
-                                    struct gl_renderbuffer *rb,
-                                    GLuint n, const GLint x[], const GLint y[],
-				    void *values )
-{
-   HW_READ_LOCK()
-      {
-         GLubyte *stencil = (GLubyte *) values;
-	 GLuint i;
-	 LOCAL_STENCIL_VARS;
-
-	 if (DBG) fprintf(stderr, "ReadStencilPixels\n");
- 
-#if HAVE_HW_STENCIL_PIXELS
-	 (void) i;
-
-	 READ_STENCIL_PIXELS();
-#else /* HAVE_HW_STENCIL_PIXELS */
-	 HW_CLIPLOOP()
-	    {
-	       for (i=0;i<n;i++) {
-		  int fy = Y_FLIP( y[i] );
-		  if (CLIPPIXEL( x[i], fy ))
-		     READ_STENCIL( stencil[i], x[i], fy );
-	       }
-	    }
-	 HW_ENDCLIPLOOP();
-#endif /* !HAVE_HW_STENCIL_PIXELS */
-      }
-   HW_READ_UNLOCK();
-}
-
-
-
-/**
- * Initialize the given renderbuffer's span routines to point to
- * the stencil functions we generated above.
- */
-static void TAG(InitStencilPointers)(struct gl_renderbuffer *rb)
-{
-   rb->GetRow = TAG(ReadStencilSpan);
-   rb->GetValues = TAG(ReadStencilPixels);
-   rb->PutRow = TAG(WriteStencilSpan);
-   rb->PutValues = TAG(WriteStencilPixels);
-}
-
-
-#undef WRITE_STENCIL
-#undef READ_STENCIL
-#undef TAG
--- a/src/mesa/drivers/dri/common/xmlpool/options.h
+++ b/src/mesa/drivers/dri/common/xmlpool/options.h
@@ -626,3 +626,13 @@ DRI_CONF_OPT_BEGIN(always_flush_cache,bool,def) \
        DRI_CONF_DESC(fr,"Enable flushing GPU caches with each draw call") \
        DRI_CONF_DESC(sv,"Enable flushing GPU caches with each draw call") \
 DRI_CONF_OPT_END
+
+#define DRI_CONF_FORCE_GLSL_EXTENSIONS_WARN(def) \
+DRI_CONF_OPT_BEGIN(force_glsl_extensions_warn,bool,def) \
+        DRI_CONF_DESC(en,"Force GLSL extension default behavior to 'warn'") \
+        DRI_CONF_DESC(de,"Force GLSL extension default behavior to 'warn'") \
+        DRI_CONF_DESC(es,"Force GLSL extension default behavior to 'warn'") \
+        DRI_CONF_DESC(nl,"Force GLSL extension default behavior to 'warn'") \
+        DRI_CONF_DESC(fr,"Force GLSL extension default behavior to 'warn'") \
+        DRI_CONF_DESC(sv,"Force GLSL extension default behavior to 'warn'") \
+DRI_CONF_OPT_END
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -592,8 +592,11 @@ static uint32_t i830_render_target_format_for_mesa_format[MESA_FORMAT_COUNT] =
 };

 static bool
-i830_render_target_supported(struct intel_context *intel, gl_format format)
+i830_render_target_supported(struct intel_context *intel,
+			     struct gl_renderbuffer *rb)
 {
+   gl_format format = rb->Format;
+
   if (format == MESA_FORMAT_S8_Z24 ||
       format == MESA_FORMAT_X8_Z24 ||
       format == MESA_FORMAT_Z16) {
@@ -642,7 +645,7 @@ i830_set_draw_region(struct intel_context *intel,
            DSTORG_VERT_BIAS(0x8) | DEPTH_IS_Z);    /* .5 */

   if (irb != NULL) {
-      value |= i830_render_target_format_for_mesa_format[irb->Base.Format];
+      value |= i830_render_target_format_for_mesa_format[intel_rb_format(irb)];
   }

   if (depth_region && depth_region->cpp == 4) {
@@ -803,7 +806,7 @@ i830_update_draw_buffer(struct intel_context *intel)

   /* Check for stencil fallback. */
   if (irbStencil && irbStencil->mt) {
-      assert(irbStencil->Base.Format == MESA_FORMAT_S8_Z24);
+      assert(intel_rb_format(irbStencil) == MESA_FORMAT_S8_Z24);
      FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, false);
   } else if (irbStencil && !irbStencil->mt) {
      FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, true);
@@ -816,7 +819,7 @@ i830_update_draw_buffer(struct intel_context *intel)
    * we still need to set up the shared depth/stencil state so we can use it.
    */
   if (depthRegion == NULL && irbStencil && irbStencil->mt
-       && irbStencil->Base.Format == MESA_FORMAT_S8_Z24) {
+       && intel_rb_format(irbStencil) == MESA_FORMAT_S8_Z24) {
      depthRegion = irbStencil->mt->region;
   }

--- a/src/mesa/drivers/dri/i915/i915_tex_layout.c
+++ b/src/mesa/drivers/dri/i915/i915_tex_layout.c
@@ -129,7 +129,7 @@ i915_miptree_layout_cube(struct intel_mipmap_tree * mt)
      intel_miptree_set_level_info(mt, level,
 				   0, 0,
 				   lvlWidth, lvlHeight,
-				   1);
+				   6);
      lvlWidth /= 2;
      lvlHeight /= 2;
   }
@@ -337,7 +337,7 @@ i945_miptree_layout_cube(struct intel_mipmap_tree * mt)
   for (level = mt->first_level; level <= mt->last_level; level++) {
      intel_miptree_set_level_info(mt, level,
 				   0, 0,
-				   lvlWidth, lvlHeight, 1);
+				   lvlWidth, lvlHeight, 6);
      lvlWidth /= 2;
      lvlHeight /= 2;
   }
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -557,8 +557,11 @@ static uint32_t i915_render_target_format_for_mesa_format[MESA_FORMAT_COUNT] =
 };

 static bool
-i915_render_target_supported(struct intel_context *intel, gl_format format)
+i915_render_target_supported(struct intel_context *intel,
+			     struct gl_renderbuffer *rb)
 {
+   gl_format format = rb->Format;
+
   if (format == MESA_FORMAT_S8_Z24 ||
       format == MESA_FORMAT_X8_Z24 ||
       format == MESA_FORMAT_Z16) {
@@ -607,7 +610,7 @@ i915_set_draw_region(struct intel_context *intel,
            DSTORG_VERT_BIAS(0x8) |     /* .5 */
            LOD_PRECLAMP_OGL | TEX_DEFAULT_COLOR_OGL);
   if (irb != NULL) {
-      value |= i915_render_target_format_for_mesa_format[irb->Base.Format];
+      value |= i915_render_target_format_for_mesa_format[intel_rb_format(irb)];
   } else {
      value |= DV_PF_8888;
   }
@@ -775,7 +778,7 @@ i915_update_draw_buffer(struct intel_context *intel)

   /* Check for stencil fallback. */
   if (irbStencil && irbStencil->mt) {
-      assert(irbStencil->Base.Format == MESA_FORMAT_S8_Z24);
+      assert(intel_rb_format(irbStencil) == MESA_FORMAT_S8_Z24);
      FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, false);
   } else if (irbStencil && !irbStencil->mt) {
      FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, true);
@@ -788,7 +791,7 @@ i915_update_draw_buffer(struct intel_context *intel)
    * we still need to set up the shared depth/stencil state so we can use it.
    */
   if (depthRegion == NULL && irbStencil && irbStencil->mt
-       && irbStencil->Base.Format == MESA_FORMAT_S8_Z24) {
+       && intel_rb_format(irbStencil) == MESA_FORMAT_S8_Z24) {
      depthRegion = irbStencil->mt->region;
   }

--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -367,16 +367,12 @@ brwCreateContext(int api,

   brw_draw_init( brw );

-   brw->new_vs_backend = (getenv("INTEL_OLD_VS") == NULL);
   brw->precompile = driQueryOptionb(&intel->optionCache, "shader_precompile");

-   /* If we're using the new shader backend, we require integer uniforms
-    * stored as actual integers.
-    */
-   if (brw->new_vs_backend) {
-      ctx->Const.NativeIntegers = true;
-      ctx->Const.UniformBooleanTrue = 1;
-   }
+   ctx->Const.NativeIntegers = true;
+   ctx->Const.UniformBooleanTrue = 1;
+
+   ctx->Const.ForceGLSLExtensionsWarn = driQueryOptionb(&intel->optionCache, "force_glsl_extensions_warn");

   return true;
 }
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -645,7 +645,6 @@ struct brw_context
   bool has_negative_rhw_bug;
   bool has_aa_line_parameters;
   bool has_pln;
-   bool new_vs_backend;
   bool precompile;

   struct {
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1128,6 +1128,7 @@ enum brw_message_target {
 /* DW1 (for gen6) */
 # define GEN6_SF_NUM_OUTPUTS_SHIFT			22
 # define GEN6_SF_SWIZZLE_ENABLE				(1 << 21)
+# define GEN6_SF_POINT_SPRITE_UPPERLEFT			(0 << 20)
 # define GEN6_SF_POINT_SPRITE_LOWERLEFT			(1 << 20)
 # define GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT		11
 # define GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT		4
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -982,7 +982,13 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 			    inst->bits3.math.precision, &space);
 	    break;
 	case BRW_SFID_SAMPLER:
-	    if (gen >= 5) {
+	    if (gen >= 7) {
+		format (file, " (%d, %d, %d, %d)",
+			inst->bits3.sampler_gen7.binding_table_index,
+			inst->bits3.sampler_gen7.sampler,
+			inst->bits3.sampler_gen7.msg_type,
+			inst->bits3.sampler_gen7.simd_mode);
+	    } else if (gen >= 5) {
 		format (file, " (%d, %d, %d, %d)",
 			inst->bits3.sampler_gen5.binding_table_index,
 			inst->bits3.sampler_gen5.sampler,
@@ -1023,7 +1029,18 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 	    break;

 	case BRW_SFID_DATAPORT_WRITE:
-	    if (gen >= 6) {
+	    if (gen >= 7) {
+		format (file, " (");
+
+		err |= control (file, "DP rc message type",
+				dp_rc_msg_type_gen6,
+				inst->bits3.gen7_dp.msg_type, &space);
+
+		format (file, ", %d, %d, %d)",
+			inst->bits3.gen7_dp.binding_table_index,
+			inst->bits3.gen7_dp.msg_control,
+			inst->bits3.gen7_dp.msg_type);
+	    } else if (gen == 6) {
 		format (file, " (");

 		err |= control (file, "DP rc message type",
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -887,6 +887,17 @@ ROUND(RNDE)

 /* Helpers for SEND instruction:
 */
+void brw_set_sampler_message(struct brw_compile *p,
+                             struct brw_instruction *insn,
+                             GLuint binding_table_index,
+                             GLuint sampler,
+                             GLuint msg_type,
+                             GLuint response_length,
+                             GLuint msg_length,
+                             GLuint header_present,
+                             GLuint simd_mode,
+                             GLuint return_format);
+
 void brw_set_dp_read_message(struct brw_compile *p,
 			     struct brw_instruction *insn,
 			     GLuint binding_table_index,
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -84,10 +84,18 @@ gen6_resolve_implied_move(struct brw_compile *p,
 static void
 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
 {
+   /* From the BSpec / ISA Reference / send - [DevIVB+]:
+    * "The send with EOT should use register space R112-R127 for <src>. This is
+    *  to enable loading of a new thread into the same slot while the message
+    *  with EOT for current thread is pending dispatch."
+    *
+    * Since we're pretending to have 16 MRFs anyway, we may as well use the
+    * registers required for messages with EOT.
+    */
   struct intel_context *intel = &p->brw->intel;
   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
      reg->file = BRW_GENERAL_REGISTER_FILE;
-      reg->nr += 111;
+      reg->nr += GEN7_MRF_HACK_START;
   }
 }

@@ -645,16 +653,17 @@ brw_set_dp_read_message(struct brw_compile *p,
   }
 }

-static void brw_set_sampler_message(struct brw_compile *p,
-                                    struct brw_instruction *insn,
-                                    GLuint binding_table_index,
-                                    GLuint sampler,
-                                    GLuint msg_type,
-                                    GLuint response_length,
-                                    GLuint msg_length,
-                                    GLuint header_present,
-                                    GLuint simd_mode,
-				    GLuint return_format)
+void
+brw_set_sampler_message(struct brw_compile *p,
+                        struct brw_instruction *insn,
+                        GLuint binding_table_index,
+                        GLuint sampler,
+                        GLuint msg_type,
+                        GLuint response_length,
+                        GLuint msg_length,
+                        GLuint header_present,
+                        GLuint simd_mode,
+                        GLuint return_format)
 {
   struct brw_context *brw = p->brw;
   struct intel_context *intel = &brw->intel;
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -379,6 +379,7 @@ public:
      this->frag_depth = NULL;
      memset(this->outputs, 0, sizeof(this->outputs));
      this->first_non_payload_grf = 0;
+      this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;

      this->current_annotation = NULL;
      this->base_ir = NULL;
@@ -583,6 +584,7 @@ public:
   ir_variable *frag_depth;
   fs_reg outputs[BRW_MAX_DRAW_BUFFERS];
   int first_non_payload_grf;
+   int max_grf;
   int urb_setup[FRAG_ATTRIB_MAX];
   bool kill_emitted;

--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -63,9 +63,9 @@ fs_visitor::assign_regs_trivial()
      assign_reg(hw_reg_mapping, &inst->src[1], reg_width);
   }

-   if (this->grf_used >= BRW_MAX_GRF) {
+   if (this->grf_used >= max_grf) {
      fail("Ran out of regs on trivial allocator (%d/%d)\n",
-	   this->grf_used, BRW_MAX_GRF);
+	   this->grf_used, max_grf);
   }

 }
@@ -88,7 +88,7 @@ brw_alloc_reg_set_for_classes(struct brw_context *brw,
   ralloc_free(brw->wm.ra_reg_to_grf);
   brw->wm.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
   ralloc_free(brw->wm.regs);
-   brw->wm.regs = ra_alloc_reg_set(ra_reg_count);
+   brw->wm.regs = ra_alloc_reg_set(brw, ra_reg_count);
   ralloc_free(brw->wm.classes);
   brw->wm.classes = ralloc_array(brw, int, class_count + 1);

@@ -156,7 +156,7 @@ fs_visitor::assign_regs()
   int reg_width = c->dispatch_width / 8;
   int hw_reg_mapping[this->virtual_grf_next];
   int first_assigned_grf = ALIGN(this->first_non_payload_grf, reg_width);
-   int base_reg_count = (BRW_MAX_GRF - first_assigned_grf) / reg_width;
+   int base_reg_count = (max_grf - first_assigned_grf) / reg_width;
   int class_sizes[base_reg_count];
   int class_count = 0;

--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -209,8 +209,8 @@ brw_depthbuffer_format(struct brw_context *brw)
   if (!drb &&
       (srb = intel_get_renderbuffer(fb, BUFFER_STENCIL)) &&
       !srb->mt->stencil_mt &&
-       (srb->Base.Format == MESA_FORMAT_S8_Z24 ||
-	srb->Base.Format == MESA_FORMAT_Z32_FLOAT_X24S8)) {
+       (intel_rb_format(srb) == MESA_FORMAT_S8_Z24 ||
+	intel_rb_format(srb) == MESA_FORMAT_Z32_FLOAT_X24S8)) {
      drb = srb;
   }

@@ -223,17 +223,30 @@ brw_depthbuffer_format(struct brw_context *brw)
   case MESA_FORMAT_Z32_FLOAT:
      return BRW_DEPTHFORMAT_D32_FLOAT;
   case MESA_FORMAT_X8_Z24:
-      if (intel->gen >= 5)
+      if (intel->gen >= 6) {
 	 return BRW_DEPTHFORMAT_D24_UNORM_X8_UINT;
-      else /* Gen4 doesn't support X8; use S8 instead. */
+      } else {
+	 /* Use D24_UNORM_S8, not D24_UNORM_X8.
+	  *
+	  * D24_UNORM_X8 was not introduced until Gen5. (See the Ironlake PRM,
+	  * Volume 2, Part 1, Section 8.4.6 "Depth/Stencil Buffer State", Bits
+	  * 3DSTATE_DEPTH_BUFFER.Surface_Format).
+	  *
+	  * However, on Gen5, D24_UNORM_X8 may be used only if separate
+	  * stencil is enabled, and we never enable it. From the Ironlake PRM,
+	  * same section as above, Bit 3DSTATE_DEPTH_BUFFER.Separate_Stencil_Buffer_Enable:
+	  *     If this field is disabled, the Surface Format of the depth
+	  *     buffer cannot be D24_UNORM_X8_UINT.
+	  */
 	 return BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+      }
   case MESA_FORMAT_S8_Z24:
      return BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
   case MESA_FORMAT_Z32_FLOAT_X24S8:
      return BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT;
   default:
      _mesa_problem(ctx, "Unexpected depth format %s\n",
-		    _mesa_get_format_name(drb->Base.Format));
+		    _mesa_get_format_name(intel_rb_format(drb)));
      return BRW_DEPTHFORMAT_D16_UNORM;
   }
 }
@@ -341,8 +354,8 @@ static void emit_depthbuffer(struct brw_context *brw)
 	        (1 << 27) | /* tiled surface */
 	        (BRW_SURFACE_2D << 29));
      OUT_BATCH(0);
-      OUT_BATCH(((stencil_irb->Base.Width - 1) << 6) |
-	         (stencil_irb->Base.Height - 1) << 19);
+      OUT_BATCH(((stencil_irb->Base.Base.Width - 1) << 6) |
+	         (stencil_irb->Base.Base.Height - 1) << 19);
      OUT_BATCH(0);
      OUT_BATCH(0);

@@ -376,8 +389,8 @@ static void emit_depthbuffer(struct brw_context *brw)
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
 		offset);
      OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
-		((depth_irb->Base.Width - 1) << 6) |
-		((depth_irb->Base.Height - 1) << 19));
+		(((depth_irb->Base.Base.Width + tile_x) - 1) << 6) |
+		(((depth_irb->Base.Base.Height + tile_y) - 1) << 19));
      OUT_BATCH(0);

      if (intel->is_g4x || intel->gen >= 5)
@@ -756,7 +769,13 @@ static void upload_state_base_address( struct brw_context *brw )
 		 1); /* Instruction base address: shader kernels (incl. SIP) */

       OUT_BATCH(1); /* General state upper bound */
-       OUT_BATCH(1); /* Dynamic state upper bound */
+       /* Dynamic state upper bound.  Although the documentation says that
+	* programming it to zero will cause it to be ignored, that is a lie.
+	* If this isn't programmed to a real bound, the sampler border color
+	* pointer is rejected, causing border color to mysteriously fail.
+	*/
+       OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+		 intel->batch.bo->size | 1);
       OUT_BATCH(1); /* Indirect object upper bound */
       OUT_BATCH(1); /* Instruction access upper bound */
       ADVANCE_BATCH();
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -229,6 +229,8 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
       * program constant) has to happen before creating this linkage.
       */
      _mesa_associate_uniform_storage(ctx, shProg, prog->Parameters);
+
+      _mesa_reference_program(ctx, &prog, NULL);
   }

   if (!brw_shader_precompile(ctx, shProg))
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -386,6 +386,8 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)

   DBG("%s\n", __FUNCTION__);

+   drm_intel_bo_unreference(cache->bo);
+   cache->bo = NULL;
   brw_clear_cache(brw, cache);
   free(cache->items);
   cache->items = NULL;
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -37,6 +37,17 @@
 /** Number of general purpose registers (VS, WM, etc) */
 #define BRW_MAX_GRF 128

+/**
+ * First GRF used for the MRF hack.
+ *
+ * On gen7, MRFs are no longer used, and contiguous GRFs are used instead.  We
+ * haven't converted our compiler to be aware of this, so it asks for MRFs and
+ * brw_eu_emit.c quietly converts them to be accesses of the top GRFs.  The
+ * register allocators have to be careful of this to avoid corrupting the "MRF"s
+ * with actual GRF allocations.
+ */
+#define GEN7_MRF_HACK_START 112.
+
 /** Number of message register file registers */
 #define BRW_MAX_MRF 16

--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -77,6 +77,7 @@ brw_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree *mt)
 	 brw_miptree_layout_texture_array(intel, mt);
 	 break;
      }
+      assert(mt->depth0 == 6);
      /* FALLTHROUGH */

   case GL_TEXTURE_3D: {
@@ -101,7 +102,6 @@ brw_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree *mt)
      pack_x_nr = 1;

      for (level = mt->first_level ; level <= mt->last_level ; level++) {
-	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
 	 GLint x = 0;
 	 GLint y = 0;
 	 GLint q, j;
@@ -110,8 +110,8 @@ brw_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree *mt)
 				      0, mt->total_height,
 				      width, height, depth);

-	 for (q = 0; q < nr_images;) {
-	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	 for (q = 0; q < depth; /* empty */) {
+	    for (j = 0; j < pack_x_nr && q < depth; j++, q++) {
 	       intel_miptree_set_image_offset(mt, level, q, x, y);
 	       x += pack_x_pitch;
 	    }
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -334,6 +334,7 @@ public:
   int virtual_grf_count;
   int virtual_grf_array_size;
   int first_non_payload_grf;
+   unsigned int max_grf;
   int *virtual_grf_def;
   int *virtual_grf_use;
   dst_reg userplane[MAX_CLIP_PLANES];
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -25,6 +25,7 @@

 extern "C" {
 #include "brw_eu.h"
+#include "main/macros.h"
 };

 using namespace brw;
@@ -95,6 +96,13 @@ vec4_visitor::setup_attributes(int payload_reg)

   prog_data->urb_read_length = (nr_attributes + 1) / 2;

+   unsigned vue_entries = MAX2(nr_attributes, c->vue_map.num_slots);
+
+   if (intel->gen == 6)
+      c->prog_data.urb_entry_size = ALIGN(vue_entries, 8) / 8;
+   else
+      c->prog_data.urb_entry_size = ALIGN(vue_entries, 4) / 4;
+
   return payload_reg + nr_attributes;
 }

@@ -639,6 +647,23 @@ vec4_visitor::generate_pull_constant_load(vec4_instruction *inst,
 					  struct brw_reg dst,
 					  struct brw_reg index)
 {
+   if (intel->gen == 7) {
+      gen6_resolve_implied_move(p, &index, inst->base_mrf);
+      brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+      brw_set_dest(p, insn, dst);
+      brw_set_src0(p, insn, index);
+      brw_set_sampler_message(p, insn,
+                              SURF_INDEX_VERT_CONST_BUFFER,
+                              0, /* LD message ignores sampler unit */
+                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                              1, /* rlen */
+                              1, /* mlen */
+                              false, /* no header */
+                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                              0);
+      return;
+   }
+
   struct brw_reg header = brw_vec8_grf(0, 0);

   gen6_resolve_implied_move(p, &header, inst->base_mrf);
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -87,9 +87,9 @@ vec4_visitor::reg_allocate_trivial()
      assign(hw_reg_mapping, &inst->src[2]);
   }

-   if (prog_data->total_grf > BRW_MAX_GRF) {
+   if (prog_data->total_grf > max_grf) {
      fail("Ran out of regs on trivial allocator (%d/%d)\n",
-	   prog_data->total_grf, BRW_MAX_GRF);
+	   prog_data->total_grf, max_grf);
   }
 }

@@ -108,7 +108,7 @@ brw_alloc_reg_set_for_classes(struct brw_context *brw,
   ralloc_free(brw->vs.ra_reg_to_grf);
   brw->vs.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
   ralloc_free(brw->vs.regs);
-   brw->vs.regs = ra_alloc_reg_set(ra_reg_count);
+   brw->vs.regs = ra_alloc_reg_set(brw, ra_reg_count);
   ralloc_free(brw->vs.classes);
   brw->vs.classes = ralloc_array(brw, int, class_count + 1);

@@ -144,7 +144,7 @@ vec4_visitor::reg_allocate()
 {
   int hw_reg_mapping[virtual_grf_count];
   int first_assigned_grf = this->first_non_payload_grf;
-   int base_reg_count = BRW_MAX_GRF - first_assigned_grf;
+   int base_reg_count = max_grf - first_assigned_grf;
   int class_sizes[base_reg_count];
   int class_count = 0;

--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1532,9 +1532,6 @@ vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,

   dst->writemask = (1 << type->vector_elements) - 1;

-   /* Do we need to worry about swizzling a swizzle? */
-   assert(src->swizzle == BRW_SWIZZLE_NOOP
-	  || src->swizzle == swizzle_for_size(type->vector_elements));
   src->swizzle = swizzle_for_size(type->vector_elements);

   vec4_instruction *inst = emit(MOV(*dst, *src));
@@ -1617,6 +1614,15 @@ vec4_visitor::visit(ir_assignment *ir)
 	 emit_bool_to_cond_code(ir->condition, &predicate);
      }

+      /* emit_block_move doesn't account for swizzles in the source register.
+       * This should be ok, since the source register is a structure or an
+       * array, and those can't be swizzled.  But double-check to be sure.
+       */
+      assert(src.swizzle ==
+             (ir->rhs->type->is_matrix()
+              ? swizzle_for_size(ir->rhs->type->vector_elements)
+              : BRW_SWIZZLE_NOOP));
+
      emit_block_move(&dst, &src, ir->rhs->type, predicate);
      return;
   }
@@ -2286,11 +2292,6 @@ vec4_visitor::emit_urb_writes()
       */
      inst->offset = (max_usable_mrf - base_mrf) / 2;
   }
-
-   if (intel->gen == 6)
-      c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
-   else
-      c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
 }

 src_reg
@@ -2600,11 +2601,9 @@ vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
   this->virtual_grf_array_size = 0;
   this->live_intervals_valid = false;

-   this->uniforms = 0;
+   this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;

-   this->variable_ht = hash_table_ctor(0,
-				       hash_table_pointer_hash,
-				       hash_table_pointer_compare);
+   this->uniforms = 0;
 }

 vec4_visitor::~vec4_visitor()
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -231,7 +231,7 @@ do_vs_prog(struct brw_context *brw,

   /* Emit GEN4 code.
    */
-   if (brw->new_vs_backend && prog) {
+   if (prog) {
      if (!brw_vs_emit(prog, &c)) {
 	 ralloc_free(mem_ctx);
 	 return false;
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -472,7 +472,8 @@ struct gl_shader *brw_new_shader(struct gl_context *ctx, GLuint name, GLuint typ
 struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint name);

 bool brw_color_buffer_write_enabled(struct brw_context *brw);
-bool brw_render_target_supported(struct intel_context *intel, gl_format format);
+bool brw_render_target_supported(struct intel_context *intel,
+				 struct gl_renderbuffer *rb);
 void brw_wm_payload_setup(struct brw_context *brw,
 			  struct brw_wm_compile *c);
 bool do_wm_prog(struct brw_context *brw,
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -553,15 +553,27 @@ brw_init_surface_formats(struct brw_context *brw)
   ctx->TextureFormatSupported[MESA_FORMAT_X8_Z24] = true;
   ctx->TextureFormatSupported[MESA_FORMAT_Z32_FLOAT] = true;
   ctx->TextureFormatSupported[MESA_FORMAT_Z32_FLOAT_X24S8] = true;
+   ctx->TextureFormatSupported[MESA_FORMAT_Z16] = true;
 }

 bool
-brw_render_target_supported(struct intel_context *intel, gl_format format)
+brw_render_target_supported(struct intel_context *intel,
+			    struct gl_renderbuffer *rb)
 {
   struct brw_context *brw = brw_context(&intel->ctx);
-   /* Not exactly true, as some of those formats are not renderable.
-    * But at least we know how to translate them.
+   gl_format format = rb->Format;
+
+   /* Many integer formats are promoted to RGBA (like XRGB8888 is), which means
+    * we would consider them renderable even though we don't have surface
+    * support for their alpha behavior and don't have the blending unit
+    * available to fake it like we do for XRGB8888.  Force them to being
+    * unsupported.
    */
+   if ((rb->_BaseFormat != GL_RGBA &&
+	rb->_BaseFormat != GL_RG &&
+	rb->_BaseFormat != GL_RED) && _mesa_is_format_integer_color(format))
+      return false;
+
   return brw->format_supported_as_render_target[format];
 }

@@ -698,11 +710,11 @@ brw_create_constant_surface(struct brw_context *brw,

   surf[1] = bo->offset; /* reloc */

-   surf[2] = (((w & 0x7f) - 1) << BRW_SURFACE_WIDTH_SHIFT |
-	      (((w >> 7) & 0x1fff) - 1) << BRW_SURFACE_HEIGHT_SHIFT);
+   surf[2] = ((w & 0x7f) << BRW_SURFACE_WIDTH_SHIFT |
+	      ((w >> 7) & 0x1fff) << BRW_SURFACE_HEIGHT_SHIFT);

-   surf[3] = ((((w >> 20) & 0x7f) - 1) << BRW_SURFACE_DEPTH_SHIFT |
-	      (width * 16 - 1) << BRW_SURFACE_PITCH_SHIFT);
+   surf[3] = (((w >> 20) & 0x7f) << BRW_SURFACE_DEPTH_SHIFT |
+	      (16 - 1) << BRW_SURFACE_PITCH_SHIFT); /* ignored */

   surf[4] = 0;
   surf[5] = 0;
@@ -729,7 +741,10 @@ brw_update_sol_surface(struct brw_context *brw,
                       uint32_t *out_offset, unsigned num_vector_components,
                       unsigned stride_dwords, unsigned offset_dwords)
 {
-   drm_intel_bo *bo = intel_buffer_object(buffer_obj)->buffer;
+   struct intel_context *intel = &brw->intel;
+   struct intel_buffer_object *intel_bo = intel_buffer_object(buffer_obj);
+   drm_intel_bo *bo =
+      intel_bufferobj_buffer(intel, intel_bo, INTEL_WRITE_PART);
   uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
                                    out_offset);
   uint32_t pitch_minus_1 = 4*stride_dwords - 1;
@@ -905,24 +920,25 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
   uint32_t *surf;
   uint32_t tile_x, tile_y;
   uint32_t format = 0;
+   gl_format rb_format = intel_rb_format(irb);

   surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
 			  6 * 4, 32, &brw->bind.surf_offset[unit]);

-   switch (irb->Base.Format) {
+   switch (rb_format) {
   case MESA_FORMAT_SARGB8:
      /* without GL_EXT_framebuffer_sRGB we shouldn't bind sRGB
 	 surfaces to the blend/update as sRGB */
      if (ctx->Color.sRGBEnabled)
-	 format = brw_format_for_mesa_format(irb->Base.Format);
+	 format = brw_format_for_mesa_format(rb_format);
      else
 	 format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
      break;
   default:
-      format = brw->render_target_format[irb->Base.Format];
-      if (unlikely(!brw->format_supported_as_render_target[irb->Base.Format])) {
+      format = brw->render_target_format[rb_format];
+      if (unlikely(!brw->format_supported_as_render_target[rb_format])) {
 	 _mesa_problem(ctx, "%s: renderbuffer format %s unsupported\n",
-		       __FUNCTION__, _mesa_get_format_name(irb->Base.Format));
+		       __FUNCTION__, _mesa_get_format_name(rb_format));
      }
      break;
   }
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -57,19 +57,41 @@ gen6_upload_blend_state(struct brw_context *brw)
   memset(blend, 0, size);

   for (b = 0; b < nr_draw_buffers; b++) {
+      /* _NEW_BUFFERS */
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[b];
+      GLenum rb_type;
+      bool integer;
+
+      if (rb)
+	 rb_type = _mesa_get_format_datatype(rb->Format);
+      else
+	 rb_type = GL_UNSIGNED_NORMALIZED;
+
+      /* Used for implementing the following bit of GL_EXT_texture_integer:
+       *
+       *     "Per-fragment operations that require floating-point color
+       *      components, including multisample alpha operations, alpha test,
+       *      blending, and dithering, have no effect when the corresponding
+       *      colors are written to an integer color buffer."
+      */
+      integer = (rb_type == GL_INT || rb_type == GL_UNSIGNED_INT);
+
      /* _NEW_COLOR */
      if (ctx->Color.ColorLogicOpEnabled) {
-	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[b];
-	 /* _NEW_BUFFERS */
 	 /* Floating point RTs should have no effect from LogicOp,
-	  * except for disabling of blending
+	  * except for disabling of blending.
+	  *
+	  * From the Sandy Bridge PRM, Vol 2 Par 1, Section 8.1.11, "Logic Ops",
+	  *
+	  *     "Logic Ops are only supported on *_UNORM surfaces (excluding
+	  *      _SRGB variants), otherwise Logic Ops must be DISABLED."
 	  */
-	 if (rb && _mesa_get_format_datatype(rb->Format) != GL_FLOAT) {
+	 if (rb_type == GL_UNSIGNED_NORMALIZED) {
 	    blend[b].blend1.logic_op_enable = 1;
 	    blend[b].blend1.logic_op_func =
 	       intel_translate_logic_op(ctx->Color.LogicOp);
 	 }
-      } else if (ctx->Color.BlendEnabled & (1 << b)) {
+      } else if (ctx->Color.BlendEnabled & (1 << b) && !integer) {
 	 GLenum eqRGB = ctx->Color.Blend[0].EquationRGB;
 	 GLenum eqA = ctx->Color.Blend[0].EquationA;
 	 GLenum srcRGB = ctx->Color.Blend[0].SrcRGB;
@@ -121,7 +143,7 @@ gen6_upload_blend_state(struct brw_context *brw)
      blend[b].blend1.clamp_range = BRW_RENDERTARGET_CLAMPRANGE_FORMAT;

      /* _NEW_COLOR */
-      if (ctx->Color.AlphaEnabled) {
+      if (ctx->Color.AlphaEnabled && !integer) {
 	 blend[b].blend1.alpha_test_enable = 1;
 	 blend[b].blend1.alpha_test_func =
 	    intel_translate_compare_func(ctx->Color.AlphaFunc);
@@ -129,7 +151,7 @@ gen6_upload_blend_state(struct brw_context *brw)
      }

      /* _NEW_COLOR */
-      if (ctx->Color.DitherFlag) {
+      if (ctx->Color.DitherFlag && !integer) {
 	 blend[b].blend1.dither_enable = 1;
 	 blend[b].blend1.y_dither_offset = 0;
 	 blend[b].blend1.x_dither_offset = 0;
--- a/src/mesa/drivers/dri/i965/gen6_hiz.c
+++ b/src/mesa/drivers/dri/i965/gen6_hiz.c
@@ -109,6 +109,26 @@ static const uint32_t gen6_hiz_meta_save =

      MESA_META_SELECT_FEEDBACK;

+static void
+gen6_hiz_get_framebuffer_enum(struct gl_context *ctx,
+                              GLenum *bind_enum,
+                              GLenum *get_enum)
+{
+   if (ctx->Extensions.EXT_framebuffer_blit && ctx->API == API_OPENGL) {
+      /* Different buffers may be bound to GL_DRAW_FRAMEBUFFER and
+       * GL_READ_FRAMEBUFFER. Take care to not disrupt the read buffer.
+       */
+      *bind_enum = GL_DRAW_FRAMEBUFFER;
+      *get_enum = GL_DRAW_FRAMEBUFFER_BINDING;
+   } else {
+      /* The enums GL_DRAW_FRAMEBUFFER and GL_READ_FRAMEBUFFER do not exist.
+       * The bound framebuffer is both the read and draw buffer.
+       */
+      *bind_enum = GL_FRAMEBUFFER;
+      *get_enum = GL_FRAMEBUFFER_BINDING;
+   }
+}
+
 /**
 * Initialize static data needed for HiZ operations.
 */
@@ -117,10 +137,13 @@ gen6_hiz_init(struct brw_context *brw)
 {
   struct gl_context *ctx = &brw->intel.ctx;
   struct brw_hiz_state *hiz = &brw->hiz;
+   GLenum fb_bind_enum, fb_get_enum;

   if (hiz->fbo != 0)
      return;

+   gen6_hiz_get_framebuffer_enum(ctx, &fb_bind_enum, &fb_get_enum);
+
   /* Create depthbuffer.
    *
    * Until glRenderbufferStorage is called, the renderbuffer hash table
@@ -139,8 +162,8 @@ gen6_hiz_init(struct brw_context *brw)

   /* Setup FBO. */
   _mesa_GenFramebuffersEXT(1, &hiz->fbo);
-   _mesa_BindFramebufferEXT(GL_DRAW_FRAMEBUFFER, hiz->fbo);
-   _mesa_FramebufferRenderbufferEXT(GL_DRAW_FRAMEBUFFER,
+   _mesa_BindFramebufferEXT(fb_bind_enum, hiz->fbo);
+   _mesa_FramebufferRenderbufferEXT(fb_bind_enum,
                                    GL_DEPTH_ATTACHMENT,
                                    GL_RENDERBUFFER,
                                    hiz->depth_rb->Name);
@@ -207,7 +230,6 @@ gen6_hiz_setup_depth_buffer(struct brw_context *brw,

   rb->Format = mt->format;
   rb->_BaseFormat = _mesa_get_format_base_format(rb->Format);
-   rb->DataType = intel_mesa_format_to_rb_datatype(rb->Format);
   rb->InternalFormat = rb->_BaseFormat;
   rb->Width = mt->level[level].width;
   rb->Height = mt->level[level].height;
@@ -241,6 +263,7 @@ gen6_resolve_slice(struct intel_context *intel,
   struct gl_context *ctx = &intel->ctx;
   struct brw_context *brw = brw_context(ctx);
   struct brw_hiz_state *hiz = &brw->hiz;
+   GLenum fb_bind_enum, fb_get_enum;

   /* Do not recurse. */
   assert(!brw->hiz.op);
@@ -250,11 +273,13 @@ gen6_resolve_slice(struct intel_context *intel,
   assert(level <= mt->last_level);
   assert(layer < mt->level[level].depth);

+   gen6_hiz_get_framebuffer_enum(ctx, &fb_bind_enum, &fb_get_enum);
+
   /* Save state. */
   GLint save_drawbuffer;
   GLint save_renderbuffer;
   _mesa_meta_begin(ctx, gen6_hiz_meta_save);
-   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &save_drawbuffer);
+   _mesa_GetIntegerv(fb_get_enum, &save_drawbuffer);
   _mesa_GetIntegerv(GL_RENDERBUFFER_BINDING, &save_renderbuffer);

   /* Initialize context data for HiZ operations. */
@@ -272,7 +297,7 @@ gen6_resolve_slice(struct intel_context *intel,

   /* Setup FBO. */
   gen6_hiz_setup_depth_buffer(brw, mt, level, layer);
-   _mesa_BindFramebufferEXT(GL_DRAW_FRAMEBUFFER, hiz->fbo);
+   _mesa_BindFramebufferEXT(fb_bind_enum, hiz->fbo);


   /* A rectangle primitive (3DPRIM_RECTLIST) consists of only three vertices.
@@ -316,7 +341,7 @@ gen6_resolve_slice(struct intel_context *intel,
    */
   gen6_hiz_teardown_depth_buffer(hiz->depth_rb);
   _mesa_BindRenderbufferEXT(GL_RENDERBUFFER, save_renderbuffer);
-   _mesa_BindFramebufferEXT(GL_DRAW_FRAMEBUFFER, save_drawbuffer);
+   _mesa_BindFramebufferEXT(fb_bind_enum, save_drawbuffer);
   _mesa_meta_end(ctx);
 }

--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -129,6 +129,7 @@ upload_sf_state(struct brw_context *brw)
   float point_size;
   uint16_t attr_overrides[FRAG_ATTRIB_MAX];
   bool userclip_active;
+   uint32_t point_sprite_origin;

   /* _NEW_TRANSFORM */
   userclip_active = (ctx->Transform.ClipPlanesEnabled != 0);
@@ -258,8 +259,16 @@ upload_sf_state(struct brw_context *brw)
   /* Clamp to the hardware limits and convert to fixed point */
   dw4 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);

-   if (ctx->Point.SpriteOrigin == GL_LOWER_LEFT)
-      dw1 |= GEN6_SF_POINT_SPRITE_LOWERLEFT;
+   /*
+    * Window coordinates in an FBO are inverted, which means point
+    * sprite origin must be inverted, too.
+    */
+   if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
+      point_sprite_origin = GEN6_SF_POINT_SPRITE_LOWERLEFT;
+   } else {
+      point_sprite_origin = GEN6_SF_POINT_SPRITE_UPPERLEFT;
+   }
+   dw1 |= point_sprite_origin;

   /* _NEW_LIGHT */
   if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
--- a/Show More
+++ b/Show More