darwin: Remove extra kCGLPFAColorSize attribute when requesting an offscreen context

https://xquartz.macosforge.org/trac/ticket/650 Signed-off-by: Jeremy Huddleston Sequoia <jeremyhu@apple.com> (cherry picked from commit b4f34241ec)
darwin: Guard Core Profile usage behind a testing envvar
2014-05-31 03:45:34 -07:00 · 2014-05-24 20:42:38 -07:00 · 2014-05-24 20:42:37 -07:00 · 2014-05-24 20:42:34 -07:00 · 2014-05-20 10:55:44 -07:00 · 2014-05-20 01:40:35 -07:00
426 changed files with 10294 additions and 11323 deletions
--- a/8
+++ b/8
@@ -184,7 +184,7 @@ ultrix-gcc:

 # Rules for making release tarballs

-PACKAGE_VERSION=8.0-devel
+PACKAGE_VERSION=8.0.5
 PACKAGE_DIR = Mesa-$(PACKAGE_VERSION)
 PACKAGE_NAME = MesaLib-$(PACKAGE_VERSION)

@@ -199,6 +199,12 @@ EXTRA_FILES = \
 	src/glsl/glcpp/glcpp-lex.c			\
 	src/glsl/glcpp/glcpp-parse.c			\
 	src/glsl/glcpp/glcpp-parse.h			\
+	src/mesa/main/api_exec_es1.c			\
+	src/mesa/main/api_exec_es1_dispatch.h		\
+	src/mesa/main/api_exec_es1_remap_helper.h	\
+	src/mesa/main/api_exec_es2.c			\
+	src/mesa/main/api_exec_es2_dispatch.h		\
+	src/mesa/main/api_exec_es2_remap_helper.h	\
 	src/mesa/program/lex.yy.c			\
 	src/mesa/program/program_parse.tab.c		\
 	src/mesa/program/program_parse.tab.h
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,91 @@
+# These commits were cherry picked without using -x.
+bca6cd2d71ad944031edeacd129eb0031a89c08e scons: Remove Haiku one-offs for gallium drivers
+efd73f72d8f34a40d6a1cd279fffa48dc13b6e5b mapi/glapi: Never use a generic no-op entry-point on Windows.
+ab1195cf1127781909d5158c7de68f8732458d75 swrast: Fix implicit declaration warnings
+e2dce7f7ee3e7da9cbb0bb33307ecd79e824426d intel: Fix rendering from textures after RenderTexture().
+b4082f492b4b55df4c636445e47b97d1f1e4b5b2 r600g: add support for TN (trinity) APUs
+5beba3d0ba593b661451217a5ffcdf68644cc903 mesa: use _mesa_rebase_rgba_float/uint() in glGetTexImage code
+4a269a8dc0170c75ff22af3910786228727ea41e r300/compiler: Clear loop registers in vertex shaders w/o loops
+73249239cf71e3595ee19f3c1a02b8b0f58994cd r300/compiler: Copy all instruction attributes during local transfoms
+
+# There's no blorp in the 8.0 branch.
+32c7b2769cbe80ff56d1c73c4f9b62f13f577c8d i965/blorp: Clarify why width/height must be adjusted for Gen6 IMS surfaces.
+e14b1288ef5b5b6091facaecd42e86f0a8157f28 i965/blorp: Change gl_renderbuffer* params to intel_renderbuffer*.
+09b0fa8499d8035fa31ccb2b550056305fbd149b i965/blorp: store surface width/height in brw_blorp_mip_info.
+c130ce7b2b26b4b67d4bf2b6dd1044a200efe25d i965/blorp: store x and y offsets in brw_blorp_mip_info.
+3123f0621561549c4566248100661ef77cab2834 i965/blorp: Thread level and layer through brw_blorp_blit_miptrees().
+f04f219906e40a6647a10fd9c1928509fe25fb84 i965/blorp: Account for offsets when emitting SURFACE_STATE.
+1a75063d5f829547b75b60ae64bddf3905b4cb8f i965/blorp: don't reduce stencil alignment restrictions when multisampling.
+5fd67fac14d7f35c311eb5c671be8d4ae9b2ea37 i965/blorp: Reduce alignment restrictions for stencil blits.
+1a5d4f7cb2367c7863b28efbd78e9169114baf42 i965/blorp: Fix offsets and width/height for stencil blits.
+a33ce665a5827c598b85bb04d94b33e6a5e41c28 i965/blorp: Increase Y alignment for multisampled stencil blits.
+124b214f094fa63ff1ddb7e9f0a1c2e0ba8214fb i965/blorp: Fix sRGB MSAA resolves.
+e2249e8c4d06a85d6389ba1689e15d7e29aa4dff i965/blorp: Add support for blits between SRGB and linear formats.
+
+# The old generated ES1/ES2 disptach code is still used in 8.0.
+aa129b0833052f613a6ec570aef092733769ee0e mesa: Don't set dispatch pointer for glPointSize in ES2
+850412b8ab272b9616da9a0df29e424b07bddde9 mesa: Don't set dispatch pointer for glGetDoublev in ES2
+11927bfc4a43aefbac5af35aae34d5cdf5d9e6bb mesa: Don't set dispatch pointer for glGetBufferSubData in ES2
+2a3a68e4c7b15860ac9398c5a56c0d6762573633 mesa: Don't set dispatch pointers for glClearDepth or glDepthRange in ES2
+1c0a44aaf5c095ca261d1ce11bb8a67dbbce54a2 mesa: Don't set dispatch pointers for glPointParameter[if][v] in ES2
+a83b01371e60356d2ed69c131bf9e0a0daba59a4 mesa: Don't set dispatch pointer for glResizeBuffersMESA in ES2
+7f7268d385cc1435264b8d3111e1596b2dae9183 mesa: Don't set dispatch pointer for glGetProgramivARB in ES2
+ee77061277b640d78befb43c26a3ffbe227e9244 mesa: Don't set dispatch pointer for glTexStorage in ES2
+3ef9e43865f38e9c8c5681768645513ce26e0488 mesa: Pass GL context to _mesa_create_save_table
+a13c07f7528c74fc433a7227777351110087b89d mesa: Don't set loopback dispatch pointers for most things in ES2 or core
+aa0f588e2d4c160879699180f0e7f4d3e52b55b9 mesa: Don't set vtxfmt dispatch pointers for many things in ES2 or core
+be66cf950e01d217b5341f8e56676dc5bf81ca47 mesa: Don't set shaderapi dispatch pointers for many things in ES2 or core
+6c01a0e770432eda0e29dbd7278a94efc688a6d3 mesa: Don't set uniform dispatch pointers for many things in ES2 or core
+8f0b81bf7ddcdf5715a3e00af67395b91f27a243 mesa: don't enable glVertexPointer() when using API_OPENGLES2.
+51b069e7aa81cdc8f38db71554ae3dd12ce0a6c4 meta: Don't save and restore fog state when there is no fog state
+
+# The GLSL GenerateMipmaps does not exist in 8.0.
+299acac849eb8506de9760c94c6e8e8b1046d909 _mesa_meta_GenerateMipmap: Support all texture targets by generating shaders at runtime
+15bf3103b48a5928321fe56fbb3ed28a0f314418 _mesa_meta_GenerateMipmap: Generate separate shaders for glsl 120 / 130
+679c93ff89c71cbd3b1d24e88abd38f00b8c1f02 meta: Don't _mesa_set_enable() invalid targets in ES 1.
+ab097dde0c958dd8b1c06a07ef8913512753760c meta: Remove unsafe global mem_ctx pointer
+3308c079bd00e9b9aa546f5214ce197a904d059b meta: Rearrange shader creation in setup_glsl_generate_mipmap
+0242381f06edb09dcf0eaacd6d26ccd8584700cc meta: Don't use GLSL 1.30 shader on OpenGL ES 2
+eb1d87fb945783448cc40ad43c9cd4d98002d424 meta: Add on demand compilation of per target shader programs
+
+# There's no dual-source blending in 8.0.
+354f2cb5c7330a7d43cf0b177daf758d2aa31e0a glsl: Generate compile errors for explicit blend indices < 0 or > 1.
+ea0d08872724b5e31e9e32db2338e15fdfdcc4de intel/i965: Disable SampleAlphaToOne if dual source blending enabled
+
+# The S3TC over-ride codes doesn't exist in 8.0
+328961d95586931a17fe81ba816d362e8389c105 mesa: Don't override S3TC internalFormat if data is pre-compressed.
+
+# Causes too many regressions... it's not even 9.0
+413c4914129cd26ca87960852d8c0264c0fb29e7 intel: Improve teximage perf for Google Chrome paint rects (v3)
+b1d0fe022dc4826dadce014ab8fe062a82f75a16 intel: Fix segfault in intel_texsubimage_tiled_memcpy
+b5891286202987dfc2606ac716050c0ee426de11 intel: Fix yet-another-bug in intel_texsubimage_tiled_memcpy
+
+# Fixes bugs caused by pathes not on 8.0.
+043f66204b1a190e18747c3befa8826c82dd87a7 glapi/glx: rename 'table' variable to 'disp_table'
+22897c74979aa02facdd5cd729db8dadf86924f5 intel: Don't call intelDestroyContext if there is no context to destroy
+87f26214d6bdeb439b30615ec53c293c5141cf11 i965: Don't free the intel_context structure when intelCreateContext fails.
+de958de71b1450952e021af4e729c87406353db6 i915: Don't free the intel_context structure when intelCreateContext fails.
+7fa0f10cd85ccb5afbc3a961164011de70970ff3 mesa: Flag _NEW_VARYING_VP_INPUTS when TexEnv programs are active.
+
+# GL_NV_draw_buffers doesn't exist in 8.0.
+23ff634c9c2eff744b5ddae7d1ba02bc1ef19ac5 gles2: Alias glReadBufferNV with desktop glReadBuffer
+
+# Introduces performance regressions for other games
+fa58644855e44830e0b91dc627703c236fa6712a r600g: fix abysmal performance in Reaction Quake
+
+# Candidates for 9.0 only
+61706915a3b5644faf7a5e67f47c9c593620bf8c gallium/u_blitter: fix stencil-only blits
+df5e2c058f73b72909fa99a2a189f5877525e3bf r600g: do not require MSAA renderbuffer support if not asked for
+2988fa940e1d8a4531fddff4d554eec1e6e04474 draw: fix non-indexed draw calls if there's an index buffer
+933faae2b8669f459e7ab27d6bcbfb6f4136b6d5 r600g: flush FMASK and CMASK when changing colorbuffers on Evergreen
+ed8d87c6a641efe8667c0ba580260ffaff5ffc7e radeonsi: add some new SI pci ids
+7da12426f7682ffc44ae40e31d1b5712521fbb70 build: Use AX_PTHREAD to detect pthreads
+9ed00075d8ea0ffaa675237e32b8611ad3064dbf build: Link libglapi with pthreads
+9dfca930d7fcfda6767d3be9b1690d010f08fea5 r600g: fix possible issue with stencil mipmap rendering
+93eba269351c6e256db3a4cc7c7018f5a3fae5a1 nouveau: use pre-calculated stride for resource_get_handle
+9a51edfb5af72a7a480f408f02d8ecd98c576b7b Re-add HAVE_PTHREADS preprocessor macro
+f42518962a08ce927e4ddd233d19d2661e135834 egl_dri2/x11: Fix eglPostSubBufferNV()
+a3b6b2d3055070da9bf7054fecfd0b171c398eb7 wayland: Destroy frame callback when destroying surface
+9785ae0973cc206afc36dbc7d5b9553f92d06b47 glsl_to_tgsi: fix dst register for texturing fetches.
+037b4f80384c72c12e31192d1a30411d4660972d r600g: fix lod bias/explicit lod with cube maps.
+eabbe5c45f5d05822c5f841628afa4008398d553 mesa: Don't glPopAttrib() GL_POINT_SPRITE_COORD_ORIGIN on < OpenGL-2.0
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+# Script for generating a list of candidates for cherry-picking to a stable branch
+
+# Grep for commits with "cherry picked from commit" in the commit message.
+git log --reverse --grep="cherry picked from commit" origin/master..HEAD |\
+	grep "cherry picked from commit" |\
+	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
+
+# Grep for commits that were marked as a candidate for the stable tree.
+git log --reverse --pretty=%H -i --grep='^[[:space:]]*NOTE: This is a candidate' HEAD..origin/master |\
+while read sha
+do
+	# Check to see whether the patch is on the ignore list.
+	if [ -f bin/.cherry-ignore ] ; then
+		if grep -q ^$sha bin/.cherry-ignore ; then
+			continue
+		fi
+	fi
+
+	# Check to see if it has already been picked over.
+	if grep -q ^$sha already_picked ; then
+		continue
+	fi
+
+	git log -n1 --pretty=oneline $sha | cat
+done
+
+rm -f already_picked
--- a/bin/mklib
+++ b/bin/mklib
@@ -334,7 +334,9 @@ case $ARCH in
 	    # environment.  If so, pass -m32 flag to linker.
 	    set ${OBJECTS}
 	    ABI32=`file $1 | grep 32-bit`
-	    if [ "${ABI32}" -a `uname -m` = "x86_64" ] ; then
+	    ARM=`file $1 | grep ARM`
+	    # Do not add "-m32" option for arm.
+            if [ -z "$ARM" -a "${ABI32}" -a `uname -m` = "x86_64" ] ; then
 		OPTS="-m32 ${OPTS}"
 	    fi

@@ -391,7 +393,9 @@ case $ARCH in
 	    # environment.  If so, pass -m32 flag to linker.
 	    set ${OBJECTS}
 	    ABI32=`file $1 | grep 32-bit`
-	    if [ "${ABI32}" -a `uname -m` = "x86_64" ] ; then
+            ARM=`file $1 | grep ARM`
+	    # Do not add "-m32" option for arm.
+            if [ -z "$ARM" -a "${ABI32}" -a `uname -m` = "x86_64" ] ; then
 		OPTS="-m32 ${OPTS}"
 	    fi
            if [ "${ALTOPTS}" ] ; then
--- a/bin/shortlog_mesa.sh
+++ b/bin/shortlog_mesa.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# This script is used to generate the list of changes that
+# appears in the release notes files, with HTML formatting.
+
+
+typeset -i in_log=0
+
+git shortlog $* | while read l
+do
+    if [ $in_log -eq 0 ]; then
+	echo '<p>'$l'</p>'
+	echo '<ul>'
+	in_log=1
+    elif echo "$l" | egrep -q '^$' ; then
+	echo '</ul>'
+	echo
+	in_log=0
+    else
+        mesg=$(echo $l | sed 's/ (cherry picked from commit [0-9a-f]\+)//;s/\&/&amp;/g;s/</\&lt;/g;s/>/\&gt;/g')
+	echo '  <li>'${mesg}'</li>'
+    fi
+done
--- a/configs/darwin
+++ b/configs/darwin
@@ -9,8 +9,8 @@ INSTALL_DIR = /usr/X11
 X11_DIR = $(INSTALL_DIR)

 # Compiler and flags
-CC = gcc
-CXX = g++
+CC = $(shell xcrun -find cc)
+CXX = $(shell xcrun -find c++)
 PIC_FLAGS = -fPIC
 DEFINES =  -D_DARWIN_C_SOURCE -DPTHREADS -D_GNU_SOURCE \
 	   -DGLX_ALIAS_UNSUPPORTED \
@@ -24,11 +24,14 @@ DEFINES =  -D_DARWIN_C_SOURCE -DPTHREADS -D_GNU_SOURCE \
 # -DIN_DRI_DRIVER

 ARCH_FLAGS += $(RC_CFLAGS)
+INCLUDE_FLAGS = -I$(INSTALL_DIR)/include -I$(X11_DIR)/include
+OPT_FLAGS = -g3 -gdwarf-2 -Os -ffast-math -fno-strict-aliasing
+WARN_FLAGS = -Wall -Wmissing-prototypes

-CFLAGS =  -ggdb3 -Os -Wall -Wmissing-prototypes -std=c99 -ffast-math -fno-strict-aliasing -fvisibility=hidden \
-	-I$(INSTALL_DIR)/include -I$(X11_DIR)/include $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(ASM_FLAGS) $(DEFINES)
-CXXFLAGS =  -ggdb3 -Os -Wall -fno-strict-aliasing -fvisibility=hidden \
-	-I$(INSTALL_DIR)/include -I$(X11_DIR)/include $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(ASM_FLAGS) $(DEFINES)
+CFLAGS = -std=c99 -fvisibility=hidden \
+	$(OPT_FLAGS) $(WARN_FLAGS) $(INCLUDE_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(ASM_FLAGS) $(DEFINES) $(EXTRA_CFLAGS)
+CXXFLAGS = -fvisibility=hidden \
+	$(OPT_FLAGS) $(WARN_FLAGS) $(INCLUDE_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(ASM_FLAGS) $(DEFINES) $(EXTRA_CFLAGS)

 # Library names (actual file names)
 GL_LIB_NAME = lib$(GL_LIB).dylib
@@ -44,10 +47,10 @@ GLW_LIB_GLOB = lib$(GLW_LIB).*dylib
 OSMESA_LIB_GLOB = lib$(OSMESA_LIB).*dylib
 VG_LIB_GLOB = lib$(VG_LIB).*dylib

-GL_LIB_DEPS = -L$(INSTALL_DIR)/$(LIB_DIR) -L$(X11_DIR)/$(LIB_DIR) -lX11 -lXext -lm -lpthread
-OSMESA_LIB_DEPS =
-GLU_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GL_LIB)
-GLW_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GL_LIB) -L$(INSTALL_DIR)/$(LIB_DIR) -L$(X11_DIR)/$(LIB_DIR) -lX11 -lXt
+GL_LIB_DEPS = -L$(INSTALL_DIR)/$(LIB_DIR) -L$(X11_DIR)/$(LIB_DIR) -lX11-xcb -lxcb -lX11 -lXext $(EXTRA_LDFLAGS)
+OSMESA_LIB_DEPS = $(EXTRA_LDFLAGS)
+GLU_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GL_LIB) $(EXTRA_LDFLAGS)
+GLW_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GL_LIB) -L$(INSTALL_DIR)/$(LIB_DIR) -L$(X11_DIR)/$(LIB_DIR) -lX11 -lXt $(EXTRA_LDFLAGS)

 SRC_DIRS = glsl mapi/glapi mapi/vgapi glx/apple mesa gallium glu
 GLU_DIRS = sgi
--- a/configs/darwin-fat-intel
+++ b/configs/darwin-fat-intel
@@ -0,0 +1,7 @@
+# Configuration for Darwin / MacOS X, making 32bit and 64bit fat dynamic libs for intel
+
+RC_CFLAGS=-arch i386 -arch x86_64
+
+include $(TOP)/configs/darwin
+
+CONFIG_NAME = darwin-fat-intel
--- a/configs/default
+++ b/configs/default
@@ -10,7 +10,7 @@ CONFIG_NAME = default
 # Version info
 MESA_MAJOR=8
 MESA_MINOR=0
-MESA_TINY=0
+MESA_TINY=5
 MESA_VERSION = $(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY)

 # external projects.  This should be useless now that we use libdrm.
--- a/configs/linux-dri
+++ b/configs/linux-dri
@@ -70,7 +70,6 @@ INTEL_CFLAGS = $(shell $(PKG_CONFIG) --cflags libdrm_intel)
 NOUVEAU_LIBS = $(shell $(PKG_CONFIG) --libs libdrm_nouveau)
 NOUVEAU_CFLAGS = $(shell $(PKG_CONFIG) --cflags libdrm_nouveau)

-LIBDRM_RADEON_LIBS = $(shell $(PKG_CONFIG) --libs libdrm_radeon)
-LIBDRM_RADEON_CFLAGS = $(shell $(PKG_CONFIG) --cflags libdrm_radeon)
-RADEON_CFLAGS = "-DHAVE_LIBDRM_RADEON=1 $(LIBDRM_RADEON_CFLAGS)"
+RADEON_LIBS = $(shell $(PKG_CONFIG) --libs libdrm_radeon)
+RADEON_CFLAGS = $(shell $(PKG_CONFIG) --cflags libdrm_radeon)
 RADEON_LDFLAGS = $(LIBDRM_RADEON_LIBS)
--- a/configure.ac
+++ b/configure.ac
@@ -88,13 +88,13 @@ AC_COMPILE_IFELSE(
       not clang
 #endif
 ]])],
-[CLANG=yes], [CLANG=no])
+[acv_mesa_CLANG=yes], [acv_mesa_CLANG=no])

-AC_MSG_RESULT([$CLANG])
+AC_MSG_RESULT([$acv_mesa_CLANG])

 dnl If we're using GCC, make sure that it is at least version 3.3.0.  Older
 dnl versions are explictly not supported.
-if test "x$GCC" = xyes -a "x$CLANG" = xno; then
+if test "x$GCC" = xyes -a "x$acv_mesa_CLANG" = xno; then
    AC_MSG_CHECKING([whether gcc version is sufficient])
    major=0
    minor=0
@@ -168,9 +168,6 @@ esac
 dnl Add flags for gcc and g++
 if test "x$GCC" = xyes; then
    CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -std=c99"
-    if test "x$CLANG" = "xno"; then
-       CFLAGS="$CFLAGS -ffast-math"
-    fi

    # Enable -fvisibility=hidden if using a gcc that supports it
    save_CFLAGS="$CFLAGS"
@@ -662,7 +659,7 @@ AC_ARG_ENABLE([gallium_gbm],
    [enable_gallium_gbm=auto])

 # Option for Gallium drivers
-GALLIUM_DRIVERS_DEFAULT="r300,r600,swrast"
+GALLIUM_DRIVERS_DEFAULT="r300,r600,svga,swrast"

 AC_ARG_WITH([gallium-drivers],
    [AS_HELP_STRING([--with-gallium-drivers@<:@=DIRS...@:>@],
@@ -1566,7 +1563,7 @@ if test "x$enable_gallium_g3dvl" = xyes; then
 fi

 if test "x$enable_xvmc" = xyes; then
-    PKG_CHECK_MODULES([XVMC], [xvmc >= 1.0.6 xorg-server])
+    PKG_CHECK_MODULES([XVMC], [xvmc >= 1.0.6])
    GALLIUM_STATE_TRACKERS_DIRS="$GALLIUM_STATE_TRACKERS_DIRS xorg/xvmc"
    HAVE_ST_XVMC="yes"
 fi
--- a/docs/contents.html
+++ b/docs/contents.html
@@ -52,7 +52,6 @@ a:visited {

 <b>User Topics</b>
 <ul>
-<li><a href="shading.html" target="MainFrame">Shading Language</a>
 <li><a href="egl.html" target="MainFrame">EGL</a>
 <li><a href="opengles.html" target="MainFrame">OpenGL ES</a>
 <li><a href="openvg.html" target="MainFrame">OpenVG / Vega</a>
@@ -63,6 +62,7 @@ a:visited {
 <LI><A HREF="extensions.html" target="MainFrame">Mesa Extensions</A>
 <LI><A HREF="mangling.html" target="MainFrame">Function Name Mangling</A>
 <LI><A href="llvmpipe.html" target="MainFrame">Gallium llvmpipe driver</A>
+<LI><A href="vmware-guest.html" target="MainFrame">VMware SVGA3D guest driver</a>
 <LI><A href="postprocess.html" target="MainFrame">Gallium post-processing</A>
 <LI><A href="viewperf.html" target="MainFrame">Viewperf Issues</A>
 </ul>
--- a/docs/intro.html
+++ b/docs/intro.html
@@ -132,12 +132,26 @@ June 2007: Mesa 7.0 is released, implementing the OpenGL 2.1 specification
 and OpenGL Shading Language.
 </p>

+<p>
+2008: Keith Whitwell and other Tungsten Graphics employees develop
+<a href="http://en.wikipedia.org/wiki/Gallium3D"  target="_parent">Gallium</a>
+- a new GPU abstraction layer.  The latest Mesa drivers are based on
+Gallium and other APIs such as OpenVG are implemented on top of Gallium.
+</p>

 <p>
-Ongoing: Mesa is used as the core of many hardware OpenGL drivers for
-the XFree86 and X.org X servers within the
-<A href="http://dri.freedesktop.org/" target="_parent">DRI project</A>.
-I continue to enhance Mesa with new extensions and features.
+February 2012: Mesa 8.0 is released, implementing the OpenGL 3.0 specification
+and version 1.30 of the OpenGL Shading Language.
+</p>
+
+<p>
+Ongoing: Mesa is the OpenGL implementation for several types of hardware
+made by Intel, AMD and NVIDIA, plus the VMware virtual GPU.
+There's also several software-based renderers: swrast (the legacy
+Mesa rasterizer), softpipe (a gallium reference driver) and llvmpipe
+(LLVM/JIT-based high-speed rasterizer).
+Work continues on the drivers and core Mesa to implement newer versions
+of the OpenGL specification.
 </p>


@@ -151,6 +165,15 @@ of the OpenGL specification is implemented.
 </p>


+<H2>Version 8.x features</H2>
+<p>
+Version 8.x of Mesa implements the OpenGL 3.0 API.
+The developers at Intel deserve a lot of credit for implementing most
+of the OpenGL 3.0 features in core Mesa, the GLSL compiler as well as
+the i965 driver.
+</p>
+
+
 <H2>Version 7.x features</H2>
 <p>
 Version 7.x of Mesa implements the OpenGL 2.1 API.  The main feature
--- a/docs/news.html
+++ b/docs/news.html
@@ -11,6 +11,32 @@
 <H1>News</H1>


+<h2>March 21, 2012</h2>
+
+<p>
+<a href="relnotes-8.0.2.html">Mesa 8.0.2</a> is released.
+This is a bug fix release.
+</p>
+
+
+<h2>February 16, 2012</h2>
+
+<p>
+<a href="relnotes-8.0.1.html">Mesa 8.0.1</a> is released.
+This is a bug fix release.
+</p>
+
+
+<h2>February 9, 2012</h2>
+
+<p>
+<a href="relnotes-8.0.html">Mesa 8.0</a> is released.
+This is the first version of Mesa to support OpenGL 3.0 and GLSL 1.30
+(with the i965 driver).
+See the release notes for more information about the release.
+</p>
+
+
 <h2>November 27, 2011</h2>

 <p>
--- a/docs/relnotes-8.0.1.html
+++ b/docs/relnotes-8.0.1.html
@@ -0,0 +1,153 @@
+<HTML>
+
+<head>
+<TITLE>Mesa Release Notes</TITLE>
+<link rel="stylesheet" type="text/css" href="mesa.css">
+<meta http-equiv="content-type" content="text/html; charset=utf-8" />
+</head>
+
+<BODY>
+
+<body bgcolor="#eeeeee">
+
+<H1>Mesa 8.0.1 Release Notes / February 16, 2012</H1>
+
+<p>
+Mesa 8.0.1 is a bug fix release which fixes bugs found since the 8.0 release.
+</p>
+<p>
+Mesa 8.0 implements the OpenGL 3.0 API, but the version reported by
+glGetString(GL_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.0.
+</p>
+<p>
+See the <a href="install.html">Compiling/Installing page</a> for prerequisites
+for DRI hardware acceleration.
+</p>
+
+
+<h2>MD5 checksums</h2>
+<pre>
+4855c2d93bd2ebd43f384bdcc92c9a27  MesaLib-8.0.1.tar.gz
+24eeebf66971809d8f40775a379b36c9  MesaLib-8.0.1.tar.bz2
+54e745d14dac5717f7f65b4e2d5c1df2  MesaLib-8.0.1.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=28924">Bug 28924</a> - [ILK] piglit tex-border-1 fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=40864">Bug 40864</a> - [bisected pineview] oglc pxconv-gettex(basic.allCases) fails on pineview</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=43327">Bug 43327</a> - [bisected SNB] HiZ make many oglc cases regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=44333">Bug 44333</a> - [bisected] Color distortion with xbmc mediaplayer</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=44927">Bug 44927</a> - [SNB IVB regression] gl-117 abort when click</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45221">Bug 45221</a> - [bisected IVB] glean/fbo regression in stencil-only case</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45877">Bug 45877</a> - main/image.c:1597: _mesa_convert_colors: Assertion `dstType == 0x1406' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45578">Bug 45578</a> - main/image.c:1659: _mesa_convert_colors: Assertion `dstType == 0x1403' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45872">Bug 45872</a> - [bisected PNV] oglc mustpass(basic.stipple) regressed on pineview</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45876">Bug 45876</a> - [PNV]oglc texenv(basic.allCases) regressed on pineview</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45917">Bug 45917</a> - [PNV] Regression in Piglit test general/two-sided-lighting-separate-specular</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45943">Bug 45943</a> - [r300g] r300_emit.c:365:r300_emit_aa_state: Assertion `(aa-d&gt;dest)-&gt;cs_buf' failed.</li>
+
+<!-- <li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=">Bug </a> - </li> -->
+
+</ul>
+
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-8.0..mesa-8.0.1
+</pre>
+
+<p>Alex Deucher (2):
+<ul>
+  <li>r600g: fix tex tile_type offset for cayman</li>
+  <li>r600g: 128 bit formats require tile_type = 1 on cayman</li>
+</ul></p>
+
+<p>Anuj Phogat (2):
+<ul>
+  <li>meta: Add pixel store/pack operations in decompress_texture_image</li>
+  <li>meta: Avoid FBO resizing/reallocating in decompress_texture_image</li>
+</ul></p>
+
+<p>Brian Paul (6):
+<ul>
+  <li>docs: add news item for 8.0 release</li>
+  <li>docs: update info about supported systems, GPUs, APIs</li>
+  <li>docs: add VMware link</li>
+  <li>docs: remove link to the GLSL compiler page</li>
+  <li>mesa: fix proxy texture target initialization</li>
+  <li>swrast: fix span color type selection</li>
+</ul></p>
+
+<p>Chad Versace (2):
+<ul>
+  <li>i965: Rewrite the HiZ op</li>
+  <li>i965: Remove file i965/junk, accidentally added in 7b36c68</li>
+</ul></p>
+
+<p>Dave Airlie (1):
+<ul>
+  <li>st/mesa: only resolve if number of samples is &gt; 1</li>
+</ul></p>
+
+<p>Eric Anholt (3):
+<ul>
+  <li>i965: Fix HiZ change compiler warning.</li>
+  <li>i965: Report the failure message when failing to compile the fragment shader.</li>
+  <li>i965/fs: Enable register spilling on gen7 too.</li>
+</ul></p>
+
+<p>Ian Romanick (4):
+<ul>
+  <li>docs: Add 8.0 MD5 checksums</li>
+  <li>glapi: Include GLES2 headers for ES2 extension functions</li>
+  <li>swrast: Only avoid empty _TexEnvPrograms</li>
+  <li>mesa: Bump version number to 8.0.1</li>
+</ul></p>
+
+<p>Kenneth Graunke (4):
+<ul>
+  <li>i965: Fix border color on Ironlake.</li>
+  <li>i965/fs: Add a new fs_inst::regs_written function.</li>
+  <li>i965/fs: Take # of components into account in try_rewrite_rhs_to_dst.</li>
+  <li>i965: Emit Ivybridge VS workaround flushes.</li>
+</ul></p>
+
+<p>Mathias Fröhlich (1):
+<ul>
+  <li>state_stracker: Fix access to uninitialized memory.</li>
+</ul></p>
+
+<p>Paul Berry (1):
+<ul>
+  <li>i915: Fix type of "specoffset" variable.</li>
+</ul></p>
+
+<p>Simon Farnsworth (1):
+<ul>
+  <li>r600g: Use a fake reloc to sleep for fences</li>
+</ul></p>
+
+</body>
+</html>
--- a/docs/relnotes-8.0.2.html
+++ b/docs/relnotes-8.0.2.html
@@ -0,0 +1,160 @@
+<HTML>
+
+<head>
+<TITLE>Mesa Release Notes</TITLE>
+<link rel="stylesheet" type="text/css" href="mesa.css">
+<meta http-equiv="content-type" content="text/html; charset=utf-8" />
+</head>
+
+<BODY>
+
+<body bgcolor="#eeeeee">
+
+<H1>Mesa 8.0.2 Release Notes / March 21, 2012</H1>
+
+<p>
+Mesa 8.0.2 is a bug fix release which fixes bugs found since the 8.0.1 release.
+</p>
+<p>
+Mesa 8.0.2 implements the OpenGL 3.0 API, but the version reported by
+glGetString(GL_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.0.
+</p>
+<p>
+See the <a href="install.html">Compiling/Installing page</a> for prerequisites
+for DRI hardware acceleration.
+</p>
+
+
+<h2>MD5 checksums</h2>
+<pre>
+70eb3dc74fbfcd72f6776268ee1db52e  MesaLib-8.0.2.tar.gz
+a368104e5700707048dc3e8691a9a7a1  MesaLib-8.0.2.tar.bz2
+d5e5cdb85d2afdbcd1c0623d3ed1c54d  MesaLib-8.0.2.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=38720">Bug 38720</a> - [SNB] Trine triggers a GPU hang</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=40059">Bug 40059</a> - [SNB] hang in "Amnesia: The Dark Descent" demo</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45216">Bug 45216</a> - [SNB] GPU hang in OilRush</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46631">Bug 46631</a> - It's really hard to hit the fast path for the fallback glReadPixels code</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46679">Bug 46679</a> - glReadPixels on a luminance texture returns the wrong values</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46311">Bug 46311</a> - Missing support of point size in Mesa core</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46665">Bug 46665</a> - [PNV] webgl conformance case max texture fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45975">Bug 45975</a> - [Gen4 + ILK] render with pointcoord will fail to render</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46666">Bug 46666</a> - [PNV] webgl conformance case NPOT case fails with TEXTURE_MIN_FILTER set to LINEAR</li>
+
+<!-- <li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=">Bug </a> - </li> -->
+
+</ul>
+
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-8.0.1..mesa-8.0.2
+</pre>
+
+<p>Brian Paul (7):
+<ul>
+      <li>svga: add null vs pointer check in update_need_pipeline()</li>
+      <li>util: add mutex lock in u_debug_memory.c code</li>
+      <li>mesa: add _mesa_rebase_rgba_float/uint() functions</li>
+      <li>mesa: use _mesa_rebase_rgba_float/uint() in glReadPixels code</li>
+      <li>mesa: use _mesa_rebase_rgba_float/uint() in glGetTexImage code</li>
+      <li>mesa: fix GL_LUMINANCE handling in glGetTexImage</li>
+      <li>docs: add links to 8.0.1 release notes</li>
+</ul></p>
+
+<p>Daniel Vetter (1):
+<ul>
+      <li>i965: fixup W-tile offset computation to take swizzling into account</li>
+<ul></p>
+
+<p>Dylan Noblesmith (1):
+<ul>
+      <li>mesa: add back glGetnUniform*v() overflow error reporting</li>
+</ul></p>
+
+<p>Ian Romanick (1):
+<ul>
+      <li>docs: Add 8.0.1 release md5sums</li>
+</ul></p>
+
+<p>Jakob Bornecrantz (3):
+<ul>
+      <li>mesa: Include mesa ES mapi generated files</li>
+      <li>mesa: Bump version number to 8.0.2</li>
+      <li>docs: Add 8.0.2 release notes</li>
+</ul></p>
+
+<p>Jeremy Huddleston (3):
+<ul>
+      <li>darwin: config file cleanups</li>
+      <li>darwin: Build create_context.c</li>
+      <li>darwin: Link against libxcb</li>
+</ul></p>
+
+<p>José Fonseca (1):
+<ul>
+      <li>svga: Clamp advertised PIPE_SHADER_CAP_MAX_TEMPS to SVGA3D_TEMPREG_MAX.</li>
+</ul></p>
+
+<p>Kenneth Graunke (2):
+<ul>
+      <li>i965: Only set Last Render Target Select on the last FB write.</li>
+      <li>i965: Fix Gen6+ dynamic state upper bound on older kernels.</li>
+</ul></p>
+
+<p>Marek Olšák (1):
+<ul>
+      <li>gallium/rtasm: properly detect SSE and SSE2</li>
+</ul></p>
+
+<p>Neil Roberts (1):
+<ul>
+      <li>mesa: Don't disable fast path for normalized types</li>
+</ul></p>
+
+<p>Tom Stellard (1):
+<ul>
+      <li>r300/compiler: Fix bug when lowering KILP on r300 cards</li>
+</ul></p>
+
+<p>Yuanhan Liu (6):
+<ul>
+      <li>mesa: let GL3 buf obj queries not depend on opengl major version</li>
+      <li>tnl: let _TNL_ATTRIB_POINTSIZE do not depend on ctx-&gt;VertexProgram._Enabled</li>
+      <li>i915: fix wrong rendering of gl_PointSize on Pineview</li>
+      <li>i915: move the FALLBACK_DRAW_OFFSET check outside the drawing rect check</li>
+      <li>i965: handle gl_PointCoord for Gen4 and Gen5 platforms</li>
+      <li>i915: fallback for NPOT cubemap texture</li>
+</ul></p>
+
+<p>Zack Rusin (3):
+<ul>
+      <li>svga: fix a crash happening before setting fragment shaders.</li>
+      <li>svga: Fix stencil op mapping</li>
+      <li>svga: fix the rasterizer state resets</li>
+</ul></p>
+
+
+</body>
+</html>
--- a/docs/relnotes-8.0.3.html
+++ b/docs/relnotes-8.0.3.html
@@ -0,0 +1,319 @@
+<HTML>
+
+<head>
+<TITLE>Mesa Release Notes</TITLE>
+<link rel="stylesheet" type="text/css" href="mesa.css">
+<meta http-equiv="content-type" content="text/html; charset=utf-8" />
+</head>
+
+<BODY>
+
+<body bgcolor="#eeeeee">
+
+<H1>Mesa 8.0.3 Release Notes / May 18, 2012</H1>
+
+<p>
+Mesa 8.0.3 is a bug fix release which fixes bugs found since the 8.0.2 release.
+</p>
+<p>
+Mesa 8.0.3 implements the OpenGL 3.0 API, but the version reported by
+glGetString(GL_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.0.
+</p>
+<p>
+See the <a href="install.html">Compiling/Installing page</a> for prerequisites
+for DRI hardware acceleration.
+</p>
+
+
+<h2>MD5 checksums</h2>
+<pre>
+b7210a6d6e4584bd852ab29294ce717e  MesaLib-8.0.3.tar.gz
+cc5ee15e306b8c15da6a478923797171  MesaLib-8.0.3.tar.bz2
+32773634a0f7e70a680505a81426eccf  MesaLib-8.0.3.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=28138">Bug 28138</a> - [G45] Regnum Online, sparkling in in-game rendering</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=30102">Bug 30102</a> - glean depthStencil test fails BadLength with indirect non-swrast rendering</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=40361">Bug 40361</a> - Glitches on X3100 after upgrade to 7.11</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=41152">Bug 41152</a> - [glsl] Shader backend in Regnum Online does not work</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=41216">Bug 41216</a> - [bisected pineview]oglc filtercubemin(basic.sizedRGBA) fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=41372">Bug 41372</a> - i830_state.c PBO crash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=41495">Bug 41495</a> - i830: intel_get_vb_max / intel_batchbuffer_space mismatch.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=44701">Bug 44701</a> - Regnum online textures flickering</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=44961">Bug 44961</a> - [bisected i965] oglc sRGB(Mipmap.1D_textures) regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=44970">Bug 44970</a> - [i965]oglc max_values(negative.textureSize.textureCube) segfaults</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45214">Bug 45214</a> - Textures disappearing or missing in RegnumOnline OpenGL game</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45558">Bug 45558</a> - cannot render on a drawable of size equal the max framebuffer size</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45921">Bug 45921</a> - [r300g, bisected] Multiple piglit regressions after glsl_to_tgsi changes</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46303">Bug 46303</a> - [SNB] segfault in intel_miptree_release()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46739">Bug 46739</a> - [snb-m-gt2+] compiz crashed with SIGSEGV in intel_miptree_release()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46834">Bug 46834</a> - small performance when playing flightgear (swrast fallback through GTT mapping)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=47126">Bug 47126</a> - tests/fbo/fbo-array.c:109: create_array_fbo: Assertion `glGetError() == 0' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48218">Bug 48218</a> - brw_fs_schedule_instructions.cpp segfault due to accessing not allocated last_mrf_write[16]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48545">Bug 48545</a> - LLVMpipe glReadPixels Firefox hits the slow path (WebGL rendering)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=49124">Bug 49124</a> - swrast/s_texfetch.c:1156: set_fetch_functions: Assertion `texImage-&gt;FetchTexel' failed.</li>
+
+<!-- <li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=">Bug </a> - </li> -->
+
+</ul>
+
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-8.0.2..mesa-8.0.3
+</pre>
+
+<p>Alban Browaeys (1):
+<ul>
+  <li>dri/i915: Fix off-by-one in i830 clip region size.</li>
+</ul></p>
+
+<p>Alex Deucher (2):
+<ul>
+  <li>r200: fix fog coordinate emit</li>
+  <li>radeon: fix fog coordinate emit</li>
+</ul></p>
+
+<p>Alexander von Gluck (4):
+<ul>
+  <li>llvmpipe: fix symbol conflict on Haiku</li>
+  <li>svga: fix typedef conflicts on Haiku</li>
+  <li>mesa: Don't use newlocale on Haiku</li>
+  <li>glsl: Don't use newlocale on Haiku</li>
+</ul></p>
+
+<p>Anuj Phogat (4):
+<ul>
+  <li>mesa: fix issues with texture border and array textures</li>
+  <li>mesa: Fix valid texture target test in _mesa_GetTexLevelParameteriv()</li>
+  <li>mesa: Fix the cause of piglit test fbo-array failure</li>
+  <li>intel: Fix a case when mapping large texture fails</li>
+</ul></p>
+
+<p>Brian Paul (17):
+<ul>
+  <li>mesa: add a couple fast-paths to fast_read_rgba_pixels_memcpy()</li>
+  <li>mesa/gdi: remove wmesa_set_renderbuffer_funcs() function</li>
+  <li>mesa/gdi: remove clear_color() function</li>
+  <li>mesa: bump version to 8.0.2 in configs/default</li>
+  <li>swrast: include s_fragprog.h to silence warnings</li>
+  <li>mesa: remove LSB-first pixel packing check in glReadPixels</li>
+  <li>mesa: fix error in _mesa_format_matches_format_and_type() for RGB888</li>
+  <li>mesa: add BGR888 code in _mesa_format_matches_format_and_type()</li>
+  <li>vbo: fix node_attrsz[] usage in vbo_bind_vertex_list()</li>
+  <li>mesa: add missing texture integer test in glTexSubImage()</li>
+  <li>mesa: add missing return after _mesa_error() in update_array()</li>
+  <li>glsl: propagate MaxUnrollIterations to the optimizer's loop unroller</li>
+  <li>st/mesa: set MaxUnrollIterations = 255</li>
+  <li>st/mesa: no-op glCopyPixels if source region is out of bounds</li>
+  <li>mesa: do more teximage error checking for generic compressed formats</li>
+  <li>mesa: fix/add error check in _mesa_ColorMaterial()</li>
+  <li>mesa: fix glMaterial / dlist bug</li>
+</ul></p>
+
+<p>Chad Versace (3):
+<ul>
+  <li>glsl: Fix Android build</li>
+  <li>main: Fix memory leak in _mesa_make_extension_string()</li>
+  <li>intel: Disable ARB_framebuffer_object in ES contexts</li>
+</ul></p>
+
+<p>Chris Wilson (1):
+<ul>
+  <li>i830: Compute initial number of vertices from remaining batch space</li>
+</ul></p>
+
+<p>Dave Airlie (4):
+<ul>
+  <li>mesa/format_unpack: add LUMINANCE 8/16 UINT/INT</li>
+  <li>glx/drisw: avoid segfaults when we fail to get visual</li>
+  <li>drisw: fix image stride calculation for 16-bit.</li>
+  <li>intel: fix TFP at 16-bpp</li>
+</ul></p>
+
+<p>Dylan Noblesmith (7):
+<ul>
+  <li>intel: fix null dereference processing HiZ buffer</li>
+  <li>util: fix undefined behavior</li>
+  <li>util: fix uninitialized table</li>
+  <li>egl: fix uninitialized values</li>
+  <li>st/vega: fix uninitialized values</li>
+  <li>egl-static: fix printf warning</li>
+  <li>i965: fix typo</li>
+</ul></p>
+
+<p>Eric Anholt (19):
+<ul>
+  <li>i965/fs: Jump from discard statements to the end of the program when done.</li>
+  <li>intel: Fix rendering from textures after RenderTexture().</li>
+  <li>mesa: Fix handling of glCopyBufferSubData() for src == dst.</li>
+  <li>i965/fs: Move GL_CLAMP handling to coordinate setup.</li>
+  <li>i965/fs: Implement GL_CLAMP behavior on texture rectangles on gen6+.</li>
+  <li>mesa: Fix push/pop of multisample coverage invert.</li>
+  <li>mesa: Include the multisample enables under GL_MULTISAMPLE_BIT attrib as well.</li>
+  <li>mesa: Fix display list handling for GL_ARB_draw_instanced.</li>
+  <li>mesa: Fix display lists for draw_elements_base_vertex with draw_instanced.</li>
+  <li>mesa: Add missing error check for first &lt 0 in glDrawArraysInstanced().</li>
+  <li>i915: Fix piglit fbo-nodepth-test on i830.</li>
+  <li>intel: Return success when asked to allocate a 0-width/height renderbuffer.</li>
+  <li>mesa: Throw error on glGetActiveUniform inside Begin/End.</li>
+  <li>i965/vs: Fix up swizzle for dereference_array of matrices.</li>
+  <li>glsl: Fix indentation of switch code.</li>
+  <li>glsl: Let the constructor figure out the types of switch-related expressions.</li>
+  <li>glsl: Reject non-scalar switch expressions.</li>
+  <li>glsl: Fix assertion failure on handling switch on uint expressions.</li>
+  <li>mesa: Check for framebuffer completeness before looking at the rb.</li>
+</ul></p>
+
+<p>Eugeni Dodonov (1):
+<ul>
+  <li>intel: add PCI IDs for Ivy Bridge GT2 server variant</li>
+</ul></p>
+
+<p>Han Shen(沈涵) (1):
+<ul>
+  <li>bin/mklib: remove '-m32' for arm linux</li>
+</ul></p>
+
+<p>Ian Romanick (1):
+<ul>
+  <li>mesa: Bump version number to 8.0.3</li>
+</ul></p>
+
+<p>Jakob Bornecrantz (1):
+<ul>
+  <li>docs: Add 8.0.2 md5sums</li>
+</ul></p>
+
+<p>Jeremy Huddleston (7):
+<ul>
+  <li>darwin: Eliminate a pthread mutex leak</li>
+  <li>darwin: Fix an error message</li>
+  <li>darwin: Make reported errors more user-friendly</li>
+  <li>darwin: Use ASL for logging</li>
+  <li>darwin: Unlock our mutex before destroying it</li>
+  <li>darwin: Eliminate a possible race condition while destroying a surface</li>
+  <li>darwin: Address a build failure on Leopard and earlier OS versions</li>
+</ul></p>
+
+<p>Jon TURNEY (1):
+<ul>
+  <li>Have __glImageSize handle format GL_DEPTH_STENCIL_NV the same way as the server does</li>
+</ul></p>
+
+<p>Jonas Maebe (2):
+<ul>
+  <li>glapi: Correct size of allocated _glapi_table struct</li>
+  <li>apple: Fix a use after free</li>
+</ul></p>
+
+<p>Jordan Justen (1):
+<ul>
+  <li>mesa: Add primitive restart support to glArrayElement</li>
+</ul></p>
+
+<p>Kenneth Graunke (12):
+<ul>
+  <li>i965: Actually upload sampler state pointers for the VS unit on Gen6.</li>
+  <li>i965/fs: Fix FB writes that tried to use the non-existent m16 register.</li>
+  <li>vbo: Remove pedantic warning about 'end' beind out of bounds.</li>
+  <li>vbo: Ignore invalid element ranges which are outside VBO bounds.</li>
+  <li>vbo: Rework checking of 'end' against _MaxElement.</li>
+  <li>vbo: Eliminate short-circuiting in invalid-start case.</li>
+  <li>i965: Fix GPU hangs in the dummy fragment shader.</li>
+  <li>i965: Make the dummy fragment shader work in SIMD16 mode.</li>
+  <li>drirc: Add force_glsl_extensions_warn workaround for Unigine Heaven.</li>
+  <li>i965: Avoid explicit accumulator operands in SIMD16 mode on Gen7.</li>
+  <li>intel: Remove pointless software fallback for glBitmap on Gen6.</li>
+  <li>glsl: Fix broken constant expression handling for &lt, &lt=, &gt;, and &gt;=.</li>
+</ul></p>
+
+<p>Kurt Roeckx (2):
+<ul>
+  <li>i915: Compute maximum number of verts using the actual batchbuffer size.</li>
+  <li>i915: Fix i830 polygon stipple from PBOs.</li>
+</ul></p>
+
+<p>Marek Olšák (5):
+<ul>
+  <li>r300g/swtcl: don't print an error when getting ClipVertex</li>
+  <li>r300g/swtcl: don't enter u_vbuf_mgr</li>
+  <li>r300g/swtcl: don't expose shader subroutine support</li>
+  <li>r300g/swtcl: fix polygon offset</li>
+  <li>r300g/swtcl: fix crash when back color is present in vertex shader</li>
+</ul></p>
+
+<p>Mathias Fröhlich (1):
+<ul>
+  <li>glsl: Avoid excessive loop unrolling.</li>
+</ul></p>
+
+<p>Matt Turner (1):
+<ul>
+  <li>Remove -ffast-math from default CFLAGS</li>
+</ul></p>
+
+<p>Paul Berry (1):
+<ul>
+  <li>i915: Initialize swrast_texture_image structure fields.</li>
+</ul></p>
+
+<p>Roland Scheidegger (1):
+<ul>
+  <li>mesa: check_index_bounds off-by-one fix</li>
+</ul></p>
+
+<p>Tom Stellard (2):
+<ul>
+  <li>r300/compiler: Clear loop registers in vertex shaders w/o loops</li>
+  <li>r300/compiler: Copy all instruction attributes during local transfoms</li>
+</ul></p>
+
+<p>Vinson Lee (5):
+<ul>
+  <li>mesa: Fix memory leak in _mesa_get_uniform_location.</li>
+  <li>linker: Fix memory leak in count_uniform_size::visit_field.</li>
+  <li>swrast: Fix memory leaks in blit_linear.</li>
+  <li>ir_to_mesa: Fix uninitialized member in add_uniform_to_shader.</li>
+  <li>mesa: Fix memory leak in generate_mipmap_compressed.</li>
+</ul></p>
+
+<p>Yuanhan Liu (2):
+<ul>
+  <li>i915: set SPRITE_POINT_ENABLE bit correctly</li>
+  <li>i965: fix wrong cube/3D texture layout</li>
+</ul></p>
+
+</body>
+</html>
--- a/docs/relnotes-8.0.4.html
+++ b/docs/relnotes-8.0.4.html
@@ -0,0 +1,201 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="mesa.css">
+</head>
+<body>
+
+<h1>Mesa 8.0.4 Release Notes / July 10, 2012</h1>
+
+<p>
+Mesa 8.0.4 is a bug fix release which fixes bugs found since the 8.0.2 release.
+</p>
+<p>
+Mesa 8.0.4 implements the OpenGL 3.0 API, but the version reported by
+glGetString(GL_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.0.
+</p>
+<p>
+See the <a href="install.html">Compiling/Installing page</a> for prerequisites
+for DRI hardware acceleration.
+</p>
+
+
+<h2>MD5 checksums</h2>
+<pre>
+02b96082d2f1ad72e7385f4022afafb9  MesaLib-8.0.4.tar.gz
+d546f988adfdf986cff45b1efa2d8a46  MesaLib-8.0.4.tar.bz2
+1f0fdabe6e8019d4de6c16e20e74d163  MesaLib-8.0.4.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45967">Bug 45967</a> - piglit getteximage-invalid-format-for-packed-type regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=47742">Bug 47742</a> - [softpipe] piglit fbo-generatemipmap-array regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48141">Bug 48141</a> - [vmwgfx] src/gallium/auxiliary/util/u_inlines.h:256:pipe_buffer_map_range: Assertion `offset + length &lt;= buffer-&gt;width0' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48472">Bug 48472</a> - GPU Lockup while running demo (rzr - the scene is dead) in wine</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=50033">Bug 50033</a> - src/mesa/state_tracker/st_cb_fbo.c:379:st_render_texture: Assertion `strb-&gt;rtt_level &lt;= strb-&gt;texture-&gt;last_level' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=50621">Bug 50621</a> - Mesa fails its test suite with a buffer overflow.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=50298">Bug 50298</a> - [ILK IVB bisected]Ogles2conform GL/sin/sin_float_vert_xvary.test regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=51574">Bug 51574</a> - ir_loop_jump constructor assigns member variable to itself</li>
+
+<!-- <li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=">Bug </a> - </li> -->
+
+</ul>
+
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-8.0.3..mesa-8.0.4
+</pre>
+
+<p>Andreas Betz (1):</p>
+<ul>
+  <li>vega: fix 565 color unpacking bug</li>
+</ul>
+
+<p>Antoine Labour (2):</p>
+<ul>
+  <li>meta: Cleanup the resources we allocate.</li>
+  <li>mesa: Free uniforms correclty.</li>
+</ul>
+
+<p>Brian Paul (22):</p>
+<ul>
+  <li>docs: add link to 8.0.3 release notes</li>
+  <li>mesa: fix Z32_FLOAT -&gt; uint conversion functions</li>
+  <li>draw: fix primitive restart bug by using the index buffer offset</li>
+  <li>st/mesa: fix glDrawPixels(GL_DEPTH_COMPONENT) color output</li>
+  <li>svga: fix synchronization bug between sampler views and surfaces</li>
+  <li>mesa: new _mesa_error_check_format_and_type() function</li>
+  <li>mesa: add missing GL_UNSIGNED_INT_10F_11F_11F_REV case</li>
+  <li>mesa: fix missing return value in getteximage_error_check()</li>
+  <li>st/mesa: pass GL_MAP_INVALIDATE_RANGE_BIT to gallium drivers</li>
+  <li>svga: add 0.5 in float-&gt;int conversion of sample min/max lod</li>
+  <li>svga: fix min/max lod clamping</li>
+  <li>svga: change PIPE_CAPF_MAX_TEXTURE_LOD_BIAS from 16.0 to 15.0</li>
+  <li>st/mesa: add fallback pipe formats for (compressed) R, RG formats</li>
+  <li>st/mesa: copy num_immediates field when copying the immediates array</li>
+  <li>svga: move svga_texture() casts/calls in svga_surface_copy()</li>
+  <li>svga: reset vertex buffer offset in svga_release_user_upl_buffers()</li>
+  <li>st/mesa: don't set PIPE_BIND_DISPLAY_TARGET for user-created renderbuffers</li>
+  <li>st/mesa: use private pipe_sampler_view in decompress_with_blit()</li>
+  <li>st/mesa: add null pointer check in st_texture_image_map()</li>
+  <li>st/mesa: fix mipmap image size computation w.r.t. texture arrays</li>
+  <li>draw: fix missing immediates bug in polygon stipple code</li>
+  <li>st/mesa: fix max_offset computation for base vertex</li>
+</ul>
+
+<p>Christoph Bumiller (1):</p>
+<ul>
+  <li>nv50: handle NEG,ABS modifiers for short RCP encoding</li>
+</ul>
+
+<p>Dylan Noblesmith (1):</p>
+<ul>
+  <li>mesa: require GL_MAX_SAMPLES &gt;= 4 for GL 3.0</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>i965/vs: Fix texelFetchOffset()</li>
+</ul>
+
+<p>Ian Romanick (5):</p>
+<ul>
+  <li>docs: Add 8.0.3 release md5sums</li>
+  <li>glx/tests: Fix off-by-one error in allocating extension string buffer</li>
+  <li>glsl: Remove spurious printf messages</li>
+  <li>glsl: Fix pi/2 constant in acos built-in function</li>
+  <li>mesa: Bump version number to 8.0.4</li>
+</ul>
+
+<p>José Fonseca (2):</p>
+<ul>
+  <li>mesa: Avoid void acinclude.m4 Android.common.mk Android.mk autogen.sh bin common.py configs configure.ac docs doxygen include Makefile scons SConstruct src tests arithmetic.</li>
+  <li>draw: Ensure that prepare is always run after LLVM garbagge collection.</li>
+</ul>
+
+<p>Kenneth Graunke (15):</p>
+<ul>
+  <li>mesa: Check for a negative "size" parameter in glCopyBufferSubData().</li>
+  <li>i965: Fix brw_swap_cmod() for LE/GE comparisons.</li>
+  <li>glsl: Remove unused ir_loop_jump::loop pointer.</li>
+  <li>ralloc: Fix ralloc_parent() of memory allocated out of the NULL context.</li>
+  <li>mesa: Restore depth texture state on glPopAttrib(GL_TEXTURE_BIT).</li>
+  <li>glsl/builtins: Fix textureGrad() for Array samplers.</li>
+  <li>mesa: Unbind ARB_copy_buffer and transform feedback buffers on delete.</li>
+  <li>mesa: Support BindBuffer{Base,Offset,Range} with a buffer of 0.</li>
+  <li>mesa: Unbind ARB_transform_feedback2 binding points on Delete too.</li>
+  <li>meta: Fix GL_RENDERBUFFER binding in decompress_texture_image().</li>
+  <li>i965/fs: Fix texelFetchOffset() on pre-Gen7.</li>
+  <li>i965/vs: Fix texelFetchOffset() on pre-Gen7.</li>
+  <li>i965/fs: Fix user-defined FS outputs with less than four components.</li>
+  <li>glsl: Hook up loop_variable_state destructor to plug a memory leak.</li>
+  <li>glsl: Don't trust loop analysis in the presence of function calls.</li>
+</ul>
+
+<p>Kurt Roeckx (1):</p>
+<ul>
+  <li>i830: Fix crash for GL_STENCIL_TEST in i830Enable()</li>
+</ul>
+
+<p>Lukas Rössler (1):</p>
+<ul>
+  <li>glu: fix two Clang warnings</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>mesa: allow exposing GL3 without EXT_texture_integer</li>
+  <li>st/mesa: don't do srgb-&gt;linear conversion in decompress_with_blit</li>
+</ul>
+
+<p>Paul Seidler (1):</p>
+<ul>
+  <li>tests: include mesa headers</li>
+</ul>
+
+<p>Stéphane Marchesin (3):</p>
+<ul>
+  <li>glx: Handle a null reply in QueryVersion.</li>
+  <li>i915g: Don't invert signalled/unsignalled fences</li>
+  <li>i915g: Don't avoid flushing when we have a pending fence.</li>
+</ul>
+
+<p>Thomas Gstädtner (1):</p>
+<ul>
+  <li>gallium/targets: pass ldflags parameter to MKLIB</li>
+</ul>
+
+<p>Vadim Girlin (2):</p>
+<ul>
+  <li>st/mesa: set stObj-&gt;lastLevel in guess_and_alloc_texture</li>
+  <li>r600g: check gpr count limit</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>st/mesa: Fix uninitialized members in glsl_to_tgsi_visitor constructor.</li>
+</ul>
+
+</body>
+</html>
--- a/docs/relnotes-8.0.5.html
+++ b/docs/relnotes-8.0.5.html
@@ -0,0 +1,256 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="mesa.css">
+</head>
+<body>
+
+<h1>Mesa 8.0.5 Release Notes / October 24, 2012</h1>
+
+<p>
+Mesa 8.0.5 is a bug fix release which fixes bugs found since the 8.0.4 release.
+</p>
+<p>
+Mesa 8.0.5 implements the OpenGL 3.0 API, but the version reported by
+glGetString(GL_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.0.
+</p>
+<p>
+See the <a href="install.html">Compiling/Installing page</a> for prerequisites
+for DRI hardware acceleration.
+</p>
+
+
+<h2>MD5 checksums</h2>
+<pre>
+cda5d101f43b8784fa60bdeaca4056f2  MesaLib-8.0.5.tar.gz
+01305591073a76b65267f69f27d635a3  MesaLib-8.0.5.tar.bz2
+97f11c00cac8fb98aa0131990086dc8e  MesaLib-8.0.5.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=44912">Bug 44912</a> - [bisected] WebGL conformance/textures/texture-mips tests fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46644">Bug 46644</a> - Sandybridge Mobile: ARBfp TXP with coords from fragment.color doesn't apply W divide</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46784">Bug 46784</a> - MAD using multiply written register fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=47375">Bug 47375</a> - Blender crash on startup after upgrade to mesa 8.0.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48120">Bug 48120</a> - GL_EXT_texture_sRGB_decode still broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48628">Bug 48628</a> - [bisected ILK]Oglc fogexp(basic.allCases) regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=49772">Bug 49772</a> - [SNB]Oglc depth-stencil(basic.read.ds) regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=52129">Bug 52129</a> - [Bisected ILK]Piglit spec_ARB_shader_texture_lod_execution_glsl-fs-shadow2DGradARB-01  regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=52382">Bug 52382</a> - [ivb gt1] Severe image corruption and GPU Hang, too many PS threads</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=52563">Bug 52563</a> - build failure - struct radeon_renderbuffer has no member named Base</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53311">Bug 53311</a> - [Bisected IVB]Oglc transform_feedback(advanced.transformFeedback.points) Invalid argument</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53314">Bug 53314</a> - [llvmpipe] src/gallium/drivers/llvmpipe/lp_texture.c:920:llvmpipe_get_texture_tile_layout: Assertion `x &lt; lpr-&gt;tiles_per_row[level]' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53316">Bug 53316</a> - [llvmpipe] src/gallium/drivers/llvmpipe/lp_texture.c:601:llvmpipe_get_transfer: Assertion `resource' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53317">Bug 53317</a> - [llvmpipe] SIGSEGV src/gallium/auxiliary/gallivm/lp_bld_sample.c:99</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53318">Bug 53318</a> - [softpipe] sp_state_shader.c:194:softpipe_delete_fs_state: Assertion `var != softpipe-&gt;fs_variant' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53319">Bug 53319</a> - [softpipe] sp_texture.c:322:softpipe_get_transfer: Assertion `level &lt;= resource-&gt;last_level' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53618">Bug 53618</a> - [Bisected i915]Piglit glx_GLX_ARB_create_context_NULL_attribute_list Aborted</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53972">Bug 53972</a> - Black Mirror III: too dark</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=54183">Bug 54183</a> - [Bisected ILK regression]many piglit/oglc/ogles2 cases Segmentation fault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=54193">Bug 54193</a> - output_components uninitialized in fs_visitor::emit_fb_writes()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=54301">Bug 54301</a> - [Bisected ILK regression]Piglit glx_GLX_ARB_create_context_forward-compatible_flag_with_3.0 Segmentation fault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=56211">Bug 56211</a> - src/mesa/state_tracker/st_cb_texture.c:1123:copy_image_data_to_texture: Assertion `u_minify(stImage-&gt;pt-&gt;height0, src_level) == stImage-&gt;base.Height' failed.</li>
+
+<!-- <li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=">Bug </a> - </li> -->
+
+</ul>
+
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-8.0.4..mesa-8.0.5
+</pre>
+
+<p>Alex Deucher (3):</p>
+<ul>
+  <li>r600g: 8.0.x support for Trinity</li>
+  <li>r600g: add new Sumo, Palm, BTC pci ids</li>
+  <li>r600g: add additional evergreen pci ids</li>
+</ul>
+
+<p>Andreas Boll (4):</p>
+<ul>
+  <li>docs/relnotes-8.0.4: fix html markup</li>
+  <li>mesa: fix html in shortlog_mesa.sh script</li>
+  <li>mesa: add get-pick-list.sh script into bin/</li>
+  <li>mesa: Bump version number to 8.0.5</li>
+</ul>
+
+<p>Brian Paul (18):</p>
+<ul>
+  <li>mesa: use _mesa_is_user_fbo() and _mesa_is_winsys_fbo() functions</li>
+  <li>intel: use _mesa_is_winsys/user_fbo() helpers</li>
+  <li>st/egl: fix uninitialized pointer bug</li>
+  <li>mesa: added Ian's shortlog_mesa.sh script in bin/</li>
+  <li>mesa: loosen small matrix determinant check</li>
+  <li>xlib: add X error handler around XGetImage() call</li>
+  <li>radeon: set swrast_renderbuffer::ColorType field when mapping renderbuffers</li>
+  <li>svga: fix invalid memory reference in needs_to_create_zero()</li>
+  <li>meta: fix glDrawPixels fallback test, stencil drawing</li>
+  <li>radeon: fix Base/base typo</li>
+  <li>st/mesa: fix glCopyTexSubImage crash</li>
+  <li>gallivm: fix crash in lp_sampler_static_state()</li>
+  <li>st/mesa: fix renderbuffer validation bug</li>
+  <li>softpipe: fix softpipe_delete_fs_state() failed assertion</li>
+  <li>mesa: raise GL_INVALID_OPERATION in glGenerateMipmap for missing base image</li>
+  <li>st/mesa: s/CALLOC/calloc/ to fix allocation bug</li>
+  <li>mesa: do internal format error checking for glTexStorage()</li>
+  <li>mesa: fix incorrect error for glCompressedSubTexImage</li>
+</ul>
+
+<p>Chad Versace (3):</p>
+<ul>
+  <li>mesa: Don't advertise GLES extensions in GL contexts</li>
+  <li>i830: Fix stack corruption</li>
+  <li>swrast: Fix implicit declaration warnings</li>
+</ul>
+
+<p>Chris Forbes (1):</p>
+<ul>
+  <li>mesa: fix dropped &amp;&amp; in glGetStringi()</li>
+</ul>
+
+<p>Christoph Bumiller (1):</p>
+<ul>
+  <li>st/mesa: call update_renderbuffer_surface for sRGB renderbuffers, too</li>
+</ul>
+
+<p>Eric Anholt (9):</p>
+<ul>
+  <li>i965/gen7: Reduce GT1 WM thread count according to updated BSpec.</li>
+  <li>i965/fs: Invalidate live intervals in passes that remove an instruction.</li>
+  <li>i965: Fix bug in the old FS backend's projtex() calculation.</li>
+  <li>i965: Add support for GL_SKIP_DECODE_EXT on other SRGB formats.</li>
+  <li>i965/vs: Convert EdgeFlagPointer values appropriately for the VS on gen4.</li>
+  <li>i965: Fix accumulator_contains() test to also reject swizzles of the dst.</li>
+  <li>mesa: Fix glPopAttrib() behavior on GL_FRAMEBUFFER_SRGB.</li>
+  <li>mesa: In conditional rendering fallback, check the query status.</li>
+  <li>i965: Drop the confusing saturate argument to math instruction setup.</li>
+</ul>
+
+<p>Ian Romanick (8):</p>
+<ul>
+  <li>docs: Add 8.0.4 release md5sums</li>
+  <li>Revert "i965: Avoid unnecessary recompiles for shaders that don't use dFdy()."</li>
+  <li>i965: Fix regression in depth texture rendering on pre-SNB</li>
+  <li>dri2: Fix bug in attribute handling for non-desktop OpenGL contexts</li>
+  <li>mesa: Generate an error when glCopyTexImage border is invalid</li>
+  <li>mesa/es: Validate glTexImage border in Mesa code rather than the ES wrapper</li>
+  <li>mesa: Allow glGetTexParameter of GL_TEXTURE_SRGB_DECODE_EXT</li>
+  <li>dri_util: Use calloc to allocate __DRIcontext</li>
+</ul>
+
+<p>Jonas Maebe (1):</p>
+<ul>
+  <li>darwin: do not create double-buffered offscreen pixel formats</li>
+</ul>
+
+<p>Jordan Justen (1):</p>
+<ul>
+  <li>intel: move error on create context to proper path</li>
+</ul>
+
+<p>José Fonseca (1):</p>
+<ul>
+  <li>mesa: disable MSVC global optimization in pack.c</li>
+</ul>
+
+<p>Kenneth Graunke (8):</p>
+<ul>
+  <li>mesa: Use GLdouble for depthMax in final unpack conversions.</li>
+  <li>i965/fs: Initialize output_components[] by filling it with zeros.</li>
+  <li>mesa: Prevent repeated glDeleteShader() from blowing away our refcounts.</li>
+  <li>i965: Support MESA_FORMAT_SIGNED_RGBA_16.</li>
+  <li>glsl: Fix #pragma invariant(all) language version check.</li>
+  <li>i965/vs: Don't clobber sampler message MRFs with subexpressions.</li>
+  <li>intel: Move finish_batch() call before MI_BATCH_BUFFER_END and padding.</li>
+  <li>i965/fs: Don't use brw-&gt;fragment_program in calculate_urb_setup().</li>
+</ul>
+
+<p>Maarten Lankhorst (1):</p>
+<ul>
+  <li>winsys/radeon: Remove unnecessary pipe_thread_destroy in radeon_drm_cs_destroy</li>
+</ul>
+
+<p>Marek Olšák (6):</p>
+<ul>
+  <li>mesa: remove assertions that do not allow compressed 2D_ARRAY textures</li>
+  <li>r300g: fix colormask with non-BGRA formats</li>
+  <li>r600g: fix RSQ of negative value on Cayman</li>
+  <li>r600g: fix EXP on Cayman</li>
+  <li>r600g: fix instance divisor on Cayman</li>
+  <li>gallium/u_blit: set dst format from pipe_resource, not pipe_surface</li>
+</ul>
+
+<p>Michel Dänzer (2):</p>
+<ul>
+  <li>st/mesa: Fix source miptree level for copying data to finalized miptree.</li>
+  <li>st/mesa: Fix assertions for copying texture image to finalized miptree.</li>
+</ul>
+
+<p>Niels Ole Salscheider (1):</p>
+<ul>
+  <li>st/mesa: index can be negative in the PROGRAM_CONSTANT case</li>
+</ul>
+
+<p>Paul Berry (5):</p>
+<ul>
+  <li>i965: Compute dFdy() correctly for FBOs.</li>
+  <li>mesa: Add UsesDFdy to struct gl_fragment_program.</li>
+  <li>i965: Avoid unnecessary recompiles for shaders that don't use dFdy().</li>
+  <li>i965/Gen6: Work around GPU hangs due to misaligned depth coordinate offsets.</li>
+  <li>i965/Gen7: Work around GPU hangs due to misaligned depth coordinate offsets.</li>
+</ul>
+
+<p>Stéphane Marchesin (1):</p>
+<ul>
+  <li>glsl/linker: Avoid buffer over-run in parcel_out_uniform_storage::visit_field</li>
+</ul>
+
+<p>Tapani Pälli (2):</p>
+<ul>
+  <li>xmlconfig: use __progname when building for Android</li>
+  <li>android: do not expose single buffered eglconfigs</li>
+</ul>
+
+<p>Vadim Girlin (1):</p>
+<ul>
+  <li>winsys/radeon: fix relocs caching</li>
+</ul>
+
+</body>
+</html>
--- a/docs/relnotes-8.0.html
+++ b/docs/relnotes-8.0.html
@@ -10,7 +10,7 @@

 <body bgcolor="#eeeeee">

-<H1>Mesa 8.0 Release Notes / (release date TBD)</H1>
+<H1>Mesa 8.0 Release Notes / February 9, 2012</H1>

 <p>
 Mesa 8.0 is a new development release.
@@ -30,7 +30,9 @@ for DRI hardware acceleration.

 <h2>MD5 checksums</h2>
 <pre>
-tbd
+3516fea6c28ce4a0fa9759e4894729a1  MesaLib-8.0.tar.gz
+1a5668fe72651a670611164cefc703b2  MesaLib-8.0.tar.bz2
+66f5a01a85530a91472a3acceb556db8  MesaLib-8.0.zip
 </pre>


--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -13,6 +13,10 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>

 <UL>
+<LI><A HREF="relnotes-8.0.4.html">8.0.4 release notes</A>
+<LI><A HREF="relnotes-8.0.3.html">8.0.3 release notes</A>
+<LI><A HREF="relnotes-8.0.2.html">8.0.2 release notes</A>
+<LI><A HREF="relnotes-8.0.1.html">8.0.1 release notes</A>
 <LI><A HREF="relnotes-8.0.html">8.0 release notes</A>
 <LI><A HREF="relnotes-7.11.html">7.11 release notes</A>
 <LI><A HREF="relnotes-7.10.3.html">7.10.3 release notes</A>
--- a/docs/systems.html
+++ b/docs/systems.html
@@ -9,34 +9,78 @@
 <H1>Supported Systems and Drivers</H1>

 <p>
-Mesa was originally designed for Unix/X11 systems and is still best
-supported on those systems. All you need is an ANSI C compiler and the
-X development environment to use Mesa.
+Mesa is primarily developed and used on Linux systems.
+But there's also support for Windows, other flavors of Unix and other
+systems such as Haiku.
+We're actively developing and maintaining several hardware and
+software drivers.
 </p>

 <p>
-The DRI hardware drivers for the X.org server and XFree86 provide
-hardware accelerated rendering for chips from ATI, Intel, and NVIDIA
-on Linux and FreeBSD.
+The primary API is OpenGL but there's also support for OpenGL ES 1
+and ES 2, OpenVG and the EGL interface.
 </p>

 <p>
-Drivers for other assorted platforms include:
-the Apple Macintosh and Windows.
+Hardware drivers include:
 </p>
+<ul>
+<li>Intel i965, i945, i915.
+    See <a href="http://intellinuxgraphics.org/index.html" target="_parent">
+    Intel's website</a>
+<li>AMD Radeon series
+<li>Some NVIDIA GPus.
+<li>VMware virtual GPU
+</ul>

 <p>
-Details about particular drivers follows:
+Software drivers include:
+</p>
+<ul>
+<li><a href="llvmpipe.html">llvmpipe</a> - uses LLVM for x86 JIT code
+    generation and is multi-threaded
+<li>softpipe - a reference Gallium driver
+<li>swrast - the legacy/original Mesa software rasterizer
+</ul>
+
+<p>
+Additional driver information:
 </p>

 <UL>
 <li><a href="http://dri.freedesktop.org/" target="_parent"> DRI hardware
 drivers</a> for the X Window System
-<LI><a href="xlibdriver.html">Xlib software driver</a> for the X Window System
+<li><a href="xlibdriver.html">Xlib / swrast driver</a> for the X Window System
 and Unix-like operating systems
-<LI>Microsoft Windows <A HREF="README.WIN32">(README.WIN32)</A>
-<LI>DEC VMS <A HREF="README.VMS">(README.VMS)</A>
+<li><a href="README.WIN32">Microsoft Windows</a>
+<li><a href="vmware-guest.html">VMware</a> guest OS driver
 </UL>

+
+<h1>
+Deprecated Systems and Drivers
+</h1>
+
+<p>
+In the past there were other drivers for older GPUs and operating
+systems.
+These have been removed from the Mesa source tree and distribution.
+If anyone's interested though, the code can be found in the git repo.
+The list includes:
+</p>
+
+<ul>
+<li>3dfx/glide
+<li>Matrox
+<li>ATI R128
+<li>Savage
+<li>VIA Unichrome
+<li>SIS
+<li>3Dlabs gamma
+<li>DOS
+<li>fbdev
+<li>DEC/VMS
+<ul>
+
 </body>
 </html>
--- a/docs/vmware-guest.html
+++ b/docs/vmware-guest.html
@@ -0,0 +1,195 @@
+<html>
+
+<title>VMware guest GL driver</title>
+
+<link rel="stylesheet" type="text/css" href="mesa.css"></head>
+
+<body>
+
+
+<h1>VMware guest GL driver</h1>
+
+<p>
+This page describes how to build, install and use the
+<a href="http://www.vmware.com/" target="_parent">VMware</a> guest GL driver
+(aka the SVGA or SVGA3D driver) for Linux using the latest source code.
+This driver gives a Linux virtual machine access to the host's GPU for
+hardware-accelerated 3D.
+VMware Workstation running on Linux or Windows and VMware Fusion running on
+MacOS are all supported.
+</p>
+
+<p>
+End users shouldn't have to go through all these steps once the driver is
+included in newer Linux distributions.
+</p>
+
+<p>
+For more information about the X components see these wiki pages at x.org:
+</p>
+<ul>
+<li><a href="http://wiki.x.org/wiki/vmware" target="_parent">
+Driver Overview</a>
+<li><a href="http://wiki.x.org/wiki/vmware/vmware3D" target="_parent">
+xf86-video-vmware Details</a>
+</ul>
+
+
+<h2>Components</h2>
+
+The components involved in this include:
+<ul>
+<li>Linux kernel module: vmwgfx
+<li>X server 2D driver: xf86-video-vmware
+<li>User-space libdrm library
+<li>Mesa/gallium OpenGL driver: "svga"
+</ul>
+
+
+<h2>Prerequisites</h2>
+
+<ul>
+<li>Kernel version at least 2.6.25 
+<li>Xserver version at least 1.7 
+<li>Ubuntu: For ubuntu you need to install a number of build dependencies. 
+  <pre>
+  sudo apt-get install git-core
+  sudo apt-get install automake libtool libpthread-stubs0-dev
+  sudo apt-get install xserver-xorg-dev x11proto-xinerama-dev
+  sudo apt-get build-dep libgl1-mesa-dri libxcb-glx0-dev
+  </pre>
+<li>Fedora: For Fedora you also need to install a number of build dependencies. 
+  <pre>
+  sudo yum install mesa-libGL-devel xorg-x11-server-devel xorg-x11-util-macros
+  sudo yum install automake gcc libtool expat-devel kernel-devel git-core
+  </pre>
+</ul>
+
+<p>
+Depending on your Linux distro, other packages may be needed.
+The configure scripts should tell you what's missing.
+</p>
+
+
+
+<h2>Getting the Latest Source Code</h2>
+
+Begin by saving your current directory location:
+  <pre>
+  export TOP=$PWD
+  </pre>
+
+<ul>
+<li>Mesa/Gallium master branch. This code is used to build libGL, and the direct rendering svga driver for libGL, vmwgfx_dri.so, and the X acceleration library libxatracker.so.x.x.x. 
+  <pre>
+  git clone git://anongit.freedesktop.org/git/mesa/mesa
+  </pre>
+<li>VMware Linux guest kernel module. Note that this repo contains the complete DRM and TTM code. The vmware-specific driver is really only the files prefixed with vmwgfx. 
+  <pre>
+  git clone git://anongit.freedesktop.org/git/mesa/vmwgfx
+  </pre>
+
+<li>libdrm, A user-space library that interfaces with drm. Most distros ship with this driver. Safest bet is really to replace the system one. Optionally you can point LIBDRM_CFLAGS and LIBDRM_LIBS to the libdrm-2.4.22 package in toolchain. But here, we replace: 
+  <pre>
+  git clone git://anongit.freedesktop.org/git/mesa/drm
+  </pre>
+<li>xf86-video-vmware. The chainloading driver, vmware_drv.so, the legacy driver vmwlegacy_drv.so, and the vmwgfx driver vmwgfx_drv.so. 
+  <pre>
+  git clone git://anongit.freedesktop.org/git/xorg/driver/xf86-video-vmware
+  </pre>
+</ul>
+
+
+<h2>Building the Code</h2>
+
+<ul>
+<li>Build libdrm: If you're on a 32-bit system, you should skip the --libdir configure option. Note also the comment about toolchain libdrm above. 
+  <pre>
+  cd $TOP/drm
+  ./autogen.sh --prefix=/usr --enable-vmwgfx-experimental-api --libdir=/usr/lib64
+  make
+  sudo make install
+  </pre>
+<li>Build Mesa and the vmwgfx_dri.so driver, the vmwgfx_drv.so xorg driver, the X acceleration library libxatracker.
+The vmwgfx_dri.so is used by the OpenGL libraries during direct rendering,
+and by the Xorg server during accelerated indirect GL rendering.
+The libxatracker library is used exclusively by the X server to do render,
+copy and video acceleration:
+<br>
+The following configure options doesn't build the EGL system.
+<br>
+As before, if you're on a 32-bit system, you should skip the --libdir
+configure option.
+  <pre>
+  cd $TOP/mesa
+  ./autogen.sh --prefix=/usr --libdir=/usr/lib64 --with-gallium-drivers=svga --with-dri-drivers= --enable-xa
+  make
+  sudo make install
+  </pre>
+
+Note that you may have to install other packages that Mesa depends upon
+if they're not installed in your system.  You should be told what's missing.
+<br>
+<br>
+
+<li>xf86-video-vmware: Now, once libxatracker is installed, we proceed with building and replacing the current Xorg driver. First check if your system is 32- or 64-bit. If you're building for a 32-bit system, you will not be needing the --libdir=/usr/lib64 option to autogen. 
+  <pre>
+  cd $TOP/xf86-video-vmware
+  ./autogen.sh --prefix=/usr --libdir=/usr/lib64
+  make
+  sudo make install
+  </pre>
+<li>vmwgfx kernel module. First make sure that any old version of this kernel module is removed from the system by issuing
+  <pre>
+  sudo rm /lib/modules/`uname -r`/kernel/drivers/gpu/drm/vmwgfx.ko*
+  </pre>
+Then 
+  <pre>
+  cd $TOP/vmwgfx
+  make
+  sudo make install
+  sudo cp 00-vmwgfx.rules /etc/udev/rules.d
+  sudo depmod -ae
+  </pre>
+</ul>
+
+
+Now try to load the kernel module by issuing
+  <pre>
+  sudo modprobe vmwgfx</pre>
+Then type 
+  <pre>
+  dmesg</pre>
+to watch the debug output. It should contain a number of lines prefixed with "[vmwgfx]". 
+
+<p>
+Then restart the Xserver (or reboot).
+The lines starting with "vmwlegacy" or "VMWARE" in the file /var/log/Xorg.0.log
+should now have been replaced with lines starting with "vmwgfx", indicating that
+the new Xorg driver is in use. 
+</p>
+
+
+<h2>Running OpenGL Programs</h2>
+
+<p>
+In a shell, run 'glxinfo' and look for the following to verify that the
+driver is working:
+</p>
+
+<pre>
+OpenGL vendor string: VMware, Inc.
+OpenGL renderer string: Gallium 0.4 on SVGA3D; build: RELEASE;
+OpenGL version string: 2.1 Mesa 8.0
+</pre>
+
+If you don't see this, try setting this environment variable:
+  <pre>
+  export LIBGL_DEBUG=verbose</pre>
+then rerun glxinfo and examine the output for error messages.
+</p>
+
+
+
+</body>
+</html>
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -25,3 +25,4 @@ CHIPSET(0x0162, IVYBRIDGE_GT2, ivb_gt2)
 CHIPSET(0x0156, IVYBRIDGE_M_GT1, ivb_gt1)
 CHIPSET(0x0166, IVYBRIDGE_M_GT2, ivb_gt2)
 CHIPSET(0x015a, IVYBRIDGE_S_GT1, ivb_gt1)
+CHIPSET(0x016a, IVYBRIDGE_S_GT2, ivb_gt2)
--- a/include/pci_ids/r600_pci_ids.h
+++ b/include/pci_ids/r600_pci_ids.h
@@ -157,6 +157,7 @@ CHIPSET(0x68FE, CEDAR_68FE, CEDAR)

 CHIPSET(0x68C0, REDWOOD_68C0, REDWOOD)
 CHIPSET(0x68C1, REDWOOD_68C1, REDWOOD)
+CHIPSET(0x68C7, REDWOOD_68C7, REDWOOD)
 CHIPSET(0x68C8, REDWOOD_68C8, REDWOOD)
 CHIPSET(0x68C9, REDWOOD_68C9, REDWOOD)
 CHIPSET(0x68D8, REDWOOD_68D8, REDWOOD)
@@ -179,6 +180,8 @@ CHIPSET(0x6880, CYPRESS_6880, CYPRESS)
 CHIPSET(0x6888, CYPRESS_6888, CYPRESS)
 CHIPSET(0x6889, CYPRESS_6889, CYPRESS)
 CHIPSET(0x688A, CYPRESS_688A, CYPRESS)
+CHIPSET(0x688C, CYPRESS_688C, CYPRESS)
+CHIPSET(0x688D, CYPRESS_688D, CYPRESS)
 CHIPSET(0x6898, CYPRESS_6898, CYPRESS)
 CHIPSET(0x6899, CYPRESS_6899, CYPRESS)
 CHIPSET(0x689B, CYPRESS_689B, CYPRESS)
@@ -195,6 +198,7 @@ CHIPSET(0x9806, PALM_9806, PALM)
 CHIPSET(0x9807, PALM_9807, PALM)
 CHIPSET(0x9808, PALM_9808, PALM)
 CHIPSET(0x9809, PALM_9809, PALM)
+CHIPSET(0x980A, PALM_980A, PALM)

 CHIPSET(0x9640, SUMO_9640,  SUMO)
 CHIPSET(0x9641, SUMO_9641,  SUMO)
@@ -204,6 +208,7 @@ CHIPSET(0x9644, SUMO2_9644, SUMO2)
 CHIPSET(0x9645, SUMO2_9645, SUMO2)
 CHIPSET(0x9647, SUMO_9647,  SUMO)
 CHIPSET(0x9648, SUMO_9648,  SUMO)
+CHIPSET(0x9649, SUMO_9649,  SUMO)
 CHIPSET(0x964a, SUMO_964A,  SUMO)
 CHIPSET(0x964b, SUMO_964B,  SUMO)
 CHIPSET(0x964c, SUMO_964C,  SUMO)
@@ -239,6 +244,7 @@ CHIPSET(0x6729, BARTS_6729, BARTS)
 CHIPSET(0x6738, BARTS_6738, BARTS)
 CHIPSET(0x6739, BARTS_6739, BARTS)
 CHIPSET(0x673E, BARTS_673E, BARTS)
+
 CHIPSET(0x6740, TURKS_6740, TURKS)
 CHIPSET(0x6741, TURKS_6741, TURKS)
 CHIPSET(0x6742, TURKS_6742, TURKS)
@@ -249,6 +255,7 @@ CHIPSET(0x6746, TURKS_6746, TURKS)
 CHIPSET(0x6747, TURKS_6747, TURKS)
 CHIPSET(0x6748, TURKS_6748, TURKS)
 CHIPSET(0x6749, TURKS_6749, TURKS)
+CHIPSET(0x674A, TURKS_674A, TURKS)
 CHIPSET(0x6750, TURKS_6750, TURKS)
 CHIPSET(0x6751, TURKS_6751, TURKS)
 CHIPSET(0x6758, TURKS_6758, TURKS)
@@ -275,7 +282,33 @@ CHIPSET(0x6766, CAICOS_6766, CAICOS)
 CHIPSET(0x6767, CAICOS_6767, CAICOS)
 CHIPSET(0x6768, CAICOS_6768, CAICOS)
 CHIPSET(0x6770, CAICOS_6770, CAICOS)
+CHIPSET(0x6771, CAICOS_6771, CAICOS)
 CHIPSET(0x6772, CAICOS_6772, CAICOS)
 CHIPSET(0x6778, CAICOS_6778, CAICOS)
 CHIPSET(0x6779, CAICOS_6779, CAICOS)
 CHIPSET(0x677B, CAICOS_677B, CAICOS)
+
+CHIPSET(0x9900, ARUBA_9900, ARUBA)
+CHIPSET(0x9901, ARUBA_9901, ARUBA)
+CHIPSET(0x9903, ARUBA_9903, ARUBA)
+CHIPSET(0x9904, ARUBA_9904, ARUBA)
+CHIPSET(0x9905, ARUBA_9905, ARUBA)
+CHIPSET(0x9906, ARUBA_9906, ARUBA)
+CHIPSET(0x9907, ARUBA_9907, ARUBA)
+CHIPSET(0x9908, ARUBA_9908, ARUBA)
+CHIPSET(0x9909, ARUBA_9909, ARUBA)
+CHIPSET(0x990A, ARUBA_990A, ARUBA)
+CHIPSET(0x990F, ARUBA_990F, ARUBA)
+CHIPSET(0x9910, ARUBA_9910, ARUBA)
+CHIPSET(0x9913, ARUBA_9913, ARUBA)
+CHIPSET(0x9917, ARUBA_9917, ARUBA)
+CHIPSET(0x9918, ARUBA_9918, ARUBA)
+CHIPSET(0x9919, ARUBA_9919, ARUBA)
+CHIPSET(0x9990, ARUBA_9990, ARUBA)
+CHIPSET(0x9991, ARUBA_9991, ARUBA)
+CHIPSET(0x9992, ARUBA_9992, ARUBA)
+CHIPSET(0x9993, ARUBA_9993, ARUBA)
+CHIPSET(0x9994, ARUBA_9994, ARUBA)
+CHIPSET(0x99A0, ARUBA_99A0, ARUBA)
+CHIPSET(0x99A2, ARUBA_99A2, ARUBA)
+CHIPSET(0x99A4, ARUBA_99A4, ARUBA)
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -498,6 +498,14 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
      for (j = 0; dri2_dpy->driver_configs[j]; j++) {
         const EGLint surface_type = EGL_WINDOW_BIT | EGL_PBUFFER_BIT;
         struct dri2_egl_config *dri2_conf;
+         unsigned int double_buffered = 0;
+
+         dri2_dpy->core->getConfigAttrib(dri2_dpy->driver_configs[j],
+            __DRI_ATTRIB_DOUBLE_BUFFER, &double_buffered);
+
+         /* support only double buffered configs */
+         if (!double_buffered)
+            continue;

         dri2_conf = dri2_add_config(dpy, dri2_dpy->driver_configs[j],
               count + 1, visuals[i].size, surface_type, NULL,
@@ -523,17 +531,6 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
      /* there is no front buffer so no OpenGL */
      dri2_conf->base.RenderableType &= ~EGL_OPENGL_BIT;
      dri2_conf->base.Conformant &= ~EGL_OPENGL_BIT;
-
-      /*
-       * We want to make sure GL_DRAW_BUFFER for windows or pbuffers is always
-       * GL_BACK.  For EGL configs that do not have a double DRI config, clear
-       * the surface type.
-       *
-       * This is just to be on the safe side.  dri2_add_config never sets
-       * EGL_WINDOW_BIT or EGL_PBUFFER_BIT for such configs.
-       */
-      if (!dri2_conf->dri_double_config)
-         dri2_conf->base.SurfaceType = 0;
   }

   return (count != 0);
--- a/src/egl/main/eglimage.c
+++ b/src/egl/main/eglimage.c
@@ -45,7 +45,7 @@ _eglParseImageAttribList(_EGLImageAttribs *attrs, _EGLDisplay *dpy,

   (void) dpy;

-   memset(attrs, 0, sizeof(attrs));
+   memset(attrs, 0, sizeof(*attrs));
   attrs->ImagePreserved = EGL_FALSE;
   attrs->GLTextureLevel = 0;
   attrs->GLTextureZOffset = 0;
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -80,6 +80,21 @@ void draw_set_viewport_state( struct draw_context *draw,
 void draw_set_clip_state( struct draw_context *pipe,
                          const struct pipe_clip_state *clip );

+/**
+ * Sets the rasterization state used by the draw module.
+ * The rast_handle is used to pass the driver specific representation
+ * of the rasterization state. It's going to be used when the
+ * draw module sets the state back on the driver itself using the
+ * pipe::bind_rasterizer_state method.
+ *
+ * NOTE: if you're calling this function from within the pipe's
+ * bind_rasterizer_state you should always call it before binding
+ * the actual state - that's because the draw module can try to
+ * bind its own rasterizer state which would reset your newly
+ * set state. i.e. always do
+ * draw_set_rasterizer_state(driver->draw, state->pipe_state, state);
+ * driver->state.raster = state;
+ */
 void draw_set_rasterizer_state( struct draw_context *draw,
                                const struct pipe_rasterizer_state *raster,
                                void *rast_handle );
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -65,8 +65,13 @@ static void
 draw_llvm_garbage_collect_callback(void *cb_data)
 {
   struct draw_llvm *llvm = (struct draw_llvm *) cb_data;
+   struct draw_context *draw = llvm->draw;
   struct draw_llvm_variant_list_item *li;

+   /* Ensure prepare will be run and shaders recompiled */
+   assert(!draw->suspend_flushing);
+   draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
+
   /* free all shader variants */
   li = first_elem(&llvm->vs_variants_list);
   while (!at_end(&llvm->vs_variants_list, li)) {
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -165,11 +165,16 @@ pstip_transform_decl(struct tgsi_transform_context *ctx,
 }


+/**
+ * TGSI immediate declaration transform callback.
+ * We're just counting the number of immediates here.
+ */
 static void
 pstip_transform_immed(struct tgsi_transform_context *ctx,
                      struct tgsi_full_immediate *immed)
 {
   struct pstip_transform_context *pctx = (struct pstip_transform_context *) ctx;
+   ctx->emit_immediate(ctx, immed); /* emit to output shader */
   pctx->numImmed++;
 }

--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -329,25 +329,28 @@ draw_pt_arrays_restart(struct draw_context *draw,

   if (draw->pt.user.elts) {
      /* indexed prims (draw_elements) */
+      const char *elts =
+         (const char *) draw->pt.user.elts + draw->pt.index_buffer.offset;
+
      cur_start = start;
      cur_count = 0;

      switch (draw->pt.user.eltSize) {
      case 1:
         {
-            const ubyte *elt_ub = (const ubyte *) draw->pt.user.elts;
+            const ubyte *elt_ub = (const ubyte *) elts;
            PRIM_RESTART_LOOP(elt_ub);
         }
         break;
      case 2:
         {
-            const ushort *elt_us = (const ushort *) draw->pt.user.elts;
+            const ushort *elt_us = (const ushort *) elts;
            PRIM_RESTART_LOOP(elt_us);
         }
         break;
      case 4:
         {
-            const uint *elt_ui = (const uint *) draw->pt.user.elts;
+            const uint *elt_ui = (const uint *) elts;
            PRIM_RESTART_LOOP(elt_ui);
         }
         break;
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -99,15 +99,14 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
                        const struct pipe_sampler_view *view,
                        const struct pipe_sampler_state *sampler)
 {
-   const struct pipe_resource *texture = view->texture;
+   const struct pipe_resource *texture;

   memset(state, 0, sizeof *state);

-   if(!texture)
+   if (!sampler || !view || !view->texture)
      return;

-   if(!sampler)
-      return;
+   texture = view->texture;

   /*
    * We don't copy sampler state over unless it is actually enabled, to avoid
--- a/src/gallium/auxiliary/postprocess/postprocess.h
+++ b/src/gallium/auxiliary/postprocess/postprocess.h
@@ -72,8 +72,7 @@ void pp_free(struct pp_queue_t *);
 void pp_free_fbos(struct pp_queue_t *);
 void pp_debug(const char *, ...);
 struct program *pp_init_prog(struct pp_queue_t *, struct pipe_screen *);
-void pp_init_fbos(struct pp_queue_t *, unsigned int, unsigned int,
-                  struct pipe_resource *);
+void pp_init_fbos(struct pp_queue_t *, unsigned int, unsigned int);

 /* The filters */

--- a/src/gallium/auxiliary/postprocess/pp_init.c
+++ b/src/gallium/auxiliary/postprocess/pp_init.c
@@ -195,7 +195,7 @@ pp_debug(const char *fmt, ...)
 /** Allocate the temp FBOs. Called on makecurrent and resize. */
 void
 pp_init_fbos(struct pp_queue_t *ppq, unsigned int w,
-             unsigned int h, struct pipe_resource *indepth)
+             unsigned int h)
 {

   struct program *p = ppq->p;  /* The lazy will inherit the earth */
@@ -242,11 +242,7 @@ pp_init_fbos(struct pp_queue_t *ppq, unsigned int w,
         goto error;
   }

-   tmp_res.format = p->surf.format = indepth->format;
   tmp_res.bind = p->surf.usage = PIPE_BIND_DEPTH_STENCIL;
-   ppq->depth = indepth;
-   if (!ppq->depth)
-      goto error;

   tmp_res.format = p->surf.format = PIPE_FORMAT_S8_UINT_Z24_UNORM;

--- a/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/src/gallium/auxiliary/postprocess/pp_run.c
@@ -42,14 +42,14 @@ void
 pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
       struct pipe_resource *out, struct pipe_resource *indepth)
 {
-
+   struct pipe_resource *refin = NULL, *refout = NULL;
   unsigned int i;

   if (in->width0 != ppq->p->framebuffer.width ||
       in->height0 != ppq->p->framebuffer.height) {
      pp_debug("Resizing the temp pp buffers\n");
      pp_free_fbos(ppq);
-      pp_init_fbos(ppq, in->width0, in->height0, indepth);
+      pp_init_fbos(ppq, in->width0, in->height0);
   }

   if (in == out && ppq->n_filters == 1) {
@@ -64,6 +64,11 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
      in = ppq->tmp[0];
   }

+   // Kept only for this frame.
+   pipe_resource_reference(&ppq->depth, indepth);
+   pipe_resource_reference(&refin, in);
+   pipe_resource_reference(&refout, out);
+
   switch (ppq->n_filters) {
   case 1:                     /* No temp buf */
      ppq->pp_queue[0] (ppq, in, out, 0);
@@ -93,6 +98,10 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,

      break;
   }
+
+   pipe_resource_reference(&ppq->depth, NULL);
+   pipe_resource_reference(&refin, NULL);
+   pipe_resource_reference(&refout, NULL);
 }


--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -25,43 +25,43 @@
 *
 **************************************************************************/

-
-#include "util/u_debug.h"
+#include "pipe/p_config.h"
 #include "rtasm_cpu.h"

-
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-static boolean rtasm_sse_enabled(void)
+
+#include "util/u_debug.h"
+#include "util/u_cpu_detect.h"
+
+DEBUG_GET_ONCE_BOOL_OPTION(nosse, "GALLIUM_NOSSE", FALSE);
+
+static struct util_cpu_caps *get_cpu_caps(void)
 {
-   static boolean firsttime = 1;
-   static boolean enabled;
-   
-   /* This gets called quite often at the moment:
-    */
-   if (firsttime) {
-      enabled =  !debug_get_bool_option("GALLIUM_NOSSE", FALSE);
-      firsttime = FALSE;
-   }
-   return enabled;
+   util_cpu_detect();
+   return &util_cpu_caps;
 }
-#endif

 int rtasm_cpu_has_sse(void)
 {
-   /* FIXME: actually detect this at run-time */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return rtasm_sse_enabled();
-#else
-   return 0;
-#endif
+   return !debug_get_option_nosse() && get_cpu_caps()->has_sse;
 }

 int rtasm_cpu_has_sse2(void) 
 {
-   /* FIXME: actually detect this at run-time */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return rtasm_sse_enabled();
-#else
-   return 0;
-#endif
+   return !debug_get_option_nosse() && get_cpu_caps()->has_sse2;
 }
+
+
+#else
+
+int rtasm_cpu_has_sse(void)
+{
+   return 0;
+}
+
+int rtasm_cpu_has_sse2(void)
+{
+   return 0;
+}
+
+#endif
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -370,7 +370,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
                      dstX0, dstY0, dstX1, dstY1);

   src_format = util_format_linear(src_tex->format);
-   dst_format = util_format_linear(dst->format);
+   dst_format = util_format_linear(dst->texture->format);

   /*
    * Check for simple case:  no format conversion, no flipping, no stretching,
--- a/src/gallium/auxiliary/util/u_debug_memory.c
+++ b/src/gallium/auxiliary/util/u_debug_memory.c
@@ -38,6 +38,7 @@

 #include "os/os_memory.h"
 #include "os/os_memory_debug.h"
+#include "os/os_thread.h"

 #include "util/u_debug.h" 
 #include "util/u_debug_stack.h" 
@@ -72,6 +73,8 @@ struct debug_memory_footer

 static struct list_head list = { &list, &list };

+pipe_static_mutex(list_mutex);
+
 static unsigned long last_no = 0;


@@ -132,7 +135,9 @@ debug_malloc(const char *file, unsigned line, const char *function,
   ftr = footer_from_header(hdr);
   ftr->magic = DEBUG_MEMORY_MAGIC;
   
+   pipe_mutex_lock(list_mutex);
   LIST_ADDTAIL(&hdr->head, &list);
+   pipe_mutex_unlock(list_mutex);
   
   return data_from_header(hdr);
 }
@@ -164,7 +169,9 @@ debug_free(const char *file, unsigned line, const char *function,
      debug_assert(0);
   }

+   pipe_mutex_lock(list_mutex);
   LIST_DEL(&hdr->head);
+   pipe_mutex_unlock(list_mutex);
   hdr->magic = 0;
   ftr->magic = 0;
   
@@ -232,7 +239,9 @@ debug_realloc(const char *file, unsigned line, const char *function,
   new_ftr = footer_from_header(new_hdr);
   new_ftr->magic = DEBUG_MEMORY_MAGIC;
   
+   pipe_mutex_lock(list_mutex);
   LIST_REPLACE(&old_hdr->head, &new_hdr->head);
+   pipe_mutex_unlock(list_mutex);

   /* copy data */
   new_ptr = data_from_header(new_hdr);
--- a/src/gallium/auxiliary/util/u_double_list.h
+++ b/src/gallium/auxiliary/util/u_double_list.h
@@ -105,6 +105,11 @@ static INLINE void list_delinit(struct list_head *item)
 #define LIST_IS_EMPTY(__list)                   \
    ((__list)->next == (__list))

+/**
+ * Cast from a pointer to a member of a struct back to the containing struct.
+ *
+ * 'sample' MUST be initialized, or else the result is undefined!
+ */
 #ifndef container_of
 #define container_of(ptr, sample, member)				\
    (void *)((char *)(ptr)						\
@@ -112,29 +117,29 @@ static INLINE void list_delinit(struct list_head *item)
 #endif

 #define LIST_FOR_EACH_ENTRY(pos, head, member)				\
-   for (pos = container_of((head)->next, pos, member);			\
+   for (pos = NULL, pos = container_of((head)->next, pos, member);	\
 	&pos->member != (head);						\
 	pos = container_of(pos->member.next, pos, member))

 #define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)	\
-   for (pos = container_of((head)->next, pos, member),			\
+   for (pos = NULL, pos = container_of((head)->next, pos, member),	\
 	storage = container_of(pos->member.next, pos, member);	\
 	&pos->member != (head);						\
 	pos = storage, storage = container_of(storage->member.next, storage, member))

 #define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member)	\
-   for (pos = container_of((head)->prev, pos, member),			\
+   for (pos = NULL, pos = container_of((head)->prev, pos, member),	\
 	storage = container_of(pos->member.prev, pos, member);		\
 	&pos->member != (head);						\
 	pos = storage, storage = container_of(storage->member.prev, storage, member))

 #define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member)		\
-   for (pos = container_of((start), pos, member);			\
+   for (pos = NULL, pos = container_of((start), pos, member);		\
 	&pos->member != (head);						\
 	pos = container_of(pos->member.next, pos, member))

 #define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member)		\
-   for (pos = container_of((start), pos, member);			\
+   for (pos = NULL, pos = container_of((start), pos, member);		\
 	&pos->member != (head);						\
 	pos = container_of(pos->member.prev, pos, member))

--- a/src/gallium/auxiliary/util/u_linkage.h
+++ b/src/gallium/auxiliary/util/u_linkage.h
@@ -49,15 +49,16 @@ unsigned util_semantic_set_from_program_file(struct util_semantic_set *set, cons
 *
 * num_slots is the size of the layout array and hardware limit instead.
 *
- * efficient_slots == 0 or efficient_solts == num_slots are typical settings.
+ * efficient_slots == 0 or efficient_slots == num_slots are typical settings.
 */
 void util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots);

 static INLINE void
-util_semantic_table_from_layout(unsigned char *table, unsigned char *layout, unsigned char first_slot_value, unsigned char num_slots)
+util_semantic_table_from_layout(unsigned char *table, size_t table_size, unsigned char *layout,
+                                unsigned char first_slot_value, unsigned char num_slots)
 {
-   int i;
-   memset(table, 0xff, sizeof(table));
+   unsigned char i;
+   memset(table, 0xff, table_size);

   for(i = 0; i < num_slots; ++i)
      table[layout[i]] = first_slot_value + i;
--- a/src/gallium/drivers/i915/i915_flush.c
+++ b/src/gallium/drivers/i915/i915_flush.c
@@ -45,7 +45,10 @@ static void i915_flush_pipe( struct pipe_context *pipe,

   draw_flush(i915->draw);

-   if (i915->batch->map == i915->batch->ptr) {
+   /* Only shortcut this if we have no fence, otherwise we must flush the
+    * empty batchbuffer to get our fence back.
+    */
+   if (!fence && i915->batch && (i915->batch->map == i915->batch->ptr)) {
      return;
   }

--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -368,7 +368,7 @@ i915_fence_signalled(struct pipe_screen *screen,
 {
   struct i915_screen *is = i915_screen(screen);

-   return is->iws->fence_signalled(is->iws, fence) == 0;
+   return is->iws->fence_signalled(is->iws, fence) == 1;
 }

 static boolean
@@ -378,7 +378,7 @@ i915_fence_finish(struct pipe_screen *screen,
 {
   struct i915_screen *is = i915_screen(screen);

-   return is->iws->fence_finish(is->iws, fence) == 0;
+   return is->iws->fence_finish(is->iws, fence) == 1;
 }


--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -792,7 +792,7 @@ lp_rast_finish( struct lp_rasterizer *rast )
 *   2. do work
 *   3. signal that we're done
 */
-static PIPE_THREAD_ROUTINE( thread_func, init_data )
+static PIPE_THREAD_ROUTINE( thread_function, init_data )
 {
   struct lp_rasterizer_task *task = (struct lp_rasterizer_task *) init_data;
   struct lp_rasterizer *rast = task->rast;
@@ -860,7 +860,7 @@ create_rast_threads(struct lp_rasterizer *rast)
   for (i = 0; i < rast->num_threads; i++) {
      pipe_semaphore_init(&rast->tasks[i].work_ready, 0);
      pipe_semaphore_init(&rast->tasks[i].work_done, 0);
-      rast->threads[i] = pipe_thread_create(thread_func,
+      rast->threads[i] = pipe_thread_create(thread_function,
                                            (void *) &rast->tasks[i]);
   }
 }
--- a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp
@@ -938,6 +938,7 @@ nv50_ir_init_prog_info(struct nv50_ir_prog_info *info)
   }
   info->io.clipDistance = 0xff;
   info->io.pointSize = 0xff;
+   info->io.vertexId = 0xff;
   info->io.edgeFlagIn = 0xff;
   info->io.edgeFlagOut = 0xff;
   info->io.fragDepth = 0xff;
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h
@@ -42,6 +42,7 @@ struct nv50_ir_varying
   unsigned mask     : 4; /* vec4 mask */
   unsigned linear   : 1; /* linearly interpolated if true (and not flat) */
   unsigned flat     : 1;
+   unsigned sc       : 1; /* special colour interpolation mode (SHADE_MODEL) */
   unsigned centroid : 1;
   unsigned patch    : 1; /* patch constant value */
   unsigned regular  : 1; /* driver-specific meaning (e.g. input in sreg) */
@@ -155,6 +156,7 @@ struct nv50_ir_prog_info
      uint8_t cullDistanceMask;  /* clip distance mode (1 bit per output) */
      int8_t genUserClip;        /* request user clip planes for ClipVertex */
      uint8_t pointSize;         /* output index for PointSize */
+      uint8_t vertexId;          /* system value index of VertexID */
      uint8_t edgeFlagIn;
      uint8_t edgeFlagOut;
      uint8_t fragDepth;         /* output index of FragDepth */
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
@@ -817,9 +817,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
               case TGSI_INTERPOLATE_CONSTANT:
                  info->in[i].flat = 1;
                  break;
+               case TGSI_INTERPOLATE_COLOR:
+                  info->in[i].sc = 1;
+                  break;
               case TGSI_INTERPOLATE_LINEAR:
-                  if (sn != TGSI_SEMANTIC_COLOR) // GL_NICEST
-                     info->in[i].linear = 1;
+                  info->in[i].linear = 1;
                  break;
               default:
                  break;
@@ -864,6 +866,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
      }
      break;
   case TGSI_FILE_SYSTEM_VALUE:
+      switch (sn) {
+      case TGSI_SEMANTIC_VERTEXID:
+         info->io.vertexId = first;
+         break;
+      default:
+         break;
+      }
      for (i = first; i <= last; ++i, ++si) {
         info->sv[i].sn = sn;
         info->sv[i].si = si;
@@ -1134,7 +1143,7 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
 static inline uint8_t
 translateInterpMode(const struct nv50_ir_varying *var, operation& op)
 {
-   uint8_t mode;
+   uint8_t mode = NV50_IR_INTERP_PERSPECTIVE;

   if (var->flat)
      mode = NV50_IR_INTERP_FLAT;
@@ -1142,9 +1151,11 @@ translateInterpMode(const struct nv50_ir_varying *var, operation& op)
   if (var->linear)
      mode = NV50_IR_INTERP_LINEAR;
   else
-      mode = NV50_IR_INTERP_PERSPECTIVE;
+   if (var->sc)
+      mode = NV50_IR_INTERP_SC;

-   op = (mode == NV50_IR_INTERP_PERSPECTIVE) ? OP_PINTERP : OP_LINTERP;
+   op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC)
+      ? OP_PINTERP : OP_LINTERP;

   if (var->centroid)
      mode |= NV50_IR_INTERP_CENTROID;
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -865,8 +865,10 @@ emit_flop(struct nv_pc *pc, struct nv_instruction *i)
   assert(SFILE(i, 0) == NV_FILE_GPR);

   if (!i->is_long) {
+      assert(i->opcode == NV_OP_RCP);
      emit_form_MUL(pc, i);
-      assert(i->opcode == NV_OP_RCP && !src0->mod);
+      if (src0->mod & NV_MOD_NEG) pc->emit[0] |= 0x00400000;
+      if (src0->mod & NV_MOD_ABS) pc->emit[0] |= 0x00008000;
      return;
   }

--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
@@ -703,11 +703,6 @@ NVC0LoweringPass::visit(Instruction *i)
         assert(prog->getType() != Program::TYPE_FRAGMENT);
      }
      break;
-   case OP_PINTERP:
-      if (i->getSrc(0)->reg.data.offset >= 0x280 &&
-          i->getSrc(0)->reg.data.offset <  0x2c0)
-         i->setInterpolate(i->getSampleMode() | NV50_IR_INTERP_SC);
-      break;
   default:
      break;
   }   
--- a/src/gallium/drivers/nvc0/nvc0_3d.xml.h
+++ b/src/gallium/drivers/nvc0/nvc0_3d.xml.h
@@ -913,6 +913,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT			0x04000000
 #define NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_CONT			0x08000000

+#define NVC0_3D_VERTEX_ID_REPLACE				0x0000161c
+#define NVC0_3D_VERTEX_ID_REPLACE_ENABLE			0x00000001
+#define NVC0_3D_VERTEX_ID_REPLACE_SOURCE__MASK			0x00000ff0
+#define NVC0_3D_VERTEX_ID_REPLACE_SOURCE__SHIFT			4
+
 #define NVC0_3D_VERTEX_DATA					0x00001640

 #define NVC0_3D_PRIM_RESTART_ENABLE				0x00001644
--- a/src/gallium/drivers/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nvc0/nvc0_context.h
@@ -134,9 +134,6 @@ struct nvc0_context {
   struct draw_context *draw;
 };

-#define NVC0_USING_EDGEFLAG(ctx) \
-   ((ctx)->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)
-
 static INLINE struct nvc0_context *
 nvc0_context(struct pipe_context *pipe)
 {
--- a/src/gallium/drivers/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nvc0/nvc0_program.c
@@ -107,7 +107,7 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)

   for (n = 0, i = 0; i < info->numInputs; ++i) {
      switch (info->in[i].sn) {
-      case TGSI_SEMANTIC_INSTANCEID:
+      case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
      case TGSI_SEMANTIC_VERTEXID:
         info->in[i].mask = 0x1;
         info->in[i].slot[0] =
@@ -580,7 +580,11 @@ nvc0_program_translate(struct nvc0_program *prog)
   prog->relocs = info->bin.relocData;
   prog->max_gpr = MAX2(4, (info->bin.maxGPR + 1));

-   prog->vp.edgeflag = PIPE_MAX_ATTRIBS;
+   prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+
+   if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
+      info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
+   prog->vp.edgeflag = info->io.edgeFlagIn;

   switch (prog->type) {
   case PIPE_SHADER_VERTEX:
--- a/src/gallium/drivers/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nvc0/nvc0_program.h
@@ -37,8 +37,9 @@ struct nvc0_program {
   struct {
      uint32_t clip_mode; /* clip/cull selection */
      uint8_t clip_enable; /* mask of defined clip planes */
-      uint8_t edgeflag;
      uint8_t num_ucps; /* also set to max if ClipDistance is used */
+      uint8_t edgeflag; /* attribute index of edgeflag input */
+      boolean need_vertex_id;
   } vp;
   struct {
      uint8_t early_z;
--- a/src/gallium/drivers/nvc0/nvc0_push.c
+++ b/src/gallium/drivers/nvc0/nvc0_push.c
@@ -21,6 +21,7 @@ struct push_context {
   struct translate *translate;

   boolean primitive_restart;
+   boolean need_vertex_id;
   uint32_t prim;
   uint32_t restart_index;
   uint32_t instance_id;
@@ -42,22 +43,23 @@ init_push_context(struct nvc0_context *nvc0, struct push_context *ctx)
   ctx->chan = nvc0->screen->base.channel;
   ctx->translate = nvc0->vertex->translate;

+   if (likely(nvc0->vertex->num_elements < 32))
+      ctx->need_vertex_id = nvc0->vertprog->vp.need_vertex_id;
+   else
+      ctx->need_vertex_id = FALSE;
+
+   ctx->edgeflag.buffer = -1;
   ctx->edgeflag.value = 0.5f;

-   if (NVC0_USING_EDGEFLAG(nvc0)) {
+   if (unlikely(nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)) {
      ve = &nvc0->vertex->element[nvc0->vertprog->vp.edgeflag].pipe;
-
      ctx->edgeflag.buffer = ve->vertex_buffer_index;
      ctx->edgeflag.offset = ve->src_offset;
-
      ctx->packet_vertex_limit = 1;
   } else {
-      ctx->edgeflag.buffer = -1;
-      ctx->edgeflag.offset = 0;
-      ctx->edgeflag.stride = 0;
-      ctx->edgeflag.data = NULL;
-
      ctx->packet_vertex_limit = nvc0->vertex->vtx_per_packet_max;
+      if (unlikely(ctx->need_vertex_id))
+         ctx->packet_vertex_limit = 1;
   }

   ctx->vertex_words = nvc0->vertex->vtx_size;
@@ -74,6 +76,17 @@ set_edgeflag(struct push_context *ctx, unsigned vtx_id)
   }
 }

+static INLINE void
+set_vertexid(struct push_context *ctx, uint32_t vtx_id)
+{
+#if 0
+   BEGIN_RING(ctx->chan, RING_3D(VERTEX_ID), 1); /* broken on nvc0 */
+#else
+   BEGIN_RING(ctx->chan, RING_3D(VERTEX_DATA), 1); /* as last attribute */
+#endif
+   OUT_RING  (ctx->chan, vtx_id);
+}
+
 static INLINE unsigned
 prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
 {
@@ -117,7 +130,7 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
      if (ctx->primitive_restart)
         nr = prim_restart_search_i08(elts, push, ctx->restart_index);

-      if (unlikely(ctx->edgeflag.buffer >= 0) && nr)
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
         set_edgeflag(ctx, elts[0]);

      size = ctx->vertex_words * nr;
@@ -126,8 +139,11 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)

      ctx->translate->run_elts8(ctx->translate, elts, nr, ctx->instance_id,
                                ctx->chan->cur);
-
      ctx->chan->cur += size;
+
+      if (unlikely(ctx->need_vertex_id) && likely(size))
+         set_vertexid(ctx, elts[0]);
+
      count -= nr;
      elts += nr;

@@ -155,7 +171,7 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
      if (ctx->primitive_restart)
         nr = prim_restart_search_i16(elts, push, ctx->restart_index);

-      if (unlikely(ctx->edgeflag.buffer >= 0) && nr)
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
         set_edgeflag(ctx, elts[0]);

      size = ctx->vertex_words * nr;
@@ -164,8 +180,11 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)

      ctx->translate->run_elts16(ctx->translate, elts, nr, ctx->instance_id,
                                 ctx->chan->cur);
-
      ctx->chan->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, elts[0]);
+
      count -= nr;
      elts += nr;

@@ -193,7 +212,7 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
      if (ctx->primitive_restart)
         nr = prim_restart_search_i32(elts, push, ctx->restart_index);

-      if (unlikely(ctx->edgeflag.buffer >= 0) && nr)
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
         set_edgeflag(ctx, elts[0]);

      size = ctx->vertex_words * nr;
@@ -202,8 +221,11 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)

      ctx->translate->run_elts(ctx->translate, elts, nr, ctx->instance_id,
                               ctx->chan->cur);
-
      ctx->chan->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, elts[0]);
+
      count -= nr;
      elts += nr;

@@ -233,6 +255,10 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
      ctx->translate->run(ctx->translate, start, push, ctx->instance_id,
                          ctx->chan->cur);
      ctx->chan->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, start);
+
      count -= push;
      start += push;
   }
@@ -326,6 +352,16 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   ctx.instance_id = info->start_instance;
   ctx.prim = nvc0_prim_gl(info->mode);

+   if (unlikely(ctx.need_vertex_id)) {
+      const unsigned a = nvc0->vertex->num_elements;
+      BEGIN_RING(ctx.chan, RING_3D(VERTEX_ATTRIB_FORMAT(a)), 1);
+      OUT_RING  (ctx.chan, (a << NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__SHIFT) |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32);
+      BEGIN_RING(ctx.chan, RING_3D(VERTEX_ID_REPLACE), 1);
+      OUT_RING  (ctx.chan, (((0x80 + a * 0x10) / 4) << 4) | 1);
+   }
+
   while (inst_count--) {
      BEGIN_RING(ctx.chan, RING_3D(VERTEX_BEGIN_GL), 1);
      OUT_RING  (ctx.chan, ctx.prim);
@@ -355,6 +391,16 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
   if (unlikely(ctx.edgeflag.value == 0.0f))
      IMMED_RING(ctx.chan, RING_3D(EDGEFLAG_ENABLE), 1);

+   if (unlikely(ctx.need_vertex_id)) {
+      const unsigned a = nvc0->vertex->num_elements;
+      IMMED_RING(ctx.chan, RING_3D(VERTEX_ID_REPLACE), 0);
+      BEGIN_RING(ctx.chan, RING_3D(VERTEX_ATTRIB_FORMAT(a)), 1);
+      OUT_RING  (ctx.chan,
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32);
+   }
+
   if (info->indexed)
      nouveau_resource_unmap(nv04_resource(nvc0->idxbuf.buffer));

--- a/src/gallium/drivers/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nvc0/nvc0_screen.c
@@ -69,15 +69,14 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
   switch (param) {
   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
-      return 64;
+      return 16 * PIPE_SHADER_TYPES; /* NOTE: should not count COMPUTE */
   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 13;
-   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 10;
   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 13;
+      return 15;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 12;
   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return 8192;
+      return 2048;
   case PIPE_CAP_MIN_TEXEL_OFFSET:
      return -8;
   case PIPE_CAP_MAX_TEXEL_OFFSET:
@@ -167,7 +166,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_CAP_MAX_INPUTS:
      if (shader == PIPE_SHADER_VERTEX)
         return 32;
-      return 0x300 / 16;
+      if (shader == PIPE_SHADER_FRAGMENT)
+         return (0x200 + 0x20 + 0x80) / 16; /* generic + colors + TexCoords */
+      return (0x200 + 0x40 + 0x80) / 16; /* without 0x60 for per-patch inputs */
   case PIPE_SHADER_CAP_MAX_CONSTS:
      return 65536 / 16;
   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
@@ -191,7 +192,11 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_CAP_INTEGERS:
      return 1;
   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+      return 16; /* would be 32 in linked (OpenGL-style) mode */
+      /*
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLER_VIEWS:
      return 32;
+      */
   case PIPE_SHADER_CAP_OUTPUT_READ:
      return 0; /* shader != PIPE_SHADER_TESSELLATION_CONTROL; */
   default:
@@ -208,12 +213,13 @@ nvc0_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
      return 10.0f;
   case PIPE_CAPF_MAX_POINT_WIDTH:
+      return 63.0f;
   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
-      return 64.0f;
+      return 63.375f;
   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
      return 16.0f;
   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-      return 4.0f;
+      return 15.0f;
   default:
      NOUVEAU_ERR("unknown PIPE_CAP %d\n", param);
      return 0.0f;
--- a/src/gallium/drivers/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nvc0/nvc0_vbo.c
@@ -263,7 +263,8 @@ nvc0_vertex_arrays_validate(struct nvc0_context *nvc0)
   struct nvc0_vertex_element *ve;
   unsigned i;

-   if (unlikely(vertex->need_conversion || NVC0_USING_EDGEFLAG(nvc0))) {
+   if (unlikely(vertex->need_conversion) ||
+       unlikely(nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)) {
      nvc0->vbo_fifo = ~0;
      nvc0->vbo_user = 0;
   } else {
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -977,7 +977,8 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 	if(fpc->fp->num_slots > num_texcoords)
 		return FALSE;
 	util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, num_texcoords);
-	util_semantic_table_from_layout(fpc->generic_to_slot, fpc->fp->slot_to_generic, 0, num_texcoords);
+	util_semantic_table_from_layout(fpc->generic_to_slot, sizeof fpc->generic_to_slot,
+                                        fpc->fp->slot_to_generic, 0, num_texcoords);

 	memset(fpc->fp->slot_to_fp_input, 0xff, sizeof(fpc->fp->slot_to_fp_input));

--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -88,6 +88,12 @@ nvfx_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_TEXEL_OFFSET:
 	case PIPE_CAP_CONDITIONAL_RENDER:
 	case PIPE_CAP_TEXTURE_BARRIER:
+	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+	case PIPE_CAP_TGSI_CAN_COMPACT_VARYINGS:
+	case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
                return 0;
 	case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
                return 0;
--- a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
+++ b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
@@ -547,7 +547,7 @@ static void do_advanced_regalloc(struct regalloc_state * s)
 	struct ra_graph * graph;

 	/* Allocate the main ra data structure */
-	regs = ra_alloc_reg_set(s->C->max_temp_regs * RC_MASK_XYZW);
+	regs = ra_alloc_reg_set(NULL, s->C->max_temp_regs * RC_MASK_XYZW);

 	/* Get list of program variables */
 	variables = rc_get_variables(s->C);
--- a/src/gallium/drivers/r300/compiler/radeon_program_alu.c
+++ b/src/gallium/drivers/r300/compiler/radeon_program_alu.c
@@ -41,13 +41,16 @@

 static struct rc_instruction *emit1(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
-	struct rc_src_register SrcReg)
+	rc_opcode Opcode, struct rc_sub_instruction * base,
+	struct rc_dst_register DstReg, struct rc_src_register SrcReg)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);

+	if (base) {
+		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+	}
+
 	fpi->U.I.Opcode = Opcode;
-	fpi->U.I.SaturateMode = Saturate;
 	fpi->U.I.DstReg = DstReg;
 	fpi->U.I.SrcReg[0] = SrcReg;
 	return fpi;
@@ -55,13 +58,17 @@ static struct rc_instruction *emit1(

 static struct rc_instruction *emit2(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+	rc_opcode Opcode, struct rc_sub_instruction * base,
+	struct rc_dst_register DstReg,
 	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);

+	if (base) {
+		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+	}
+
 	fpi->U.I.Opcode = Opcode;
-	fpi->U.I.SaturateMode = Saturate;
 	fpi->U.I.DstReg = DstReg;
 	fpi->U.I.SrcReg[0] = SrcReg0;
 	fpi->U.I.SrcReg[1] = SrcReg1;
@@ -70,14 +77,18 @@ static struct rc_instruction *emit2(

 static struct rc_instruction *emit3(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+	rc_opcode Opcode, struct rc_sub_instruction * base,
+	struct rc_dst_register DstReg,
 	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
 	struct rc_src_register SrcReg2)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);

+	if (base) {
+		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+	}
+
 	fpi->U.I.Opcode = Opcode;
-	fpi->U.I.SaturateMode = Saturate;
 	fpi->U.I.DstReg = DstReg;
 	fpi->U.I.SrcReg[0] = SrcReg0;
 	fpi->U.I.SrcReg[1] = SrcReg1;
@@ -221,7 +232,7 @@ static void transform_ABS(struct radeon_compiler* c,
 	struct rc_src_register src = inst->U.I.SrcReg[0];
 	src.Abs = 1;
 	src.Negate = RC_MASK_NONE;
-	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, src);
+	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
 	rc_remove_instruction(inst);
 }

@@ -240,7 +251,7 @@ static void transform_CEIL(struct radeon_compiler* c,

 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
-	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
 		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
 	rc_remove_instruction(inst);
 }
@@ -256,7 +267,7 @@ static void transform_CLAMP(struct radeon_compiler *c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 	emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
 		inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
-	emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
 	rc_remove_instruction(inst);
 }
@@ -272,7 +283,7 @@ static void transform_DP2(struct radeon_compiler* c,
 	src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
 	src1.Swizzle &= ~(63 << (3 * 2));
 	src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
-	emit2(c, inst->Prev, RC_OPCODE_DP3, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
+	emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
 	rc_remove_instruction(inst);
 }

@@ -283,7 +294,7 @@ static void transform_DPH(struct radeon_compiler* c,
 	src0.Negate &= ~RC_MASK_W;
 	src0.Swizzle &= ~(7 << (3 * 3));
 	src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
-	emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
+	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
 	rc_remove_instruction(inst);
 }

@@ -294,7 +305,7 @@ static void transform_DPH(struct radeon_compiler* c,
 static void transform_DST(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	emit2(c, inst->Prev, RC_OPCODE_MUL, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
 	rc_remove_instruction(inst);
@@ -305,7 +316,7 @@ static void transform_FLR(struct radeon_compiler* c,
 {
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
-	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
 		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
 	rc_remove_instruction(inst);
 }
@@ -379,14 +390,14 @@ static void transform_LIT(struct radeon_compiler* c,
 		swizzle_wwww(srctemp));

 	/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
 		dstregtmpmask(temp, RC_MASK_Z),
 		negate(swizzle_xxxx(srctemp)),
 		swizzle_wwww(srctemp),
 		builtin_zero);

 	/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
-	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
+	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
 		dstregtmpmask(temp, RC_MASK_XYW),
 		swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));

@@ -401,7 +412,7 @@ static void transform_LRP(struct radeon_compiler* c,
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
 		dst,
 		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
-	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
+	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
 		inst->U.I.DstReg,
 		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);

@@ -418,7 +429,7 @@ static void transform_POW(struct radeon_compiler* c,

 	emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
-	emit1(c, inst->Prev, RC_OPCODE_EX2, inst->U.I.SaturateMode, inst->U.I.DstReg, tempsrc);
+	emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);

 	rc_remove_instruction(inst);
 }
@@ -472,7 +483,7 @@ static void transform_SEQ(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);

 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);

 	rc_remove_instruction(inst);
@@ -481,7 +492,7 @@ static void transform_SEQ(struct radeon_compiler* c,
 static void transform_SFL(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, builtin_zero);
+	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
 	rc_remove_instruction(inst);
 }

@@ -491,7 +502,7 @@ static void transform_SGE(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);

 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);

 	rc_remove_instruction(inst);
@@ -503,7 +514,7 @@ static void transform_SGT(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);

 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);

 	rc_remove_instruction(inst);
@@ -515,7 +526,7 @@ static void transform_SLE(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);

 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);

 	rc_remove_instruction(inst);
@@ -527,7 +538,7 @@ static void transform_SLT(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);

 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);

 	rc_remove_instruction(inst);
@@ -539,7 +550,7 @@ static void transform_SNE(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);

 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);

 	rc_remove_instruction(inst);
@@ -604,7 +615,7 @@ static void transform_XPD(struct radeon_compiler* c,
 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
-	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
 		negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
@@ -719,7 +730,7 @@ static void transform_r300_vertex_DP3(struct radeon_compiler* c,
 	src1.Negate &= ~RC_MASK_W;
 	src1.Swizzle &= ~(7 << (3 * 3));
 	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
-	emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
+	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
 	rc_remove_instruction(inst);
 }

@@ -1043,22 +1054,22 @@ static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
 	unsigned srctmp)
 {
 	if (inst->U.I.Opcode == RC_OPCODE_COS) {
-		emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
 			srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
 	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
-		emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode,
+		emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
 			inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
 	} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
 		struct rc_dst_register moddst = inst->U.I.DstReg;

 		if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
 			moddst.WriteMask = RC_MASK_X;
-			emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, moddst,
+			emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
 				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
 		}
 		if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
 			moddst.WriteMask = RC_MASK_Y;
-			emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode, moddst,
+			emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
 				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
 		}
 	}
@@ -1165,35 +1176,79 @@ int radeonTransformDeriv(struct radeon_compiler* c,
 }

 /**
+ * IF Temp[0].x -> IF Temp[0].x
+ * ...          -> ...
+ * KILP         -> KIL -abs(Temp[0].x)
+ * ...          -> ...
+ * ENDIF        -> ENDIF
+ *
+ * === OR ===
+ *
 * IF Temp[0].x -\
 * KILP         - > KIL -abs(Temp[0].x)
 * ENDIF        -/
 *
- * This needs to be done in its own pass, because it modifies the instructions
- * before and after KILP.
+ * === OR ===
+ *
+ * IF Temp[0].x -> IF Temp[0].x
+ * ...          -> ...
+ * ELSE         -> ELSE
+ * ...	        -> ...
+ * KILP	        -> KIL -abs(Temp[0].x)
+ * ...          -> ...
+ * ENDIF        -> ENDIF
+ *
+ * === OR ===
+ *
+ * KILP         -> KIL -none.1111
+ *
+ * This needs to be done in its own pass, because it might modify the
+ * instructions before and after KILP.
 */
 void rc_transform_KILP(struct radeon_compiler * c, void *user)
 {
 	struct rc_instruction * inst;
 	for (inst = c->Program.Instructions.Next;
 			inst != &c->Program.Instructions; inst = inst->Next) {
+		struct rc_instruction * if_inst;
+		unsigned in_if = 0;

 		if (inst->U.I.Opcode != RC_OPCODE_KILP)
 			continue;

+		for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
+						if_inst = if_inst->Prev) {
+
+			if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
+				in_if = 1;
+				break;
+			}
+		}
+
 		inst->U.I.Opcode = RC_OPCODE_KIL;

-		if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
-				|| inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
+		if (!in_if) {
 			inst->U.I.SrcReg[0] = negate(builtin_one);
 		} else {
-
+			/* This should work even if the KILP is inside the ELSE
+			 * block, because -0.0 is considered negative. */
 			inst->U.I.SrcReg[0] =
-				negate(absolute(inst->Prev->U.I.SrcReg[0]));
-			/* Remove IF */
-			rc_remove_instruction(inst->Prev);
-			/* Remove ENDIF */
-			rc_remove_instruction(inst->Next);
+				negate(absolute(if_inst->U.I.SrcReg[0]));
+
+			if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
+				&& inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
+
+				/* Optimize the special case:
+				 * IF Temp[0].x
+				 * KILP
+				 * ENDIF
+				 */
+
+				/* Remove IF */
+				rc_remove_instruction(inst->Prev);
+				/* Remove ENDIF */
+				rc_remove_instruction(inst->Next);
+			}
 		}
 	}
 }
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -63,8 +63,13 @@ static void r300_blitter_begin(struct r300_context* r300, enum r300_blitter_op o
    util_blitter_save_vertex_shader(r300->blitter, r300->vs_state.state);
    util_blitter_save_viewport(r300->blitter, &r300->viewport);
    util_blitter_save_vertex_elements(r300->blitter, r300->velems);
-    util_blitter_save_vertex_buffers(r300->blitter, r300->vbuf_mgr->nr_vertex_buffers,
-                                     r300->vbuf_mgr->vertex_buffer);
+    if (r300->vbuf_mgr) {
+        util_blitter_save_vertex_buffers(r300->blitter, r300->vbuf_mgr->nr_vertex_buffers,
+                                         r300->vbuf_mgr->vertex_buffer);
+    } else {
+        util_blitter_save_vertex_buffers(r300->blitter, r300->swtcl_nr_vertex_buffers,
+                                         r300->swtcl_vertex_buffer);
+    }

    if (op & R300_SAVE_FRAMEBUFFER) {
        util_blitter_save_framebuffer(r300->blitter, r300->fb_state.state);
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -419,17 +419,19 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
    r300_init_query_functions(r300);
    r300_init_state_functions(r300);
    r300_init_resource_functions(r300);
-    
+
    r300->context.create_video_decoder = vl_create_decoder;
    r300->context.create_video_buffer = vl_video_buffer_create;

-    r300->vbuf_mgr = u_vbuf_create(&r300->context, 1024 * 1024, 16,
+    if (r300->screen->caps.has_tcl) {
+        r300->vbuf_mgr = u_vbuf_create(&r300->context, 1024 * 1024, 16,
                                       PIPE_BIND_VERTEX_BUFFER |
                                       PIPE_BIND_INDEX_BUFFER,
                                       U_VERTEX_FETCH_DWORD_ALIGNED);
-    if (!r300->vbuf_mgr)
-        goto fail;
-    r300->vbuf_mgr->caps.format_fixed32 = 0;
+        if (!r300->vbuf_mgr)
+            goto fail;
+        r300->vbuf_mgr->caps.format_fixed32 = 0;
+    }

    r300->blitter = util_blitter_create(&r300->context);
    if (r300->blitter == NULL)
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -42,6 +42,16 @@ struct r300_fragment_shader;
 struct r300_vertex_shader;
 struct r300_stencilref_context;

+enum colormask_swizzle {
+    COLORMASK_BGRA,
+    COLORMASK_RGBA,
+    COLORMASK_RRRR,
+    COLORMASK_AAAA,
+    COLORMASK_GRRG,
+    COLORMASK_ARRA,
+    COLORMASK_NUM_SWIZZLES
+};
+
 struct r300_atom {
    /* Name, for debugging. */
    const char* name;
@@ -67,7 +77,7 @@ struct r300_aa_state {
 struct r300_blend_state {
    struct pipe_blend_state state;

-    uint32_t cb_clamp[8];
+    uint32_t cb_clamp[COLORMASK_NUM_SWIZZLES][8];
    uint32_t cb_noclamp[8];
    uint32_t cb_no_readwrite[8];
 };
@@ -321,6 +331,8 @@ struct r300_surface {

    /* Whether the CBZB clear is allowed on the surface. */
    boolean cbzb_allowed;
+
+    unsigned colormask_swizzle;
 };

 struct r300_texture_desc {
@@ -581,6 +593,9 @@ struct r300_context {
    void *dsa_decompress_zmask;

    struct u_vbuf *vbuf_mgr;
+    struct pipe_index_buffer swtcl_index_buffer;
+    struct pipe_vertex_buffer swtcl_vertex_buffer[PIPE_MAX_ATTRIBS];
+    unsigned swtcl_nr_vertex_buffers;

    struct util_slab_mempool pool_transfers;

--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -45,10 +45,12 @@ void r300_emit_blend_state(struct r300_context* r300,
    CS_LOCALS(r300);

    if (fb->nr_cbufs) {
-        if (fb->cbufs[0]->format == PIPE_FORMAT_R16G16B16A16_FLOAT)
+        if (fb->cbufs[0]->format == PIPE_FORMAT_R16G16B16A16_FLOAT) {
            WRITE_CS_TABLE(blend->cb_noclamp, size);
-        else
-            WRITE_CS_TABLE(blend->cb_clamp, size);
+        } else {
+            unsigned swz = r300_surface(fb->cbufs[0])->colormask_swizzle;
+            WRITE_CS_TABLE(blend->cb_clamp[swz], size);
+        }
    } else {
        WRITE_CS_TABLE(blend->cb_no_readwrite, size);
    }
@@ -1030,20 +1032,18 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
            R300_PVS_VF_MAX_VTX_NUM(12) |
            (r300screen->caps.is_r500 ? R500_TCL_STATE_OPTIMIZATION : 0));

-    /* Emit flow control instructions. */
-    if (code->num_fc_ops) {
-
-        OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_OPC, code->fc_ops);
-        if (r300screen->caps.is_r500) {
-            OUT_CS_REG_SEQ(R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0, code->num_fc_ops * 2);
-            OUT_CS_TABLE(code->fc_op_addrs.r500, code->num_fc_ops * 2);
-        } else {
-            OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_ADDRS_0, code->num_fc_ops);
-            OUT_CS_TABLE(code->fc_op_addrs.r300, code->num_fc_ops);
-        }
-        OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0, code->num_fc_ops);
-        OUT_CS_TABLE(code->fc_loop_index, code->num_fc_ops);
+    /* Emit flow control instructions.  Even if there are no fc instructions,
+     * we still need to write the registers to make sure they are cleared. */
+    OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_OPC, code->fc_ops);
+    if (r300screen->caps.is_r500) {
+        OUT_CS_REG_SEQ(R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0, R300_VS_MAX_FC_OPS * 2);
+        OUT_CS_TABLE(code->fc_op_addrs.r500, R300_VS_MAX_FC_OPS * 2);
+    } else {
+        OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_ADDRS_0, R300_VS_MAX_FC_OPS);
+        OUT_CS_TABLE(code->fc_op_addrs.r300, R300_VS_MAX_FC_OPS);
    }
+    OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0, R300_VS_MAX_FC_OPS);
+    OUT_CS_TABLE(code->fc_loop_index, R300_VS_MAX_FC_OPS);

    END_CS;
 }
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -818,7 +818,7 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
    struct pipe_transfer *ib_transfer = NULL;
    int i;
    void *indices = NULL;
-    boolean indexed = info->indexed && r300->vbuf_mgr->index_buffer.buffer;
+    boolean indexed = info->indexed && r300->swtcl_index_buffer.buffer;

    if (r300->skip_rendering) {
        return;
@@ -831,10 +831,10 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
            (indexed ? PREP_INDEXED : 0),
            indexed ? 256 : 6);

-    for (i = 0; i < r300->vbuf_mgr->nr_vertex_buffers; i++) {
-        if (r300->vbuf_mgr->vertex_buffer[i].buffer) {
+    for (i = 0; i < r300->swtcl_nr_vertex_buffers; i++) {
+        if (r300->swtcl_vertex_buffer[i].buffer) {
            void *buf = pipe_buffer_map(pipe,
-                                  r300->vbuf_mgr->vertex_buffer[i].buffer,
+                                  r300->swtcl_vertex_buffer[i].buffer,
                                  PIPE_TRANSFER_READ |
                                  PIPE_TRANSFER_UNSYNCHRONIZED,
                                  &vb_transfer[i]);
@@ -843,7 +843,7 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
    }

    if (indexed) {
-        indices = pipe_buffer_map(pipe, r300->vbuf_mgr->index_buffer.buffer,
+        indices = pipe_buffer_map(pipe, r300->swtcl_index_buffer.buffer,
                                  PIPE_TRANSFER_READ |
                                  PIPE_TRANSFER_UNSYNCHRONIZED, &ib_transfer);
    }
@@ -856,8 +856,8 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
    draw_flush(r300->draw);
    r300->draw_vbo_locked = FALSE;

-    for (i = 0; i < r300->vbuf_mgr->nr_vertex_buffers; i++) {
-        if (r300->vbuf_mgr->vertex_buffer[i].buffer) {
+    for (i = 0; i < r300->swtcl_nr_vertex_buffers; i++) {
+        if (r300->swtcl_vertex_buffer[i].buffer) {
            pipe_buffer_unmap(pipe, vb_transfer[i]);
            draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
        }
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -212,6 +212,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
        switch (param)
        {
        case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+        case PIPE_SHADER_CAP_SUBROUTINES:
            return 0;
        default:;
        }
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -169,15 +169,52 @@ static boolean blend_discard_if_src_alpha_color_1(unsigned srcRGB, unsigned srcA
            dstA == PIPE_BLENDFACTOR_ONE);
 }

+/* The hardware colormask is clunky a must be swizzled depending on the format.
+ * This was figured out by trial-and-error. */
 static unsigned bgra_cmask(unsigned mask)
 {
-    /* Gallium uses RGBA color ordering while R300 expects BGRA. */
-
    return ((mask & PIPE_MASK_R) << 2) |
           ((mask & PIPE_MASK_B) >> 2) |
           (mask & (PIPE_MASK_G | PIPE_MASK_A));
 }

+static unsigned rgba_cmask(unsigned mask)
+{
+    return mask & PIPE_MASK_RGBA;
+}
+
+static unsigned rrrr_cmask(unsigned mask)
+{
+    return (mask & PIPE_MASK_R) |
+           ((mask & PIPE_MASK_R) << 1) |
+           ((mask & PIPE_MASK_R) << 2) |
+           ((mask & PIPE_MASK_R) << 3);
+}
+
+static unsigned aaaa_cmask(unsigned mask)
+{
+    return ((mask & PIPE_MASK_A) >> 3) |
+           ((mask & PIPE_MASK_A) >> 2) |
+           ((mask & PIPE_MASK_A) >> 1) |
+           (mask & PIPE_MASK_A);
+}
+
+static unsigned grrg_cmask(unsigned mask)
+{
+    return ((mask & PIPE_MASK_R) << 1) |
+           ((mask & PIPE_MASK_R) << 2) |
+           ((mask & PIPE_MASK_G) >> 1) |
+           ((mask & PIPE_MASK_G) << 2);
+}
+
+static unsigned arra_cmask(unsigned mask)
+{
+    return ((mask & PIPE_MASK_R) << 1) |
+           ((mask & PIPE_MASK_R) << 2) |
+           ((mask & PIPE_MASK_A) >> 3) |
+           (mask & PIPE_MASK_A);
+}
+
 /* Create a new blend state based on the CSO blend state.
 *
 * This encompasses alpha blending, logic/raster ops, and blend dithering. */
@@ -190,9 +227,9 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
    uint32_t blend_control_noclamp = 0;    /* R300_RB3D_CBLEND: 0x4e04 */
    uint32_t alpha_blend_control = 0; /* R300_RB3D_ABLEND: 0x4e08 */
    uint32_t alpha_blend_control_noclamp = 0; /* R300_RB3D_ABLEND: 0x4e08 */
-    uint32_t color_channel_mask = 0;  /* R300_RB3D_COLOR_CHANNEL_MASK: 0x4e0c */
    uint32_t rop = 0;                 /* R300_RB3D_ROPCNTL: 0x4e18 */
    uint32_t dither = 0;              /* R300_RB3D_DITHER_CTL: 0x4e50 */
+    int i;
    CB_LOCALS;

    blend->state = *state;
@@ -331,20 +368,6 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
                (state->logicop_func) << R300_RB3D_ROPCNTL_ROP_SHIFT;
    }

-    /* Color channel masks for all MRTs. */
-    color_channel_mask = bgra_cmask(state->rt[0].colormask);
-    if (r300screen->caps.is_r500 && state->independent_blend_enable) {
-        if (state->rt[1].blend_enable) {
-            color_channel_mask |= bgra_cmask(state->rt[1].colormask) << 4;
-        }
-        if (state->rt[2].blend_enable) {
-            color_channel_mask |= bgra_cmask(state->rt[2].colormask) << 8;
-        }
-        if (state->rt[3].blend_enable) {
-            color_channel_mask |= bgra_cmask(state->rt[3].colormask) << 12;
-        }
-    }
-
    /* Neither fglrx nor classic r300 ever set this, regardless of dithering
     * state. Since it's an optional implementation detail, we can leave it
     * out and never dither.
@@ -358,14 +381,27 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
    */

    /* Build a command buffer. */
-    BEGIN_CB(blend->cb_clamp, 8);
-    OUT_CB_REG(R300_RB3D_ROPCNTL, rop);
-    OUT_CB_REG_SEQ(R300_RB3D_CBLEND, 3);
-    OUT_CB(blend_control);
-    OUT_CB(alpha_blend_control);
-    OUT_CB(color_channel_mask);
-    OUT_CB_REG(R300_RB3D_DITHER_CTL, dither);
-    END_CB;
+    {
+        unsigned (*func[COLORMASK_NUM_SWIZZLES])(unsigned) = {
+            bgra_cmask,
+            rgba_cmask,
+            rrrr_cmask,
+            aaaa_cmask,
+            grrg_cmask,
+            arra_cmask
+        };
+
+        for (i = 0; i < COLORMASK_NUM_SWIZZLES; i++) {
+            BEGIN_CB(blend->cb_clamp[i], 8);
+            OUT_CB_REG(R300_RB3D_ROPCNTL, rop);
+            OUT_CB_REG_SEQ(R300_RB3D_CBLEND, 3);
+            OUT_CB(blend_control);
+            OUT_CB(alpha_blend_control);
+            OUT_CB(func[i](state->rt[0].colormask));
+            OUT_CB_REG(R300_RB3D_DITHER_CTL, dither);
+            END_CB;
+        }
+    }

    /* Build a command buffer. */
    BEGIN_CB(blend->cb_noclamp, 8);
@@ -373,7 +409,7 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
    OUT_CB_REG_SEQ(R300_RB3D_CBLEND, 3);
    OUT_CB(blend_control_noclamp);
    OUT_CB(alpha_blend_control_noclamp);
-    OUT_CB(color_channel_mask);
+    OUT_CB(rgba_cmask(state->rt[0].colormask));
    OUT_CB_REG(R300_RB3D_DITHER_CTL, dither);
    END_CB;

@@ -1048,6 +1084,10 @@ static void* r300_create_rs_state(struct pipe_context* pipe,

    /* Override some states for Draw. */
    rs->rs_draw.sprite_coord_enable = 0; /* We can do this in HW. */
+    rs->rs_draw.offset_point = 0;
+    rs->rs_draw.offset_line = 0;
+    rs->rs_draw.offset_tri = 0;
+    rs->rs_draw.offset_clamp = 0;

 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    vap_control_status = R300_VC_NO_SWAP;
@@ -1595,7 +1635,6 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
                                    const struct pipe_vertex_buffer* buffers)
 {
    struct r300_context* r300 = r300_context(pipe);
-    unsigned i;
    struct pipe_vertex_buffer dummy_vb = {0};

    /* There must be at least one vertex buffer set, otherwise it locks up. */
@@ -1605,18 +1644,13 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
        count = 1;
    }

-    u_vbuf_set_vertex_buffers(r300->vbuf_mgr, count, buffers);
-
    if (r300->screen->caps.has_tcl) {
-        /* HW TCL. */
-        for (i = 0; i < count; i++) {
-            if (buffers[i].buffer &&
-		!r300_resource(buffers[i].buffer)->b.user_ptr) {
-            }
-        }
+        u_vbuf_set_vertex_buffers(r300->vbuf_mgr, count, buffers);
        r300->vertex_arrays_dirty = TRUE;
    } else {
-        /* SW TCL. */
+        util_copy_vertex_buffers(r300->swtcl_vertex_buffer,
+                                 &r300->swtcl_nr_vertex_buffers,
+                                 buffers, count);
        draw_set_vertex_buffers(r300->draw, count, buffers);
    }
 }
@@ -1626,9 +1660,15 @@ static void r300_set_index_buffer(struct pipe_context* pipe,
 {
    struct r300_context* r300 = r300_context(pipe);

-    u_vbuf_set_index_buffer(r300->vbuf_mgr, ib);
-
-    if (!r300->screen->caps.has_tcl) {
+    if (r300->screen->caps.has_tcl) {
+        u_vbuf_set_index_buffer(r300->vbuf_mgr, ib);
+    } else {
+        if (ib) {
+            pipe_resource_reference(&r300->swtcl_index_buffer.buffer, ib->buffer);
+            memcpy(&r300->swtcl_index_buffer, ib, sizeof(*ib));
+        } else {
+            pipe_resource_reference(&r300->swtcl_index_buffer.buffer, NULL);
+        }
        draw_set_index_buffer(r300->draw, ib);
    }
 }
@@ -1702,11 +1742,11 @@ static void* r300_create_vertex_elements_state(struct pipe_context* pipe,
        return NULL;

    velems->count = count;
-    velems->vmgr_elements =
-        u_vbuf_create_vertex_elements(r300->vbuf_mgr, count, attribs,
-                                          velems->velem);

    if (r300_screen(pipe->screen)->caps.has_tcl) {
+        velems->vmgr_elements =
+            u_vbuf_create_vertex_elements(r300->vbuf_mgr, count, attribs,
+                                          velems->velem);
        /* Setup PSC.
         * The unused components will be replaced by (..., 0, 1). */
        r300_vertex_psc(velems);
@@ -1716,6 +1756,8 @@ static void* r300_create_vertex_elements_state(struct pipe_context* pipe,
                align(util_format_get_blocksize(velems->velem[i].src_format), 4);
            velems->vertex_size_dwords += velems->format_size[i] / 4;
        }
+    } else {
+        memcpy(velems->velem, attribs, count * sizeof(struct pipe_vertex_element));
    }

    return velems;
@@ -1733,9 +1775,9 @@ static void r300_bind_vertex_elements_state(struct pipe_context *pipe,

    r300->velems = velems;

-    u_vbuf_bind_vertex_elements(r300->vbuf_mgr, state, velems->vmgr_elements);
-
-    if (r300->draw) {
+    if (r300->screen->caps.has_tcl) {
+        u_vbuf_bind_vertex_elements(r300->vbuf_mgr, state, velems->vmgr_elements);
+    } else {
        draw_set_vertex_elements(r300->draw, velems->count, velems->velem);
        return;
    }
@@ -1750,7 +1792,9 @@ static void r300_delete_vertex_elements_state(struct pipe_context *pipe, void *s
    struct r300_context *r300 = r300_context(pipe);
    struct r300_vertex_element_state *velems = state;

-    u_vbuf_destroy_vertex_elements(r300->vbuf_mgr, velems->vmgr_elements);
+    if (r300->screen->caps.has_tcl) {
+        u_vbuf_destroy_vertex_elements(r300->vbuf_mgr, velems->vmgr_elements);
+    }
    FREE(state);
 }

@@ -1765,10 +1809,10 @@ static void* r300_create_vs_state(struct pipe_context* pipe,
    vs->state.tokens = tgsi_dup_tokens(shader->tokens);

    if (r300->screen->caps.has_tcl) {
-        r300_init_vs_outputs(vs);
+        r300_init_vs_outputs(r300, vs);
        r300_translate_vertex_shader(r300, vs);
    } else {
-        r300_draw_init_vertex_shader(r300->draw, vs);
+        r300_draw_init_vertex_shader(r300, vs);
    }

    return vs;
@@ -1794,9 +1838,8 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
    if (r300->screen->caps.has_tcl) {
        unsigned fc_op_dwords = r300->screen->caps.is_r500 ? 3 : 2;
        r300_mark_atom_dirty(r300, &r300->vs_state);
-        r300->vs_state.size =
-                vs->code.length + 9 +
-        (vs->code.num_fc_ops ? vs->code.num_fc_ops * fc_op_dwords + 4 : 0);
+        r300->vs_state.size = vs->code.length + 9 +
+			(R300_VS_MAX_FC_OPS * fc_op_dwords + 4);

        r300_mark_atom_dirty(r300, &r300->vs_constants);
        r300->vs_constants.size =
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -704,10 +704,87 @@ static uint32_t r300_translate_out_fmt(enum pipe_format format)
    }
 }

+static uint32_t r300_translate_colormask_swizzle(enum pipe_format format)
+{
+    switch (format) {
+    case PIPE_FORMAT_A8_UNORM:
+    case PIPE_FORMAT_A8_SNORM:
+    case PIPE_FORMAT_A16_UNORM:
+    case PIPE_FORMAT_A16_SNORM:
+    case PIPE_FORMAT_A16_FLOAT:
+    case PIPE_FORMAT_A32_FLOAT:
+        return COLORMASK_AAAA;
+
+    case PIPE_FORMAT_I8_UNORM:
+    case PIPE_FORMAT_I8_SNORM:
+    case PIPE_FORMAT_L8_UNORM:
+    case PIPE_FORMAT_L8_SNORM:
+    case PIPE_FORMAT_R8_UNORM:
+    case PIPE_FORMAT_R8_SNORM:
+    case PIPE_FORMAT_R32_FLOAT:
+    case PIPE_FORMAT_L32_FLOAT:
+    case PIPE_FORMAT_I32_FLOAT:
+        return COLORMASK_RRRR;
+
+    case PIPE_FORMAT_L8A8_SNORM:
+    case PIPE_FORMAT_L8A8_UNORM:
+    case PIPE_FORMAT_L16A16_UNORM:
+    case PIPE_FORMAT_L16A16_SNORM:
+    case PIPE_FORMAT_L16A16_FLOAT:
+    case PIPE_FORMAT_L32A32_FLOAT:
+        return COLORMASK_ARRA;
+
+    case PIPE_FORMAT_R8G8_SNORM:
+    case PIPE_FORMAT_R8G8_UNORM:
+    case PIPE_FORMAT_R16G16_UNORM:
+    case PIPE_FORMAT_R16G16_SNORM:
+    case PIPE_FORMAT_R16G16_FLOAT:
+    case PIPE_FORMAT_R32G32_FLOAT:
+        return COLORMASK_GRRG;
+
+    case PIPE_FORMAT_B5G6R5_UNORM:
+    case PIPE_FORMAT_B5G5R5A1_UNORM:
+    case PIPE_FORMAT_B5G5R5X1_UNORM:
+    case PIPE_FORMAT_B4G4R4A4_UNORM:
+    case PIPE_FORMAT_B4G4R4X4_UNORM:
+    case PIPE_FORMAT_B8G8R8A8_UNORM:
+    /*case PIPE_FORMAT_B8G8R8A8_SNORM:*/
+    case PIPE_FORMAT_B8G8R8X8_UNORM:
+    /*case PIPE_FORMAT_B8G8R8X8_SNORM:*/
+    case PIPE_FORMAT_B10G10R10A2_UNORM:
+        return COLORMASK_BGRA;
+
+    case PIPE_FORMAT_R8G8B8X8_UNORM:
+    /*case PIPE_FORMAT_R8G8B8X8_SNORM:*/
+    case PIPE_FORMAT_R8G8B8A8_UNORM:
+    case PIPE_FORMAT_R8G8B8A8_SNORM:
+    case PIPE_FORMAT_R10G10B10A2_UNORM:
+    case PIPE_FORMAT_R10G10B10X2_SNORM:
+    case PIPE_FORMAT_R16_UNORM:
+    case PIPE_FORMAT_R16G16B16A16_UNORM:
+    case PIPE_FORMAT_R16_SNORM:
+    case PIPE_FORMAT_R16G16B16A16_SNORM:
+    case PIPE_FORMAT_R16_FLOAT:
+    case PIPE_FORMAT_R16G16B16A16_FLOAT:
+    case PIPE_FORMAT_R32G32B32A32_FLOAT:
+    case PIPE_FORMAT_L16_UNORM:
+    case PIPE_FORMAT_L16_SNORM:
+    case PIPE_FORMAT_L16_FLOAT:
+    case PIPE_FORMAT_I16_UNORM:
+    case PIPE_FORMAT_I16_SNORM:
+    case PIPE_FORMAT_I16_FLOAT:
+        return COLORMASK_RGBA;
+
+    default:
+        return ~0; /* Unsupported. */
+    }
+}
+
 boolean r300_is_colorbuffer_format_supported(enum pipe_format format)
 {
    return r300_translate_colorformat(format) != ~0 &&
-           r300_translate_out_fmt(format) != ~0;
+           r300_translate_out_fmt(format) != ~0 &&
+           r300_translate_colormask_swizzle(format) != ~0;
 }

 boolean r300_is_zs_format_supported(enum pipe_format format)
@@ -827,6 +904,8 @@ static void r300_texture_setup_fb_state(struct r300_surface *surf)
                R300_COLOR_TILE(tex->tex.macrotile[level]) |
                R300_COLOR_MICROTILE(tex->tex.microtile);
        surf->format = r300_translate_out_fmt(surf->base.format);
+        surf->colormask_swizzle =
+            r300_translate_colormask_swizzle(surf->base.format);
    }
 }

--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -36,6 +36,7 @@

 /* Convert info about VS output semantics into r300_shader_semantics. */
 static void r300_shader_read_vs_outputs(
+    struct r300_context *r300,
    struct tgsi_shader_info* info,
    struct r300_shader_semantics* vs_outputs)
 {
@@ -83,6 +84,14 @@ static void r300_shader_read_vs_outputs(
                fprintf(stderr, "r300 VP: cannot handle edgeflag output.\n");
                break;

+            case TGSI_SEMANTIC_CLIPVERTEX:
+                assert(index == 0);
+                /* Draw does clip vertex for us. */
+                if (r300->screen->caps.has_tcl) {
+                    fprintf(stderr, "r300 VP: cannot handle clip vertex output.\n");
+                }
+                break;
+
            default:
                fprintf(stderr, "r300 VP: unknown vertex output semantic: %i.\n",
                        info->output_semantic_name[i]);
@@ -160,10 +169,11 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
    c->code->outputs[outputs->wpos] = reg++;
 }

-void r300_init_vs_outputs(struct r300_vertex_shader *vs)
+void r300_init_vs_outputs(struct r300_context *r300,
+                          struct r300_vertex_shader *vs)
 {
    tgsi_scan_shader(vs->state.tokens, &vs->info);
-    r300_shader_read_vs_outputs(&vs->info, &vs->outputs);
+    r300_shader_read_vs_outputs(r300, &vs->info, &vs->outputs);
 }

 static void r300_dummy_vertex_shader(
@@ -187,7 +197,7 @@ static void r300_dummy_vertex_shader(
    ureg_destroy(ureg);

    shader->dummy = TRUE;
-    r300_init_vs_outputs(shader);
+    r300_init_vs_outputs(r300, shader);
    r300_translate_vertex_shader(r300, shader);
 }

--- a/src/gallium/drivers/r300/r300_vs.h
+++ b/src/gallium/drivers/r300/r300_vs.h
@@ -56,12 +56,13 @@ struct r300_vertex_shader {
    void *draw_vs;
 };

-void r300_init_vs_outputs(struct r300_vertex_shader *vs);
+void r300_init_vs_outputs(struct r300_context *r300,
+                          struct r300_vertex_shader *vs);

 void r300_translate_vertex_shader(struct r300_context *r300,
                                  struct r300_vertex_shader *vs);

-void r300_draw_init_vertex_shader(struct draw_context *draw,
+void r300_draw_init_vertex_shader(struct r300_context *r300,
                                  struct r300_vertex_shader *vs);

 #endif /* R300_VS_H */
--- a/src/gallium/drivers/r300/r300_vs_draw.c
+++ b/src/gallium/drivers/r300/r300_vs_draw.c
@@ -29,7 +29,7 @@
 *
 * Transformations:
 * 1) If the secondary color output is present, the primary color must be
- *    inserted before it.
+ *    present too.
 * 2) If any back-face color output is present, there must be all 4 color
 *    outputs and missing ones must be inserted.
 * 3) Insert a trailing texcoord output containing a copy of POS, for WPOS.
@@ -52,7 +52,6 @@ struct vs_transform_context {

    boolean color_used[2];
    boolean bcolor_used[2];
-    boolean temp_used[128];

    /* Index of the pos output, typically 0. */
    unsigned pos_output;
@@ -72,6 +71,8 @@ struct vs_transform_context {
    boolean first_instruction;
    /* End instruction processed? */
    boolean end_instruction;
+
+    boolean temp_used[1024];
 };

 static void emit_temp(struct tgsi_transform_context *ctx, unsigned reg)
@@ -102,9 +103,9 @@ static void emit_output(struct tgsi_transform_context *ctx,
    ++vsctx->num_outputs;
 }

-static void insert_output(struct tgsi_transform_context *ctx,
-                          struct tgsi_full_declaration *before,
-                          unsigned name, unsigned index, unsigned interp)
+static void insert_output_before(struct tgsi_transform_context *ctx,
+                                 struct tgsi_full_declaration *before,
+                                 unsigned name, unsigned index, unsigned interp)
 {
    struct vs_transform_context *vsctx = (struct vs_transform_context *)ctx;
    unsigned i;
@@ -115,28 +116,29 @@ static void insert_output(struct tgsi_transform_context *ctx,
    }

    /* Insert the new output. */
-    emit_output(ctx, name, index, interp, before->Range.First);
+    emit_output(ctx, name, index, interp,
+                before->Range.First + vsctx->decl_shift);

    ++vsctx->decl_shift;
 }

-static void insert_trailing_bcolor(struct tgsi_transform_context *ctx,
-                                   struct tgsi_full_declaration *before)
+static void insert_output_after(struct tgsi_transform_context *ctx,
+                                struct tgsi_full_declaration *after,
+                                unsigned name, unsigned index, unsigned interp)
 {
    struct vs_transform_context *vsctx = (struct vs_transform_context *)ctx;
+    unsigned i;

-    /* If BCOLOR0 is used, make sure BCOLOR1 is present too. Otherwise
-     * the rasterizer doesn't do the color selection correctly. */
-    if (vsctx->bcolor_used[0] && !vsctx->bcolor_used[1]) {
-        if (before) {
-            insert_output(ctx, before, TGSI_SEMANTIC_BCOLOR, 1,
-                          TGSI_INTERPOLATE_LINEAR);
-        } else {
-            emit_output(ctx, TGSI_SEMANTIC_BCOLOR, 1,
-                        TGSI_INTERPOLATE_LINEAR, vsctx->num_outputs);
-        }
-        vsctx->bcolor_used[1] = TRUE;
+    /* Make a place for the new output. */
+    for (i = after->Range.First+1; i < Elements(vsctx->out_remap); i++) {
+        ++vsctx->out_remap[i];
    }
+
+    /* Insert the new output. */
+    emit_output(ctx, name, index, interp,
+                after->Range.First + 1);
+
+    ++vsctx->decl_shift;
 }

 static void transform_decl(struct tgsi_transform_context *ctx,
@@ -153,41 +155,38 @@ static void transform_decl(struct tgsi_transform_context *ctx,

            case TGSI_SEMANTIC_COLOR:
                assert(decl->Semantic.Index < 2);
-                vsctx->color_used[decl->Semantic.Index] = TRUE;

                /* We must rasterize the first color if the second one is
                 * used, otherwise the rasterizer doesn't do the color
                 * selection correctly. Declare it, but don't write to it. */
                if (decl->Semantic.Index == 1 && !vsctx->color_used[0]) {
-                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
-                                  TGSI_INTERPOLATE_LINEAR);
+                    insert_output_before(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
                    vsctx->color_used[0] = TRUE;
                }
                break;

            case TGSI_SEMANTIC_BCOLOR:
                assert(decl->Semantic.Index < 2);
-                vsctx->bcolor_used[decl->Semantic.Index] = TRUE;

                /* We must rasterize all 4 colors if back-face colors are
                 * used, otherwise the rasterizer doesn't do the color
                 * selection correctly. Declare it, but don't write to it. */
                if (!vsctx->color_used[0]) {
-                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
-                                  TGSI_INTERPOLATE_LINEAR);
+                    insert_output_before(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
                    vsctx->color_used[0] = TRUE;
                }
                if (!vsctx->color_used[1]) {
-                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 1,
-                                  TGSI_INTERPOLATE_LINEAR);
+                    insert_output_before(ctx, decl, TGSI_SEMANTIC_COLOR, 1,
+                                         TGSI_INTERPOLATE_LINEAR);
                    vsctx->color_used[1] = TRUE;
                }
                if (decl->Semantic.Index == 1 && !vsctx->bcolor_used[0]) {
-                    insert_output(ctx, decl, TGSI_SEMANTIC_BCOLOR, 0,
-                                  TGSI_INTERPOLATE_LINEAR);
+                    insert_output_before(ctx, decl, TGSI_SEMANTIC_BCOLOR, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
                    vsctx->bcolor_used[0] = TRUE;
                }
-                /* One more case is handled in insert_trailing_bcolor. */
                break;

            case TGSI_SEMANTIC_GENERIC:
@@ -195,11 +194,6 @@ static void transform_decl(struct tgsi_transform_context *ctx,
                break;
        }

-        if (decl->Semantic.Name != TGSI_SEMANTIC_BCOLOR) {
-            /* Insert it as soon as possible. */
-            insert_trailing_bcolor(ctx, decl);
-        }
-
        /* Since we're inserting new outputs in between, the following outputs
         * should be moved to the right so that they don't overlap with
         * the newly added ones. */
@@ -214,6 +208,14 @@ static void transform_decl(struct tgsi_transform_context *ctx,
    }

    ctx->emit_declaration(ctx, decl);
+
+    /* Insert BCOLOR1 if needed. */
+    if (decl->Declaration.File == TGSI_FILE_OUTPUT &&
+        decl->Semantic.Name == TGSI_SEMANTIC_BCOLOR &&
+        !vsctx->bcolor_used[1]) {
+        insert_output_after(ctx, decl, TGSI_SEMANTIC_BCOLOR, 1,
+                            TGSI_INTERPOLATE_LINEAR);
+    }
 }

 static void transform_inst(struct tgsi_transform_context *ctx,
@@ -226,10 +228,6 @@ static void transform_inst(struct tgsi_transform_context *ctx,
    if (!vsctx->first_instruction) {
        vsctx->first_instruction = TRUE;

-        /* The trailing BCOLOR should be inserted before the code
-         * if it hasn't already been done so. */
-        insert_trailing_bcolor(ctx, NULL);
-
        /* Insert the generic output for WPOS. */
        emit_output(ctx, TGSI_SEMANTIC_GENERIC, vsctx->last_generic + 1,
                    TGSI_INTERPOLATE_PERSPECTIVE, vsctx->num_outputs);
@@ -309,14 +307,18 @@ static void transform_inst(struct tgsi_transform_context *ctx,
    ctx->emit_instruction(ctx, inst);
 }

-void r300_draw_init_vertex_shader(struct draw_context *draw,
+void r300_draw_init_vertex_shader(struct r300_context *r300,
                                  struct r300_vertex_shader *vs)
 {
+    struct draw_context *draw = r300->draw;
    struct pipe_shader_state new_vs;
+    struct tgsi_shader_info info;
    struct vs_transform_context transform;
    const uint newLen = tgsi_num_tokens(vs->state.tokens) + 100 /* XXX */;
    unsigned i;

+    tgsi_scan_shader(vs->state.tokens, &info);
+
    new_vs.tokens = tgsi_alloc_tokens(newLen);
    if (new_vs.tokens == NULL)
        return;
@@ -329,6 +331,22 @@ void r300_draw_init_vertex_shader(struct draw_context *draw,
    transform.base.transform_instruction = transform_inst;
    transform.base.transform_declaration = transform_decl;

+    for (i = 0; i < info.num_outputs; i++) {
+        unsigned index = info.output_semantic_index[i];
+
+        switch (info.output_semantic_name[i]) {
+            case TGSI_SEMANTIC_COLOR:
+                assert(index < 2);
+                transform.color_used[index] = TRUE;
+                break;
+
+            case TGSI_SEMANTIC_BCOLOR:
+                assert(index < 2);
+                transform.bcolor_used[index] = TRUE;
+                break;
+        }
+    }
+
    tgsi_transform_shader(vs->state.tokens,
                          (struct tgsi_token*)new_vs.tokens,
                          newLen, &transform.base);
@@ -350,7 +368,7 @@ void r300_draw_init_vertex_shader(struct draw_context *draw,
    vs->state.tokens = new_vs.tokens;

    /* Init the VS output table for the rasterizer. */
-    r300_init_vs_outputs(vs);
+    r300_init_vs_outputs(r300, vs);

    /* Make the last generic be WPOS. */
    vs->outputs.wpos = vs->outputs.generic[transform.last_generic + 1];
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -38,6 +38,23 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
+		/* prepend ALU_EXTENDED if we need more than 2 kcache sets */
+		if (cf->eg_alu_extended) {
+			bc->bytecode[id++] =
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(cf->kcache[2].bank) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(cf->kcache[3].bank) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(cf->kcache[2].mode);
+			bc->bytecode[id++] = EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_EXTENDED |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(cf->kcache[3].mode) |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(cf->kcache[2].addr) |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(cf->kcache[3].addr) |
+				S_SQ_CF_ALU_WORD1_EXT_BARRIER(1);
+		}
+
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
 			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
 			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -78,6 +78,10 @@
 #define   S_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) & 0x3) << 30)
 #define   G_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) >> 30) & 0x3)
 #define   C_SQ_CF_ALU_WORD0_KCACHE_MODE0                             0x3FFFFFFF
+#define     V_SQ_CF_KCACHE_NOP                                       0x00000000
+#define     V_SQ_CF_KCACHE_LOCK_1                                    0x00000001
+#define     V_SQ_CF_KCACHE_LOCK_2                                    0x00000002
+#define     V_SQ_CF_KCACHE_LOCK_LOOP_INDEX                           0x00000003
 #define P_SQ_CF_ALU_WORD1
 #define   S_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) & 0x3) << 0)
 #define   G_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) >> 0) & 0x3)
@@ -103,7 +107,50 @@
 #define   S_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) & 0x1) << 31)
 #define   G_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) >> 31) & 0x1)
 #define   C_SQ_CF_ALU_WORD1_BARRIER                                  0x7FFFFFFF
-/* extended TODO */
+
+#define P_SQ_CF_ALU_WORD0_EXT
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x)           (((x) & 0x3) << 4)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x)           (((x) >> 4) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0              0xFFFFFFCF
+#define	    V_SQ_CF_INDEX_NONE                                       0x00
+#define	    V_SQ_CF_INDEX_0                                          0x01
+#define	    V_SQ_CF_INDEX_1                                          0x02
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x)           (((x) & 0x3) << 6)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x)           (((x) >> 6) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1              0xFFFFFF3F
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x)           (((x) & 0x3) << 8)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x)           (((x) >> 8) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2              0xFFFFFCFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x)           (((x) & 0x3) << 10)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x)           (((x) >> 10) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3              0xFFFFF3FF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x)                      (((x) & 0xF) << 22)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x)                      (((x) >> 22) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2                         0xFC3FFFFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x)                      (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x)                      (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3                         0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x)                      (((x) & 0x3) << 30)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x)                      (((x) >> 30) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2                         0x3FFFFFFF
+
+#define P_SQ_CF_ALU_WORD1_EXT
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x)                      (((x) & 0x3) << 0)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x)                      (((x) >> 0) & 0x3)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3                         0xFFFFFFFC
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x)                      (((x) & 0xFF) << 2)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x)                      (((x) >> 2) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2                         0xFFFFFC03
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x)                      (((x) & 0xFF) << 10)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x)                      (((x) >> 10) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3                         0xFFFC03FF
+#define   S_SQ_CF_ALU_WORD1_EXT_CF_INST(x)                           (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD1_EXT_CF_INST(x)                           (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD1_EXT_CF_INST                              0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD1_EXT_BARRIER(x)                           (((x) & 0x1) << 31)
+#define   G_SQ_CF_ALU_WORD1_EXT_BARRIER(x)                           (((x) >> 31) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_EXT_BARRIER                              0x7FFFFFFF
+
 /* done */
 #define P_SQ_CF_ALLOC_EXPORT_WORD0
 #define   S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x)                   (((x) & 0x1FFF) << 0)
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -99,7 +99,9 @@ static const struct r600_reg evergreen_context_reg_list[] = {
 	{R_028058_DB_DEPTH_SIZE, 0, 0, 0},
 	{R_02805C_DB_DEPTH_SLICE, 0, 0, 0},
 	{R_028140_ALU_CONST_BUFFER_SIZE_PS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028144_ALU_CONST_BUFFER_SIZE_PS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028180_ALU_CONST_BUFFER_SIZE_VS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028184_ALU_CONST_BUFFER_SIZE_VS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028200_PA_SC_WINDOW_OFFSET, 0, 0, 0},
 	{R_028204_PA_SC_WINDOW_SCISSOR_TL, 0, 0, 0},
 	{R_028208_PA_SC_WINDOW_SCISSOR_BR, 0, 0, 0},
@@ -293,7 +295,9 @@ static const struct r600_reg evergreen_context_reg_list[] = {
 	{R_028924_SQ_GS_VERT_ITEMSIZE_2, 0, 0, 0},
 	{R_028928_SQ_GS_VERT_ITEMSIZE_3, 0, 0, 0},
 	{R_028940_ALU_CONST_CACHE_PS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028944_ALU_CONST_CACHE_PS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028980_ALU_CONST_CACHE_VS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028984_ALU_CONST_CACHE_VS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028A00_PA_SU_POINT_SIZE, 0, 0, 0},
 	{R_028A04_PA_SU_POINT_MINMAX, 0, 0, 0},
 	{R_028A08_PA_SU_LINE_CNTL, 0, 0, 0},
@@ -465,7 +469,9 @@ static const struct r600_reg cayman_context_reg_list[] = {
 	{R_028058_DB_DEPTH_SIZE, 0, 0, 0},
 	{R_02805C_DB_DEPTH_SLICE, 0, 0, 0},
 	{R_028140_ALU_CONST_BUFFER_SIZE_PS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028144_ALU_CONST_BUFFER_SIZE_PS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028180_ALU_CONST_BUFFER_SIZE_VS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028184_ALU_CONST_BUFFER_SIZE_VS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028200_PA_SC_WINDOW_OFFSET, 0, 0, 0},
 	{R_028204_PA_SC_WINDOW_SCISSOR_TL, 0, 0, 0},
 	{R_028208_PA_SC_WINDOW_SCISSOR_BR, 0, 0, 0},
@@ -658,7 +664,9 @@ static const struct r600_reg cayman_context_reg_list[] = {
 	{R_028924_SQ_GS_VERT_ITEMSIZE_2, 0, 0, 0},
 	{R_028928_SQ_GS_VERT_ITEMSIZE_3, 0, 0, 0},
 	{R_028940_ALU_CONST_CACHE_PS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028944_ALU_CONST_CACHE_PS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028980_ALU_CONST_CACHE_VS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028984_ALU_CONST_CACHE_VS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028A00_PA_SU_POINT_SIZE, 0, 0, 0},
 	{R_028A04_PA_SU_POINT_MINMAX, 0, 0, 0},
 	{R_028A08_PA_SU_LINE_CNTL, 0, 0, 0},
@@ -922,7 +930,7 @@ int evergreen_context_init(struct r600_context *ctx, struct r600_screen *screen)
 	}

 	/* add blocks */
-	if (ctx->screen->family == CHIP_CAYMAN)
+	if (ctx->screen->family >= CHIP_CAYMAN)
 		r = r600_context_add_block(ctx, cayman_config_reg_list,
 					   Elements(cayman_config_reg_list), PKT3_SET_CONFIG_REG, EVERGREEN_CONFIG_REG_OFFSET);
 	else
@@ -930,7 +938,7 @@ int evergreen_context_init(struct r600_context *ctx, struct r600_screen *screen)
 					   Elements(evergreen_config_reg_list), PKT3_SET_CONFIG_REG, EVERGREEN_CONFIG_REG_OFFSET);
 	if (r)
 		goto out_err;
-	if (ctx->screen->family == CHIP_CAYMAN)
+	if (ctx->screen->family >= CHIP_CAYMAN)
 		r = r600_context_add_block(ctx, cayman_context_reg_list,
 					   Elements(cayman_context_reg_list), PKT3_SET_CONTEXT_REG, EVERGREEN_CONTEXT_REG_OFFSET);
 	else
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -508,6 +508,10 @@ static uint32_t r600_translate_colorformat(enum pipe_format format)
 	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 		return V_028C70_COLOR_X24_8_32_FLOAT;

+	case PIPE_FORMAT_R32_UINT:
+	case PIPE_FORMAT_R32_SINT:
+		return V_028C70_COLOR_32;
+
 	case PIPE_FORMAT_R32_FLOAT:
 	case PIPE_FORMAT_Z32_FLOAT:
 		return V_028C70_COLOR_32_FLOAT;
@@ -902,6 +906,8 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	rs->clamp_fragment_color = state->clamp_fragment_color;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
+	rs->two_side = state->light_twoside;
+	rs->clip_plane_enable = state->clip_plane_enable;

 	clip_rule = state->scissor ? 0xAAAA : 0xFFFF;

@@ -939,8 +945,8 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 		S_028814_POLYMODE_FRONT_PTYPE(r600_translate_fill(state->fill_front)) |
 		S_028814_POLYMODE_BACK_PTYPE(r600_translate_fill(state->fill_back)), 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_02881C_PA_CL_VS_OUT_CNTL,
-			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex) |
-			S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex), 0xFFFFFFFF, NULL, 0);
+			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex),
+			S_02881C_USE_VTX_POINT_SIZE(1), NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028820_PA_CL_NANINF_CNTL, 0x00000000, 0xFFFFFFFF, NULL, 0);
 	/* point size 12.4 fixed point */
 	tmp = (unsigned)(state->point_size * 8.0);
@@ -987,9 +993,10 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp), 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_02820C_PA_SC_CLIPRECT_RULE, clip_rule, 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028810_PA_CL_CLIP_CNTL,
-			S_028810_PS_UCP_MODE(3) | (state->clip_plane_enable & 63) |
-			S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
-			S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip), 0xFFFFFFFF, NULL, 0);
+			S_028810_PS_UCP_MODE(3) | S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
+			S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip),
+			S_028810_PS_UCP_MODE(3) | S_028810_ZCLIP_NEAR_DISABLE(1) |
+			S_028810_ZCLIP_FAR_DISABLE(1), NULL, 0);
 	return rstate;
 }

@@ -1039,6 +1046,7 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte
 							struct pipe_resource *texture,
 							const struct pipe_sampler_view *state)
 {
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_sampler_view *view = CALLOC_STRUCT(r600_pipe_sampler_view);
 	struct r600_pipe_resource_state *rstate;
 	struct r600_resource_texture *tmp = (struct r600_resource_texture*)texture;
@@ -1085,6 +1093,11 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte
 		      util_format_get_blockwidth(state->format), 8);
 	array_mode = tmp->array_mode[0];
 	tile_type = tmp->tile_type;
+	/* 128 bit formats require tile type = 1 */
+	if (rctx->chip_class == CAYMAN) {
+		if (util_format_get_blocksize(state->format) >= 16)
+			tile_type = 1;
+	}

 	if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
@@ -1100,8 +1113,11 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte

 	rstate->val[0] = (S_030000_DIM(r600_tex_dim(texture->target)) |
 			  S_030000_PITCH((pitch / 8) - 1) |
-			  S_030000_NON_DISP_TILING_ORDER(tile_type) |
 			  S_030000_TEX_WIDTH(texture->width0 - 1));
+	if (rctx->chip_class == CAYMAN)
+		rstate->val[0] |= CM_S_030000_NON_DISP_TILING_ORDER(tile_type);
+	else
+		rstate->val[0] |= S_030000_NON_DISP_TILING_ORDER(tile_type);
 	rstate->val[1] = (S_030004_TEX_HEIGHT(height - 1) |
 			  S_030004_TEX_DEPTH(depth - 1) |
 			  S_030004_ARRAY_MODE(array_mode));
@@ -1204,6 +1220,7 @@ static void evergreen_set_clip_state(struct pipe_context *ctx,
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+	struct pipe_resource *cbuf;

 	if (rstate == NULL)
 		return;
@@ -1228,6 +1245,13 @@ static void evergreen_set_clip_state(struct pipe_context *ctx,
 	free(rctx->states[R600_PIPE_STATE_CLIP]);
 	rctx->states[R600_PIPE_STATE_CLIP] = rstate;
 	r600_context_pipe_state_set(&rctx->ctx, rstate);
+
+	cbuf = pipe_user_buffer_create(ctx->screen,
+                                   state->ucp,
+                                   4*4*8, /* 8*4 floats */
+                                   PIPE_BIND_CONSTANT_BUFFER);
+	r600_set_constant_buffer(ctx, PIPE_SHADER_VERTEX, 1, cbuf);
+	pipe_resource_reference(&cbuf, NULL);
 }

 static void evergreen_set_polygon_stipple(struct pipe_context *ctx,
@@ -1442,6 +1466,11 @@ static void evergreen_cb(struct r600_pipe_context *rctx, struct r600_pipe_state
 		tile_type = rtex->tile_type;
 	} else /* workaround for linear buffers */
 		tile_type = 1;
+	/* 128 bit formats require tile type = 1 */
+	if (rctx->chip_class == CAYMAN) {
+		if (util_format_get_blocksize(surf->base.format) >= 16)
+			tile_type = 1;
+	}

 	/* FIXME handle enabling of CB beyond BASE8 which has different offset */
 	r600_pipe_state_add_reg(rstate,
@@ -2462,6 +2491,16 @@ void evergreen_pipe_shader_vs(struct pipe_context *ctx, struct r600_pipe_shader
 	r600_pipe_state_add_reg(rstate,
 				R_03A200_SQ_LOOP_CONST_0 + (32 * 4), 0x01000FFF,
 				0xFFFFFFFF, NULL, 0);
+
+	r600_pipe_state_add_reg(rstate,
+				R_02881C_PA_CL_VS_OUT_CNTL,
+				S_02881C_VS_OUT_CCDIST0_VEC_ENA((rshader->clip_dist_write & 0x0F) != 0) |
+				S_02881C_VS_OUT_CCDIST1_VEC_ENA((rshader->clip_dist_write & 0xF0) != 0) |
+				S_02881C_VS_OUT_MISC_VEC_ENA(rshader->vs_out_misc_write),
+				S_02881C_VS_OUT_CCDIST0_VEC_ENA(1) |
+				S_02881C_VS_OUT_CCDIST1_VEC_ENA(1) |
+				S_02881C_VS_OUT_MISC_VEC_ENA(1),
+				NULL, 0);
 }

 void evergreen_fetch_shader(struct pipe_context *ctx,
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -675,13 +675,6 @@
 #define   G_028814_MULTI_PRIM_IB_ENA(x)                (((x) >> 21) & 0x1)
 #define   C_028814_MULTI_PRIM_IB_ENA                   0xFFDFFFFF

-#define R_028004_DB_DEPTH_VIEW                       0x028004
-#define   S_028004_SLICE_START(x)                      (((x) & 0x7FF) << 0)
-#define   G_028004_SLICE_START(x)                      (((x) >> 0) & 0x7FF)
-#define   C_028004_SLICE_START                         0xFFFFF800
-#define   S_028004_SLICE_MAX(x)                        (((x) & 0x7FF) << 13)
-#define   G_028004_SLICE_MAX(x)                        (((x) >> 13) & 0x7FF)
-#define   C_028004_SLICE_MAX                           0xFF001FFF
 #define R_028D24_DB_HTILE_SURFACE                    0x028D24
 #define   S_028D24_HTILE_WIDTH(x)                      (((x) & 0x1) << 0)
 #define   G_028D24_HTILE_WIDTH(x)                      (((x) >> 0) & 0x1)
@@ -977,6 +970,9 @@
 #define   S_030000_NON_DISP_TILING_ORDER(x)            (((x) & 0x1) << 5)
 #define   G_030000_NON_DISP_TILING_ORDER(x)            (((x) >> 5) & 0x1)
 #define   C_030000_NON_DISP_TILING_ORDER               0xFFFFFFDF
+#define   CM_S_030000_NON_DISP_TILING_ORDER(x)         (((x) & 0x3) << 4)
+#define   CM_G_030000_NON_DISP_TILING_ORDER(x)         (((x) >> 4) & 0x3)
+#define   CM_C_030000_NON_DISP_TILING_ORDER            0xFFFFFFCF
 #define   S_030000_PITCH(x)                            (((x) & 0xFFF) << 6)
 #define   G_030000_PITCH(x)                            (((x) >> 6) & 0xFFF)
 #define   C_030000_PITCH                               0xFFFC003F
@@ -1469,6 +1465,12 @@
 #define   S_028004_ZPASS_INCREMENT_DISABLE        (((x) & 0x1) << 0)
 #define   S_028004_PERFECT_ZPASS_COUNTS(x)        (((x) & 0x1) << 1)
 #define R_028008_DB_DEPTH_VIEW                       0x00028008
+#define   S_028008_SLICE_START(x)                      (((x) & 0x7FF) << 0)
+#define   G_028008_SLICE_START(x)                      (((x) >> 0) & 0x7FF)
+#define   C_028008_SLICE_START                         0xFFFFF800
+#define   S_028008_SLICE_MAX(x)                        (((x) & 0x7FF) << 13)
+#define   G_028008_SLICE_MAX(x)                        (((x) >> 13) & 0x7FF)
+#define   C_028008_SLICE_MAX                           0xFF001FFF
 #define R_02800C_DB_RENDER_OVERRIDE                  0x0002800C
 #define   V_02800C_FORCE_OFF                         0
 #define   V_02800C_FORCE_ENABLE                      1
@@ -1524,7 +1526,9 @@
 #define R_028050_DB_Z_WRITE_BASE                     0x00028050
 #define R_028054_DB_STENCIL_WRITE_BASE               0x00028054
 #define R_028140_ALU_CONST_BUFFER_SIZE_PS_0          0x00028140
+#define R_028144_ALU_CONST_BUFFER_SIZE_PS_1          0x00028144
 #define R_028180_ALU_CONST_BUFFER_SIZE_VS_0          0x00028180
+#define R_028184_ALU_CONST_BUFFER_SIZE_VS_1          0x00028184
 #define R_028200_PA_SC_WINDOW_OFFSET                 0x00028200
 #define R_02820C_PA_SC_CLIPRECT_RULE                 0x0002820C
 #define R_028210_PA_SC_CLIPRECT_0_TL                 0x00028210
@@ -1701,7 +1705,9 @@
 #define R_028924_SQ_GS_VERT_ITEMSIZE_2               0x00028924
 #define R_028928_SQ_GS_VERT_ITEMSIZE_3               0x00028928
 #define R_028940_ALU_CONST_CACHE_PS_0                0x00028940
+#define R_028944_ALU_CONST_CACHE_PS_1                0x00028944
 #define R_028980_ALU_CONST_CACHE_VS_0                0x00028980
+#define R_028984_ALU_CONST_CACHE_VS_1                0x00028984
 #define R_028A04_PA_SU_POINT_MINMAX                  0x00028A04
 #define R_028A08_PA_SU_LINE_CNTL                     0x00028A08
 #define   S_028A08_WIDTH(x)                            (((x) & 0xFFFF) << 0)
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -66,6 +66,7 @@ enum radeon_family {
 	CHIP_TURKS,
 	CHIP_CAICOS,
 	CHIP_CAYMAN,
+	CHIP_ARUBA,
 	CHIP_LAST,
 };

--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -91,6 +91,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
@@ -236,8 +237,18 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
 	return tex;
 }

-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class)
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family)
 {
+	if ((chip_class == R600) && (family != CHIP_RV670))
+		bc->ar_handling = AR_HANDLE_RV6XX;
+	else
+		bc->ar_handling = AR_HANDLE_NORMAL;
+
+	if ((chip_class == R600) && (family != CHIP_RV670 && family != CHIP_RS780 &&
+					   family != CHIP_RS880))
+		bc->r6xx_nop_after_rel_dst = 1;
+	else
+		bc->r6xx_nop_after_rel_dst = 0;
 	LIST_INITHEAD(&bc->cf);
 	bc->chip_class = chip_class;
 }
@@ -249,8 +260,14 @@ static int r600_bytecode_add_cf(struct r600_bytecode *bc)
 	if (cf == NULL)
 		return -ENOMEM;
 	LIST_ADDTAIL(&cf->list, &bc->cf);
-	if (bc->cf_last)
+	if (bc->cf_last) {
 		cf->id = bc->cf_last->id + 2;
+		if (bc->cf_last->eg_alu_extended) {
+			/* take into account extended alu size */
+			cf->id += 2;
+			bc->ndw += 2;
+		}
+	}
 	bc->cf_last = cf;
 	bc->ncf++;
 	bc->ndw += 2;
@@ -428,7 +445,8 @@ static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *
 		return !alu->is_op3 && (
 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
-			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
+			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT ||
+			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT);
 	case EVERGREEN:
 	case CAYMAN:
 	default:
@@ -444,7 +462,8 @@ static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a
 	case R600:
 	case R700:
 		return is_alu_reduction_inst(bc, alu) ||
-			is_alu_mova_inst(bc, alu);
+			(is_alu_mova_inst(bc, alu) && 
+			 (alu->inst != V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT));
 	case EVERGREEN:
 	case CAYMAN:
 	default:
@@ -452,6 +471,7 @@ static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a
 			is_alu_mova_inst(bc, alu) ||
 			(alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
 			 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR ||
+			 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_LOAD_P0 ||
 			 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY ||
 			 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW);
 	}
@@ -465,6 +485,7 @@ static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode
 	case R700:
 		if (!alu->is_op3)
 			return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
+				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT ||
 				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
 			        alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT ||
 				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
@@ -536,6 +557,19 @@ static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a
 		!is_alu_trans_unit_inst(bc, alu);
 }

+static int is_nop_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
+{
+	switch (bc->chip_class) {
+	case R600:
+	case R700:
+		return (!alu->is_op3 && alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
+	case EVERGREEN:
+	case CAYMAN:
+	default:
+		return (!alu->is_op3 && alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
+	}
+}		
+
 static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
 			    struct r600_bytecode_alu *assignment[5])
 {
@@ -688,7 +722,7 @@ static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
 					return r;
 			}
 		} else if (is_cfile(sel)) {
-			r = reserve_cfile(bc, bs, sel, elem);
+			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
 			if (r)
 				return r;
 		}
@@ -715,7 +749,7 @@ static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
 				const_count++;
 		}
 		if (is_cfile(sel)) {
-			r = reserve_cfile(bc, bs, sel, elem);
+			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
 			if (r)
 				return r;
 		}
@@ -1037,6 +1071,10 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu
 		alu = slots[i];
 		num_once_inst += is_alu_once_inst(bc, alu);

+		/* don't reschedule NOPs */
+		if (is_nop_inst(bc, alu))
+			return 0;
+
 		/* Let's check dst gpr. */
 		if (alu->dst.rel) {
 			if (have_mova)
@@ -1111,117 +1149,203 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu
 	return 0;
 }

-/* This code handles kcache lines as single blocks of 32 constants. We could
- * probably do slightly better by recognizing that we actually have two
- * consecutive lines of 16 constants, but the resulting code would also be
- * somewhat more complicated. */
-static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
+/* we'll keep kcache sets sorted by bank & addr */
+static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
+		struct r600_bytecode_kcache *kcache,
+		unsigned bank, unsigned line)
 {
-	struct r600_bytecode_kcache *kcache = bc->cf_last->kcache;
-	unsigned int required_lines;
-	unsigned int free_lines = 0;
-	unsigned int cache_line[3];
-	unsigned int count = 0;
-	unsigned int i, j;
-	int r;
+	int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;

-	/* Collect required cache lines. */
-	for (i = 0; i < 3; ++i) {
-		boolean found = false;
-		unsigned int line;
+	for (i = 0; i < kcache_banks; i++) {
+		if (kcache[i].mode) {
+			int d;

-		if (alu->src[i].sel < 512)
+			if (kcache[i].bank < bank)
+				continue;
+
+			if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
+					kcache[i].bank > bank) {
+				/* try to insert new line */
+				if (kcache[kcache_banks-1].mode) {
+					/* all sets are in use */
+					return -ENOMEM;
+				}
+
+				memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
+				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
+				kcache[i].bank = bank;
+				kcache[i].addr = line;
+				return 0;
+			}
+
+			d = line - kcache[i].addr;
+
+			if (d == -1) {
+				kcache[i].addr--;
+				if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
+					/* we are prepending the line to the current set,
+					 * discarding the existing second line,
+					 * so we'll have to insert line+2 after it */
+					line += 2;
+					continue;
+				} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
+					kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
+					return 0;
+				} else {
+					/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
+					return -ENOMEM;
+				}
+			} else if (d == 1) {
+				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
+				return 0;
+			} else if (d == 0)
+				return 0;
+		} else { /* free kcache set - use it */
+			kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
+			kcache[i].bank = bank;
+			kcache[i].addr = line;
+			return 0;
+		}
+	}
+	return -ENOMEM;
+}
+
+static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
+		struct r600_bytecode_kcache *kcache,
+		struct r600_bytecode_alu *alu)
+{
+	int i, r;
+
+	for (i = 0; i < 3; i++) {
+		unsigned bank, line, sel = alu->src[i].sel;
+
+		if (sel < 512)
 			continue;

-		line = ((alu->src[i].sel - 512) / 32) * 2;
+		bank = alu->src[i].kc_bank;
+		line = (sel-512)>>4;

-		for (j = 0; j < count; ++j) {
-			if (cache_line[j] == line) {
-				found = true;
-				break;
-			}
-		}
-
-		if (!found)
-			cache_line[count++] = line;
-	}
-
-	/* This should never actually happen. */
-	if (count >= 3) return -ENOMEM;
-
-	for (i = 0; i < 2; ++i) {
-		if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
-			++free_lines;
-		}
-	}
-
-	/* Filter lines pulled in by previous intructions. Note that this is
-	 * only for the required_lines count, we can't remove these from the
-	 * cache_line array since we may have to start a new ALU clause. */
-	for (i = 0, required_lines = count; i < count; ++i) {
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == cache_line[i]) {
-				--required_lines;
-				break;
-			}
-		}
-	}
-
-	/* Start a new ALU clause if needed. */
-	if (required_lines > free_lines) {
-		if ((r = r600_bytecode_add_cf(bc))) {
+		if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line)))
 			return r;
-		}
-		bc->cf_last->inst = type;
-		kcache = bc->cf_last->kcache;
 	}
+	return 0;
+}

-	/* Setup the kcache lines. */
-	for (i = 0; i < count; ++i) {
-		boolean found = false;
-
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == cache_line[i]) {
-				found = true;
-				break;
-			}
-		}
-
-		if (found) continue;
-
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
-				kcache[j].bank = 0;
-				kcache[j].addr = cache_line[i];
-				kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
-				break;
-			}
-		}
-	}
+static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc,
+		struct r600_bytecode_alu *alu,
+		struct r600_bytecode_kcache * kcache)
+{
+	int i, j;

 	/* Alter the src operands to refer to the kcache. */
 	for (i = 0; i < 3; ++i) {
 		static const unsigned int base[] = {128, 160, 256, 288};
-		unsigned int line;
+		unsigned int line, sel = alu->src[i].sel, found = 0;

-		if (alu->src[i].sel < 512)
+		if (sel < 512)
 			continue;

-		alu->src[i].sel -= 512;
-		line = (alu->src[i].sel / 32) * 2;
+		sel -= 512;
+		line = sel>>4;

-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == line) {
-				alu->src[i].sel &= 0x1f;
-				alu->src[i].sel += base[j];
-				break;
+		for (j = 0; j < 4 && !found; ++j) {
+			switch (kcache[j].mode) {
+			case V_SQ_CF_KCACHE_NOP:
+			case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
+				R600_ERR("unexpected kcache line mode\n");
+				return -ENOMEM;
+			default:
+				if (kcache[j].bank == alu->src[i].kc_bank &&
+						kcache[j].addr <= line &&
+						line < kcache[j].addr + kcache[j].mode) {
+					alu->src[i].sel = sel - (kcache[j].addr<<4);
+					alu->src[i].sel += base[j];
+					found=1;
+			    }
 			}
 		}
 	}
+	return 0;
+}

+static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
+{
+	struct r600_bytecode_kcache kcache_sets[4];
+	struct r600_bytecode_kcache *kcache = kcache_sets;
+	int r;
+
+	memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
+
+	if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
+		/* can't alloc, need to start new clause */
+		if ((r = r600_bytecode_add_cf(bc))) {
+			return r;
+		}
+		bc->cf_last->inst = type;
+
+		/* retry with the new clause */
+		kcache = bc->cf_last->kcache;
+		if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
+			/* can't alloc again- should never happen */
+			return r;
+		}
+	} else {
+		/* update kcache sets */
+		memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
+	}
+
+	/* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */
+	if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) {
+		if (bc->chip_class < EVERGREEN)
+			return -ENOMEM;
+		bc->cf_last->eg_alu_extended = 1;
+	}
+
+	return 0;
+}
+
+static int insert_nop_r6xx(struct r600_bytecode *bc)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(alu));
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+		alu.src[0].chan = i;
+		alu.dst.chan = i;
+		alu.last = (i == 3);
+		r = r600_bytecode_add_alu(bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
+/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
+static int load_ar_r6xx(struct r600_bytecode *bc)
+{
+	struct r600_bytecode_alu alu;
+	int r;
+
+	if (bc->ar_loaded)
+		return 0;
+
+	/* hack to avoid making MOVA the last instruction in the clause */
+	if ((bc->cf_last->ndw>>1) >= 110)
+		bc->force_add_cf = 1;
+
+	memset(&alu, 0, sizeof(alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT;
+	alu.src[0].sel = bc->ar_reg;
+	alu.last = 1;
+	alu.index_mode = INDEX_MODE_LOOP;
+	r = r600_bytecode_add_alu(bc, &alu);
+	if (r)
+		return r;
+
+	/* no requirement to set uses waterfall on MOVA_GPR_INT */
+	bc->ar_loaded = 1;
 	return 0;
 }

@@ -1231,6 +1355,9 @@ static int load_ar(struct r600_bytecode *bc)
 	struct r600_bytecode_alu alu;
 	int r;

+	if (bc->ar_handling)
+		return load_ar_r6xx(bc);
+
 	if (bc->ar_loaded)
 		return 0;

@@ -1365,6 +1492,10 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytec
 		bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
 		bc->cf_last->curr_bs_head = NULL;
 	}
+
+	if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
+		insert_nop_r6xx(bc);
+
 	return 0;
 }

@@ -1588,6 +1719,7 @@ static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecod
 				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
 				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
 				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
+				S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
 				S_SQ_ALU_WORD0_LAST(alu->last);

 	if (alu->is_op3) {
@@ -1837,6 +1969,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 					if (r)
 						return r;
 					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
+					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
+
 					switch(bc->chip_class) {
 					case EVERGREEN: /* eg alu is same encoding as r700 */
 					case CAYMAN:
@@ -1932,6 +2066,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 					if (r)
 						return r;
 					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
+					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
+
 					switch(bc->chip_class) {
 					case R600:
 						r = r600_bytecode_alu_build(bc, alu, addr);
@@ -2072,6 +2208,19 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
+				if (cf->eg_alu_extended) {
+					fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]);
+					fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank);
+					fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank);
+					fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode);
+					id++;
+					fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]);
+					fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode);
+					fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr);
+					fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr);
+					id++;
+				}
+
 				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
 				fprintf(stderr, "ADDR:%d ", cf->addr);
 				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
@@ -2275,7 +2424,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 			fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
 			fprintf(stderr, "REL:%d ", alu->src[1].rel);
 			fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
-			fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
+			fprintf(stderr, "NEG:%d ", alu->src[1].neg);
+			fprintf(stderr, "IM:%d) ", alu->index_mode);
 			fprintf(stderr, "LAST:%d)\n", alu->last);
 			id++;
 			fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
@@ -2539,7 +2689,7 @@ int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, stru
 	unsigned fetch_resource_start = rctx->chip_class >= EVERGREEN ? 0 : 160;
 	unsigned format, num_format, format_comp, endian;
 	u32 *bytecode;
-	int i, r;
+	int i, j, r;

 	/* Vertex element offsets need special handling. If the offset is
 	 * bigger than what we can put in the fetch instruction we need to
@@ -2554,28 +2704,44 @@ int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, stru
 	}

 	memset(&bc, 0, sizeof(bc));
-	r600_bytecode_init(&bc, rctx->chip_class);
+	r600_bytecode_init(&bc, rctx->chip_class, rctx->family);

 	for (i = 0; i < ve->count; i++) {
 		if (elements[i].instance_divisor > 1) {
-			struct r600_bytecode_alu alu;
-
-			memset(&alu, 0, sizeof(alu));
-			alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
-			alu.src[0].sel = 0;
-			alu.src[0].chan = 3;
-
-			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
-			alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
-
-			alu.dst.sel = i + 1;
-			alu.dst.chan = 3;
-			alu.dst.write = 1;
-			alu.last = 1;
-
-			if ((r = r600_bytecode_add_alu(&bc, &alu))) {
-				r600_bytecode_clear(&bc);
-				return r;
+			if (rctx->chip_class == CAYMAN) {
+				for (j = 0; j < 4; j++) {
+					struct r600_bytecode_alu alu;
+					memset(&alu, 0, sizeof(alu));
+					alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
+					alu.src[0].sel = 0;
+					alu.src[0].chan = 3;
+					alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+					alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
+					alu.dst.sel = i + 1;
+					alu.dst.chan = j;
+					alu.dst.write = j == 3;
+					alu.last = j == 3;
+					if ((r = r600_bytecode_add_alu(&bc, &alu))) {
+						r600_bytecode_clear(&bc);
+						return r;
+					}
+				}
+			} else {
+				struct r600_bytecode_alu alu;
+				memset(&alu, 0, sizeof(alu));
+				alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
+				alu.src[0].sel = 0;
+				alu.src[0].chan = 3;
+				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+				alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
+				alu.dst.sel = i + 1;
+				alu.dst.chan = 3;
+				alu.dst.write = 1;
+				alu.last = 1;
+				if ((r = r600_bytecode_add_alu(&bc, &alu))) {
+					r600_bytecode_clear(&bc);
+					return r;
+				}
 			}
 		}
 	}
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -32,6 +32,7 @@ struct r600_bytecode_alu_src {
 	unsigned			neg;
 	unsigned			abs;
 	unsigned			rel;
+	unsigned			kc_bank;
 	uint32_t			value;
 };

@@ -54,6 +55,7 @@ struct r600_bytecode_alu {
 	unsigned			bank_swizzle;
 	unsigned			bank_swizzle_force;
 	unsigned			omod;
+	unsigned                        index_mode;
 };

 struct r600_bytecode_tex {
@@ -143,8 +145,9 @@ struct r600_bytecode_cf {
 	unsigned			cond;
 	unsigned			pop_count;
 	unsigned			cf_addr; /* control flow addr */
-	struct r600_bytecode_kcache		kcache[2];
+	struct r600_bytecode_kcache		kcache[4];
 	unsigned			r6xx_uses_waterfall;
+	unsigned			eg_alu_extended;
 	struct list_head		alu;
 	struct list_head		tex;
 	struct list_head		vtx;
@@ -176,6 +179,10 @@ struct r600_cf_callstack {
 	int				max;
 };

+#define AR_HANDLE_NORMAL 0
+#define AR_HANDLE_RV6XX 1 /* except RV670 */
+
+
 struct r600_bytecode {
 	enum chip_class			chip_class;
 	int				type;
@@ -194,13 +201,15 @@ struct r600_bytecode {
 	struct r600_cf_callstack	callstack[SQ_MAX_CALL_DEPTH];
 	unsigned	ar_loaded;
 	unsigned	ar_reg;
+	unsigned        ar_handling;
+	unsigned        r6xx_nop_after_rel_dst;
 };

 /* eg_asm.c */
 int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf);

 /* r600_asm.c */
-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class);
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family);
 void r600_bytecode_clear(struct r600_bytecode *bc);
 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu);
 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx);
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -408,9 +408,13 @@ static const struct r600_reg r600_context_reg_list[] = {
 	{R_028128_CB_CLEAR_BLUE, 0, 0, 0},
 	{R_02812C_CB_CLEAR_ALPHA, 0, 0, 0},
 	{R_028140_ALU_CONST_BUFFER_SIZE_PS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028144_ALU_CONST_BUFFER_SIZE_PS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028180_ALU_CONST_BUFFER_SIZE_VS_0, REG_FLAG_DIRTY_ALWAYS, 0, 0},
+	{R_028184_ALU_CONST_BUFFER_SIZE_VS_1, REG_FLAG_DIRTY_ALWAYS, 0, 0},
 	{R_028940_ALU_CONST_CACHE_PS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028944_ALU_CONST_CACHE_PS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_028980_ALU_CONST_CACHE_VS_0, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
+	{R_028984_ALU_CONST_CACHE_VS_1, REG_FLAG_NEED_BO, S_0085F0_SH_ACTION_ENA(1), 0xFFFFFFFF},
 	{R_02823C_CB_SHADER_MASK, 0, 0, 0},
 	{R_028238_CB_TARGET_MASK, 0, 0, 0},
 	{R_028410_SX_ALPHA_TEST_CONTROL, 0, 0, 0},
@@ -1326,15 +1330,20 @@ void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *
 			if (block->pm4_bo_index[j]) {
 				/* find relocation */
 				struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
-				block->pm4[reloc->bo_pm4_index] =
-					r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
-				r600_context_bo_flush(ctx,
-						      reloc->flush_flags,
-						      reloc->flush_mask,
-						      reloc->bo);
+				if (reloc->bo) {
+					block->pm4[reloc->bo_pm4_index] =
+							r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
+					r600_context_bo_flush(ctx,
+							reloc->flush_flags,
+							reloc->flush_mask,
+							reloc->bo);
+				} else {
+					block->pm4[reloc->bo_pm4_index] = 0;
+				}
 				nbo--;
 				if (nbo == 0)
 					break;
+
 			}
 		}
 		ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -47,6 +47,7 @@
 #include "r600_resource.h"
 #include "r600_shader.h"
 #include "r600_pipe.h"
+#include "r600_hw_context_priv.h"

 /*
 * pipe_context
@@ -116,6 +117,14 @@ static struct r600_fence *r600_create_fence(struct r600_pipe_context *ctx)

 	rscreen->fences.data[fence->index] = 0;
 	r600_context_emit_fence(&ctx->ctx, rscreen->fences.bo, fence->index, 1);
+
+	/* Create a dummy BO so that fence_finish without a timeout can sleep waiting for completion */
+	fence->sleep_bo = (struct r600_resource*)
+			pipe_buffer_create(&ctx->ctx.screen->screen, PIPE_BIND_CUSTOM,
+					   PIPE_USAGE_STAGING, 1);
+	/* Add the fence as a dummy relocation. */
+	r600_context_bo_reloc(&ctx->ctx, fence->sleep_bo, RADEON_USAGE_READWRITE);
+
 out:
 	pipe_mutex_unlock(rscreen->fences.mutex);
 	return fence;
@@ -324,6 +333,7 @@ static const char *r600_get_family_name(enum radeon_family family)
 	case CHIP_TURKS: return "AMD TURKS";
 	case CHIP_CAICOS: return "AMD CAICOS";
 	case CHIP_CAYMAN: return "AMD CAYMAN";
+	case CHIP_ARUBA: return "AMD ARUBA";
 	default: return "AMD unknown";
 	}
 }
@@ -492,7 +502,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_MAX_CONSTS:
 		return R600_MAX_CONST_BUFFER_SIZE;
 	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-		return R600_MAX_CONST_BUFFERS;
+		return R600_MAX_CONST_BUFFERS-1;
 	case PIPE_SHADER_CAP_MAX_PREDS:
 		return 0; /* FIXME */
 	case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
@@ -505,8 +515,6 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_SUBROUTINES:
 		return 0;
 	case PIPE_SHADER_CAP_INTEGERS:
-		if (rscreen->chip_class == EVERGREEN)
-			return 1;
 		return 0;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 		return 16;
@@ -570,6 +578,7 @@ static void r600_fence_reference(struct pipe_screen *pscreen,
 	if (pipe_reference(&(*oldf)->reference, &newf->reference)) {
 		struct r600_screen *rscreen = (struct r600_screen *)pscreen;
 		pipe_mutex_lock(rscreen->fences.mutex);
+		pipe_resource_reference((struct pipe_resource**)&(*oldf)->sleep_bo, NULL);
 		LIST_ADDTAIL(&(*oldf)->head, &rscreen->fences.pool);
 		pipe_mutex_unlock(rscreen->fences.mutex);
 	}
@@ -603,6 +612,17 @@ static boolean r600_fence_finish(struct pipe_screen *pscreen,
 	}

 	while (rscreen->fences.data[rfence->index] == 0) {
+		/* Special-case infinite timeout - wait for the dummy BO to become idle */
+		if (timeout == PIPE_TIMEOUT_INFINITE) {
+			rscreen->ws->buffer_wait(rfence->sleep_bo->buf, RADEON_USAGE_READWRITE);
+			break;
+		}
+
+		/* The dummy BO will be busy until the CS including the fence has completed, or
+		 * the GPU is reset. Don't bother continuing to spin when the BO is idle. */
+		if (!rscreen->ws->buffer_is_busy(rfence->sleep_bo->buf, RADEON_USAGE_READWRITE))
+			break;
+
 		if (++spins % 256)
 			continue;
 #ifdef PIPE_OS_UNIX
@@ -612,11 +632,11 @@ static boolean r600_fence_finish(struct pipe_screen *pscreen,
 #endif
 		if (timeout != PIPE_TIMEOUT_INFINITE &&
 		    os_time_get() - start_time >= timeout) {
-			return FALSE;
+			break;
 		}
 	}

-	return TRUE;
+	return rscreen->fences.data[rfence->index] != 0;
 }

 static int r600_interpret_tiling(struct r600_screen *rscreen, uint32_t tiling_config)
@@ -758,7 +778,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
 	}

 	/* setup class */
-	if (rscreen->family == CHIP_CAYMAN) {
+	if (rscreen->family >= CHIP_CAYMAN) {
 		rscreen->chip_class = CAYMAN;
 	} else if (rscreen->family >= CHIP_CEDAR) {
 		rscreen->chip_class = EVERGREEN;
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -39,7 +39,7 @@
 #include "r600_shader.h"
 #include "r600_resource.h"

-#define R600_MAX_CONST_BUFFERS 1
+#define R600_MAX_CONST_BUFFERS 2
 #define R600_MAX_CONST_BUFFER_SIZE 4096

 #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -108,7 +108,9 @@ struct r600_pipe_rasterizer {
 	boolean				clamp_vertex_color;
 	boolean				clamp_fragment_color;
 	boolean				flatshade;
+	boolean				two_side;
 	unsigned			sprite_coord_enable;
+	unsigned                        clip_plane_enable;
 	float				offset_units;
 	float				offset_scale;
 };
@@ -170,6 +172,7 @@ struct r600_textures_info {
 struct r600_fence {
 	struct pipe_reference		reference;
 	unsigned			index; /* in the shared bo */
+	struct r600_resource            *sleep_bo;
 	struct list_head		head;
 };

@@ -218,6 +221,9 @@ struct r600_pipe_context {
 	/* shader information */
 	boolean				clamp_vertex_color;
 	boolean				clamp_fragment_color;
+	boolean				two_side;
+	unsigned			user_clip_plane_enable;
+	unsigned			clip_dist_enable;
 	unsigned			sprite_coord_enable;
 	boolean				export_16bpc;
 	unsigned			alpha_ref;
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -191,6 +191,10 @@ struct r600_shader_ctx {
 	boolean                                 input_linear;
 	boolean                                 input_perspective;
 	int					num_interp_gpr;
+	int					face_gpr;
+	int					colors_used;
+	boolean                 clip_vertex_write;
+	unsigned                cv_output;
 };

 struct r600_shader_tgsi_instruction {
@@ -374,12 +378,6 @@ static int r600_spi_sid(struct r600_shader_io * io)
 			/* For generic params simply use sid from tgsi */
 			index = io->sid;
 		} else {
-
-			/* FIXME: two-side rendering is broken in r600g, this will
-			 * keep old functionality */
-			if (name == TGSI_SEMANTIC_BCOLOR)
-				name = TGSI_SEMANTIC_COLOR;
-
 			/* For non-generic params - pack name and sid into 8 bits */
 			index = 0x80 | (name<<3) | (io->sid);
 		}
@@ -393,6 +391,51 @@ static int r600_spi_sid(struct r600_shader_io * io)
 	return index;
 };

+/* turn input into interpolate on EG */
+static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
+{
+	int r = 0;
+
+	if (ctx->shader->input[index].spi_sid) {
+		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
+		if (ctx->shader->input[index].interpolate > 0) {
+			r = evergreen_interp_alu(ctx, index);
+		} else {
+			r = evergreen_interp_flat(ctx, index);
+		}
+	}
+	return r;
+}
+
+static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
+{
+	struct r600_bytecode_alu alu;
+	int i, r;
+	int gpr_front = ctx->shader->input[front].gpr;
+	int gpr_back = ctx->shader->input[back].gpr;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(alu));
+		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
+		alu.is_op3 = 1;
+		alu.dst.write = 1;
+		alu.dst.sel = gpr_front;
+		alu.src[0].sel = ctx->face_gpr;
+		alu.src[1].sel = gpr_front;
+		alu.src[2].sel = gpr_back;
+
+		alu.dst.chan = i;
+		alu.src[1].chan = i;
+		alu.src[2].chan = i;
+		alu.last = (i==3);
+
+		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+			return r;
+	}
+
+	return 0;
+}
+
 static int tgsi_declaration(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
@@ -408,15 +451,15 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->input[i].interpolate = d->Declaration.Interpolate;
 		ctx->shader->input[i].centroid = d->Declaration.Centroid;
 		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
-		if (ctx->type == TGSI_PROCESSOR_FRAGMENT && ctx->bc->chip_class >= EVERGREEN) {
-			/* turn input into interpolate on EG */
-			if (ctx->shader->input[i].spi_sid) {
-				ctx->shader->input[i].lds_pos = ctx->shader->nlds++;
-				if (ctx->shader->input[i].interpolate > 0) {
-					evergreen_interp_alu(ctx, i);
-				} else {
-					evergreen_interp_flat(ctx, i);
-				}
+		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+			if (ctx->shader->input[i].name == TGSI_SEMANTIC_FACE)
+				ctx->face_gpr = ctx->shader->input[i].gpr;
+			else if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR)
+				ctx->colors_used++;
+			if (ctx->bc->chip_class >= EVERGREEN) {
+				r = evergreen_interp_input(ctx, i);
+				if (r)
+					return r;
 			}
 		}
 		break;
@@ -427,6 +470,21 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
 		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
 		ctx->shader->output[i].interpolate = d->Declaration.Interpolate;
+		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
+		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+			switch (d->Semantic.Name) {
+			case TGSI_SEMANTIC_CLIPDIST:
+				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
+				break;
+			case TGSI_SEMANTIC_PSIZE:
+				ctx->shader->vs_out_misc_write = 1;
+				break;
+			case TGSI_SEMANTIC_CLIPVERTEX:
+				ctx->clip_vertex_write = TRUE;
+				ctx->cv_output = i;
+				break;
+			}
+		}
 		break;
 	case TGSI_FILE_CONSTANT:
 	case TGSI_FILE_TEMPORARY:
@@ -690,6 +748,47 @@ static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
 	return 0;
 }

+static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
+{
+	int i, r, count = ctx->shader->ninput;
+
+	/* additional inputs will be allocated right after the existing inputs,
+	 * we won't need them after the color selection, so we don't need to
+	 * reserve these gprs for the rest of the shader code and to adjust
+	 * output offsets etc. */
+	int gpr = ctx->file_offset[TGSI_FILE_INPUT] +
+			ctx->info.file_max[TGSI_FILE_INPUT] + 1;
+
+	if (ctx->face_gpr == -1) {
+		i = ctx->shader->ninput++;
+		ctx->shader->input[i].name = TGSI_SEMANTIC_FACE;
+		ctx->shader->input[i].spi_sid = 0;
+		ctx->shader->input[i].gpr = gpr++;
+		ctx->face_gpr = ctx->shader->input[i].gpr;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
+			int ni = ctx->shader->ninput++;
+			memcpy(&ctx->shader->input[ni],&ctx->shader->input[i], sizeof(struct r600_shader_io));
+			ctx->shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
+			ctx->shader->input[ni].spi_sid = r600_spi_sid(&ctx->shader->input[ni]);
+			ctx->shader->input[ni].gpr = gpr++;
+
+			if (ctx->bc->chip_class >= EVERGREEN) {
+				r = evergreen_interp_input(ctx, ni);
+				if (r)
+					return r;
+			}
+
+			r = select_twoside_color(ctx, i, ni);
+			if (r)
+				return r;
+		}
+	}
+	return 0;
+}
+
 static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pipe_shader *pipeshader)
 {
 	struct r600_shader *shader = &pipeshader->shader;
@@ -701,11 +800,12 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 	struct r600_bytecode_output output[32];
 	unsigned output_done, noutput;
 	unsigned opcode;
-	int i, j, r = 0, pos0;
+	int i, j, k, r = 0;
+	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;

 	ctx.bc = &shader->bc;
 	ctx.shader = shader;
-	r600_bytecode_init(ctx.bc, rctx->chip_class);
+	r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
 	ctx.tokens = tokens;
 	tgsi_scan_shader(tokens, &ctx.info);
 	tgsi_parse_init(&ctx.parse, tokens);
@@ -713,6 +813,12 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 	shader->processor_type = ctx.type;
 	ctx.bc->type = shader->processor_type;

+	ctx.face_gpr = -1;
+	ctx.colors_used = 0;
+	ctx.clip_vertex_write = 0;
+
+	shader->two_side = (ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->two_side;
+
 	shader->clamp_color = (((ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->clamp_fragment_color) ||
 		((ctx.type == TGSI_PROCESSOR_VERTEX) && rctx->clamp_vertex_color));

@@ -791,6 +897,37 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 			if (r)
 				goto out_err;
 			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			break;
+		case TGSI_TOKEN_TYPE_PROPERTY:
+			property = &ctx.parse.FullToken.FullProperty;
+			switch (property->Property.PropertyName) {
+			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
+				if (property->u[0].Data == 1)
+					shader->fs_write_all = TRUE;
+				break;
+			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
+				if (property->u[0].Data == 1)
+					shader->vs_prohibit_ucps = TRUE;
+				break;
+			}
+			break;
+		default:
+			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
+			r = -EINVAL;
+			goto out_err;
+		}
+	}
+
+	if (shader->two_side && ctx.colors_used) {
+		if ((r = process_twoside_color_inputs(&ctx)))
+			return r;
+	}
+
+	tgsi_parse_init(&ctx.parse, tokens);
+	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
+		tgsi_parse_token(&ctx.parse);
+		switch (ctx.parse.FullToken.Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			r = tgsi_is_supported(&ctx);
 			if (r)
@@ -814,22 +951,57 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 			if (r)
 				goto out_err;
 			break;
-		case TGSI_TOKEN_TYPE_PROPERTY:
-			property = &ctx.parse.FullToken.FullProperty;
-			if (property->Property.PropertyName == TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS) {
-				if (property->u[0].Data == 1)
-					shader->fs_write_all = TRUE;
-			}
-			break;
 		default:
-			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
-			r = -EINVAL;
-			goto out_err;
+			break;
 		}
 	}

 	noutput = shader->noutput;

+	if (ctx.clip_vertex_write) {
+		/* need to convert a clipvertex write into clipdistance writes and not export
+		   the clip vertex anymore */
+
+		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
+		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
+		shader->output[noutput].gpr = ctx.temp_reg;
+		noutput++;
+		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
+		shader->output[noutput].gpr = ctx.temp_reg+1;
+		noutput++;
+
+		/* reset spi_sid for clipvertex output to avoid confusing spi */
+		shader->output[ctx.cv_output].spi_sid = 0;
+
+		shader->clip_dist_write = 0xFF;
+
+		for (i = 0; i < 8; i++) {
+			int oreg = i >> 2;
+			int ochan = i & 3;
+
+			for (j = 0; j < 4; j++) {
+				struct r600_bytecode_alu alu;
+				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
+				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
+				alu.src[0].chan = j;
+
+				alu.src[1].sel = 512 + i;
+				alu.src[1].kc_bank = 1;
+				alu.src[1].chan = j;
+
+				alu.dst.sel = ctx.temp_reg + oreg;
+				alu.dst.chan = j;
+				alu.dst.write = (j == ochan);
+				if (j == 3)
+					alu.last = 1;
+				r = r600_bytecode_add_alu(ctx.bc, &alu);
+				if (r)
+					return r;
+			}
+		}
+	}
+
 	/* clamp color outputs */
 	if (shader->clamp_color) {
 		for (i = 0; i < noutput; i++) {
@@ -949,68 +1121,86 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 	}

 	/* export output */
-	j = 0;
-	for (i = 0, pos0 = 0; i < noutput; i++) {
-		memset(&output[i], 0, sizeof(struct r600_bytecode_output));
-		output[i + j].gpr = shader->output[i].gpr;
-		output[i + j].elem_size = 3;
-		output[i + j].swizzle_x = 0;
-		output[i + j].swizzle_y = 1;
-		output[i + j].swizzle_z = 2;
-		output[i + j].swizzle_w = 3;
-		output[i + j].burst_count = 1;
-		output[i + j].barrier = 1;
-		output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
-		output[i + j].array_base = i - pos0;
-		output[i + j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
+	for (i = 0, j = 0; i < noutput; i++, j++) {
+		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
+		output[j].gpr = shader->output[i].gpr;
+		output[j].elem_size = 3;
+		output[j].swizzle_x = 0;
+		output[j].swizzle_y = 1;
+		output[j].swizzle_z = 2;
+		output[j].swizzle_w = 3;
+		output[j].burst_count = 1;
+		output[j].barrier = 1;
+		output[j].type = -1;
+		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
 		switch (ctx.type) {
 		case TGSI_PROCESSOR_VERTEX:
-			if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
-				output[i + j].array_base = 60;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
-				/* position doesn't count in array_base */
-				pos0++;
-			}
-			if (shader->output[i].name == TGSI_SEMANTIC_PSIZE) {
-				output[i + j].array_base = 61;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
-				/* position doesn't count in array_base */
-				pos0++;
+			switch (shader->output[i].name) {
+			case TGSI_SEMANTIC_POSITION:
+				output[j].array_base = next_pos_base++;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+				break;
+
+			case TGSI_SEMANTIC_PSIZE:
+				output[j].array_base = next_pos_base++;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+				break;
+			case TGSI_SEMANTIC_CLIPVERTEX:
+				j--;
+				break;
+			case TGSI_SEMANTIC_CLIPDIST:
+				output[j].array_base = next_pos_base++;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+				/* spi_sid is 0 for clipdistance outputs that were generated
+				 * for clipvertex - we don't need to pass them to PS */
+				if (shader->output[i].spi_sid) {
+					j++;
+					/* duplicate it as PARAM to pass to the pixel shader */
+					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
+					output[j].array_base = next_param_base++;
+					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+				}
+				break;
+			case TGSI_SEMANTIC_FOG:
+				output[j].swizzle_y = 4; /* 0 */
+				output[j].swizzle_z = 4; /* 0 */
+				output[j].swizzle_w = 5; /* 1 */
+				break;
 			}
 			break;
 		case TGSI_PROCESSOR_FRAGMENT:
 			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
-				output[i + j].array_base = shader->output[i].sid;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+				output[j].array_base = next_pixel_base++;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 				if (shader->fs_write_all && (rctx->chip_class >= EVERGREEN)) {
-					for (j = 1; j < shader->nr_cbufs; j++) {
-						memset(&output[i + j], 0, sizeof(struct r600_bytecode_output));
-						output[i + j].gpr = shader->output[i].gpr;
-						output[i + j].elem_size = 3;
-						output[i + j].swizzle_x = 0;
-						output[i + j].swizzle_y = 1;
-						output[i + j].swizzle_z = 2;
-						output[i + j].swizzle_w = 3;
-						output[i + j].burst_count = 1;
-						output[i + j].barrier = 1;
-						output[i + j].array_base = shader->output[i].sid + j;
-						output[i + j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
-						output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+					for (k = 1; k < shader->nr_cbufs; k++) {
+						j++;
+						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
+						output[j].gpr = shader->output[i].gpr;
+						output[j].elem_size = 3;
+						output[j].swizzle_x = 0;
+						output[j].swizzle_y = 1;
+						output[j].swizzle_z = 2;
+						output[j].swizzle_w = 3;
+						output[j].burst_count = 1;
+						output[j].barrier = 1;
+						output[j].array_base = next_pixel_base++;
+						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
+						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 					}
-					j = shader->nr_cbufs-1;
 				}
 			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
-				output[i + j].array_base = 61;
-				output[i + j].swizzle_x = 2;
-				output[i + j].swizzle_y = 7;
-				output[i + j].swizzle_z = output[i + j].swizzle_w = 7;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+				output[j].array_base = 61;
+				output[j].swizzle_x = 2;
+				output[j].swizzle_y = 7;
+				output[j].swizzle_z = output[j].swizzle_w = 7;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
-				output[i + j].array_base = 61;
-				output[i + j].swizzle_x = 7;
-				output[i + j].swizzle_y = 1;
-				output[i + j].swizzle_z = output[i + j].swizzle_w = 7;
-				output[i + j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+				output[j].array_base = 61;
+				output[j].swizzle_x = 7;
+				output[j].swizzle_y = 1;
+				output[j].swizzle_z = output[j].swizzle_w = 7;
+				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else {
 				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
 				r = -EINVAL;
@@ -1022,48 +1212,49 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 			r = -EINVAL;
 			goto out_err;
 		}
+
+		if (output[j].type==-1) {
+			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+			output[j].array_base = next_param_base++;
+		}
 	}
-	noutput += j;
+
 	/* add fake param output for vertex shader if no param is exported */
-	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
-		for (i = 0, pos0 = 0; i < noutput; i++) {
-			if (output[i].type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) {
-				pos0 = 1;
-				break;
-			}
-		}
-		if (!pos0) {
-			memset(&output[i], 0, sizeof(struct r600_bytecode_output));
-			output[i].gpr = 0;
-			output[i].elem_size = 3;
-			output[i].swizzle_x = 7;
-			output[i].swizzle_y = 7;
-			output[i].swizzle_z = 7;
-			output[i].swizzle_w = 7;
-			output[i].burst_count = 1;
-			output[i].barrier = 1;
-			output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
-			output[i].array_base = 0;
-			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
-			noutput++;
-		}
+	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
+			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
+			output[j].gpr = 0;
+			output[j].elem_size = 3;
+			output[j].swizzle_x = 7;
+			output[j].swizzle_y = 7;
+			output[j].swizzle_z = 7;
+			output[j].swizzle_w = 7;
+			output[j].burst_count = 1;
+			output[j].barrier = 1;
+			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+			output[j].array_base = 0;
+			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
+			j++;
 	}
+
 	/* add fake pixel export */
-	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && !noutput) {
-		memset(&output[0], 0, sizeof(struct r600_bytecode_output));
-		output[0].gpr = 0;
-		output[0].elem_size = 3;
-		output[0].swizzle_x = 7;
-		output[0].swizzle_y = 7;
-		output[0].swizzle_z = 7;
-		output[0].swizzle_w = 7;
-		output[0].burst_count = 1;
-		output[0].barrier = 1;
-		output[0].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
-		output[0].array_base = 0;
-		output[0].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
-		noutput++;
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && j == 0) {
+		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
+		output[j].gpr = 0;
+		output[j].elem_size = 3;
+		output[j].swizzle_x = 7;
+		output[j].swizzle_y = 7;
+		output[j].swizzle_z = 7;
+		output[j].swizzle_w = 7;
+		output[j].burst_count = 1;
+		output[j].barrier = 1;
+		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+		output[j].array_base = 0;
+		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
+		j++;
 	}
+
+	noutput = j;
+
 	/* set export done on last export of each type */
 	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
 		if (ctx.bc->chip_class < CAYMAN) {
@@ -1086,6 +1277,14 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 	if (ctx.bc->chip_class == CAYMAN)
 		cm_bytecode_add_cf_end(ctx.bc);

+	/* check GPR limit - we have 124 = 128 - 4
+	 * (4 are reserved as alu clause temporary registers) */
+	if (ctx.bc->ngpr > 124) {
+		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
+		r = -ENOMEM;
+		goto out_err;
+	}
+
 	free(ctx.literals);
 	tgsi_parse_free(&ctx.parse);
 	return 0;
@@ -1262,6 +1461,11 @@ static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
 		alu.inst = ctx->inst_info->r600_opcode;
 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
 			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
+
+			/* RSQ should take the absolute value of src */
+			if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
+				r600_bytecode_src_set_abs(&alu.src[j]);
+			}
 		}
 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
@@ -2665,10 +2869,8 @@ static int tgsi_exp(struct r600_shader_ctx *ctx)

 				alu.dst.sel = ctx->temp_reg;
 				alu.dst.chan = i;
-				if (i == 0)
-					alu.dst.write = 1;
-				if (i == 2)
-					alu.last = 1;
+				alu.dst.write = i == 0;
+				alu.last = i == 2;
 				r = r600_bytecode_add_alu(ctx->bc, &alu);
 				if (r)
 					return r;
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -34,6 +34,7 @@ struct r600_shader_io {
 	unsigned		interpolate;
 	boolean                 centroid;
 	unsigned		lds_pos; /* for evergreen */
+	unsigned		write_mask;
 };

 struct r600_shader {
@@ -46,8 +47,14 @@ struct r600_shader {
 	struct r600_shader_io	output[32];
 	boolean			uses_kill;
 	boolean			fs_write_all;
+	boolean			vs_prohibit_ucps;
 	boolean			clamp_color;
+	boolean			two_side;
 	unsigned		nr_cbufs;
+	/* bit n is set if the shader writes gl_ClipDistance[n] */
+	unsigned		clip_dist_write;
+	/* flag is set if the shader writes VS_OUT_MISC_VEC (e.g. for PSIZE) */
+	boolean			vs_out_misc_write;
 };

 #endif
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -471,4 +471,11 @@
 #define SQ_ALU_SCL_122                           0x00000001
 #define SQ_ALU_SCL_212                           0x00000002
 #define SQ_ALU_SCL_221                           0x00000003
+
+#define   INDEX_MODE_AR_X 0
+#define   INDEX_MODE_AR_Y 1
+#define   INDEX_MODE_AR_Z 2
+#define   INDEX_MODE_AR_W 3
+#define   INDEX_MODE_LOOP 4
+
 #endif
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -509,6 +509,10 @@ static uint32_t r600_translate_colorformat(enum pipe_format format)
 	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 		return V_0280A0_COLOR_X24_8_32_FLOAT;

+	case PIPE_FORMAT_R32_UINT:
+	case PIPE_FORMAT_R32_SINT:
+		return V_0280A0_COLOR_32;
+
 	case PIPE_FORMAT_R32_FLOAT:
 	case PIPE_FORMAT_Z32_FLOAT:
 		return V_0280A0_COLOR_32_FLOAT;
@@ -954,6 +958,8 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	rs->clamp_fragment_color = state->clamp_fragment_color;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
+	rs->two_side = state->light_twoside;
+	rs->clip_plane_enable = state->clip_plane_enable;

 	clip_rule = state->scissor ? 0xAAAA : 0xFFFF;
 	/* offset */
@@ -990,8 +996,8 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 		S_028814_POLYMODE_FRONT_PTYPE(r600_translate_fill(state->fill_front)) |
 		S_028814_POLYMODE_BACK_PTYPE(r600_translate_fill(state->fill_back)), 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_02881C_PA_CL_VS_OUT_CNTL,
-			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex) |
-			S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex), 0xFFFFFFFF, NULL, 0);
+			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex),
+			S_02881C_USE_VTX_POINT_SIZE(1), NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028820_PA_CL_NANINF_CNTL, 0x00000000, 0xFFFFFFFF, NULL, 0);
 	/* point size 12.4 fixed point */
 	tmp = (unsigned)(state->point_size * 8.0);
@@ -1030,10 +1036,10 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate, R_028DFC_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp), 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_02820C_PA_SC_CLIPRECT_RULE, clip_rule, 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028810_PA_CL_CLIP_CNTL,
-			S_028810_PS_UCP_MODE(3) | (state->clip_plane_enable & 63) |
-			S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
-			S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip), 0xFFFFFFFF, NULL, 0);
-
+			S_028810_PS_UCP_MODE(3) | S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
+			S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip),
+			S_028810_PS_UCP_MODE(3) | S_028810_ZCLIP_NEAR_DISABLE(1) |
+			S_028810_ZCLIP_FAR_DISABLE(1), NULL, 0);
 	return rstate;
 }

@@ -1311,6 +1317,7 @@ static void r600_set_clip_state(struct pipe_context *ctx,
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+	struct pipe_resource * cbuf;

 	if (rstate == NULL)
 		return;
@@ -1335,6 +1342,13 @@ static void r600_set_clip_state(struct pipe_context *ctx,
 	free(rctx->states[R600_PIPE_STATE_CLIP]);
 	rctx->states[R600_PIPE_STATE_CLIP] = rstate;
 	r600_context_pipe_state_set(&rctx->ctx, rstate);
+
+	cbuf = pipe_user_buffer_create(ctx->screen,
+                                   state->ucp,
+                                   4*4*8, /* 8*4 floats */
+                                   PIPE_BIND_CONSTANT_BUFFER);
+	r600_set_constant_buffer(ctx, PIPE_SHADER_VERTEX, 1, cbuf);
+	pipe_resource_reference(&cbuf, NULL);
 }

 static void r600_set_polygon_stipple(struct pipe_context *ctx,
@@ -2069,7 +2083,7 @@ void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader *shad
 	struct r600_shader *rshader = &shader->shader;
 	unsigned i, exports_ps, num_cout, spi_ps_in_control_0, spi_input_z, spi_ps_in_control_1, db_shader_control;
 	int pos_index = -1, face_index = -1;
-	unsigned tmp, sid;
+	unsigned tmp, sid, ufi = 0;

 	rstate->nregs = 0;

@@ -2147,6 +2161,10 @@ void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader *shad
 			S_0286D0_FRONT_FACE_ADDR(rshader->input[face_index].gpr);
 	}

+	/* HW bug in original R600 */
+	if (rctx->family == CHIP_R600)
+		ufi = 1;
+
 	r600_pipe_state_add_reg(rstate, R_0286CC_SPI_PS_IN_CONTROL_0, spi_ps_in_control_0, 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_0286D0_SPI_PS_IN_CONTROL_1, spi_ps_in_control_1, 0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_0286D8_SPI_INPUT_Z, spi_input_z, 0xFFFFFFFF, NULL, 0);
@@ -2156,7 +2174,8 @@ void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader *shad
 	r600_pipe_state_add_reg(rstate,
 				R_028850_SQ_PGM_RESOURCES_PS,
 				S_028850_NUM_GPRS(rshader->bc.ngpr) |
-				S_028850_STACK_SIZE(rshader->bc.nstack),
+				S_028850_STACK_SIZE(rshader->bc.nstack) |
+				S_028850_UNCACHED_FIRST_INST(ufi),
 				0xFFFFFFFF, NULL, 0);
 	r600_pipe_state_add_reg(rstate,
 				R_028854_SQ_PGM_EXPORTS_PS,
@@ -2234,6 +2253,16 @@ void r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_pipe_shader *shad
 	r600_pipe_state_add_reg(rstate,
 				R_03E200_SQ_LOOP_CONST_0 + (32 * 4), 0x01000FFF,
 				0xFFFFFFFF, NULL, 0);
+
+	r600_pipe_state_add_reg(rstate,
+				R_02881C_PA_CL_VS_OUT_CNTL,
+				S_02881C_VS_OUT_CCDIST0_VEC_ENA((rshader->clip_dist_write & 0x0F) != 0) |
+				S_02881C_VS_OUT_CCDIST1_VEC_ENA((rshader->clip_dist_write & 0xF0) != 0) |
+				S_02881C_VS_OUT_MISC_VEC_ENA(rshader->vs_out_misc_write),
+				S_02881C_VS_OUT_CCDIST0_VEC_ENA(1) |
+				S_02881C_VS_OUT_CCDIST1_VEC_ENA(1) |
+				S_02881C_VS_OUT_MISC_VEC_ENA(1),
+				NULL, 0);
 }

 void r600_fetch_shader(struct pipe_context *ctx,
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -103,6 +103,7 @@ void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 	rctx->clamp_fragment_color = rs->clamp_fragment_color;

 	rctx->sprite_coord_enable = rs->sprite_coord_enable;
+	rctx->two_side = rs->two_side;

 	rctx->rasterizer = rs;

@@ -352,11 +353,11 @@ void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
 	case PIPE_SHADER_VERTEX:
 		rctx->vs_const_buffer.nregs = 0;
 		r600_pipe_state_add_reg(&rctx->vs_const_buffer,
-					R_028180_ALU_CONST_BUFFER_SIZE_VS_0,
+					R_028180_ALU_CONST_BUFFER_SIZE_VS_0 + index * 4,
 					ALIGN_DIVUP(buffer->width0 >> 4, 16),
 					0xFFFFFFFF, NULL, 0);
 		r600_pipe_state_add_reg(&rctx->vs_const_buffer,
-					R_028980_ALU_CONST_CACHE_VS_0,
+					R_028980_ALU_CONST_CACHE_VS_0 + index * 4,
 					offset >> 8, 0xFFFFFFFF, rbuffer, RADEON_USAGE_READ);
 		r600_context_pipe_state_set(&rctx->ctx, &rctx->vs_const_buffer);

@@ -549,6 +550,30 @@ static int r600_shader_rebuild(struct pipe_context * ctx, struct r600_pipe_shade
 static void r600_update_derived_state(struct r600_pipe_context *rctx)
 {
 	struct pipe_context * ctx = (struct pipe_context*)rctx;
+	struct r600_pipe_state rstate;
+	unsigned user_clip_plane_enable;
+	unsigned clip_dist_enable;
+
+	if (rctx->vs_shader->shader.clip_dist_write || rctx->vs_shader->shader.vs_prohibit_ucps)
+		user_clip_plane_enable = 0;
+	else
+		user_clip_plane_enable = rctx->rasterizer->clip_plane_enable & 0x3F;
+
+	clip_dist_enable = rctx->rasterizer->clip_plane_enable & rctx->vs_shader->shader.clip_dist_write;
+	rstate.nregs = 0;
+
+	if (user_clip_plane_enable != rctx->user_clip_plane_enable) {
+		r600_pipe_state_add_reg(&rstate, R_028810_PA_CL_CLIP_CNTL, user_clip_plane_enable , 0x3F, NULL, 0);
+		rctx->user_clip_plane_enable = user_clip_plane_enable;
+	}
+
+	if (clip_dist_enable != rctx->clip_dist_enable) {
+		r600_pipe_state_add_reg(&rstate, R_02881C_PA_CL_VS_OUT_CNTL, clip_dist_enable, 0xFF, NULL, 0);
+		rctx->clip_dist_enable = clip_dist_enable;
+	}
+
+	if (rstate.nregs)
+		r600_context_pipe_state_set(&rctx->ctx, &rstate);

 	if (!rctx->blitter->running) {
 		if (rctx->have_depth_fb || rctx->have_depth_texture)
@@ -564,6 +589,7 @@ static void r600_update_derived_state(struct r600_pipe_context *rctx)
 	}

 	if ((rctx->ps_shader->shader.clamp_color != rctx->clamp_fragment_color) ||
+	    (rctx->ps_shader->shader.two_side != rctx->two_side) ||
 	    ((rctx->chip_class >= EVERGREEN) && rctx->ps_shader->shader.fs_write_all &&
 	     (rctx->ps_shader->shader.nr_cbufs != rctx->nr_cbufs))) {
 		r600_shader_rebuild(&rctx->context, rctx->ps_shader);
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -869,6 +869,7 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
 	const struct util_format_description *desc;
 	boolean uniform = TRUE;
 	static int r600_enable_s3tc = -1;
+	bool is_srgb_valid = FALSE;

 	int i;
 	const uint32_t sign_bit[4] = {
@@ -980,14 +981,17 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
 		case PIPE_FORMAT_DXT1_SRGB:
 		case PIPE_FORMAT_DXT1_SRGBA:
 			result = FMT_BC1;
+			is_srgb_valid = TRUE;
 			goto out_word4;
 		case PIPE_FORMAT_DXT3_RGBA:
 		case PIPE_FORMAT_DXT3_SRGBA:
 			result = FMT_BC2;
+			is_srgb_valid = TRUE;
 			goto out_word4;
 		case PIPE_FORMAT_DXT5_RGBA:
 		case PIPE_FORMAT_DXT5_SRGBA:
 			result = FMT_BC3;
+			is_srgb_valid = TRUE;
 			goto out_word4;
 		default:
 			goto out_unknown;
@@ -1095,6 +1099,7 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
 				goto out_word4;
 			case 4:
 				result = FMT_8_8_8_8;
+				is_srgb_valid = TRUE;
 				goto out_word4;
 			}
 			goto out_unknown;
@@ -1158,6 +1163,9 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
 	}

 out_word4:
+
+	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && !is_srgb_valid)
+		return ~0;
 	if (word4_p)
 		*word4_p = word4;
 	if (yuv_format_p)
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -3538,9 +3538,13 @@
 #define R_038018_RESOURCE0_WORD6                     0x038018

 #define R_028140_ALU_CONST_BUFFER_SIZE_PS_0          0x00028140
+#define R_028144_ALU_CONST_BUFFER_SIZE_PS_1          0x00028144
 #define R_028180_ALU_CONST_BUFFER_SIZE_VS_0          0x00028180
+#define R_028184_ALU_CONST_BUFFER_SIZE_VS_1          0x00028184
 #define R_028940_ALU_CONST_CACHE_PS_0                0x00028940
+#define R_028944_ALU_CONST_CACHE_PS_1                0x00028944
 #define R_028980_ALU_CONST_CACHE_VS_0                0x00028980
+#define R_028984_ALU_CONST_CACHE_VS_1                0x00028984

 #define R_03CFF0_SQ_VTX_BASE_VTX_LOC                 0x03CFF0
 #define R_03CFF4_SQ_VTX_START_INST_LOC               0x03CFF4
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -45,7 +45,7 @@
 #include "sp_fence.h"
 #include "sp_public.h"

-DEBUG_GET_ONCE_BOOL_OPTION(use_llvm, "SOFTPIPE_USE_LLVM", FALSE);
+DEBUG_GET_ONCE_BOOL_OPTION(use_llvm, "SOFTPIPE_USE_LLVM", FALSE)

 static const char *
 softpipe_get_vendor(struct pipe_screen *screen)
@@ -121,7 +121,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
      return 1;
   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return 64; /* matches core Mesa defaults */
+      return 256; /* for GL3 */
   case PIPE_CAP_MIN_TEXEL_OFFSET:
      return -8;
   case PIPE_CAP_MAX_TEXEL_OFFSET:
@@ -138,7 +138,9 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
 static int
 softpipe_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_shader_cap param)
 {
+#ifdef HAVE_LLVM
   struct softpipe_screen *sp_screen = softpipe_screen(screen);
+#endif
   switch(shader)
   {
   case PIPE_SHADER_FRAGMENT:
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -88,7 +88,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
      vinfo->num_attribs = 0;
      for (i = 0; i < fsInfo->num_inputs; i++) {
         int src;
-         enum interp_mode interp;
+         enum interp_mode interp = INTERP_LINEAR;

         switch (fsInfo->input_interpolate[i]) {
         case TGSI_INTERPOLATE_CONSTANT:
@@ -105,7 +105,6 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
            break;
         default:
            assert(0);
-            interp = INTERP_LINEAR;
         }

         switch (fsInfo->input_semantic_name[i]) {
--- a/src/gallium/drivers/softpipe/sp_state_shader.c
+++ b/src/gallium/drivers/softpipe/sp_state_shader.c
@@ -158,8 +158,10 @@ softpipe_bind_fs_state(struct pipe_context *pipe, void *fs)

   softpipe->fs = fs;

-   if (fs == NULL)
-      softpipe->fs_variant = NULL;
+   /* This depends on the current fragment shader and must always be
+    * re-validated before use.
+    */
+   softpipe->fs_variant = NULL;

   if (state)
      draw_bind_fragment_shader(softpipe->draw,
--- a/src/gallium/drivers/svga/include/svga_types.h
+++ b/src/gallium/drivers/svga/include/svga_types.h
@@ -28,6 +28,7 @@

 #include "pipe/p_compiler.h"

+#ifndef __HAIKU__
 typedef int64_t int64;
 typedef uint64_t uint64;

@@ -39,6 +40,9 @@ typedef uint16_t uint16;

 typedef int8_t int8;
 typedef uint8_t uint8;
+#else
+#include <OS.h>
+#endif /* HAIKU */

 typedef uint8_t Bool;

--- a/src/gallium/drivers/svga/svga_pipe_blit.c
+++ b/src/gallium/drivers/svga/svga_pipe_blit.c
@@ -44,8 +44,7 @@ static void svga_surface_copy(struct pipe_context *pipe,
                              const struct pipe_box *src_box)
 {
   struct svga_context *svga = svga_context(pipe);
-   struct svga_texture *stex = svga_texture(src_tex);
-   struct svga_texture *dtex = svga_texture(dst_tex);
+   struct svga_texture *stex, *dtex;
 /*   struct pipe_screen *screen = pipe->screen;
   SVGA3dCopyBox *box;
   enum pipe_error ret;
@@ -63,6 +62,9 @@ static void svga_surface_copy(struct pipe_context *pipe,
      return;
   }

+   stex = svga_texture(src_tex);
+   dtex = svga_texture(dst_tex);
+
 #if 0
   srcsurf = screen->get_tex_surface(screen, src_tex,
                                     src_level, src_box->z, src_box->z,
--- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -57,10 +57,10 @@ svga_translate_stencil_op(unsigned op)
   case PIPE_STENCIL_OP_KEEP:      return SVGA3D_STENCILOP_KEEP;
   case PIPE_STENCIL_OP_ZERO:      return SVGA3D_STENCILOP_ZERO;
   case PIPE_STENCIL_OP_REPLACE:   return SVGA3D_STENCILOP_REPLACE;
-   case PIPE_STENCIL_OP_INCR:      return SVGA3D_STENCILOP_INCR;
-   case PIPE_STENCIL_OP_DECR:      return SVGA3D_STENCILOP_DECR;
-   case PIPE_STENCIL_OP_INCR_WRAP: return SVGA3D_STENCILOP_INCRSAT; /* incorrect? */
-   case PIPE_STENCIL_OP_DECR_WRAP: return SVGA3D_STENCILOP_DECRSAT; /* incorrect? */
+   case PIPE_STENCIL_OP_INCR:      return SVGA3D_STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:      return SVGA3D_STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP: return SVGA3D_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP: return SVGA3D_STENCILOP_DECR;
   case PIPE_STENCIL_OP_INVERT:    return SVGA3D_STENCILOP_INVERT;
   default:
      assert(0);
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -201,6 +201,17 @@ svga_release_user_upl_buffers(struct svga_context *svga)
      if (vb->buffer && svga_buffer_is_user_buffer(vb->buffer)) {
         struct svga_buffer *buffer = svga_buffer(vb->buffer);

+         /* The buffer_offset is relative to the uploaded buffer.
+          * Since we're discarding that buffer we need to reset this offset
+          * so it's not inadvertantly applied to a subsequent draw.
+          *
+          * XXX a root problem here is that the svga->curr.vb[] information
+          * is getting set both by gallium API calls and by code in
+          * svga_upload_user_buffers().  We should instead have two copies
+          * of the vertex buffer information and choose between as needed.
+          */
+         vb->buffer_offset = 0;
+
         buffer->uploaded.start = ~0;
         buffer->uploaded.end = 0;
         if (buffer->uploaded.buffer)
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -237,11 +237,11 @@ static void svga_bind_rasterizer_state( struct pipe_context *pipe,
   struct svga_context *svga = svga_context(pipe);
   struct svga_rasterizer_state *raster = (struct svga_rasterizer_state *)state;

-   svga->curr.rast = raster;

   draw_set_rasterizer_state(svga->swtnl.draw, raster ? &raster->templ : NULL,
                             state);
-   
+   svga->curr.rast = raster;
+
   svga->dirty |= SVGA_NEW_RAST;
 }

--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -125,8 +125,8 @@ svga_create_sampler_state(struct pipe_context *pipe,
    *    - min/max LOD clamping
    */
   cso->min_lod = 0;
-   cso->view_min_lod = MAX2(sampler->min_lod, 0);
-   cso->view_max_lod = MAX2(sampler->max_lod, 0);
+   cso->view_min_lod = MAX2((int) (sampler->min_lod + 0.5), 0);
+   cso->view_max_lod = MAX2((int) (sampler->max_lod + 0.5), 0);

   /* Use min_mipmap */
   if (svga->debug.use_min_mipmap) {
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -454,16 +454,19 @@ svga_texture_create(struct pipe_screen *screen,
   }

   /* 
-    * XXX: Never pass the SVGA3D_SURFACE_HINT_RENDERTARGET hint. Mesa cannot
+    * Note: Previously we never passed the
+    * SVGA3D_SURFACE_HINT_RENDERTARGET hint. Mesa cannot
    * know beforehand whether a texture will be used as a rendertarget or not
    * and it always requests PIPE_BIND_RENDER_TARGET, therefore
    * passing the SVGA3D_SURFACE_HINT_RENDERTARGET here defeats its purpose.
+    *
+    * However, this was changed since other state trackers
+    * (XA for example) uses it accurately and certain device versions
+    * relies on it in certain situations to render correctly.
    */
-#if 0
   if((template->bind & PIPE_BIND_RENDER_TARGET) &&
      !util_format_is_s3tc(template->format))
      tex->key.flags |= SVGA3D_SURFACE_HINT_RENDERTARGET;
-#endif
   
   if(template->bind & PIPE_BIND_DEPTH_STENCIL)
      tex->key.flags |= SVGA3D_SURFACE_HINT_DEPTHSTENCIL;
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -120,7 +120,7 @@ svga_get_paramf(struct pipe_screen *screen, enum pipe_capf param)
      return result.u;

   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-      return 16.0;
+      return 15.0;

   default:
      return 0;
@@ -235,7 +235,7 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
      case PIPE_SHADER_CAP_MAX_TEMPS:
         if (!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_FRAGMENT_SHADER_TEMPS, &result))
            return 32;
-         return result.u;
+         return MIN2(result.u, SVGA3D_TEMPREG_MAX);
      case PIPE_SHADER_CAP_MAX_ADDRS:
      case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
 	 /* 
@@ -286,7 +286,7 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
      case PIPE_SHADER_CAP_MAX_TEMPS:
         if (!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_VERTEX_SHADER_TEMPS, &result))
            return 32;
-         return result.u;
+         return MIN2(result.u, SVGA3D_TEMPREG_MAX);
      case PIPE_SHADER_CAP_MAX_ADDRS:
         return 1;
      case PIPE_SHADER_CAP_MAX_PREDS:
--- a/src/gallium/drivers/svga/svga_state_framebuffer.c
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@@ -477,7 +477,7 @@ emit_clip_planes( struct svga_context *svga,

   /* TODO: just emit directly from svga_set_clip_state()?
    */
-   for (i = 0; i < 6; i++) {
+   for (i = 0; i < SVGA3D_MAX_CLIP_PLANES; i++) {
      /* need to express the plane in D3D-style coordinate space.
       * GL coords get converted to D3D coords with the matrix:
       * [ 1  0  0  0 ]
--- a/src/gallium/drivers/svga/svga_state_need_swtnl.c
+++ b/src/gallium/drivers/svga/svga_state_need_swtnl.c
@@ -136,7 +136,7 @@ update_need_pipeline( struct svga_context *svga,

   /* EDGEFLAGS
    */
-    if (vs->base.info.writes_edgeflag) {
+    if (vs && vs->base.info.writes_edgeflag) {
      SVGA_DBG(DEBUG_SWTNL, "%s: edgeflags\n", __FUNCTION__);
      need_pipeline = TRUE;
   }
@@ -145,7 +145,8 @@ update_need_pipeline( struct svga_context *svga,
    */
   if (svga->curr.reduced_prim == PIPE_PRIM_POINTS) {
      unsigned sprite_coord_gen = svga->curr.rast->templ.sprite_coord_enable;
-      unsigned generic_inputs = svga->curr.fs->generic_inputs;
+      unsigned generic_inputs =
+         svga->curr.fs ? svga->curr.fs->generic_inputs : 0;

      if (sprite_coord_gen &&
          (generic_inputs & ~sprite_coord_gen)) {
--- a/Show More
+++ b/Show More