mesa: Bump version to 9.1.1

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
docs: 9.1.1 release notes
2013-03-19 17:14:38 -07:00 · 2013-03-19 17:11:41 -07:00 · 2013-03-18 09:39:28 -04:00 · 2013-03-18 09:38:59 -04:00 · 2013-03-15 19:55:26 +00:00 · 2013-03-15 19:55:26 +00:00
153 changed files with 3188 additions and 825 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -36,7 +36,7 @@ check-local:

 # Rules for making release tarballs

-PACKAGE_VERSION=9.1-devel
+PACKAGE_VERSION=9.1.1
 PACKAGE_DIR = Mesa-$(PACKAGE_VERSION)
 PACKAGE_NAME = MesaLib-$(PACKAGE_VERSION)

--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,4 @@
+d60da27273d2cdb68bc32cae2ca66718dab15f27 st/mesa: set ctx->Const.MaxSamples = 0, not 1
+5c86a728d4f688c0fe7fbf9f4b8f88060b65c4ee r600g: fix htile buffer leak
+496928a442cec980b534bc5da2523b3632b21b61 CopyTexImage: Don't check sRGB vs LINEAR for desktop GL
+3ee602314fc22054f69ee476f2e1037653d269bc mesa: Allow glGet* queries of MAX_VARYING_COMPONENTS in ES 3
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -8,7 +8,7 @@ git log --reverse --grep="cherry picked from commit" origin/master..HEAD |\
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked

 # Grep for commits that were marked as a candidate for the stable tree.
-git log --reverse --pretty=%H -i --grep='^[[:space:]]*NOTE: This is a candidate' HEAD..origin/master |\
+git log --reverse --pretty=%H -i --grep='^[[:space:]]*NOTE: .*[Cc]andidate' HEAD..origin/master |\
 while read sha
 do
 	# Check to see whether the patch is on the ignore list.
--- a/common.py
+++ b/common.py
@@ -100,4 +100,4 @@ def AddOptions(opts):
 	opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
 	opts.Add(BoolOption('texture_float', 'enable floating-point textures and renderbuffers', 'no'))
 	if host_platform == 'windows':
-		opts.Add(EnumOption('MSVS_VERSION', 'MS Visual C++ version', None, allowed_values=('7.1', '8.0', '9.0')))
+		opts.Add(EnumOption('MSVC_VERSION', 'MS Visual C++ version', None, allowed_values=('7.1', '8.0', '9.0', '10.0', '11.0')))
--- a/configure.ac
+++ b/configure.ac
@@ -6,7 +6,7 @@ dnl Tell the user about autoconf.html in the --help output
 m4_divert_once([HELP_END], [
 See docs/autoconf.html for more details on the options for Mesa.])

-AC_INIT([Mesa], [9.1.0],
+AC_INIT([Mesa], [9.1.1],
    [https://bugs.freedesktop.org/enter_bug.cgi?product=Mesa])
 AC_CONFIG_AUX_DIR([bin])
 AC_CONFIG_MACRO_DIR([m4])
@@ -20,7 +20,8 @@ echo \#buildapi-variable-no-builddir >/dev/null
 # Support silent build rules, requires at least automake-1.11. Disable
 # by either passing --disable-silent-rules to configure or passing V=1
 # to make
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])],
+    [AC_SUBST([AM_DEFAULT_VERBOSITY], [1])])

 m4_ifdef([AM_PROG_AR], [AM_PROG_AR])

@@ -30,7 +31,7 @@ AC_SUBST([OSMESA_VERSION])

 dnl Versions for external dependencies
 LIBDRM_REQUIRED=2.4.24
-LIBDRM_RADEON_REQUIRED=2.4.40
+LIBDRM_RADEON_REQUIRED=2.4.42
 LIBDRM_INTEL_REQUIRED=2.4.38
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
@@ -57,10 +58,10 @@ LT_PREREQ([2.2])
 LT_INIT([disable-static])

 AX_PROG_BISON([],
-              AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-parse.c"]
+              AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-parse.c"],
                    [AC_MSG_ERROR([bison not found - unable to compile glcpp-parse.y])]))
 AX_PROG_FLEX([],
-             AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-lex.c"]
+             AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-lex.c"],
                   [AC_MSG_ERROR([flex not found - unable to compile glcpp-lex.l])]))

 AC_PATH_PROG([PERL], [perl])
@@ -611,7 +612,7 @@ AC_ARG_ENABLE([opencl],
         [enable OpenCL library NOTE: Enabling this option will also enable
          --with-llvm-shared-libs
          @<:@default=no@:>@])],
-   [enable_opencl="$enableval" with_llvm_shared_libs="$enableval"],
+   [],
   [enable_opencl=no])
 AC_ARG_ENABLE([xlib_glx],
    [AS_HELP_STRING([--enable-xlib-glx],
@@ -701,6 +702,16 @@ if test "x$enable_dri$enable_xlib_glx" = xyesyes; then
    AC_MSG_ERROR([DRI and Xlib-GLX cannot be built together])
 fi

+if test "x$enable_opengl$enable_xlib_glx" = xnoyes; then
+    AC_MSG_ERROR([Xlib-GLX cannot be built without OpenGL])
+fi
+
+# Disable GLX if OpenGL is not enabled
+if test "x$enable_glx$enable_opengl" = xyesno; then
+    AC_MSG_WARN([OpenGL not enabled, disabling GLX])
+    enable_glx=no
+fi
+
 # Disable GLX if DRI and Xlib-GLX are not enabled
 if test "x$enable_glx" = xyes -a \
        "x$enable_dri" = xno -a \
@@ -1619,8 +1630,13 @@ AC_ARG_ENABLE([gallium-llvm],
 AC_ARG_WITH([llvm-shared-libs],
    [AS_HELP_STRING([--with-llvm-shared-libs],
        [link with LLVM shared libraries @<:@default=disabled@:>@])],
-    [with_llvm_shared_libs=yes],
+    [],
    [with_llvm_shared_libs=no])
+AS_IF([test x$enable_opencl = xyes],
+    [
+        AC_MSG_WARN([OpenCL required, forcing LLVM shared libraries])
+        with_llvm_shared_libs=yes
+    ])

 AC_ARG_WITH([llvm-prefix],
    [AS_HELP_STRING([--with-llvm-prefix],
@@ -1662,16 +1678,17 @@ if test "x$enable_gallium_llvm" = xyes; then
    if test "x$LLVM_CONFIG" != xno; then
 	LLVM_VERSION=`$LLVM_CONFIG --version | sed 's/svn.*//g'`
 	LLVM_VERSION_INT=`echo $LLVM_VERSION | sed -e 's/\([[0-9]]\)\.\([[0-9]]\)/\10\2/g'`
-        if test "x$with_llvm_shared_libs" != xyes; then
-            LLVM_COMPONENTS="engine bitwriter"
-            if $LLVM_CONFIG --components | grep -q '\<mcjit\>'; then
-                LLVM_COMPONENTS="${LLVM_COMPONENTS} mcjit"
-            fi
+        LLVM_COMPONENTS="engine bitwriter"
+        if $LLVM_CONFIG --components | grep -q '\<mcjit\>'; then
+            LLVM_COMPONENTS="${LLVM_COMPONENTS} mcjit"
+        fi
+        if $LLVM_CONFIG --components | grep -q '\<oprofilejit\>'; then
+            LLVM_COMPONENTS="${LLVM_COMPONENTS} oprofilejit"
+        fi

-            if test "x$enable_opencl" = xyes; then
-                LLVM_COMPONENTS="${LLVM_COMPONENTS} ipo linker instrumentation"
-            fi
-	fi
+        if test "x$enable_opencl" = xyes; then
+            LLVM_COMPONENTS="${LLVM_COMPONENTS} ipo linker instrumentation"
+        fi
 	LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags`
 	LLVM_BINDIR=`$LLVM_CONFIG --bindir`
 	LLVM_CPPFLAGS=`strip_unwanted_llvm_flags "$LLVM_CONFIG --cppflags"`
@@ -1840,6 +1857,9 @@ if test "x$with_gallium_drivers" != x; then
            if test "x$enable_r600_llvm" = xyes; then
                USE_R600_LLVM_COMPILER=yes;
            fi
+            if test "x$enable_opencl" = xyes; then
+                LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
+            fi
            gallium_check_st "radeon/drm" "dri-r600" "xorg-r600" "" "xvmc-r600" "vdpau-r600"
            ;;
        xradeonsi)
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,23 @@

 <h1>News</h1>

+<h2>February 22, 2013</h2>
+
+<p>
+<a href="relnotes-9.1.html">Mesa 9.1</a> is released.
+This is a new development release.
+See the release notes for more information about the release.
+</p>
+
+
+<h2>February 21, 2013</h2>
+
+<p>
+<a href="relnotes-9.0.3.html">Mesa 9.0.3</a> is released.
+This is a bug fix release.
+</p>
+
+
 <h2>January 22, 2013</h2>

 <p>
--- a/docs/relnotes-9.1.1.html
+++ b/docs/relnotes-9.1.1.html
@@ -0,0 +1,232 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 9.1.1 Release Notes / March 19th, 2013</h1>
+
+<p>
+Mesa 9.1.1 is a bug fix release which fixes bugs found since the 9.1 release.
+</p>
+<p>
+Mesa 9.1 implements the OpenGL 3.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.1.  OpenGL
+3.1 is <strong>only</strong> available if requested at context creation
+because GL_ARB_compatibility is not supported.
+</p>
+
+<h2>MD5 checksums</h2>
+<pre>
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=30232">Bug 30232</a> - [GM45] mesa demos spriteblast render incorrectly</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=32429">Bug 32429</a> - [gles2] Ironlake: gl_PointCoord takes no effect for point sprites</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=38086">Bug 38086</a> - Mesa 7.11-devel implementation error: Unexpected program target in destroy_program_variants_cb()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=57121">Bug 57121</a> - [snb] corrupted GLSL built-in function results when using Uniform Buffer contents as arguments</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=58042">Bug 58042</a> - [bisected] Garbled UI in Team Fortress 2 and Counter-Strike: Source</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=58960">Bug 58960</a> - Texture flicker with fragment shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59495">Bug 59495</a> - [i965 Bisected]Oglc fbblit(advanced.blitFb-3d-cube.mirror.both) fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59783">Bug 59783</a> - [IVB bisected] 3DMMES2.0 Taiji performance reduced by ~13% with gnome-session enable compositing</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60121">Bug 60121</a> - build - libvdpau_softpipe fails at runtime.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60143">Bug 60143</a> - gbm_dri_bo_create fails to initialize bo-&gt;base.base.format</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60802">Bug 60802</a> - Corruption with DMA ring on cayman</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60848">Bug 60848</a> - [bisected] r600g: add htile support cause gpu lockup in Dishonored wine.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60938">Bug 60938</a> - [softpipe] piglit interpolation-noperspective-gl_BackColor-flat-fixed regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=61012">Bug 61012</a> - alloc_layout_array tx * ty assertion failure when making pbuffer current</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=61026">Bug 61026</a> - Segfault in glBitmap when called with PBO source</li>
+
+<!-- <li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=">Bug </a> - </li> -->
+</ul>
+
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-9.1..mesa-9.1.1
+</pre>
+
+
+<p>Adam Sampson (1):</p>
+<ul>
+  <li>autotools: oprofilejit should be included in the list of LLVM components required</li>
+</ul>
+
+<p>Alex Deucher (2):</p>
+<ul>
+  <li>r600g: add Richland APU pci ids</li>
+  <li>r600g: Use blitter rather than DMA for 128bpp on cayman (v3)</li>
+</ul>
+
+<p>Andreas Boll (2):</p>
+<ul>
+  <li>docs: Add 9.1 release md5sums</li>
+  <li>docs: add news item for 9.1 release</li>
+</ul>
+
+<p>Anuj Phogat (1):</p>
+<ul>
+  <li>meta: Allocate texture before initializing texture coordinates</li>
+</ul>
+
+<p>Brian Paul (11):</p>
+<ul>
+  <li>docs: remove stray 'date' text</li>
+  <li>docs: insert links to the 9.0.3 release</li>
+  <li>draw: fix non-perspective interpolation in interp()</li>
+  <li>st/mesa: implement glBitmap unpacking from a PBO, for the cache path</li>
+  <li>st/xlib: initialize the drawable size in create_xmesa_buffer()</li>
+  <li>st/mesa: fix trimming of GL_QUAD_STRIP</li>
+  <li>st/mesa: check for dummy programs in destroy_program_variants()</li>
+  <li>st/mesa: fix polygon offset state translation logic</li>
+  <li>draw: fix broken polygon offset stage</li>
+  <li>llvmpipe: add missing checks for polygon offset point/line modes</li>
+  <li>svga: always link with C++</li>
+</ul>
+
+<p>Daniel van Vugt (1):</p>
+<ul>
+  <li>gbm: Remember to init format on gbm_dri_bo_create.</li>
+</ul>
+
+<p>Eric Anholt (7):</p>
+<ul>
+  <li>i965/fs: Do a general SEND dependency workaround for the original 965.</li>
+  <li>i965/fs: Fix copy propagation with smearing.</li>
+  <li>i965/fs: Delay setup of uniform loads until after pre-regalloc scheduling.</li>
+  <li>i965/fs: Only do CSE when the dst types match.</li>
+  <li>i965/fs: Fix broken math on values loaded from uniform buffers on gen6.</li>
+  <li>mesa: Fix setup of ctx-&gt;Point.PointSprite for GLES2.</li>
+  <li>i965: Fix the W value of deprecated pointcoords on pre-gen6.</li>
+</ul>
+
+<p>Frank Henigman (1):</p>
+<ul>
+  <li>i965: Link i965_dri.so with C++ linker.</li>
+</ul>
+
+<p>Ian Romanick (3):</p>
+<ul>
+  <li>mesa: Add previously picked commit to .cherry-ignore</li>
+  <li>mesa: Modify candidate search string</li>
+  <li>egl: Allow 24-bit visuals for 32-bit RGBA8888 configs</li>
+</ul>
+
+<p>Jakub Bogusz (1):</p>
+<ul>
+  <li>vdpau-softpipe: Build correct source file - vl_winsys_xsp.c</li>
+</ul>
+
+<p>Jerome Glisse (1):</p>
+<ul>
+  <li>r600g: workaround hyperz lockup on evergreen</li>
+</ul>
+
+<p>John Kåre Alsaker (1):</p>
+<ul>
+  <li>llvmpipe: Fix creation of shared and scanout textures.</li>
+</ul>
+
+<p>Jordan Justen (1):</p>
+<ul>
+  <li>attrib: push/pop FRAGMENT_PROGRAM_ARB state</li>
+</ul>
+
+<p>José Fonseca (3):</p>
+<ul>
+  <li>scons: Allows choosing VS 10 or 11.</li>
+  <li>scons: Define _ALLOW_KEYWORD_MACROS on MSVC builds.</li>
+  <li>scons: Warn when using MSVS versions prior to 2012.</li>
+</ul>
+
+<p>Keith Kriewall (1):</p>
+<ul>
+  <li>scons: Fix Windows build with LLVM 3.2</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>i965: Fix Crystal Well PCI IDs.</li>
+</ul>
+
+<p>Marek Olšák (5):</p>
+<ul>
+  <li>r600g: use async DMA with a non-zero src offset</li>
+  <li>r600g: flush and invalidate htile cache when appropriate</li>
+  <li>gallium/util: add helper code for 1D integer range</li>
+  <li>r600g: always map uninitialized buffer range as unsynchronized</li>
+  <li>r600g: pad the DMA CS to a multiple of 8 dwords</li>
+</ul>
+
+<p>Martin Andersson (1):</p>
+<ul>
+  <li>winsys/radeon: Only add bo to hash table when creating flink</li>
+</ul>
+
+<p>Matt Turner (1):</p>
+<ul>
+  <li>mesa: Allow ETC2/EAC formats with ARB_ES3_compatibility.</li>
+</ul>
+
+<p>Michel Dänzer (3):</p>
+<ul>
+  <li>radeonsi: Fix up and enable flat shading.</li>
+  <li>r600g/Cayman: Fix blending using destination alpha factor but non-alpha dest</li>
+  <li>radeonsi: Fix off-by-one for maximum vertex element index in some cases</li>
+</ul>
+
+<p>Tapani Pälli (2):</p>
+<ul>
+  <li>mesa: add missing case in _mesa_GetTexParameterfv()</li>
+  <li>mesa/es: NULL check in EGLImageTargetTexture2DOES</li>
+</ul>
+
+<p>Vadim Girlin (1):</p>
+<ul>
+  <li>r600g: fix check_and_set_bank_swizzle for cayman</li>
+</ul>
+
+<p>Vincent Lejeune (2):</p>
+<ul>
+  <li>r600g/llvm: Add support for UBO</li>
+  <li>r600g: Check comp_mask before merging export instructions</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes-9.1.html
+++ b/docs/relnotes-9.1.html
@@ -14,7 +14,7 @@
 <iframe src="contents.html"></iframe>
 <div class="content">

-<h1>Mesa 9.1 Release Notes / date TBD</h1>
+<h1>Mesa 9.1 Release Notes / February 22, 2013</h1>

 <p>
 Mesa 9.1 is a new development release.
@@ -33,7 +33,9 @@ because GL_ARB_compatibility is not supported.

 <h2>MD5 checksums</h2>
 <pre>
-tbd
+86d40f3056f89949368764bf84aff55e  MesaLib-9.1.tar.gz
+d3891e02215422e120271d976ff1947e  MesaLib-9.1.tar.bz2
+01645f28f53351c23b0beb6c688911d8  MesaLib-9.1.zip
 </pre>


@@ -44,9 +46,19 @@ Note: some of the new features are only available with certain drivers.
 </p>

 <ul>
+<li>GL_ANGLE_texture_compression_dxt3</li>
+<li>GL_ANGLE_texture_compression_dxt5</li>
+<li>GL_ARB_ES3_compatibility</li>
+<li>GL_ARB_internalformat_query</li>
 <li>GL_ARB_map_buffer_alignment</li>
-<li>GL_ARB_texture_cube_map_array</li>
+<li>GL_ARB_shading_language_packing</li>
 <li>GL_ARB_texture_buffer_object_rgb32</li>
+<li>GL_ARB_texture_cube_map_array</li>
+<li>GL_EXT_color_buffer_float</li>
+<li>GL_OES_depth_texture_cube_map</li>
+<li>OpenGL 3.1 core profile support on Radeon HD2000 up to HD6000 series </li>
+<li>Multisample anti-aliasing support on Radeon X1000 series</li>
+<li>OpenGL ES 3.0 support on Intel HD Graphics 2000, 2500, 3000, and 4000</li>
 </ul>


@@ -63,6 +75,7 @@ Note: some of the new features are only available with certain drivers.
 <li>Removed swrast support for GL_NV_vertex_program</li>
 <li>Removed swrast support for GL_NV_fragment_program</li>
 <li>Removed OpenVMS support (unmaintained and broken)</li>
+<li>Removed makedepend build dependency</li>
 </ul>

 </div>
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -22,6 +22,7 @@ The release notes summarize what's new or changed in each Mesa release.

 <ul>
 <li><a href="relnotes-9.1.html">9.1 release notes</a>
+<li><a href="relnotes-9.0.3.html">9.0.3 release notes</a>
 <li><a href="relnotes-9.0.2.html">9.0.2 release notes</a>
 <li><a href="relnotes-9.0.1.html">9.0.1 release notes</a>
 <li><a href="relnotes-9.0.html">9.0 release notes</a>
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -53,12 +53,12 @@ CHIPSET(0x0A26, HASWELL_ULT_M_GT2_PLUS, hsw_gt2)
 CHIPSET(0x0A0A, HASWELL_ULT_S_GT1, hsw_gt1)
 CHIPSET(0x0A1A, HASWELL_ULT_S_GT2, hsw_gt2)
 CHIPSET(0x0A2A, HASWELL_ULT_S_GT2_PLUS, hsw_gt2)
-CHIPSET(0x0D12, HASWELL_CRW_GT1, hsw_gt1)
-CHIPSET(0x0D22, HASWELL_CRW_GT2, hsw_gt2)
-CHIPSET(0x0D32, HASWELL_CRW_GT2_PLUS, hsw_gt2)
-CHIPSET(0x0D16, HASWELL_CRW_M_GT1, hsw_gt1)
-CHIPSET(0x0D26, HASWELL_CRW_M_GT2, hsw_gt2)
-CHIPSET(0x0D36, HASWELL_CRW_M_GT2_PLUS, hsw_gt2)
-CHIPSET(0x0D1A, HASWELL_CRW_S_GT1, hsw_gt1)
-CHIPSET(0x0D2A, HASWELL_CRW_S_GT2, hsw_gt2)
-CHIPSET(0x0D3A, HASWELL_CRW_S_GT2_PLUS, hsw_gt2)
+CHIPSET(0x0D02, HASWELL_CRW_GT1, hsw_gt1)
+CHIPSET(0x0D12, HASWELL_CRW_GT2, hsw_gt2)
+CHIPSET(0x0D22, HASWELL_CRW_GT2_PLUS, hsw_gt2)
+CHIPSET(0x0D06, HASWELL_CRW_M_GT1, hsw_gt1)
+CHIPSET(0x0D16, HASWELL_CRW_M_GT2, hsw_gt2)
+CHIPSET(0x0D26, HASWELL_CRW_M_GT2_PLUS, hsw_gt2)
+CHIPSET(0x0D0A, HASWELL_CRW_S_GT1, hsw_gt1)
+CHIPSET(0x0D1A, HASWELL_CRW_S_GT2, hsw_gt2)
+CHIPSET(0x0D2A, HASWELL_CRW_S_GT2_PLUS, hsw_gt2)
--- a/include/pci_ids/r600_pci_ids.h
+++ b/include/pci_ids/r600_pci_ids.h
@@ -298,6 +298,10 @@ CHIPSET(0x9907, ARUBA_9907, ARUBA)
 CHIPSET(0x9908, ARUBA_9908, ARUBA)
 CHIPSET(0x9909, ARUBA_9909, ARUBA)
 CHIPSET(0x990A, ARUBA_990A, ARUBA)
+CHIPSET(0x990B, ARUBA_990B, ARUBA)
+CHIPSET(0x990C, ARUBA_990C, ARUBA)
+CHIPSET(0x990D, ARUBA_990D, ARUBA)
+CHIPSET(0x990E, ARUBA_990E, ARUBA)
 CHIPSET(0x990F, ARUBA_990F, ARUBA)
 CHIPSET(0x9910, ARUBA_9910, ARUBA)
 CHIPSET(0x9913, ARUBA_9913, ARUBA)
@@ -309,6 +313,13 @@ CHIPSET(0x9991, ARUBA_9991, ARUBA)
 CHIPSET(0x9992, ARUBA_9992, ARUBA)
 CHIPSET(0x9993, ARUBA_9993, ARUBA)
 CHIPSET(0x9994, ARUBA_9994, ARUBA)
+CHIPSET(0x9995, ARUBA_9995, ARUBA)
+CHIPSET(0x9996, ARUBA_9996, ARUBA)
+CHIPSET(0x9997, ARUBA_9997, ARUBA)
+CHIPSET(0x9998, ARUBA_9998, ARUBA)
+CHIPSET(0x9999, ARUBA_9999, ARUBA)
+CHIPSET(0x999A, ARUBA_999A, ARUBA)
+CHIPSET(0x999B, ARUBA_999B, ARUBA)
 CHIPSET(0x99A0, ARUBA_99A0, ARUBA)
 CHIPSET(0x99A2, ARUBA_99A2, ARUBA)
 CHIPSET(0x99A4, ARUBA_99A4, ARUBA)
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -46,3 +46,17 @@ CHIPSET(0x6839, VERDE_6839, VERDE)
 CHIPSET(0x683B, VERDE_683B, VERDE)
 CHIPSET(0x683D, VERDE_683D, VERDE)
 CHIPSET(0x683F, VERDE_683F, VERDE)
+
+CHIPSET(0x6600, OLAND_6600, OLAND)
+CHIPSET(0x6601, OLAND_6601, OLAND)
+CHIPSET(0x6602, OLAND_6602, OLAND)
+CHIPSET(0x6603, OLAND_6603, OLAND)
+CHIPSET(0x6606, OLAND_6606, OLAND)
+CHIPSET(0x6607, OLAND_6607, OLAND)
+CHIPSET(0x6610, OLAND_6610, OLAND)
+CHIPSET(0x6611, OLAND_6611, OLAND)
+CHIPSET(0x6613, OLAND_6613, OLAND)
+CHIPSET(0x6620, OLAND_6620, OLAND)
+CHIPSET(0x6621, OLAND_6621, OLAND)
+CHIPSET(0x6623, OLAND_6623, OLAND)
+CHIPSET(0x6631, OLAND_6631, OLAND)
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -289,6 +289,7 @@ def generate(env):
                '_CRT_SECURE_NO_DEPRECATE',
                '_SCL_SECURE_NO_WARNINGS',
                '_SCL_SECURE_NO_DEPRECATE',
+                '_ALLOW_KEYWORD_MACROS',
            ]
        if env['build'] in ('debug', 'checked'):
            cppdefines += ['_DEBUG']
@@ -401,6 +402,8 @@ def generate(env):
              '/Oi', # enable intrinsic functions
            ]
        else:
+            if distutils.version.LooseVersion(env['MSVC_VERSION']) < distutils.version.LooseVersion('11.0'):
+                print 'scons: warning: Visual Studio versions prior to 2012 are known to produce incorrect code when optimizations are enabled ( https://bugs.freedesktop.org/show_bug.cgi?id=58718 )'
            ccflags += [
                '/O2', # optimize for speed
            ]
@@ -530,7 +533,7 @@ def generate(env):
    env.PkgCheckModules('XF86VIDMODE', ['xxf86vm'])
    env.PkgCheckModules('DRM', ['libdrm >= 2.4.24'])
    env.PkgCheckModules('DRM_INTEL', ['libdrm_intel >= 2.4.30'])
-    env.PkgCheckModules('DRM_RADEON', ['libdrm_radeon >= 2.4.40'])
+    env.PkgCheckModules('DRM_RADEON', ['libdrm_radeon >= 2.4.42'])
    env.PkgCheckModules('XORG', ['xorg-server >= 1.6.0'])
    env.PkgCheckModules('KMS', ['libkms >= 2.4.24'])
    env.PkgCheckModules('UDEV', ['libudev > 150'])
--- a/scons/llvm.py
+++ b/scons/llvm.py
@@ -92,7 +92,19 @@ def generate(env):
            'HAVE_STDINT_H',
        ])
        env.Prepend(LIBPATH = [os.path.join(llvm_dir, 'lib')])
-        if llvm_version >= distutils.version.LooseVersion('3.0'):
+        if llvm_version >= distutils.version.LooseVersion('3.2'):
+            # 3.2
+            env.Prepend(LIBS = [
+                'LLVMBitWriter', 'LLVMX86Disassembler', 'LLVMX86AsmParser',
+                'LLVMX86CodeGen', 'LLVMX86Desc', 'LLVMSelectionDAG',
+                'LLVMAsmPrinter', 'LLVMMCParser', 'LLVMX86AsmPrinter',
+                'LLVMX86Utils', 'LLVMX86Info', 'LLVMJIT',
+                'LLVMExecutionEngine', 'LLVMCodeGen', 'LLVMScalarOpts',
+                'LLVMInstCombine', 'LLVMTransformUtils', 'LLVMipa',
+                'LLVMAnalysis', 'LLVMTarget', 'LLVMMC', 'LLVMCore',
+                'LLVMSupport', 'LLVMRuntimeDyld', 'LLVMObject'
+            ])
+        elif llvm_version >= distutils.version.LooseVersion('3.0'):
            # 3.0
            env.Prepend(LIBS = [
                'LLVMBitWriter', 'LLVMX86Disassembler', 'LLVMX86AsmParser',
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -195,7 +195,14 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
      for (i = 0; attr_list[i] != EGL_NONE; i += 2)
         _eglSetConfigKey(&base, attr_list[i], attr_list[i+1]);

-   if (depth > 0 && depth != base.BufferSize)
+   /* Allow a 24-bit RGB visual to match a 32-bit RGBA EGLConfig.  Otherwise
+    * it will only match a 32-bit RGBA visual.  On a composited window manager
+    * on X11, this will make all of the EGLConfigs with destination alpha get
+    * blended by the compositor.  This is probably not what the application
+    * wants... especially on drivers that only have 32-bit RGBA EGLConfigs!
+    */
+   if (depth > 0 && depth != base.BufferSize
+       && !(depth == 24 && base.BufferSize == 32))
      return NULL;

   if (rgba_masks && memcmp(rgba_masks, dri_masks, sizeof(dri_masks)))
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -446,6 +446,7 @@ dri2_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
 {
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
+   __DRIbuffer buffer;
   int i, ret = 0;

   while (dri2_surf->frame_callback && ret != -1)
@@ -463,6 +464,13 @@ dri2_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
      if (dri2_surf->color_buffers[i].age > 0)
         dri2_surf->color_buffers[i].age++;

+   /* Make sure we have a back buffer in case we're swapping without ever
+    * rendering. */
+   if (get_back_bo(dri2_surf, &buffer) < 0) {
+      _eglError(EGL_BAD_ALLOC, "dri2_swap_buffers");
+      return EGL_FALSE;
+   }
+
   dri2_surf->back->age = 1;
   dri2_surf->current = dri2_surf->back;
   dri2_surf->back = NULL;
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -167,12 +167,17 @@ static void interp( const struct clip_stage *clip,
   {
      int k;
      t_nopersp = t;
-      for (k = 0; k < 2; k++)
+      /* find either in.x != out.x or in.y != out.y */
+      for (k = 0; k < 2; k++) {
         if (in->clip[k] != out->clip[k]) {
-            t_nopersp = (dst->clip[k] - out->clip[k]) /
-               (in->clip[k] - out->clip[k]);
+            /* do divide by W, then compute linear interpolation factor */
+            float in_coord = in->clip[k] / in->clip[3];
+            float out_coord = out->clip[k] / out->clip[3];
+            float dst_coord = dst->clip[k] / dst->clip[3];
+            t_nopersp = (dst_coord - out_coord) / (in_coord - out_coord);
            break;
         }
+      }
   }

   /* Other attributes
--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -127,10 +127,44 @@ static void offset_first_tri( struct draw_stage *stage,
 			      struct prim_header *header )
 {
   struct offset_stage *offset = offset_stage(stage);
+   const struct pipe_rasterizer_state *rast = stage->draw->rasterizer;
+   unsigned fill_mode = rast->fill_front;
+   boolean do_offset;
+
+   if (rast->fill_back != rast->fill_front) {
+      /* Need to check for back-facing triangle */
+      boolean ccw = header->det < 0.0f;
+      if (ccw != rast->front_ccw)
+         fill_mode = rast->fill_back;
+   }
+
+   /* Now determine if we need to do offsetting for the point/line/fill mode */
+   switch (fill_mode) {
+   case PIPE_POLYGON_MODE_FILL:
+      do_offset = rast->offset_tri;
+      break;
+   case PIPE_POLYGON_MODE_LINE:
+      do_offset = rast->offset_line;
+      break;
+   case PIPE_POLYGON_MODE_POINT:
+      do_offset = rast->offset_point;
+      break;
+   default:
+      assert(!"invalid fill_mode in offset_first_tri()");
+      do_offset = rast->offset_tri;
+   }
+
+   if (do_offset) {
+      offset->scale = rast->offset_scale;
+      offset->clamp = rast->offset_clamp;
+      offset->units = (float) (rast->offset_units * stage->draw->mrd);
+   }
+   else {
+      offset->scale = 0.0f;
+      offset->clamp = 0.0f;
+      offset->units = 0.0f;
+   }

-   offset->units = (float) (stage->draw->rasterizer->offset_units * stage->draw->mrd);
-   offset->scale = stage->draw->rasterizer->offset_scale;
-   offset->clamp = stage->draw->rasterizer->offset_clamp;

   stage->tri = offset_tri;
   stage->tri( stage, header );
--- a/src/gallium/auxiliary/util/u_range.h
+++ b/src/gallium/auxiliary/util/u_range.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2013 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * @file
+ * 1D integer range, capable of the union and intersection operations.
+ *
+ * It only maintains a single interval which is extended when the union is
+ * done. This implementation is partially thread-safe (readers are not
+ * protected by a lock).
+ *
+ * @author Marek Olšák
+ */
+
+#ifndef U_RANGE_H
+#define U_RANGE_H
+
+#include "os/os_thread.h"
+
+struct util_range {
+   unsigned start; /* inclusive */
+   unsigned end; /* exclusive */
+
+   /* for the range to be consistent with multiple contexts: */
+   pipe_mutex write_mutex;
+};
+
+
+static INLINE void
+util_range_set_empty(struct util_range *range)
+{
+   range->start = ~0;
+   range->end = 0;
+}
+
+/* This is like a union of two sets. */
+static INLINE void
+util_range_add(struct util_range *range, unsigned start, unsigned end)
+{
+   if (start < range->start || end > range->end) {
+      pipe_mutex_lock(range->write_mutex);
+      range->start = MIN2(start, range->start);
+      range->end = MAX2(end, range->end);
+      pipe_mutex_unlock(range->write_mutex);
+   }
+}
+
+static INLINE boolean
+util_ranges_intersect(struct util_range *range, unsigned start, unsigned end)
+{
+   return MAX2(start, range->start) < MIN2(end, range->end);
+}
+
+
+/* Init/deinit */
+
+static INLINE void
+util_range_init(struct util_range *range)
+{
+   pipe_mutex_init(range->write_mutex);
+   util_range_set_empty(range);
+}
+
+static INLINE void
+util_range_destroy(struct util_range *range)
+{
+   pipe_mutex_destroy(range->write_mutex);
+}
+
+#endif
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -421,10 +421,10 @@ util_clear_depth_stencil(struct pipe_context *pipe,
         else {
            uint32_t dst_mask;
            if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT)
-               dst_mask = 0xffffff00;
+               dst_mask = 0x00ffffff;
            else {
               assert(format == PIPE_FORMAT_S8_UINT_Z24_UNORM);
-               dst_mask = 0xffffff;
+               dst_mask = 0xffffff00;
            }
            if (clear_flags & PIPE_CLEAR_DEPTH)
               dst_mask = ~dst_mask;
--- a/src/gallium/drivers/llvmpipe/Makefile.am
+++ b/src/gallium/drivers/llvmpipe/Makefile.am
@@ -85,23 +85,30 @@ check_PROGRAMS = \
 	lp_test_printf
 TESTS = $(check_PROGRAMS)

+TEST_LIBS = \
+	    libllvmpipe.la \
+	    $(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	    $(LLVM_LIBS) \
+	    $(DLOPEN_LIBS) \
+	    $(PTHREAD_LIBS)
+
 lp_test_format_SOURCES = lp_test_format.c lp_test_main.c
-lp_test_format_LDADD = libllvmpipe.la ../../auxiliary/libgallium.la $(LLVM_LIBS)
+lp_test_format_LDADD = $(TEST_LIBS)
 nodist_EXTRA_lp_test_format_SOURCES = dummy.cpp

 lp_test_arit_SOURCES = lp_test_arit.c lp_test_main.c
-lp_test_arit_LDADD = libllvmpipe.la ../../auxiliary/libgallium.la $(LLVM_LIBS)
+lp_test_arit_LDADD = $(TEST_LIBS)
 nodist_EXTRA_lp_test_arit_SOURCES = dummy.cpp

 lp_test_blend_SOURCES = lp_test_blend.c lp_test_main.c
-lp_test_blend_LDADD = libllvmpipe.la ../../auxiliary/libgallium.la $(LLVM_LIBS)
+lp_test_blend_LDADD = $(TEST_LIBS)
 nodist_EXTRA_lp_test_blend_SOURCES = dummy.cpp

 lp_test_conv_SOURCES = lp_test_conv.c lp_test_main.c
-lp_test_conv_LDADD = libllvmpipe.la ../../auxiliary/libgallium.la $(LLVM_LIBS)
+lp_test_conv_LDADD = $(TEST_LIBS)
 nodist_EXTRA_lp_test_conv_SOURCES = dummy.cpp

 lp_test_printf_SOURCES = lp_test_printf.c lp_test_main.c
-lp_test_printf_LDADD = libllvmpipe.la ../../auxiliary/libgallium.la $(LLVM_LIBS)
+lp_test_printf_LDADD = $(TEST_LIBS)
 nodist_EXTRA_lp_test_printf_SOURCES = dummy.cpp

--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -46,6 +46,10 @@ clear_flags(struct pipe_rasterizer_state *rast)
 {
   rast->light_twoside = 0;
   rast->offset_tri = 0;
+   rast->offset_line = 0;
+   rast->offset_point = 0;
+   rast->offset_units = 0.0f;
+   rast->offset_scale = 0.0f;
 }


@@ -74,6 +78,8 @@ llvmpipe_create_rasterizer_state(struct pipe_context *pipe,
    */
   need_pipeline = (rast->fill_front != PIPE_POLYGON_MODE_FILL ||
 		    rast->fill_back != PIPE_POLYGON_MODE_FILL ||
+                    rast->offset_point ||
+                    rast->offset_line ||
 		    rast->point_smooth ||
 		    rast->line_smooth ||
 		    rast->line_stipple_enable ||
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -295,7 +295,9 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
   /* assert(lpr->base.bind); */

   if (resource_is_texture(&lpr->base)) {
-      if (lpr->base.bind & PIPE_BIND_DISPLAY_TARGET) {
+      if (lpr->base.bind & (PIPE_BIND_DISPLAY_TARGET |
+                            PIPE_BIND_SCANOUT |
+                            PIPE_BIND_SHARED)) {
         /* displayable surface */
         if (!llvmpipe_displaytarget_layout(screen, lpr))
            goto fail;
--- a/src/gallium/drivers/r300/Makefile.am
+++ b/src/gallium/drivers/r300/Makefile.am
@@ -22,6 +22,7 @@ r300_compiler_tests_CPPFLAGS = \
 	-I$(top_srcdir)/src/gallium/drivers/r300/compiler
 r300_compiler_tests_SOURCES = \
 	$(testdir)/r300_compiler_tests.c \
+	$(testdir)/radeon_compiler_optimize_tests.c \
 	$(testdir)/radeon_compiler_util_tests.c \
 	$(testdir)/rc_test_helpers.c \
 	$(testdir)/unit_test.c
--- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
+++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
@@ -854,7 +854,7 @@ static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, voi
 		transform_negative_addressing(c, lastARL, inst, min_offset);
 }

-static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
+struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 	.IsNative = &swizzle_is_native,
 	.Split = 0 /* should never be called */
 };
--- a/src/gallium/drivers/r300/compiler/radeon_compiler_util.h
+++ b/src/gallium/drivers/r300/compiler/radeon_compiler_util.h
@@ -1,3 +1,30 @@
+/*
+ * Copyright 2010 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
 #include "radeon_program_constants.h"

 #ifndef RADEON_PROGRAM_UTIL_H
--- a/src/gallium/drivers/r300/compiler/radeon_emulate_loops.h
+++ b/src/gallium/drivers/r300/compiler/radeon_emulate_loops.h
@@ -1,4 +1,29 @@
-
+/*
+ * Copyright 2010 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */

 #ifndef RADEON_EMULATE_LOOPS_H
 #define RADEON_EMULATE_LOOPS_H
--- a/src/gallium/drivers/r300/compiler/radeon_inline_literals.c
+++ b/src/gallium/drivers/r300/compiler/radeon_inline_literals.c
@@ -1,3 +1,27 @@
+/*
+ * Copyright 2012 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Tom Stellard <thomas.stellard@amd.com>
+ */

 #include "radeon_compiler.h"
 #include "radeon_compiler_util.h"
--- a/src/gallium/drivers/r300/compiler/radeon_optimize.c
+++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c
@@ -816,7 +816,7 @@ static int peephole_mul_omod(

 	/* Rewrite the instructions */
 	for (var = writer_list->Item; var; var = var->Friend) {
-		struct rc_variable * writer = writer_list->Item;
+		struct rc_variable * writer = var;
 		unsigned conversion_swizzle = rc_make_conversion_swizzle(
 					writer->Inst->U.I.DstReg.WriteMask,
 					inst_mul->U.I.DstReg.WriteMask);
--- a/src/gallium/drivers/r300/compiler/radeon_pair_dead_sources.c
+++ b/src/gallium/drivers/r300/compiler/radeon_pair_dead_sources.c
@@ -1,3 +1,29 @@
+/*
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */

 #include "radeon_compiler.h"
 #include "radeon_compiler_util.h"
--- a/src/gallium/drivers/r300/compiler/radeon_rename_regs.h
+++ b/src/gallium/drivers/r300/compiler/radeon_rename_regs.h
@@ -1,3 +1,29 @@
+/*
+ * Copyright 2010 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */

 #ifndef RADEON_RENAME_REGS_H
 #define RADEON_RENAME_REGS_H
--- a/src/gallium/drivers/r300/compiler/radeon_swizzle.h
+++ b/src/gallium/drivers/r300/compiler/radeon_swizzle.h
@@ -54,4 +54,6 @@ struct rc_swizzle_caps {
 	void (*Split)(struct rc_src_register reg, unsigned int mask, struct rc_swizzle_split * split);
 };

+extern struct rc_swizzle_caps r300_vertprog_swizzle_caps;
+
 #endif /* RADEON_SWIZZLE_H */
--- a/src/gallium/drivers/r300/compiler/radeon_vert_fc.c
+++ b/src/gallium/drivers/r300/compiler/radeon_vert_fc.c
@@ -1,3 +1,27 @@
+/*
+ * Copyright 2012 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Tom Stellard <thomas.stellard@amd.com>
+ */

 #include "radeon_compiler.h"
 #include "radeon_compiler_util.h"
--- a/src/gallium/drivers/r300/compiler/tests/r300_compiler_tests.c
+++ b/src/gallium/drivers/r300/compiler/tests/r300_compiler_tests.c
@@ -1,6 +1,43 @@
+/*
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
 #include "r300_compiler_tests.h"

+#include <stdlib.h>
+
 int main(int argc, char ** argv)
 {
-	radeon_compiler_util_run_tests();
+	unsigned pass = 1;
+	pass &= radeon_compiler_optimize_run_tests();
+	pass &= radeon_compiler_util_run_tests();
+
+	if (pass) {
+		return EXIT_SUCCESS;
+	} else {
+		return EXIT_FAILURE;
+	}
 }
--- a/src/gallium/drivers/r300/compiler/tests/r300_compiler_tests.h
+++ b/src/gallium/drivers/r300/compiler/tests/r300_compiler_tests.h
@@ -1,2 +1,29 @@
+/*
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */

-void radeon_compiler_util_run_tests(void);
+unsigned radeon_compiler_optimize_run_tests(void);
+unsigned radeon_compiler_util_run_tests(void);
--- a/src/gallium/drivers/r300/compiler/tests/radeon_compiler_optimize_tests.c
+++ b/src/gallium/drivers/r300/compiler/tests/radeon_compiler_optimize_tests.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Tom Stellard <thomas.stellard@amd.com>
+ */
+
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
+
+#include "r300_compiler_tests.h"
+#include "rc_test_helpers.h"
+#include "unit_test.h"
+
+static void test_runner_rc_optimize(struct test_result * result)
+{
+	struct radeon_compiler c;
+	struct rc_instruction *inst;
+	struct rc_instruction *inst_list[3];
+	unsigned inst_count = 0;
+	float const0[4] = {2.0f, 0.0f, 0.0f, 0.0f};
+	unsigned pass = 1;
+
+	test_begin(result);
+	init_compiler(&c, RC_FRAGMENT_PROGRAM, 1, 0);
+
+	rc_constants_add_immediate_vec4(&c.Program.Constants, const0);
+
+	add_instruction(&c, "RCP temp[0].x, const[1].x___;");
+	add_instruction(&c, "RCP temp[0].y, const[1]._y__;");
+	add_instruction(&c, "MUL temp[1].xy, const[0].xx__, temp[0].xy__;");
+	add_instruction(&c, "MOV output[0].xy, temp[1].xy;" );
+
+	rc_optimize(&c, NULL);
+
+	for(inst = c.Program.Instructions.Next;
+					inst != &c.Program.Instructions;
+					inst = inst->Next, inst_count++) {
+		inst_list[inst_count] = inst;
+	}
+
+	if (inst_list[0]->U.I.Omod != RC_OMOD_MUL_2 ||
+			inst_list[1]->U.I.Omod != RC_OMOD_MUL_2 ||
+			inst_list[2]->U.I.Opcode != RC_OPCODE_MOV) {
+		pass = 0;
+	}
+	test_check(result, pass);
+}
+
+unsigned radeon_compiler_optimize_run_tests()
+{
+	struct test tests[] = {
+		{"rc_optimize() => peephole_mul_omod()", test_runner_rc_optimize},
+		{NULL, NULL}
+	};
+	return run_tests(tests);
+}
--- a/src/gallium/drivers/r300/compiler/tests/radeon_compiler_util_tests.c
+++ b/src/gallium/drivers/r300/compiler/tests/radeon_compiler_util_tests.c
@@ -1,3 +1,30 @@
+/*
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
@@ -67,11 +94,11 @@ static void test_runner_rc_inst_can_use_presub(struct test_result * result)
 		"MAD temp[0].xyz, temp[2].xyz_, -temp[3].xxx_, input[5].xyz_;");
 }

-void radeon_compiler_util_run_tests()
+unsigned radeon_compiler_util_run_tests()
 {
 	struct test tests[] = {
 		{"rc_inst_can_use_presub()", test_runner_rc_inst_can_use_presub},
 		{NULL, NULL}
 	};
-	run_tests(tests);
+	return run_tests(tests);
 }
--- a/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c
+++ b/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.c
@@ -1,3 +1,32 @@
+/*
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Tom Stellard <thomas.stellard@amd.com>
+ */
+
 #include <errno.h>
 #include <regex.h>
 #include <stdlib.h>
@@ -5,9 +34,14 @@
 #include <string.h>
 #include <sys/types.h>

-#include "../radeon_compiler_util.h"
-#include "../radeon_opcodes.h"
-#include "../radeon_program.h"
+#include "r500_fragprog.h"
+#include "r300_fragprog_swizzle.h"
+#include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
+#include "radeon_opcodes.h"
+#include "radeon_program.h"
+#include "radeon_regalloc.h"
+#include "radeon_swizzle.h"

 #include "rc_test_helpers.h"

@@ -259,6 +293,7 @@ int init_rc_normal_dst(
 	if (tokens.WriteMask.Length == 0) {
 		inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
 	} else {
+		inst->U.I.DstReg.WriteMask = 0;
 		/* The first character should be '.' */
 		if (tokens.WriteMask.String[0] != '.') {
 			fprintf(stderr, "1st char of writemask is not valid.\n");
@@ -311,7 +346,8 @@ struct inst_tokens {
 * this string is the same that is output by rc_program_print.
 * @return 1 On success, 0 on failure
 */
-int init_rc_normal_instruction(
+
+int parse_rc_normal_instruction(
 	struct rc_instruction * inst,
 	const char * inst_str)
 {
@@ -320,10 +356,6 @@ int init_rc_normal_instruction(
 	regmatch_t matches[REGEX_INST_MATCHES];
 	struct inst_tokens tokens;

-	/* Initialize inst */
-	memset(inst, 0, sizeof(struct rc_instruction));
-	inst->Type = RC_INSTRUCTION_NORMAL;
-
 	/* Execute the regex */
 	if (!regex_helper(regex_str, inst_str, matches, REGEX_INST_MATCHES)) {
 		return 0;
@@ -340,6 +372,8 @@ int init_rc_normal_instruction(


 	/* Fill out the rest of the instruction. */
+	inst->Type = RC_INSTRUCTION_NORMAL;
+
 	for (i = 0; i < MAX_RC_OPCODE; i++) {
 		const struct rc_opcode_info * info = rc_get_opcode_info(i);
 		unsigned int first_src = 3;
@@ -378,3 +412,47 @@ int init_rc_normal_instruction(
 	}
 	return 1;
 }
+
+int init_rc_normal_instruction(
+	struct rc_instruction * inst,
+	const char * inst_str)
+{
+	/* Initialize inst */
+	memset(inst, 0, sizeof(struct rc_instruction));
+
+	return parse_rc_normal_instruction(inst, inst_str);
+}
+
+void add_instruction(struct radeon_compiler *c, const char * inst_string)
+{
+	struct rc_instruction * new_inst =
+		rc_insert_new_instruction(c, c->Program.Instructions.Prev);
+
+	parse_rc_normal_instruction(new_inst, inst_string);
+
+}
+
+void init_compiler(
+	struct radeon_compiler *c,
+	enum rc_program_type program_type,
+	unsigned is_r500,
+	unsigned is_r400)
+{
+	struct rc_regalloc_state *rs = malloc(sizeof(struct rc_regalloc_state));
+	rc_init(c, rs);
+
+	c->is_r500 = is_r500;
+	c->max_temp_regs = is_r500 ? 128 : (is_r400 ? 64 : 32);
+	c->max_constants = is_r500 ? 256 : 32;
+	c->max_alu_insts = (is_r500 || is_r400) ? 512 : 64;
+	c->max_tex_insts = (is_r500 || is_r400) ? 512 : 32;
+	if (program_type == RC_FRAGMENT_PROGRAM) {
+		c->has_half_swizzles = 1;
+		c->has_presub = 1;
+		c->has_omod = 1;
+		c->SwizzleCaps =
+			is_r500 ? &r500_swizzle_caps : &r300_swizzle_caps;
+	} else {
+		c->SwizzleCaps = &r300_vertprog_swizzle_caps;
+	}
+}
--- a/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.h
+++ b/src/gallium/drivers/r300/compiler/tests/rc_test_helpers.h
@@ -1,3 +1,33 @@
+/*
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Tom Stellard <thomas.stellard@amd.com>
+ */
+
+#include "radeon_compiler.h"

 int init_rc_normal_src(
 	struct rc_instruction * inst,
@@ -8,6 +38,18 @@ int init_rc_normal_dst(
 	struct rc_instruction * inst,
 	const char * dst_str);

+int parse_rc_normal_instruction(
+	struct rc_instruction * inst,
+	const char * inst_str);
+
 int init_rc_normal_instruction(
 	struct rc_instruction * inst,
 	const char * inst_str);
+
+void add_instruction(struct radeon_compiler *c, const char * inst_string);
+
+void init_compiler(
+	struct radeon_compiler *c,
+	enum rc_program_type program_type,
+	unsigned is_r500,
+	unsigned is_r400);
--- a/src/gallium/drivers/r300/compiler/tests/unit_test.c
+++ b/src/gallium/drivers/r300/compiler/tests/unit_test.c
@@ -1,19 +1,51 @@
+/*
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>

 #include "unit_test.h"

-void run_tests(struct test tests[])
+unsigned run_tests(struct test tests[])
 {
 	int i;
+	unsigned pass = 1;
 	for (i = 0; tests[i].name; i++) {
 		printf("Test %s\n", tests[i].name);
 		memset(&tests[i].result, 0, sizeof(tests[i].result));
 		tests[i].test_func(&tests[i].result);
 		printf("Test %s (%d/%d) pass\n", tests[i].name,
 			tests[i].result.pass, tests[i].result.test_count);
+		if (tests[i].result.pass != tests[i].result.test_count) {
+			pass = 0;
+		}
 	}
+	return pass;
 }

 void test_begin(struct test_result * result)
--- a/src/gallium/drivers/r300/compiler/tests/unit_test.h
+++ b/src/gallium/drivers/r300/compiler/tests/unit_test.h
@@ -1,3 +1,29 @@
+/*
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */

 struct test_result {
 	unsigned int test_count;
@@ -11,7 +37,7 @@ struct test {
 	struct test_result result;
 };

-void run_tests(struct test tests[]);
+unsigned run_tests(struct test tests[]);

 void test_begin(struct test_result * result);
 void test_check(struct test_result * result, int cond);
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -487,6 +487,7 @@ static void r300_set_blend_color(struct pipe_context* pipe,
        (struct r300_blend_color_state*)r300->blend_color_state.state;
    struct pipe_blend_color c;
    enum pipe_format format = fb->nr_cbufs ? fb->cbufs[0]->format : 0;
+    float tmp;
    CB_LOCALS;

    state->state = *color; /* Save it, so that we can reuse it in set_fb_state */
@@ -513,6 +514,13 @@ static void r300_set_blend_color(struct pipe_context* pipe,
            c.color[2] = c.color[3];
            break;

+        case PIPE_FORMAT_R8G8B8A8_UNORM:
+        case PIPE_FORMAT_R8G8B8X8_UNORM:
+            tmp = c.color[0];
+            c.color[0] = c.color[2];
+            c.color[2] = tmp;
+            break;
+
        default:;
        }
    }
@@ -919,6 +927,9 @@ r300_set_framebuffer_state(struct pipe_context* pipe,
    /* Need to reset clamping or colormask. */
    r300_mark_atom_dirty(r300, &r300->blend_state);

+    /* Re-swizzle the blend color. */
+    r300_set_blend_color(pipe, &((struct r300_blend_color_state*)r300->blend_color_state.state)->state);
+
    /* If zsbuf is set from NULL to non-NULL or vice versa.. */
    if (!!old_state->zsbuf != !!state->zsbuf) {
        r300_mark_atom_dirty(r300, &r300->dsa_state);
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -978,9 +978,9 @@ r300_texture_create_object(struct r300_screen *rscreen,
    tex->tex.microtile = microtile;
    tex->tex.macrotile[0] = macrotile;
    tex->tex.stride_in_bytes_override = stride_in_bytes_override;
-    tex->domain = base->flags & R300_RESOURCE_FLAG_TRANSFER ?
-                  RADEON_DOMAIN_GTT :
-                  RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT;
+    tex->domain = base->flags & R300_RESOURCE_FLAG_TRANSFER ? RADEON_DOMAIN_GTT :
+                  base->nr_samples > 1 ? RADEON_DOMAIN_VRAM :
+                                         RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT;
    tex->buf = buffer;

    r300_texture_desc_init(rscreen, tex, base);
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -243,9 +243,9 @@ void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_en
 void evergreen_dma_copy(struct r600_context *rctx,
 		struct pipe_resource *dst,
 		struct pipe_resource *src,
-		unsigned long dst_offset,
-		unsigned long src_offset,
-		unsigned long size)
+		uint64_t dst_offset,
+		uint64_t src_offset,
+		uint64_t size)
 {
 	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
 	unsigned i, ncopy, csize, sub_cmd, shift;
@@ -283,4 +283,7 @@ void evergreen_dma_copy(struct r600_context *rctx,
 		src_offset += csize << shift;
 		size -= csize;
 	}
+
+	util_range_add(&rdst->valid_buffer_range, dst_offset,
+		       dst_offset + size);
 }
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -808,6 +808,7 @@ static void *evergreen_create_dsa_state(struct pipe_context *ctx,
 	dsa->valuemask[1] = state->stencil[1].valuemask;
 	dsa->writemask[0] = state->stencil[0].writemask;
 	dsa->writemask[1] = state->stencil[1].writemask;
+	dsa->zwritemask = state->depth.writemask;

 	db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
 		S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
@@ -1321,6 +1322,10 @@ void evergreen_init_color_surface_rat(struct r600_context *rctx,
 	 * elements. */
 	surf->cb_color_dim = pipe_buffer->width0;

+	/* Set the buffer range the GPU will have access to: */
+	util_range_add(&r600_resource(pipe_buffer)->valid_buffer_range,
+		       0, pipe_buffer->width0);
+
 	surf->cb_color_cmask = surf->cb_color_base;
 	surf->cb_color_cmask_slice = 0;
 	surf->cb_color_fmask = surf->cb_color_base;
@@ -1405,10 +1410,15 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 			S_028C74_NON_DISP_TILING_ORDER(non_disp_tiling) |
 		        S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);

-	if (rctx->chip_class == CAYMAN && rtex->resource.b.b.nr_samples > 1) {
-		unsigned log_samples = util_logbase2(rtex->resource.b.b.nr_samples);
-		color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
-				S_028C74_NUM_FRAGMENTS(log_samples);
+	if (rctx->chip_class == CAYMAN) {
+		color_attrib |=	S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] ==
+							   UTIL_FORMAT_SWIZZLE_1);
+
+		if (rtex->resource.b.b.nr_samples > 1) {
+			unsigned log_samples = util_logbase2(rtex->resource.b.b.nr_samples);
+			color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
+					S_028C74_NUM_FRAGMENTS(log_samples);
+		}
 	}

 	ntype = V_028C70_NUMBER_UNORM;
@@ -1647,6 +1657,11 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 	}
 	if (rctx->framebuffer.state.zsbuf) {
 		rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
+
+		rtex = (struct r600_texture*)rctx->framebuffer.state.zsbuf->texture;
+		if (rtex->htile) {
+			rctx->flags |= R600_CONTEXT_FLUSH_AND_INV_DB_META;
+		}
 	}

 	util_copy_framebuffer_state(&rctx->framebuffer.state, state);
@@ -1668,6 +1683,8 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 		surf = (struct r600_surface*)state->cbufs[i];
 		rtex = (struct r600_texture*)surf->base.texture;

+		r600_context_add_resource_size(ctx, state->cbufs[i]->texture);
+
 		if (!surf->color_initialized) {
 			evergreen_init_color_surface(rctx, surf);
 		}
@@ -1699,6 +1716,8 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 	if (state->zsbuf) {
 		surf = (struct r600_surface*)state->zsbuf;

+		r600_context_add_resource_size(ctx, state->zsbuf->texture);
+
 		if (!surf->depth_initialized) {
 			evergreen_init_depth_surface(rctx, surf);
 		}
@@ -2218,9 +2237,23 @@ static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_
 		}
 		db_render_override |= S_02800C_NOOP_CULL_DISABLE(1);
 	}
-	if (rctx->db_state.rsurf && rctx->db_state.rsurf->htile_enabled) {
+	/* FIXME we should be able to use hyperz even if we are not writing to
+	 * zbuffer but somehow this trigger GPU lockup. See :
+	 *
+	 * https://bugs.freedesktop.org/show_bug.cgi?id=60848
+	 *
+	 * Disable hyperz for now if not writing to zbuffer.
+	 */
+	if (rctx->db_state.rsurf && rctx->db_state.rsurf->htile_enabled && rctx->zwritemask) {
 		/* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */
 		db_render_override |= S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_OFF);
+		/* This is to fix a lockup when hyperz and alpha test are enabled at
+		 * the same time somehow GPU get confuse on which order to pick for
+		 * z test
+		 */
+		if (rctx->alphatest_state.sx_alpha_test_control) {
+			db_render_override |= S_02800C_FORCE_SHADER_Z_ORDER(1);
+		}
 	} else {
 		db_render_override |= S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_DISABLE);
 	}
@@ -3210,7 +3243,7 @@ void evergreen_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_state *rstate = &shader->rstate;
 	struct r600_shader *rshader = &shader->shader;
-	unsigned i, exports_ps, num_cout, spi_ps_in_control_0, spi_input_z, spi_ps_in_control_1, db_shader_control;
+	unsigned i, exports_ps, num_cout, spi_ps_in_control_0, spi_input_z, spi_ps_in_control_1, db_shader_control = 0;
 	int pos_index = -1, face_index = -1;
 	int ninterp = 0;
 	boolean have_linear = FALSE, have_centroid = FALSE, have_perspective = FALSE;
@@ -3220,7 +3253,6 @@ void evergreen_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader

 	rstate->nregs = 0;

-	db_shader_control = S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
 	for (i = 0; i < rshader->ninput; i++) {
 		/* evergreen NUM_INTERP only contains values interpolated into the LDS,
 		   POSITION goes via GPRs from the SC so isn't counted */
@@ -3454,6 +3486,24 @@ void evergreen_update_db_shader_control(struct r600_context * rctx)
 								V_02880C_EXPORT_DB_FULL) |
 			S_02880C_ALPHA_TO_MASK_DISABLE(rctx->framebuffer.cb0_is_integer);

+	/* When alpha test is enabled we can't trust the hw to make the proper
+	 * decision on the order in which ztest should be run related to fragment
+	 * shader execution.
+	 *
+	 * If alpha test is enabled perform early z rejection (RE_Z) but don't early
+	 * write to the zbuffer. Write to zbuffer is delayed after fragment shader
+	 * execution and thus after alpha test so if discarded by the alpha test
+	 * the z value is not written.
+	 * If ReZ is enabled, and the zfunc/zenable/zwrite values change you can
+	 * get a hang unless you flush the DB in between.  For now just use
+	 * LATE_Z.
+	 */
+	if (rctx->alphatest_state.sx_alpha_test_control) {
+		db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
+	} else {
+		db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
+	}
+
 	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
 		rctx->db_misc_state.atom.dirty = true;
@@ -3481,7 +3531,7 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
 	unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
 	unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split;
-	unsigned long base, addr;
+	uint64_t base, addr;

 	/* make sure that the dma ring is only one active */
 	rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
@@ -3502,7 +3552,8 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 	if (dst_mode == RADEON_SURF_MODE_LINEAR) {
 		/* T2L */
 		array_mode = evergreen_array_mode(src_mode);
-		slice_tile_max = (((pitch * rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1;
+		slice_tile_max = (rsrc->surface.level[src_level].nblk_x * rsrc->surface.level[src_level].nblk_y) >> 6;
+		slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
 		/* linear height must be the same as the slice tile max height, it's ok even
 		 * if the linear destination/source have smaller heigh as the size of the
 		 * dma packet will be using the copy_height which is always smaller or equal
@@ -3526,7 +3577,8 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 	} else {
 		/* L2T */
 		array_mode = evergreen_array_mode(dst_mode);
-		slice_tile_max = (((pitch * rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1;
+		slice_tile_max = (rdst->surface.level[dst_level].nblk_x * rdst->surface.level[dst_level].nblk_y) >> 6;
+		slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
 		/* linear height must be the same as the slice tile max height, it's ok even
 		 * if the linear destination/source have smaller heigh as the size of the
 		 * dma packet will be using the copy_height which is always smaller or equal
@@ -3624,8 +3676,19 @@ boolean evergreen_dma_blit(struct pipe_context *ctx,
 		return FALSE;
 	}

+	/* 128 bpp surfaces require non_disp_tiling for both
+	 * tiled and linear buffers on cayman.  However, async
+	 * DMA only supports it on the tiled side.  As such
+	 * the tile order is backwards after a L2T/T2L packet.
+	 */
+	if ((rctx->chip_class == CAYMAN) &&
+	    (src_mode != dst_mode) &&
+	    (util_format_get_blocksize(src->format) >= 16)) {
+		return FALSE;
+	}
+
 	if (src_mode == dst_mode) {
-		unsigned long dst_offset, src_offset;
+		uint64_t dst_offset, src_offset;
 		/* simple dma blit would do NOTE code here assume :
 		 *   src_box.x/y == 0
 		 *   dst_x/y == 0
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -28,6 +28,7 @@

 #include "../../winsys/radeon/drm/radeon_winsys.h"
 #include "util/u_double_list.h"
+#include "util/u_range.h"
 #include "util/u_transfer.h"

 #define R600_ERR(fmt, args...) \
@@ -50,6 +51,16 @@ struct r600_resource {

 	/* Resource state. */
 	unsigned			domains;
+
+	/* The buffer range which is initialized (with a write transfer,
+	 * streamout, DMA, or as a random access target). The rest of
+	 * the buffer is considered invalid and can be mapped unsynchronized.
+	 *
+	 * This allows unsychronized mapping of a buffer range which hasn't
+	 * been used yet. It's for applications which forget to use
+	 * the unsynchronized map flag and expect the driver to figure it out.
+         */
+	struct util_range		valid_buffer_range;
 };

 #define R600_BLOCK_MAX_BO		32
@@ -151,6 +162,8 @@ struct r600_so_target {
 #define R600_CONTEXT_WAIT_CP_DMA_IDLE		(1 << 3)
 #define R600_CONTEXT_FLUSH_AND_INV		(1 << 4)
 #define R600_CONTEXT_FLUSH_AND_INV_CB_META	(1 << 5)
+#define R600_CONTEXT_PS_PARTIAL_FLUSH		(1 << 6)
+#define R600_CONTEXT_FLUSH_AND_INV_DB_META      (1 << 7)

 struct r600_context;
 struct r600_screen;
@@ -174,9 +187,9 @@ void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw);
 void r600_dma_copy(struct r600_context *rctx,
 		struct pipe_resource *dst,
 		struct pipe_resource *src,
-		unsigned long dst_offset,
-		unsigned long src_offset,
-		unsigned long size);
+		uint64_t dst_offset,
+		uint64_t src_offset,
+		uint64_t size);
 boolean r600_dma_blit(struct pipe_context *ctx,
 			struct pipe_resource *dst,
 			unsigned dst_level,
@@ -187,9 +200,9 @@ boolean r600_dma_blit(struct pipe_context *ctx,
 void evergreen_dma_copy(struct r600_context *rctx,
 		struct pipe_resource *dst,
 		struct pipe_resource *src,
-		unsigned long dst_offset,
-		unsigned long src_offset,
-		unsigned long size);
+		uint64_t dst_offset,
+		uint64_t src_offset,
+		uint64_t size);
 boolean evergreen_dma_blit(struct pipe_context *ctx,
 			struct pipe_resource *dst,
 			unsigned dst_level,
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -68,13 +68,17 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_DX10:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_DX10:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_DX10:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_DX10:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
@@ -150,13 +154,17 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_DX10:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_DX10:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_DX10:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_DX10:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
@@ -314,6 +322,7 @@ int r600_bytecode_add_output(struct r600_bytecode *bc, const struct r600_bytecod
 		output->swizzle_y == bc->cf_last->output.swizzle_y &&
 		output->swizzle_z == bc->cf_last->output.swizzle_z &&
 		output->swizzle_w == bc->cf_last->output.swizzle_w &&
+		output->comp_mask == bc->cf_last->output.comp_mask &&
 		(output->burst_count + bc->cf_last->output.burst_count) <= 16) {

 		if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
@@ -865,12 +874,6 @@ static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
 	bank_swizzle[4] = SQ_ALU_SCL_210;
 	while(bank_swizzle[4] <= SQ_ALU_SCL_221) {

-		if (max_slots == 4) {
-			for (i = 0; i < max_slots; i++) {
-				if (bank_swizzle[i] == SQ_ALU_VEC_210)
-				  return -1;
-			}
-		}
 		init_bank_swizzle(&bs);
 		if (scalar_only == false) {
 			for (i = 0; i < 4; i++) {
@@ -902,8 +905,10 @@ static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
 					bank_swizzle[i]++;
 					if (bank_swizzle[i] <= SQ_ALU_VEC_210)
 						break;
-					else
+					else if (i < max_slots - 1)
 						bank_swizzle[i] = SQ_ALU_VEC_012;
+					else
+						return -1;
 				}
 			}
 		}
--- a/src/gallium/drivers/r600/r600_buffer.c
+++ b/src/gallium/drivers/r600/r600_buffer.c
@@ -34,6 +34,7 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
 {
 	struct r600_resource *rbuffer = r600_resource(buf);

+	util_range_destroy(&rbuffer->valid_buffer_range);
 	pb_reference(&rbuffer->buf, NULL);
 	FREE(rbuffer);
 }
@@ -98,6 +99,14 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,

 	assert(box->x + box->width <= resource->width0);

+	/* See if the buffer range being mapped has never been initialized,
+	 * in which case it can be mapped unsynchronized. */
+	if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
+	    usage & PIPE_TRANSFER_WRITE &&
+	    !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
+		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+	}
+
 	if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
 	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
 		assert(usage & PIPE_TRANSFER_WRITE);
@@ -178,6 +187,7 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
 {
 	struct r600_context *rctx = (struct r600_context*)pipe;
 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+	struct r600_resource *rbuffer = r600_resource(transfer->resource);

 	if (rtransfer->staging) {
 		struct pipe_resource *dst, *src;
@@ -189,7 +199,7 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
 		doffset = transfer->box.x;
 		soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT;
 		/* Copy the staging buffer into the original one. */
-		if (rctx->rings.dma.cs && !(size % 4) && !(doffset % 4) && !(soffset)) {
+		if (rctx->rings.dma.cs && !(size % 4) && !(doffset % 4) && !(soffset % 4)) {
 			if (rctx->screen->chip_class >= EVERGREEN) {
 				evergreen_dma_copy(rctx, dst, src, doffset, soffset, size);
 			} else {
@@ -203,6 +213,11 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
 		}
 		pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL);
 	}
+
+	if (transfer->usage & PIPE_TRANSFER_WRITE) {
+		util_range_add(&rbuffer->valid_buffer_range, transfer->box.x,
+			       transfer->box.x + transfer->box.width);
+	}
 	util_slab_free(&rctx->pool_transfers, transfer);
 }

@@ -259,6 +274,7 @@ bool r600_init_resource(struct r600_screen *rscreen,

 	res->cs_buf = rscreen->ws->buffer_get_cs_handle(res->buf);
 	res->domains = domains;
+	util_range_set_empty(&res->valid_buffer_range);
 	return true;
 }

@@ -275,6 +291,7 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 	pipe_reference_init(&rbuffer->b.b.reference, 1);
 	rbuffer->b.b.screen = screen;
 	rbuffer->b.vtbl = &r600_buffer_vtbl;
+	util_range_init(&rbuffer->valid_buffer_range);

 	if (!r600_init_resource(rscreen, rbuffer, templ->width0, alignment, TRUE, templ->usage)) {
 		FREE(rbuffer);
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -359,6 +359,16 @@ out_err:
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 			boolean count_draw_in)
 {
+	if (!ctx->ws->cs_memory_below_limit(ctx->rings.gfx.cs, ctx->vram, ctx->gtt)) {
+		ctx->gtt = 0;
+		ctx->vram = 0;
+		ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC);
+		return;
+	}
+	/* all will be accounted once relocation are emited */
+	ctx->gtt = 0;
+	ctx->vram = 0;
+
 	/* The number of dwords we already used in the CS so far. */
 	num_dw += ctx->rings.gfx.cs->cdw;

@@ -623,17 +633,27 @@ void r600_flush_emit(struct r600_context *rctx)
 		/* Use of WAIT_UNTIL is deprecated on Cayman+ */
 		if (rctx->family >= CHIP_CAYMAN) {
 			/* emit a PS partial flush on Cayman/TN */
-			cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
-			cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+			rctx->flags |= R600_CONTEXT_PS_PARTIAL_FLUSH;
 		}
 	}

+	if (rctx->flags & R600_CONTEXT_PS_PARTIAL_FLUSH) {
+		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+	}
+
 	if (rctx->chip_class >= R700 &&
 	    (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_CB_META)) {
 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0);
 	}

+	if (rctx->chip_class >= R700 &&
+	    (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_DB_META)) {
+		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0);
+	}
+
 	if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV) {
 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0);
@@ -728,6 +748,7 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
 	 */
 	ctx->flags |= R600_CONTEXT_FLUSH_AND_INV |
 		      R600_CONTEXT_FLUSH_AND_INV_CB_META |
+		      R600_CONTEXT_FLUSH_AND_INV_DB_META |
 		      R600_CONTEXT_WAIT_3D_IDLE |
 		      R600_CONTEXT_WAIT_CP_DMA_IDLE;

@@ -784,6 +805,8 @@ void r600_begin_new_cs(struct r600_context *ctx)

 	ctx->pm4_dirty_cdwords = 0;
 	ctx->flags = 0;
+	ctx->gtt = 0;
+	ctx->vram = 0;

 	/* Begin a new CS. */
 	r600_emit_command_buffer(ctx->rings.gfx.cs, &ctx->start_cs_cmd);
@@ -1103,6 +1126,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES |
 		       R600_CONTEXT_FLUSH_AND_INV |
 		       R600_CONTEXT_FLUSH_AND_INV_CB_META |
+		       R600_CONTEXT_FLUSH_AND_INV_DB_META |
 		       R600_CONTEXT_STREAMOUT_FLUSH |
 		       R600_CONTEXT_WAIT_3D_IDLE;

@@ -1145,6 +1169,12 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 		src_offset += byte_count;
 		dst_offset += byte_count;
 	}
+
+	/* Invalidate the read caches. */
+	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
+
+	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
+		       dst_offset + size);
 }

 void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw)
@@ -1160,9 +1190,9 @@ void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw)
 void r600_dma_copy(struct r600_context *rctx,
 		struct pipe_resource *dst,
 		struct pipe_resource *src,
-		unsigned long dst_offset,
-		unsigned long src_offset,
-		unsigned long size)
+		uint64_t dst_offset,
+		uint64_t src_offset,
+		uint64_t size)
 {
 	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
 	unsigned i, ncopy, csize, shift;
@@ -1191,4 +1221,7 @@ void r600_dma_copy(struct r600_context *rctx,
 		src_offset += csize << shift;
 		size -= csize;
 	}
+
+	util_range_add(&rdst->valid_buffer_range, dst_offset,
+		       dst_offset + size);
 }
--- a/src/gallium/drivers/r600/r600_hw_context_priv.h
+++ b/src/gallium/drivers/r600/r600_hw_context_priv.h
@@ -29,7 +29,7 @@
 #include "r600_pipe.h"

 /* the number of CS dwords for flushing and drawing */
-#define R600_MAX_FLUSH_CS_DWORDS	12
+#define R600_MAX_FLUSH_CS_DWORDS	16
 #define R600_MAX_DRAW_CS_DWORDS		34
 #define R600_TRACE_CS_DWORDS		7

--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -38,8 +38,12 @@ static LLVMValueRef llvm_fetch_const(
 		LLVMValueRef index = LLVMBuildLoad(bld_base->base.gallivm->builder, bld->addr[reg->Indirect.Index][reg->Indirect.SwizzleX], "");
 		offset[1] = LLVMBuildAdd(bld_base->base.gallivm->builder, offset[1], index, "");
 	}
+	unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ;
+	if (reg->Register.Dimension) {
+		ConstantAddressSpace += reg->Dimension.Index;
+	}
 	LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base->base.elem_type, 4), 1024),
-							CONSTANT_BUFFER_0_ADDR_SPACE);
+							ConstantAddressSpace);
 	LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base->base.gallivm->builder, lp_build_const_int32(bld_base->base.gallivm, 0), const_ptr_type, "");
 	LLVMValueRef ptr = LLVMBuildGEP(bld_base->base.gallivm->builder, const_ptr, offset, 2, "");
 	LLVMValueRef cvecval = LLVMBuildLoad(bld_base->base.gallivm->builder, ptr, "");
@@ -537,6 +541,7 @@ const char * r600_llvm_gpu_string(enum radeon_family family)
 	case CHIP_RV630:
 	case CHIP_RV620:
 	case CHIP_RV635:
+	case CHIP_RV670:
 	case CHIP_RS780:
 	case CHIP_RS880:
 		gpu_family = "r600";
@@ -547,7 +552,6 @@ const char * r600_llvm_gpu_string(enum radeon_family family)
 	case CHIP_RV730:
 		gpu_family = "rv730";
 		break;
-	case CHIP_RV670:
 	case CHIP_RV740:
 	case CHIP_RV770:
 		gpu_family = "rv770";
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -22,6 +22,7 @@
 */
 #include "r600_pipe.h"
 #include "r600_public.h"
+#include "r600d.h"

 #include <errno.h>
 #include "pipe/p_shader_tokens.h"
@@ -165,12 +166,23 @@ static void r600_flush_gfx_ring(void *ctx, unsigned flags)
 static void r600_flush_dma_ring(void *ctx, unsigned flags)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
+	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+	unsigned padding_dw, i;

-	if (!rctx->rings.dma.cs->cdw) {
+	if (!cs->cdw) {
 		return;
 	}
+
+	/* Pad the DMA CS to a multiple of 8 dwords. */
+	padding_dw = 8 - cs->cdw % 8;
+	if (padding_dw < 8) {
+		for (i = 0; i < padding_dw; i++) {
+			cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0);
+		}
+	}
+
 	rctx->rings.dma.flushing = true;
-	rctx->ws->cs_flush(rctx->rings.dma.cs, flags);
+	rctx->ws->cs_flush(cs, flags);
 	rctx->rings.dma.flushing = false;
 }

--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -298,7 +298,8 @@ struct r600_dsa_state {
 	unsigned			alpha_ref;
 	ubyte				valuemask[2];
 	ubyte				writemask[2];
-	unsigned                        sx_alpha_test_control;
+	unsigned			zwritemask;
+	unsigned			sx_alpha_test_control;
 };

 struct r600_pipe_shader;
@@ -447,6 +448,10 @@ struct r600_context {
 	unsigned			backend_mask;
 	unsigned			max_db; /* for OQ */

+	/* current unaccounted memory usage */
+	uint64_t			vram;
+	uint64_t			gtt;
+
 	/* Miscellaneous state objects. */
 	void				*custom_dsa_flush;
 	void				*custom_blend_resolve;
@@ -509,6 +514,7 @@ struct r600_context {
 	bool				alpha_to_one;
 	bool				force_blend_disable;
 	boolean				dual_src_blend;
+	unsigned			zwritemask;

 	/* Index buffer. */
 	struct pipe_index_buffer	index_buffer;
@@ -869,9 +875,11 @@ static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx,
 	 * look serialized from driver pov
 	 */
 	if (!ring->flushing) {
-		if (ring == &ctx->rings.gfx && ctx->rings.dma.cs) {
-			/* flush dma ring */
-			ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
+		if (ring == &ctx->rings.gfx) {
+			if (ctx->rings.dma.cs) {
+				/* flush dma ring */
+				ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
+			}
 		} else {
 			/* flush gfx ring */
 			ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC);
@@ -996,4 +1004,28 @@ static INLINE unsigned u_max_layer(struct pipe_resource *r, unsigned level)
 	}
 }

+static INLINE void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r)
+{
+	struct r600_context *rctx = (struct r600_context *)ctx;
+	struct r600_resource *rr = (struct r600_resource *)r;
+
+	if (r == NULL) {
+		return;
+	}
+
+	/*
+	 * The idea is to compute a gross estimate of memory requirement of
+	 * each draw call. After each draw call, memory will be precisely
+	 * accounted. So the uncertainty is only on the current draw call.
+	 * In practice this gave very good estimate (+/- 10% of the target
+	 * memory limit).
+	 */
+	if (rr->domains & RADEON_DOMAIN_GTT) {
+		rctx->gtt += rr->buf->size;
+	}
+	if (rr->domains & RADEON_DOMAIN_VRAM) {
+		rctx->vram += rr->buf->size;
+	}
+}
+
 #endif
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -802,6 +802,7 @@ static void *r600_create_dsa_state(struct pipe_context *ctx,
 	dsa->valuemask[1] = state->stencil[1].valuemask;
 	dsa->writemask[0] = state->stencil[0].writemask;
 	dsa->writemask[1] = state->stencil[1].writemask;
+	dsa->zwritemask = state->depth.writemask;

 	db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
 		S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
@@ -1515,6 +1516,11 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 	}
 	if (rctx->framebuffer.state.zsbuf) {
 		rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
+
+		rtex = (struct r600_texture*)rctx->framebuffer.state.zsbuf->texture;
+		if (rctx->chip_class >= R700 && rtex->htile) {
+			rctx->flags |= R600_CONTEXT_FLUSH_AND_INV_DB_META;
+		}
 	}

 	/* Set the new state. */
@@ -1544,6 +1550,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,

 		surf = (struct r600_surface*)state->cbufs[i];
 		rtex = (struct r600_texture*)surf->base.texture;
+		r600_context_add_resource_size(ctx, state->cbufs[i]->texture);

 		if (!surf->color_initialized || force_cmask_fmask) {
 			r600_init_color_surface(rctx, surf, force_cmask_fmask);
@@ -1576,6 +1583,8 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 	if (state->zsbuf) {
 		surf = (struct r600_surface*)state->zsbuf;

+		r600_context_add_resource_size(ctx, state->zsbuf->texture);
+
 		if (!surf->depth_initialized) {
 			r600_init_depth_surface(rctx, surf);
 		}
@@ -1937,6 +1946,13 @@ static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom
 	if (rctx->db_state.rsurf && rctx->db_state.rsurf->htile_enabled) {
 		/* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */
 		db_render_override |= S_028D10_FORCE_HIZ_ENABLE(V_028D10_FORCE_OFF);
+		/* This is to fix a lockup when hyperz and alpha test are enabled at
+		 * the same time somehow GPU get confuse on which order to pick for
+		 * z test
+		 */
+		if (rctx->alphatest_state.sx_alpha_test_control) {
+			db_render_override |= S_028D10_FORCE_SHADER_Z_ORDER(1);
+		}
 	} else {
 		db_render_override |= S_028D10_FORCE_HIZ_ENABLE(V_028D10_FORCE_DISABLE);
 	}
@@ -2745,7 +2761,7 @@ void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader *shad
 				tmp);
 	}

-	db_shader_control = S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
+	db_shader_control = 0;
 	for (i = 0; i < rshader->noutput; i++) {
 		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION)
 			z_export = 1;
@@ -2940,6 +2956,19 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 	unsigned db_shader_control = rctx->ps_shader->current->db_shader_control |
 				     S_02880C_DUAL_EXPORT_ENABLE(dual_export);

+	/* When alpha test is enabled we can't trust the hw to make the proper
+	 * decision on the order in which ztest should be run related to fragment
+	 * shader execution.
+	 *
+	 * If alpha test is enabled perform z test after fragment. RE_Z (early
+	 * z test but no write to the zbuffer) seems to cause lockup on r6xx/r7xx
+	 */
+	if (rctx->alphatest_state.sx_alpha_test_control) {
+		db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
+	} else {
+		db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
+	}
+
 	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
 		rctx->db_misc_state.atom.dirty = true;
@@ -2979,7 +3008,7 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 	struct r600_texture *rdst = (struct r600_texture*)dst;
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
 	unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
-	unsigned long base, addr;
+	uint64_t base, addr;

 	/* make sure that the dma ring is only one active */
 	rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
@@ -2998,7 +3027,8 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 	if (dst_mode == RADEON_SURF_MODE_LINEAR) {
 		/* T2L */
 		array_mode = r600_array_mode(src_mode);
-		slice_tile_max = (((pitch * rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1;
+		slice_tile_max = (rsrc->surface.level[src_level].nblk_x * rsrc->surface.level[src_level].nblk_y) >> 6;
+		slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
 		/* linear height must be the same as the slice tile max height, it's ok even
 		 * if the linear destination/source have smaller heigh as the size of the
 		 * dma packet will be using the copy_height which is always smaller or equal
@@ -3016,7 +3046,8 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 	} else {
 		/* L2T */
 		array_mode = r600_array_mode(dst_mode);
-		slice_tile_max = (((pitch * rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1;
+		slice_tile_max = (rdst->surface.level[dst_level].nblk_x * rdst->surface.level[dst_level].nblk_y) >> 6;
+		slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
 		/* linear height must be the same as the slice tile max height, it's ok even
 		 * if the linear destination/source have smaller heigh as the size of the
 		 * dma packet will be using the copy_height which is always smaller or equal
@@ -3037,14 +3068,15 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 		return FALSE;
 	}

-	size = (copy_height * pitch) >> 2;
-	ncopy = (size / 0x0000ffff) + !!(size % 0x0000ffff);
+	/* It's a r6xx/r7xx limitation, the blit must be on 8 boundary for number
+	 * line in the blit. Compute max 8 line we can copy in the size limit
+	 */
+	cheight = ((0x0000ffff << 2) / pitch) & 0xfffffff8;
+	ncopy = (copy_height / cheight) + !!(copy_height % cheight);
 	r600_need_dma_space(rctx, ncopy * 7);
+
 	for (i = 0; i < ncopy; i++) {
-		cheight = copy_height;
-		if (((cheight * pitch) >> 2) > 0x0000ffff) {
-			cheight = (0x0000ffff << 2) / pitch;
-		}
+		cheight = cheight > copy_height ? copy_height : cheight;
 		size = (cheight * pitch) >> 2;
 		/* emit reloc before writting cs so that cs is always in consistent state */
 		r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, RADEON_USAGE_READ);
@@ -3109,7 +3141,7 @@ boolean r600_dma_blit(struct pipe_context *ctx,
 	}

 	if (src_mode == dst_mode) {
-		unsigned long dst_offset, src_offset, size;
+		uint64_t dst_offset, src_offset, size;

 		/* simple dma blit would do NOTE code here assume :
 		 *   src_box.x/y == 0
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -284,6 +284,16 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 	ref.valuemask[1] = dsa->valuemask[1];
 	ref.writemask[0] = dsa->writemask[0];
 	ref.writemask[1] = dsa->writemask[1];
+	if (rctx->zwritemask != dsa->zwritemask) {
+		rctx->zwritemask = dsa->zwritemask;
+		if (rctx->chip_class >= EVERGREEN) {
+			/* work around some issue when not writting to zbuffer
+			 * we are having lockup on evergreen so do not enable
+			 * hyperz when not writting zbuffer
+			 */
+			rctx->db_misc_state.atom.dirty = true;
+		}
+	}

 	r600_set_stencil_ref(ctx, &ref);

@@ -293,6 +303,11 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 		rctx->alphatest_state.sx_alpha_test_control = dsa->sx_alpha_test_control;
 		rctx->alphatest_state.sx_alpha_ref = dsa->alpha_ref;
 		rctx->alphatest_state.atom.dirty = true;
+		if (rctx->chip_class >= EVERGREEN) {
+			evergreen_update_db_shader_control(rctx);
+		} else {
+			r600_update_db_shader_control(rctx);
+		}
 	}
 }

@@ -479,7 +494,8 @@ static void r600_set_index_buffer(struct pipe_context *ctx,

 	if (ib) {
 		pipe_resource_reference(&rctx->index_buffer.buffer, ib->buffer);
-	        memcpy(&rctx->index_buffer, ib, sizeof(*ib));
+		memcpy(&rctx->index_buffer, ib, sizeof(*ib));
+		r600_context_add_resource_size(ctx, ib->buffer);
 	} else {
 		pipe_resource_reference(&rctx->index_buffer.buffer, NULL);
 	}
@@ -516,6 +532,7 @@ static void r600_set_vertex_buffers(struct pipe_context *ctx,
 					vb[i].buffer_offset = input[i].buffer_offset;
 					pipe_resource_reference(&vb[i].buffer, input[i].buffer);
 					new_buffer_mask |= 1 << i;
+					r600_context_add_resource_size(ctx, input[i].buffer);
 				} else {
 					pipe_resource_reference(&vb[i].buffer, NULL);
 					disable_mask |= 1 << i;
@@ -613,6 +630,7 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,

 			pipe_sampler_view_reference((struct pipe_sampler_view **)&dst->views.views[i], views[i]);
 			new_mask |= 1 << i;
+			r600_context_add_resource_size(pipe, views[i]->texture);
 		} else {
 			pipe_sampler_view_reference((struct pipe_sampler_view **)&dst->views.views[i], NULL);
 			disable_mask |= 1 << i;
@@ -806,6 +824,8 @@ static void r600_bind_ps_state(struct pipe_context *ctx, void *state)
 	rctx->ps_shader = (struct r600_pipe_shader_selector *)state;
 	r600_context_pipe_state_set(rctx, &rctx->ps_shader->current->rstate);

+	r600_context_add_resource_size(ctx, (struct pipe_resource *)rctx->ps_shader->current->bo);
+
 	if (rctx->chip_class <= R700) {
 		bool multiwrite = rctx->ps_shader->current->shader.fs_write_all;

@@ -835,6 +855,8 @@ static void r600_bind_vs_state(struct pipe_context *ctx, void *state)
 	if (state) {
 		r600_context_pipe_state_set(rctx, &rctx->vs_shader->current->rstate);

+		r600_context_add_resource_size(ctx, (struct pipe_resource *)rctx->vs_shader->current->bo);
+
 		/* Update clip misc state. */
 		if (rctx->vs_shader->current->pa_cl_vs_out_cntl != rctx->clip_misc_state.pa_cl_vs_out_cntl ||
 		    rctx->vs_shader->current->shader.clip_dist_write != rctx->clip_misc_state.clip_dist_write) {
@@ -938,10 +960,13 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint
 		} else {
 			u_upload_data(rctx->uploader, 0, input->buffer_size, ptr, &cb->buffer_offset, &cb->buffer);
 		}
+		/* account it in gtt */
+		rctx->gtt += input->buffer_size;
 	} else {
 		/* Setup the hw buffer. */
 		cb->buffer_offset = input->buffer_offset;
 		pipe_resource_reference(&cb->buffer, input->buffer);
+		r600_context_add_resource_size(ctx, input->buffer);
 	}

 	state->enabled_mask |= 1 << index;
@@ -957,6 +982,7 @@ r600_create_so_target(struct pipe_context *ctx,
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_so_target *t;
+	struct r600_resource *rbuffer = (struct r600_resource*)buffer;

 	t = CALLOC_STRUCT(r600_so_target);
 	if (!t) {
@@ -976,6 +1002,9 @@ r600_create_so_target(struct pipe_context *ctx,
 	pipe_resource_reference(&t->b.buffer, buffer);
 	t->b.buffer_offset = buffer_offset;
 	t->b.buffer_size = buffer_size;
+
+	util_range_add(&rbuffer->valid_buffer_range, buffer_offset,
+		       buffer_offset + buffer_size);
 	return &t->b;
 }

@@ -1004,6 +1033,7 @@ static void r600_set_so_targets(struct pipe_context *ctx,
 	/* Set the new targets. */
 	for (i = 0; i < num_targets; i++) {
 		pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->so_targets[i], targets[i]);
+		r600_context_add_resource_size(ctx, targets[i]->buffer);
 	}
 	for (; i < rctx->num_so_targets; i++) {
 		pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->so_targets[i], NULL);
@@ -1343,6 +1373,12 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		rctx->vgt_state.atom.dirty = true;
 	}

+	/* Workaround for hardware deadlock on certain R600 ASICs: write into a CB register. */
+	if (rctx->chip_class == R600) {
+		rctx->flags |= R600_CONTEXT_PS_PARTIAL_FLUSH;
+		rctx->cb_misc_state.atom.dirty = true;
+	}
+
 	/* Emit states. */
 	r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE);
 	r600_flush_emit(rctx);
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -270,6 +270,7 @@ static void r600_texture_destroy(struct pipe_screen *screen,
 	if (rtex->flushed_depth_texture)
 		pipe_resource_reference((struct pipe_resource **)&rtex->flushed_depth_texture, NULL);

+        pipe_resource_reference((struct pipe_resource**)&rtex->htile, NULL);
 	pb_reference(&resource->buf, NULL);
 	FREE(rtex);
 }
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -119,6 +119,7 @@
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
 #define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH	0x1f
 #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
+#define EVENT_TYPE_FLUSH_AND_INV_DB_META       0x2c /* supported on r700+ */
 #define EVENT_TYPE_FLUSH_AND_INV_CB_META	46 /* supported on r700+ */
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -155,7 +155,7 @@ static inline LLVMValueRef bitcast(

 void radeon_llvm_emit_prepare_cube_coords(struct lp_build_tgsi_context * bld_base,
                                          struct lp_build_emit_data * emit_data,
-                                          unsigned coord_arg);
+                                          LLVMValueRef *coords_arg);

 void radeon_llvm_context_init(struct radeon_llvm_context * ctx);

--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -531,7 +531,7 @@ static void kil_emit(
 void radeon_llvm_emit_prepare_cube_coords(
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data,
-		unsigned coord_arg)
+		LLVMValueRef *coords_arg)
 {

 	unsigned target = emit_data->inst->Texture.Texture;
@@ -542,11 +542,13 @@ void radeon_llvm_emit_prepare_cube_coords(
 	LLVMValueRef coords[4];
 	LLVMValueRef mad_args[3];
 	LLVMValueRef idx;
+	struct LLVMOpaqueValue *cube_vec;
+	LLVMValueRef v;
 	unsigned i;

-	LLVMValueRef v = build_intrinsic(builder, "llvm.AMDGPU.cube",
-			LLVMVectorType(type, 4),
-			&emit_data->args[coord_arg], 1, LLVMReadNoneAttribute);
+	cube_vec = lp_build_gather_values(bld_base->base.gallivm, coords_arg, 4);
+	v = build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4),
+                            &cube_vec, 1, LLVMReadNoneAttribute);

 	for (i = 0; i < 4; ++i) {
 		idx = lp_build_const_int32(gallivm, i);
@@ -579,18 +581,14 @@ void radeon_llvm_emit_prepare_cube_coords(
 	if (target != TGSI_TEXTURE_CUBE ||
 		opcode != TGSI_OPCODE_TEX) {

-		/* load source coord.w component - array_index for cube arrays or
-		 * compare value for SHADOWCUBE */
-		idx = lp_build_const_int32(gallivm, 3);
-		coords[3] = LLVMBuildExtractElement(builder,
-				emit_data->args[coord_arg], idx, "");
-
 		/* for cube arrays coord.z = coord.w(array_index) * 8 + face */
 		if (target == TGSI_TEXTURE_CUBE_ARRAY ||
 			target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {

+			/* coords_arg.w component - array_index for cube arrays or
+			 * compare value for SHADOWCUBE */
 			coords[2] = lp_build_emit_llvm_ternary(bld_base, TGSI_OPCODE_MAD,
-					coords[3], lp_build_const_float(gallivm, 8.0), coords[2]);
+					coords_arg[3], lp_build_const_float(gallivm, 8.0), coords[2]);
 		}

 		/* for instructions that need additional src (compare/lod/bias),
@@ -598,12 +596,11 @@ void radeon_llvm_emit_prepare_cube_coords(
 		if (opcode == TGSI_OPCODE_TEX2 ||
 			opcode == TGSI_OPCODE_TXB2 ||
 			opcode == TGSI_OPCODE_TXL2) {
-			coords[3] = emit_data->args[coord_arg + 1];
+			coords[3] = coords_arg[4];
 		}
 	}

-	emit_data->args[coord_arg] =
-			lp_build_gather_values(bld_base->base.gallivm, coords, 4);
+	memcpy(coords_arg, coords, sizeof(coords));
 }

 static void txd_fetch_args(
@@ -645,9 +642,6 @@ static void txp_fetch_args(
 					TGSI_OPCODE_DIV, arg, src_w);
 	}
 	coords[3] = bld_base->base.one;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->arg_count = 1;

 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
 	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
@@ -655,8 +649,12 @@ static void txp_fetch_args(
 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, 0);
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
 	}
+
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->arg_count = 1;
 }

 static void tex_fetch_args(
@@ -673,17 +671,12 @@ static void tex_fetch_args(

 	const struct tgsi_full_instruction * inst = emit_data->inst;

-	LLVMValueRef coords[4];
+	LLVMValueRef coords[5];
 	unsigned chan;
 	for (chan = 0; chan < 4; chan++) {
 		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
 	}

-	emit_data->arg_count = 1;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-
 	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
 		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
 		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
@@ -692,7 +685,7 @@ static void tex_fetch_args(
 		 * That operand should be passed as a float value in the args array
 		 * right after the coord vector. After packing it's not used anymore,
 		 * that's why arg_count is not increased */
-		emit_data->args[1] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
 	}

 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
@@ -701,8 +694,13 @@ static void tex_fetch_args(
 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, 0);
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
 	}
+
+	emit_data->arg_count = 1;
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 }

 static void txf_fetch_args(
@@ -768,6 +766,22 @@ static void emit_icmp(
 	emit_data->output[emit_data->chan] = v;
 }

+static void emit_ucmp(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	unsigned pred;
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef context = bld_base->base.gallivm->context;
+
+
+	LLVMValueRef v = LLVMBuildFCmp(builder, LLVMRealUGE,
+			emit_data->args[0], lp_build_const_float(bld_base->base.gallivm, 0.), "");
+
+	emit_data->output[emit_data->chan] = LLVMBuildSelect(builder, v, emit_data->args[2], emit_data->args[1], "");
+}
+
 static void emit_cmp(
 		const struct lp_build_tgsi_action *action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1243,6 +1257,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_USNE].emit = emit_icmp;
 	bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f;
 	bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor;
+	bld_base->op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp;

 	bld_base->rsq_action.emit = build_tgsi_intrinsic_nomem;
 	bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq";
--- a/src/gallium/drivers/radeonsi/r600_blit.c
+++ b/src/gallium/drivers/radeonsi/r600_blit.c
@@ -98,21 +98,6 @@ static void r600_blitter_end(struct pipe_context *ctx)
 	r600_context_queries_resume(rctx);
 }

-static unsigned u_max_layer(struct pipe_resource *r, unsigned level)
-{
-	switch (r->target) {
-	case PIPE_TEXTURE_CUBE:
-		return 6 - 1;
-	case PIPE_TEXTURE_3D:
-		return u_minify(r->depth0, level) - 1;
-	case PIPE_TEXTURE_1D_ARRAY:
-	case PIPE_TEXTURE_2D_ARRAY:
-		return r->array_size - 1;
-	default:
-		return 0;
-	}
-}
-
 void si_blit_uncompress_depth(struct pipe_context *ctx,
 		struct r600_resource_texture *texture,
 		struct r600_resource_texture *staging,
--- a/src/gallium/drivers/radeonsi/r600_texture.c
+++ b/src/gallium/drivers/radeonsi/r600_texture.c
@@ -55,11 +55,8 @@ static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600
 	struct pipe_resource *texture = transfer->resource;
 	struct pipe_box sbox;

-	sbox.x = sbox.y = sbox.z = 0;
-	sbox.width = transfer->box.width;
-	sbox.height = transfer->box.height;
-	/* XXX that might be wrong */
-	sbox.depth = 1;
+	u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
+
 	ctx->resource_copy_region(ctx, texture, transfer->level,
 				  transfer->box.x, transfer->box.y, transfer->box.z,
 				  rtransfer->staging,
@@ -153,8 +150,7 @@ static int r600_init_surface(struct r600_screen *rscreen,
 		surface->flags |= RADEON_SURF_SCANOUT;
 	}

-	if ((ptex->bind & PIPE_BIND_DEPTH_STENCIL) &&
-	    !is_flushed_depth && is_depth) {
+	if (!is_flushed_depth && is_depth) {
 		surface->flags |= RADEON_SURF_ZBUFFER;

 		if (is_stencil) {
@@ -239,7 +235,6 @@ static void *si_texture_transfer_map(struct pipe_context *ctx,
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture;
-	struct pipe_resource resource;
 	struct r600_transfer *trans;
 	boolean use_staging_texture = FALSE;
 	struct radeon_winsys_cs_handle *buf;
@@ -299,42 +294,52 @@ static void *si_texture_transfer_map(struct pipe_context *ctx,
 					 level, level,
 					 box->z, box->z + box->depth - 1);
 		trans->transfer.stride = staging_depth->surface.level[level].pitch_bytes;
+		trans->transfer.layer_stride = staging_depth->surface.level[level].slice_size;
 		trans->offset = r600_texture_get_offset(staging_depth, level, box->z);

 		trans->staging = &staging_depth->resource.b.b;
 	} else if (use_staging_texture) {
-		resource.target = PIPE_TEXTURE_2D;
+		struct pipe_resource resource;
+		struct r600_resource_texture *staging;
+
+		memset(&resource, 0, sizeof(resource));
 		resource.format = texture->format;
 		resource.width0 = box->width;
 		resource.height0 = box->height;
 		resource.depth0 = 1;
 		resource.array_size = 1;
-		resource.last_level = 0;
-		resource.nr_samples = 0;
 		resource.usage = PIPE_USAGE_STAGING;
-		resource.bind = 0;
 		resource.flags = R600_RESOURCE_FLAG_TRANSFER;
-		/* For texture reading, the temporary (detiled) texture is used as
-		 * a render target when blitting from a tiled texture. */
-		if (usage & PIPE_TRANSFER_READ) {
-			resource.bind |= PIPE_BIND_RENDER_TARGET;
-		}
-		/* For texture writing, the temporary texture is used as a sampler
-		 * when blitting into a tiled texture. */
-		if (usage & PIPE_TRANSFER_WRITE) {
-			resource.bind |= PIPE_BIND_SAMPLER_VIEW;
+
+		/* We must set the correct texture target and dimensions if needed for a 3D transfer. */
+		if (box->depth > 1 && u_max_layer(texture, level) > 0)
+			resource.target = texture->target;
+		else
+			resource.target = PIPE_TEXTURE_2D;
+
+		switch (resource.target) {
+		case PIPE_TEXTURE_1D_ARRAY:
+		case PIPE_TEXTURE_2D_ARRAY:
+		case PIPE_TEXTURE_CUBE_ARRAY:
+			resource.array_size = box->depth;
+			break;
+		case PIPE_TEXTURE_3D:
+			resource.depth0 = box->depth;
+			break;
+		default:;
 		}
 		/* Create the temporary texture. */
-		trans->staging = ctx->screen->resource_create(ctx->screen, &resource);
-		if (trans->staging == NULL) {
+		staging = (struct r600_resource_texture*)ctx->screen->resource_create(ctx->screen, &resource);
+		if (staging == NULL) {
 			R600_ERR("failed to create temporary texture to hold untiled copy\n");
 			pipe_resource_reference(&trans->transfer.resource, NULL);
 			FREE(trans);
 			return NULL;
 		}

-		trans->transfer.stride = ((struct r600_resource_texture *)trans->staging)
-					->surface.level[0].pitch_bytes;
+		trans->staging = &staging->resource.b.b;
+		trans->transfer.stride = staging->surface.level[0].pitch_bytes;
+		trans->transfer.layer_stride = staging->surface.level[0].slice_size;
 		if (usage & PIPE_TRANSFER_READ) {
 			r600_copy_to_staging_texture(ctx, trans);
 			/* Always referenced in the blit. */
@@ -349,7 +354,7 @@ static void *si_texture_transfer_map(struct pipe_context *ctx,
 	if (trans->staging) {
 		buf = si_resource(trans->staging)->cs_buf;
 	} else {
-		buf = si_resource(trans->transfer.resource)->cs_buf;
+		buf = rtex->resource.cs_buf;
 	}

 	if (rtex->is_depth || !trans->staging)
@@ -549,6 +554,8 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,
 	struct r600_surface *surface = CALLOC_STRUCT(r600_surface);
 	unsigned level = surf_tmpl->u.tex.level;

+	assert(surf_tmpl->u.tex.first_layer <= u_max_layer(texture, surf_tmpl->u.tex.level));
+	assert(surf_tmpl->u.tex.last_layer <= u_max_layer(texture, surf_tmpl->u.tex.level));
 	assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
 	if (surface == NULL)
 		return NULL;
--- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
@@ -280,6 +280,7 @@ static const char *r600_get_family_name(enum radeon_family family)
 	case CHIP_TAHITI: return "AMD TAHITI";
 	case CHIP_PITCAIRN: return "AMD PITCAIRN";
 	case CHIP_VERDE: return "AMD CAPE VERDE";
+	case CHIP_OLAND: return "AMD OLAND";
 	default: return "AMD unknown";
 	}
 }
@@ -379,7 +380,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
 			return 15;
 	case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-		return /*rscreen->info.drm_minor >= 9 ? 16384 :*/ 0;
+		return 16384;
 	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
 		return 32;

@@ -458,7 +459,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 		/* FIXME Isn't this equal to TEMPS? */
 		return 1; /* Max native address registers */
 	case PIPE_SHADER_CAP_MAX_CONSTS:
-		return 64;
+		return 4096; /* actually only memory limits this */
 	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
 		return 1;
 	case PIPE_SHADER_CAP_MAX_PREDS:
--- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
@@ -277,4 +277,20 @@ static INLINE uint64_t r600_resource_va(struct pipe_screen *screen, struct pipe_
 	return rscreen->ws->buffer_get_virtual_address(rresource->cs_buf);
 }

+static INLINE unsigned u_max_layer(struct pipe_resource *r, unsigned level)
+{
+	switch (r->target) {
+	case PIPE_TEXTURE_CUBE:
+		return 6 - 1;
+	case PIPE_TEXTURE_3D:
+		return u_minify(r->depth0, level) - 1;
+	case PIPE_TEXTURE_1D_ARRAY:
+	case PIPE_TEXTURE_2D_ARRAY:
+	case PIPE_TEXTURE_CUBE_ARRAY:
+		return r->array_size - 1;
+	default:
+		return 0;
+	}
+}
+
 #endif
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -263,6 +263,14 @@ static void declare_input_fs(
 				build_intrinsic(base->gallivm->builder,
 					"llvm.SI.fs.read.pos", input_type,
 					args, 1, LLVMReadNoneAttribute);
+
+			if (chan == 3)
+				/* RCP for fragcoord.w */
+				si_shader_ctx->radeon_bld.inputs[soa_index] =
+					LLVMBuildFDiv(gallivm->builder,
+						      lp_build_const_float(gallivm, 1.0f),
+						      si_shader_ctx->radeon_bld.inputs[soa_index],
+						      "");
 		}
 		return;
 	}
@@ -301,14 +309,8 @@ static void declare_input_fs(
 	/* XXX: Handle all possible interpolation modes */
 	switch (decl->Interp.Interpolate) {
 	case TGSI_INTERPOLATE_COLOR:
-		/* XXX: Flat shading hangs the GPU */
-		if (si_shader_ctx->rctx->queued.named.rasterizer &&
-		    si_shader_ctx->rctx->queued.named.rasterizer->flatshade) {
-#if 0
+		if (si_shader_ctx->key.flatshade) {
 			intr_name = "llvm.SI.fs.interp.constant";
-#else
-			intr_name = "llvm.SI.fs.interp.linear.center";
-#endif
 		} else {
 			if (decl->Interp.Centroid)
 				intr_name = "llvm.SI.fs.interp.persp.centroid";
@@ -317,11 +319,8 @@ static void declare_input_fs(
 		}
 		break;
 	case TGSI_INTERPOLATE_CONSTANT:
-		/* XXX: Flat shading hangs the GPU */
-#if 0
 		intr_name = "llvm.SI.fs.interp.constant";
 		break;
-#endif
 	case TGSI_INTERPOLATE_LINEAR:
 		if (decl->Interp.Centroid)
 			intr_name = "llvm.SI.fs.interp.linear.centroid";
@@ -433,6 +432,15 @@ static LLVMValueRef fetch_constant(
 	LLVMValueRef offset;
 	LLVMValueRef load;

+	if (swizzle == LP_CHAN_ALL) {
+		unsigned chan;
+		LLVMValueRef values[4];
+		for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
+			values[chan] = fetch_constant(bld_base, reg, type, chan);
+
+		return lp_build_gather_values(bld_base->base.gallivm, values, 4);
+	}
+
 	/* currently not supported */
 	if (reg->Register.Indirect) {
 		assert(0);
@@ -446,12 +454,6 @@ static LLVMValueRef fetch_constant(
 	 * CONST[0].x will have an offset of 0 and CONST[1].x will have an
 	 * offset of 4. */
 	idx = (reg->Register.Index * 4) + swizzle;
-
-	/* index loads above 255 are currently not supported */
-	if (idx > 255) {
-		assert(0);
-		idx = 0;
-	}
 	offset = lp_build_const_int32(base->gallivm, idx);

 	load = build_indexed_load(base->gallivm, const_ptr, offset);
@@ -612,6 +614,12 @@ static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 		int i;

 		tgsi_parse_token(parse);
+
+		if (parse->FullToken.Token.Type == TGSI_TOKEN_TYPE_PROPERTY &&
+		    parse->FullToken.FullProperty.Property.PropertyName ==
+		    TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS)
+			shader->fs_write_all = TRUE;
+
 		if (parse->FullToken.Token.Type != TGSI_TOKEN_TYPE_DECLARATION)
 			continue;

@@ -775,6 +783,29 @@ static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 	last_args[1] = lp_build_const_int32(base->gallivm,
 					    si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT);

+	if (shader->fs_write_all && shader->nr_cbufs > 1) {
+		int i;
+
+		/* Specify that this is not yet the last export */
+		last_args[2] = lp_build_const_int32(base->gallivm, 0);
+
+		for (i = 1; i < shader->nr_cbufs; i++) {
+			/* Specify the target we are exporting */
+			last_args[3] = lp_build_const_int32(base->gallivm,
+							    V_008DFC_SQ_EXP_MRT + i);
+
+			lp_build_intrinsic(base->gallivm->builder,
+					   "llvm.SI.export",
+					   LLVMVoidTypeInContext(base->gallivm->context),
+					   last_args, 9);
+
+			si_shader_ctx->shader->spi_shader_col_format |=
+				si_shader_ctx->shader->spi_shader_col_format << 4;
+		}
+
+		last_args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
+	}
+
 	/* Specify that this is the last export */
 	last_args[2] = lp_build_const_int32(base->gallivm, 1);

@@ -791,54 +822,127 @@ static void tex_fetch_args(
 	struct lp_build_tgsi_context * bld_base,
 	struct lp_build_emit_data * emit_data)
 {
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned opcode = inst->Instruction.Opcode;
+	unsigned target = inst->Texture.Texture;
 	LLVMValueRef ptr;
 	LLVMValueRef offset;
+	LLVMValueRef coords[4];
+	LLVMValueRef address[16];
+	unsigned count = 0;
+	unsigned chan;

 	/* WriteMask */
 	/* XXX: should be optimized using emit_data->inst->Dst[0].Register.WriteMask*/
 	emit_data->args[0] = lp_build_const_int32(bld_base->base.gallivm, 0xf);

-	/* Coordinates */
-	/* XXX: Not all sample instructions need 4 address arguments. */
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
-		LLVMValueRef src_w;
-		unsigned chan;
-		LLVMValueRef coords[4];
-
-		emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-		src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-
-		for (chan = 0; chan < 3; chan++ ) {
-			LLVMValueRef arg = lp_build_emit_fetch(bld_base,
-							       emit_data->inst, 0, chan);
+	/* Fetch and project texture coordinates */
+	coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
+	for (chan = 0; chan < 3; chan++ ) {
+		coords[chan] = lp_build_emit_fetch(bld_base,
+						   emit_data->inst, 0,
+						   chan);
+		if (opcode == TGSI_OPCODE_TXP)
 			coords[chan] = lp_build_emit_llvm_binary(bld_base,
 								 TGSI_OPCODE_DIV,
-								 arg, src_w);
-		}
+								 coords[chan],
+								 coords[3]);
+	}
+
+	if (opcode == TGSI_OPCODE_TXP)
 		coords[3] = bld_base->base.one;
-		emit_data->args[1] = lp_build_gather_values(bld_base->base.gallivm,
-							    coords, 4);
-	} else
-		emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
-							 0, LP_CHAN_ALL);

-	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
-		/* These instructions have additional operand that should be packed
-		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
-		 * That operand should be passed as a float value in the args array
-		 * right after the coord vector. After packing it's not used anymore,
-		 * that's why arg_count is not increased */
-		emit_data->args[2] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+	/* Pack LOD bias value */
+	if (opcode == TGSI_OPCODE_TXB)
+		address[count++] = coords[3];
+
+	if ((target == TGSI_TEXTURE_CUBE || target == TGSI_TEXTURE_SHADOWCUBE) &&
+	    opcode != TGSI_OPCODE_TXQ)
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
+
+	/* Pack depth comparison value */
+	switch (target) {
+	case TGSI_TEXTURE_SHADOW1D:
+	case TGSI_TEXTURE_SHADOW1D_ARRAY:
+	case TGSI_TEXTURE_SHADOW2D:
+	case TGSI_TEXTURE_SHADOWRECT:
+		address[count++] = coords[2];
+		break;
+	case TGSI_TEXTURE_SHADOWCUBE:
+	case TGSI_TEXTURE_SHADOW2D_ARRAY:
+		address[count++] = coords[3];
+		break;
+	case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+		address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
 	}

-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, 1);
+	/* Pack texture coordinates */
+	address[count++] = coords[0];
+	switch (target) {
+	case TGSI_TEXTURE_2D:
+	case TGSI_TEXTURE_2D_ARRAY:
+	case TGSI_TEXTURE_3D:
+	case TGSI_TEXTURE_CUBE:
+	case TGSI_TEXTURE_RECT:
+	case TGSI_TEXTURE_SHADOW2D:
+	case TGSI_TEXTURE_SHADOWRECT:
+	case TGSI_TEXTURE_SHADOW2D_ARRAY:
+	case TGSI_TEXTURE_SHADOWCUBE:
+	case TGSI_TEXTURE_2D_MSAA:
+	case TGSI_TEXTURE_2D_ARRAY_MSAA:
+	case TGSI_TEXTURE_CUBE_ARRAY:
+	case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+		address[count++] = coords[1];
 	}
+	switch (target) {
+	case TGSI_TEXTURE_3D:
+	case TGSI_TEXTURE_CUBE:
+	case TGSI_TEXTURE_SHADOWCUBE:
+	case TGSI_TEXTURE_CUBE_ARRAY:
+	case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+		address[count++] = coords[2];
+	}
+
+	/* Pack array slice */
+	switch (target) {
+	case TGSI_TEXTURE_1D_ARRAY:
+		address[count++] = coords[1];
+	}
+	switch (target) {
+	case TGSI_TEXTURE_2D_ARRAY:
+	case TGSI_TEXTURE_2D_ARRAY_MSAA:
+	case TGSI_TEXTURE_SHADOW2D_ARRAY:
+		address[count++] = coords[2];
+	}
+	switch (target) {
+	case TGSI_TEXTURE_CUBE_ARRAY:
+	case TGSI_TEXTURE_SHADOW1D_ARRAY:
+	case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+		address[count++] = coords[3];
+	}
+
+	/* Pack LOD */
+	if (opcode == TGSI_OPCODE_TXL)
+		address[count++] = coords[3];
+
+	if (count > 16) {
+		assert(!"Cannot handle more than 16 texture address parameters");
+		count = 16;
+	}
+
+	for (chan = 0; chan < count; chan++ ) {
+		address[chan] = LLVMBuildBitCast(gallivm->builder,
+						 address[chan],
+						 LLVMInt32TypeInContext(gallivm->context),
+						 "");
+	}
+
+	/* Pad to power of two vector */
+	while (count < util_next_power_of_two(count))
+		address[count++] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+
+	emit_data->args[1] = lp_build_gather_values(gallivm, address, count);

 	/* Resource */
 	ptr = use_sgpr(bld_base->base.gallivm, SGPR_CONST_PTR_V8I32, SI_SGPR_RESOURCE);
@@ -855,8 +959,7 @@ static void tex_fetch_args(
 						ptr, offset);

 	/* Dimensions */
-	emit_data->args[4] = lp_build_const_int32(bld_base->base.gallivm,
-					emit_data->inst->Texture.Texture);
+	emit_data->args[4] = lp_build_const_int32(bld_base->base.gallivm, target);

 	emit_data->arg_count = 5;
 	/* XXX: To optimize, we could use a float or v2f32, if the last bits of
@@ -866,22 +969,37 @@ static void tex_fetch_args(
 			4);
 }

+static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
+				struct lp_build_tgsi_context * bld_base,
+				struct lp_build_emit_data * emit_data)
+{
+	struct lp_build_context * base = &bld_base->base;
+	char intr_name[23];
+
+	sprintf(intr_name, "%sv%ui32", action->intr_name,
+		LLVMGetVectorSize(LLVMTypeOf(emit_data->args[1])));
+
+	emit_data->output[emit_data->chan] = lp_build_intrinsic(
+		base->gallivm->builder, intr_name, emit_data->dst_type,
+		emit_data->args, emit_data->arg_count);
+}
+
 static const struct lp_build_tgsi_action tex_action = {
 	.fetch_args = tex_fetch_args,
-	.emit = lp_build_tgsi_intrinsic,
-	.intr_name = "llvm.SI.sample"
+	.emit = build_tex_intrinsic,
+	.intr_name = "llvm.SI.sample."
 };

 static const struct lp_build_tgsi_action txb_action = {
 	.fetch_args = tex_fetch_args,
-	.emit = lp_build_tgsi_intrinsic,
-	.intr_name = "llvm.SI.sample.bias"
+	.emit = build_tex_intrinsic,
+	.intr_name = "llvm.SI.sampleb."
 };

 static const struct lp_build_tgsi_action txl_action = {
 	.fetch_args = tex_fetch_args,
-	.emit = lp_build_tgsi_intrinsic,
-	.intr_name = "llvm.SI.sample.lod"
+	.emit = build_tex_intrinsic,
+	.intr_name = "llvm.SI.samplel."
 };


--- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
@@ -82,6 +82,7 @@ struct si_shader_key {
 	unsigned		nr_cbufs:4;
 	unsigned		color_two_side:1;
 	unsigned		alpha_func:3;
+	unsigned		flatshade:1;
 	float			alpha_ref;
 };

--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -314,6 +314,8 @@ static void si_update_fb_rs_state(struct r600_context *rctx)

 	offset_units = rctx->queued.named.rasterizer->offset_units;
 	switch (rctx->framebuffer.zsbuf->texture->format) {
+	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_Z24X8_UNORM:
 	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 		depth = -24;
@@ -419,8 +421,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 	rs->offset_units = state->offset_units;
 	rs->offset_scale = state->offset_scale * 12.0f;

-	/* XXX: Flat shading hangs the GPU */
-	tmp = S_0286D4_FLAT_SHADE_ENA(0);
+	tmp = S_0286D4_FLAT_SHADE_ENA(1);
 	if (state->sprite_coord_enable) {
 		tmp |= S_0286D4_PNT_SPRITE_ENA(1) |
 			S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
@@ -720,7 +721,6 @@ static uint32_t si_translate_colorformat(enum pipe_format format)
 	case PIPE_FORMAT_L8A8_SNORM:
 	case PIPE_FORMAT_L8A8_UINT:
 	case PIPE_FORMAT_L8A8_SINT:
-	case PIPE_FORMAT_L8A8_SRGB:
 	case PIPE_FORMAT_R8G8_SNORM:
 	case PIPE_FORMAT_R8G8_UNORM:
 	case PIPE_FORMAT_R8G8_UINT:
@@ -775,6 +775,7 @@ static uint32_t si_translate_colorformat(enum pipe_format format)
 	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 		return V_028C70_COLOR_8_24;

+	case PIPE_FORMAT_S8X24_UINT:
 	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 		return V_028C70_COLOR_24_8;
@@ -804,15 +805,12 @@ static uint32_t si_translate_colorformat(enum pipe_format format)
 		return V_028C70_COLOR_10_11_11;

 	/* 64-bit buffers. */
-	case PIPE_FORMAT_R16G16B16_USCALED:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
 	case PIPE_FORMAT_R16G16B16A16_UINT:
 	case PIPE_FORMAT_R16G16B16A16_SINT:
 	case PIPE_FORMAT_R16G16B16A16_USCALED:
 	case PIPE_FORMAT_R16G16B16A16_SSCALED:
 	case PIPE_FORMAT_R16G16B16A16_UNORM:
 	case PIPE_FORMAT_R16G16B16A16_SNORM:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
 	case PIPE_FORMAT_R16G16B16A16_FLOAT:
 		return V_028C70_COLOR_16_16_16_16;

@@ -898,7 +896,6 @@ static uint32_t si_translate_colorswap(enum pipe_format format)
 	case PIPE_FORMAT_L8A8_SNORM:
 	case PIPE_FORMAT_L8A8_UINT:
 	case PIPE_FORMAT_L8A8_SINT:
-	case PIPE_FORMAT_L8A8_SRGB:
 		return V_028C70_SWAP_ALT;
 	case PIPE_FORMAT_R8G8_SNORM:
 	case PIPE_FORMAT_R8G8_UNORM:
@@ -955,9 +952,10 @@ static uint32_t si_translate_colorswap(enum pipe_format format)
 	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 		return V_028C70_SWAP_STD;

+	case PIPE_FORMAT_S8X24_UINT:
 	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-		return V_028C70_SWAP_STD;
+		return V_028C70_SWAP_STD_REV;

 	case PIPE_FORMAT_R10G10B10A2_UNORM:
 	case PIPE_FORMAT_R10G10B10X2_SNORM:
@@ -1119,9 +1117,11 @@ static uint32_t si_translate_dbformat(enum pipe_format format)
 	switch (format) {
 	case PIPE_FORMAT_Z16_UNORM:
 		return V_028040_Z_16;
+	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_Z24X8_UNORM:
 	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-		return V_028040_Z_24; /* XXX no longer supported on SI */
+		return V_028040_Z_24; /* deprecated on SI */
 	case PIPE_FORMAT_Z32_FLOAT:
 	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 		return V_028040_Z_32_FLOAT;
@@ -1154,14 +1154,14 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 		case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 			return V_008F14_IMG_DATA_FORMAT_8_24;
 		case PIPE_FORMAT_X8Z24_UNORM:
+		case PIPE_FORMAT_S8X24_UINT:
 		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 			return V_008F14_IMG_DATA_FORMAT_24_8;
-		case PIPE_FORMAT_X32_S8X24_UINT:
-		case PIPE_FORMAT_S8X24_UINT:
 		case PIPE_FORMAT_S8_UINT:
 			return V_008F14_IMG_DATA_FORMAT_8;
 		case PIPE_FORMAT_Z32_FLOAT:
 			return V_008F14_IMG_DATA_FORMAT_32;
+		case PIPE_FORMAT_X32_S8X24_UINT:
 		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 			return V_008F14_IMG_DATA_FORMAT_X24_8_32;
 		default:
@@ -1172,6 +1172,8 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 		goto out_unknown; /* TODO */

 	case UTIL_FORMAT_COLORSPACE_SRGB:
+		if (desc->nr_channels != 4 && desc->nr_channels != 1)
+			goto out_unknown;
 		break;

 	default:
@@ -1523,6 +1525,8 @@ static unsigned si_tile_mode_index(struct r600_resource_texture *rtex, unsigned
 			switch (rtex->real_format) {
 			case PIPE_FORMAT_Z16_UNORM:
 				return 5;
+			case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+			case PIPE_FORMAT_X8Z24_UNORM:
 			case PIPE_FORMAT_Z24X8_UNORM:
 			case PIPE_FORMAT_Z24_UNORM_S8_UINT:
 			case PIPE_FORMAT_Z32_FLOAT:
@@ -1586,7 +1590,7 @@ static void si_cb(struct r600_context *rctx, struct si_pm4_state *pm4,
 	struct r600_surface *surf;
 	unsigned level = state->cbufs[cb]->u.tex.level;
 	unsigned pitch, slice;
-	unsigned color_info;
+	unsigned color_info, color_attrib;
 	unsigned tile_mode_index;
 	unsigned format, swap, ntype, endian;
 	uint64_t offset;
@@ -1624,15 +1628,19 @@ static void si_cb(struct r600_context *rctx, struct si_pm4_state *pm4,
 		if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
 			ntype = V_028C70_NUMBER_SRGB;
 		else if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
-			if (desc->channel[i].normalized)
-				ntype = V_028C70_NUMBER_SNORM;
-			else if (desc->channel[i].pure_integer)
+			if (desc->channel[i].pure_integer) {
 				ntype = V_028C70_NUMBER_SINT;
+			} else {
+				assert(desc->channel[i].normalized);
+				ntype = V_028C70_NUMBER_SNORM;
+			}
 		} else if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-			if (desc->channel[i].normalized)
-				ntype = V_028C70_NUMBER_UNORM;
-			else if (desc->channel[i].pure_integer)
+			if (desc->channel[i].pure_integer) {
 				ntype = V_028C70_NUMBER_UINT;
+			} else {
+				assert(desc->channel[i].normalized);
+				ntype = V_028C70_NUMBER_UNORM;
+			}
 		}
 	}

@@ -1670,6 +1678,9 @@ static void si_cb(struct r600_context *rctx, struct si_pm4_state *pm4,
 		S_028C70_NUMBER_TYPE(ntype) |
 		S_028C70_ENDIAN(endian);

+	color_attrib = S_028C74_TILE_MODE_INDEX(tile_mode_index) |
+		S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_1);
+
 	offset += r600_resource_va(rctx->context.screen, state->cbufs[cb]->texture);
 	offset >>= 8;

@@ -1687,8 +1698,7 @@ static void si_cb(struct r600_context *rctx, struct si_pm4_state *pm4,
 			       S_028C6C_SLICE_MAX(state->cbufs[cb]->u.tex.last_layer));
 	}
 	si_pm4_set_reg(pm4, R_028C70_CB_COLOR0_INFO + cb * 0x3C, color_info);
-	si_pm4_set_reg(pm4, R_028C74_CB_COLOR0_ATTRIB + cb * 0x3C,
-		       S_028C74_TILE_MODE_INDEX(tile_mode_index));
+	si_pm4_set_reg(pm4, R_028C74_CB_COLOR0_ATTRIB + cb * 0x3C, color_attrib);

 	/* Determine pixel shader export format */
 	max_comp_size = si_colorformat_max_comp_size(format);
@@ -1848,7 +1858,7 @@ static INLINE struct si_shader_key si_shader_selector_key(struct pipe_context *c
 		key.export_16bpc = rctx->export_16bpc;
 		if (rctx->queued.named.rasterizer) {
 			key.color_two_side = rctx->queued.named.rasterizer->two_side;
-			/*key.flatshade = rctx->queued.named.rasterizer->flatshade;*/
+			key.flatshade = rctx->queued.named.rasterizer->flatshade;
 		}
 		if (rctx->queued.named.dsa) {
 			key.alpha_func = rctx->queued.named.dsa->alpha_func;
@@ -2053,6 +2063,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	unsigned char state_swizzle[4], swizzle[4];
 	unsigned height, depth, width;
 	enum pipe_format pipe_format = state->format;
+	struct radeon_surface_level *surflevel;
 	int first_non_void;
 	uint64_t va;

@@ -2072,37 +2083,84 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	state_swizzle[2] = state->swizzle_b;
 	state_swizzle[3] = state->swizzle_a;

+	surflevel = tmp->surface.level;
+
 	/* Texturing with separate depth and stencil. */
 	if (tmp->is_depth && !tmp->is_flushing_texture) {
 		switch (pipe_format) {
 		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 			pipe_format = PIPE_FORMAT_Z32_FLOAT;
 			break;
+		case PIPE_FORMAT_X8Z24_UNORM:
+		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+			/* Z24 is always stored like this. */
+			pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+			break;
 		case PIPE_FORMAT_X24S8_UINT:
 		case PIPE_FORMAT_S8X24_UINT:
 		case PIPE_FORMAT_X32_S8X24_UINT:
 			pipe_format = PIPE_FORMAT_S8_UINT;
+			surflevel = tmp->surface.stencil_level;
 			break;
 		default:;
 		}
 	}

 	desc = util_format_description(pipe_format);
-	util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+
+	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+		const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
+		const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+
+		switch (pipe_format) {
+		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+		case PIPE_FORMAT_X24S8_UINT:
+		case PIPE_FORMAT_X32_S8X24_UINT:
+		case PIPE_FORMAT_X8Z24_UNORM:
+			util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+			break;
+		default:
+			util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
+		}
+	} else {
+		util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+	}

 	first_non_void = util_format_get_first_non_void_channel(pipe_format);
-	if (first_non_void < 0) {
-		num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
-	} else switch (desc->channel[first_non_void].type) {
-	case UTIL_FORMAT_TYPE_FLOAT:
-		num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
-		break;
-	case UTIL_FORMAT_TYPE_SIGNED:
-		num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
-		break;
-	case UTIL_FORMAT_TYPE_UNSIGNED:
-	default:
+
+	switch (pipe_format) {
+	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 		num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+		break;
+	default:
+		if (first_non_void < 0) {
+			num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+		} else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+			num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
+		} else {
+			num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+
+			switch (desc->channel[first_non_void].type) {
+			case UTIL_FORMAT_TYPE_FLOAT:
+				num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+				break;
+			case UTIL_FORMAT_TYPE_SIGNED:
+				if (desc->channel[first_non_void].normalized)
+					num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
+				else if (desc->channel[first_non_void].pure_integer)
+					num_format = V_008F14_IMG_NUM_FORMAT_SINT;
+				else
+					num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
+				break;
+			case UTIL_FORMAT_TYPE_UNSIGNED:
+				if (desc->channel[first_non_void].normalized)
+					num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+				else if (desc->channel[first_non_void].pure_integer)
+					num_format = V_008F14_IMG_NUM_FORMAT_UINT;
+				else
+					num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
+			}
+		}
 	}

 	format = si_translate_texformat(ctx->screen, pipe_format, desc, first_non_void);
@@ -2115,10 +2173,10 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	/* not supported any more */
 	//endian = si_colorformat_endian_swap(format);

-	width = tmp->surface.level[0].npix_x;
-	height = tmp->surface.level[0].npix_y;
-	depth = tmp->surface.level[0].npix_z;
-	pitch = tmp->surface.level[0].nblk_x * util_format_get_blockwidth(pipe_format);
+	width = surflevel[0].npix_x;
+	height = surflevel[0].npix_y;
+	depth = surflevel[0].npix_z;
+	pitch = surflevel[0].nblk_x * util_format_get_blockwidth(pipe_format);

 	if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
@@ -2128,7 +2186,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	}

 	va = r600_resource_va(ctx->screen, texture);
-	va += tmp->surface.level[0].offset;
+	va += surflevel[0].offset;
 	view->state[0] = va >> 8;
 	view->state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) |
 			  S_008F14_DATA_FORMAT(format) |
@@ -2476,10 +2534,20 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 			num_format = V_008F0C_BUF_NUM_FORMAT_USCALED; /* XXX */
 			break;
 		case UTIL_FORMAT_TYPE_SIGNED:
-			num_format = V_008F0C_BUF_NUM_FORMAT_SNORM;
+			if (desc->channel[first_non_void].normalized)
+				num_format = V_008F0C_BUF_NUM_FORMAT_SNORM;
+			else if (desc->channel[first_non_void].pure_integer)
+				num_format = V_008F0C_BUF_NUM_FORMAT_SINT;
+			else
+				num_format = V_008F0C_BUF_NUM_FORMAT_SSCALED;
 			break;
 		case UTIL_FORMAT_TYPE_UNSIGNED:
-			num_format = V_008F0C_BUF_NUM_FORMAT_UNORM;
+			if (desc->channel[first_non_void].normalized)
+				num_format = V_008F0C_BUF_NUM_FORMAT_UNORM;
+			else if (desc->channel[first_non_void].pure_integer)
+				num_format = V_008F0C_BUF_NUM_FORMAT_UINT;
+			else
+				num_format = V_008F0C_BUF_NUM_FORMAT_USCALED;
 			break;
 		case UTIL_FORMAT_TYPE_FLOAT:
 		default:
@@ -2665,9 +2733,14 @@ void si_init_config(struct r600_context *rctx)
 		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x2a00126a);
 		break;
 	case CHIP_VERDE:
-	default:
 		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x0000124a);
 		break;
+	case CHIP_OLAND:
+		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000082);
+		break;
+	default:
+		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x00000000);
+		break;
 	}

 	si_pm4_set_state(rctx, init, pm4);
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -128,11 +128,6 @@ static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *s
 			continue;
 		}

-		/* XXX: Flat shading hangs the GPU */
-		if (shader->shader.input[i].interpolate == TGSI_INTERPOLATE_CONSTANT ||
-		    (shader->shader.input[i].interpolate == TGSI_INTERPOLATE_COLOR &&
-		     rctx->queued.named.rasterizer->flatshade))
-			have_linear = TRUE;
 		if (shader->shader.input[i].interpolate == TGSI_INTERPOLATE_LINEAR)
 			have_linear = TRUE;
 		if (shader->shader.input[i].interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
@@ -327,15 +322,12 @@ static void si_update_spi_map(struct r600_context *rctx)
 bcolor:
 		tmp = 0;

-#if 0
-		/* XXX: Flat shading hangs the GPU */
 		if (name == TGSI_SEMANTIC_POSITION ||
 		    ps->input[i].interpolate == TGSI_INTERPOLATE_CONSTANT ||
 		    (ps->input[i].interpolate == TGSI_INTERPOLATE_COLOR &&
-		     rctx->rasterizer && rctx->rasterizer->flatshade)) {
+		     rctx->ps_shader->current->key.flatshade)) {
 			tmp |= S_028644_FLAT_SHADE(1);
 		}
-#endif

 		if (name == TGSI_SEMANTIC_GENERIC &&
 		    rctx->sprite_coord_enable & (1 << ps->input[i].sid)) {
@@ -453,8 +445,14 @@ static void si_vertex_buffer_update(struct r600_context *rctx)
 		si_pm4_sh_data_add(pm4, va & 0xFFFFFFFF);
 		si_pm4_sh_data_add(pm4, (S_008F04_BASE_ADDRESS_HI(va >> 32) |
 					 S_008F04_STRIDE(vb->stride)));
-		si_pm4_sh_data_add(pm4, (vb->buffer->width0 - vb->buffer_offset) /
-					 MAX2(vb->stride, 1));
+		if (vb->stride)
+			/* Round up by rounding down and adding 1 */
+			si_pm4_sh_data_add(pm4,
+					   (vb->buffer->width0 - offset -
+					    util_format_get_blocksize(ve->src_format)) /
+					   vb->stride + 1);
+		else
+			si_pm4_sh_data_add(pm4, vb->buffer->width0 - offset);
 		si_pm4_sh_data_add(pm4, rctx->vertex_elements->rsrc_word3[i]);

 		if (!bound[ve->vertex_buffer_index]) {
@@ -524,10 +522,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	struct pipe_index_buffer ib = {};
 	uint32_t cp_coher_cntl;

-	if ((!info->count && (info->indexed || !info->count_from_stream_output)) ||
-	    (info->indexed && !rctx->index_buffer.buffer)) {
+	if (!info->count && (info->indexed || !info->count_from_stream_output))
 		return;
-	}

 	if (!rctx->ps_shader || !rctx->vs_shader)
 		return;
@@ -538,13 +534,14 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */
 		pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer);
+		ib.user_buffer = rctx->index_buffer.user_buffer;
 		ib.index_size = rctx->index_buffer.index_size;
 		ib.offset = rctx->index_buffer.offset + info->start * ib.index_size;

 		/* Translate or upload, if needed. */
 		r600_translate_index_buffer(rctx, &ib, info->count);

-		if (ib.user_buffer) {
+		if (ib.user_buffer && !ib.buffer) {
 			r600_upload_index_buffer(rctx, &ib, info->count);
 		}

--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -2963,6 +2963,7 @@ sp_create_sampler_variant( const struct pipe_sampler_state *sampler,

   case PIPE_TEX_MIPFILTER_LINEAR:
      if (key.bits.is_pot &&
+          key.bits.target == PIPE_TEXTURE_2D &&
          sampler->min_img_filter == sampler->mag_img_filter &&
          sampler->normalized_coords &&
          sampler->wrap_s == PIPE_TEX_WRAP_REPEAT &&
--- a/src/gallium/drivers/svga/svga_state_rss.c
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -23,6 +23,7 @@
 *
 **********************************************************/

+#include "util/u_format.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "pipe/p_defines.h"
@@ -248,6 +249,16 @@ emit_rss(struct svga_context *svga, unsigned dirty)
      EMIT_RS_FLOAT( svga, bias, DEPTHBIAS, fail );
   }

+   if (dirty & SVGA_NEW_FRAME_BUFFER) {
+      /* XXX: we only look at the first color buffer's sRGB state */
+      float gamma = 1.0f;
+      if (svga->curr.framebuffer.cbufs[0] &&
+          util_format_is_srgb(svga->curr.framebuffer.cbufs[0]->format)) {
+         gamma = 2.2f;
+      }
+      EMIT_RS_FLOAT(svga, gamma, OUTPUTGAMMA, fail);
+   }
+
   if (dirty & SVGA_NEW_RAST) {
      /* bitmask of the enabled clip planes */
      unsigned enabled = svga->curr.rast->templ.clip_plane_enable;
--- a/src/gallium/state_trackers/egl/Makefile.am
+++ b/src/gallium/state_trackers/egl/Makefile.am
@@ -72,6 +72,7 @@ AM_CPPFLAGS += \
 	-I$(top_srcdir)/src/gallium/winsys \
 	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
 	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
+	-I$(top_builddir)/src/egl/wayland/wayland-drm \
 	-DHAVE_WAYLAND_BACKEND
 endif

--- a/src/gallium/state_trackers/glx/xlib/xm_api.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.c
@@ -438,7 +438,6 @@ create_xmesa_buffer(Drawable d, BufferType type,
 {
   XMesaDisplay xmdpy = xmesa_init_display(vis->display);
   XMesaBuffer b;
-   uint width, height;

   ASSERT(type == WINDOW || type == PIXMAP || type == PBUFFER);

@@ -457,7 +456,7 @@ create_xmesa_buffer(Drawable d, BufferType type,
   b->type = type;
   b->cmap = cmap;

-   get_drawable_size(vis->display, d, &width, &height);
+   get_drawable_size(vis->display, d, &b->width, &b->height);

   /*
    * Create framebuffer, but we'll plug in our own renderbuffers below.
--- a/src/gallium/state_trackers/xorg/xorg_exa.c
+++ b/src/gallium/state_trackers/xorg/xorg_exa.c
@@ -318,7 +318,7 @@ ExaFinishAccess(PixmapPtr pPix, int index)
    if (!priv)
 	return;

-    if (!priv->map_transfer || pPix->devPrivate.ptr == NULL)
+    if (!priv->map_transfer)
 	return;

    exa_debug_printf("ExaFinishAccess %d\n", index);
--- a/src/gallium/targets/dri-vmwgfx/Makefile.am
+++ b/src/gallium/targets/dri-vmwgfx/Makefile.am
@@ -58,17 +58,13 @@ vmwgfx_dri_la_LIBADD = \
 	$(top_builddir)/src/gallium/drivers/svga/libsvga.la \
 	$(GALLIUM_DRI_LIB_DEPS)

-if HAVE_MESA_LLVM
 vmwgfx_dri_la_LINK = $(CXXLINK) $(vmwgfx_dri_la_LDFLAGS)
 # Mention a dummy pure C++ file to trigger generation of the $(LINK) variable
 nodist_EXTRA_vmwgfx_dri_la_SOURCES = dummy-cpp.cpp

+if HAVE_MESA_LLVM
 vmwgfx_dri_la_LDFLAGS += $(LLVM_LDFLAGS)
 vmwgfx_dri_la_LIBADD += $(LLVM_LIBS)
-else
-vmwgfx_dri_la_LINK = $(LINK) $(vmwgfx_dri_la_LDFLAGS)
-# Mention a dummy pure C file to trigger generation of the $(LINK) variable
-nodist_EXTRA_vmwgfx_dri_la_SOURCES = dummy-c.c
 endif

 # Provide compatibility with scripts for the old Mesa build system for
--- a/src/gallium/targets/vdpau-softpipe/Makefile.am
+++ b/src/gallium/targets/vdpau-softpipe/Makefile.am
@@ -35,7 +35,7 @@ vdpaudir = $(VDPAU_LIB_INSTALL_DIR)
 vdpau_LTLIBRARIES = libvdpau_softpipe.la

 libvdpau_softpipe_la_SOURCES = \
-	$(top_srcdir)/src/gallium/auxiliary/vl/vl_winsys_dri.c
+	$(top_srcdir)/src/gallium/auxiliary/vl/vl_winsys_xsp.c

 libvdpau_softpipe_la_LDFLAGS = \
 	-module \
--- a/src/gallium/targets/xa-vmwgfx/Makefile.am
+++ b/src/gallium/targets/xa-vmwgfx/Makefile.am
@@ -47,6 +47,8 @@ libxatracker_la_LIBADD = \
 	$(top_builddir)/src/gallium/drivers/trace/libtrace.la \
 	$(top_builddir)/src/gallium/drivers/rbug/librbug.la

+nodist_EXTRA_libxatracker_la_SOURCES = dummy.cpp
+
 if HAVE_MESA_LLVM
 libxatracker_la_LDFLAGS += $(LLVM_LDFLAGS)
 libxatracker_la_LIBADD += $(LLVM_LIBS)
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -593,10 +593,11 @@ static struct pb_buffer *radeon_bomgr_create_bo(struct pb_manager *_mgr,
        va.offset = bo->va;
        r = drmCommandWriteRead(rws->fd, DRM_RADEON_GEM_VA, &va, sizeof(va));
        if (r && va.operation == RADEON_VA_RESULT_ERROR) {
-            fprintf(stderr, "radeon: Failed to allocate a buffer:\n");
+            fprintf(stderr, "radeon: Failed to allocate virtual address for buffer:\n");
            fprintf(stderr, "radeon:    size      : %d bytes\n", size);
            fprintf(stderr, "radeon:    alignment : %d bytes\n", desc->alignment);
            fprintf(stderr, "radeon:    domains   : %d\n", args.initial_domain);
+            fprintf(stderr, "radeon:    va        : 0x%016llx\n", (unsigned long long)bo->va);
            radeon_bo_destroy(&bo->base);
            return NULL;
        }
@@ -956,6 +957,10 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,

            bo->flinked = TRUE;
            bo->flink = flink.name;
+
+            pipe_mutex_lock(bo->mgr->bo_handles_mutex);
+            util_hash_table_set(bo->mgr->bo_handles, (void*)(uintptr_t)bo->flink, bo);
+            pipe_mutex_unlock(bo->mgr->bo_handles_mutex);
        }
        whandle->handle = bo->flink;
    } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -383,6 +383,16 @@ static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
    return status;
 }

+static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
+{
+    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+    boolean status =
+        (cs->csc->used_gart + gtt) < cs->ws->info.gart_size * 0.7 &&
+        (cs->csc->used_vram + vram) < cs->ws->info.vram_size * 0.7;
+
+    return status;
+}
+
 static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
                                      struct radeon_winsys_cs_handle *buf)
 {
@@ -575,6 +585,7 @@ void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
    ws->base.cs_destroy = radeon_drm_cs_destroy;
    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
    ws->base.cs_validate = radeon_drm_cs_validate;
+    ws->base.cs_memory_below_limit = radeon_drm_cs_memory_below_limit;
    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
    ws->base.cs_flush = radeon_drm_cs_flush;
    ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush;
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -312,6 +312,7 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
    case CHIP_TAHITI:
    case CHIP_PITCAIRN:
    case CHIP_VERDE:
+    case CHIP_OLAND:
        ws->info.chip_class = TAHITI;
        break;
    }
--- a/src/gallium/winsys/radeon/drm/radeon_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_winsys.h
@@ -123,6 +123,7 @@ enum radeon_family {
    CHIP_TAHITI,
    CHIP_PITCAIRN,
    CHIP_VERDE,
+    CHIP_OLAND,
    CHIP_LAST,
 };

@@ -392,6 +393,16 @@ struct radeon_winsys {
     */
    boolean (*cs_validate)(struct radeon_winsys_cs *cs);

+    /**
+     * Return TRUE if there is enough memory in VRAM and GTT for the relocs
+     * added so far.
+     *
+     * \param cs        A command stream to validate.
+     * \param vram      VRAM memory size pending to be use
+     * \param gtt       GTT memory size pending to be use
+     */
+    boolean (*cs_memory_below_limit)(struct radeon_winsys_cs *cs, uint64_t vram, uint64_t gtt);
+
    /**
     * Write a relocated dword to a command buffer.
     *
--- a/src/gbm/backends/dri/gbm_dri.c
+++ b/src/gbm/backends/dri/gbm_dri.c
@@ -481,6 +481,7 @@ create_dumb(struct gbm_device *gbm,
   bo->base.base.width = width;
   bo->base.base.height = height;
   bo->base.base.stride = create_arg.pitch;
+   bo->base.base.format = format;
   bo->base.base.handle.u32 = create_arg.handle;
   bo->handle = create_arg.handle;
   bo->size = create_arg.size;
@@ -529,6 +530,7 @@ gbm_dri_bo_create(struct gbm_device *gbm,
   bo->base.base.gbm = gbm;
   bo->base.base.width = width;
   bo->base.base.height = height;
+   bo->base.base.format = format;

   switch (format) {
   case GBM_FORMAT_RGB565:
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2829,9 +2829,9 @@ ast_declarator_list::hir(exec_list *instructions,
       *    flat."
       *
       * From section 4.3.4 of the GLSL 3.00 ES spec:
-       *    "Fragment shader inputs that are signed or unsigned integers or
-       *    integer vectors must be qualified with the interpolation qualifier
-       *    flat."
+       *    "Fragment shader inputs that are, or contain, signed or unsigned
+       *    integers or integer vectors must be qualified with the
+       *    interpolation qualifier flat."
       *
       * Since vertex outputs and fragment inputs must have matching
       * qualifiers, these two requirements are equivalent.
@@ -2839,12 +2839,12 @@ ast_declarator_list::hir(exec_list *instructions,
      if (state->is_version(130, 300)
          && state->target == vertex_shader
          && state->current_function == NULL
-          && var->type->is_integer()
+          && var->type->contains_integer()
          && var->mode == ir_var_shader_out
          && var->interpolation != INTERP_QUALIFIER_FLAT) {

-         _mesa_glsl_error(&loc, state, "If a vertex output is an integer, "
-                          "then it must be qualified with 'flat'");
+         _mesa_glsl_error(&loc, state, "If a vertex output is (or contains) "
+                          "an integer, then it must be qualified with 'flat'");
      }


@@ -3967,6 +3967,47 @@ ast_iteration_statement::hir(exec_list *instructions,
 }


+/**
+ * Determine if the given type is valid for establishing a default precision
+ * qualifier.
+ *
+ * From GLSL ES 3.00 section 4.5.4 ("Default Precision Qualifiers"):
+ *
+ *     "The precision statement
+ *
+ *         precision precision-qualifier type;
+ *
+ *     can be used to establish a default precision qualifier. The type field
+ *     can be either int or float or any of the sampler types, and the
+ *     precision-qualifier can be lowp, mediump, or highp."
+ *
+ * GLSL ES 1.00 has similar language.  GLSL 1.30 doesn't allow precision
+ * qualifiers on sampler types, but this seems like an oversight (since the
+ * intention of including these in GLSL 1.30 is to allow compatibility with ES
+ * shaders).  So we allow int, float, and all sampler types regardless of GLSL
+ * version.
+ */
+static bool
+is_valid_default_precision_type(const struct _mesa_glsl_parse_state *state,
+                                const char *type_name)
+{
+   const struct glsl_type *type = state->symbols->get_type(type_name);
+   if (type == NULL)
+      return false;
+
+   switch (type->base_type) {
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+      /* "int" and "float" are valid, but vectors and matrices are not. */
+      return type->vector_elements == 1 && type->matrix_columns == 1;
+   case GLSL_TYPE_SAMPLER:
+      return true;
+   default:
+      return false;
+   }
+}
+
+
 ir_rvalue *
 ast_type_specifier::hir(exec_list *instructions,
 			  struct _mesa_glsl_parse_state *state)
@@ -4007,11 +4048,10 @@ ast_type_specifier::hir(exec_list *instructions,
                          "arrays");
         return NULL;
      }
-      if (strcmp(this->type_name, "float") != 0 &&
-	  strcmp(this->type_name, "int") != 0) {
+      if (!is_valid_default_precision_type(state, this->type_name)) {
         _mesa_glsl_error(&loc, state,
                          "default precision statements apply only to types "
-                          "float and int");
+                          "float, int, and sampler types");
         return NULL;
      }

--- a/src/glsl/builtin_compiler/Makefile.am
+++ b/src/glsl/builtin_compiler/Makefile.am
@@ -20,23 +20,44 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.

-CC = @CC_FOR_BUILD@
-CFLAGS = @CFLAGS_FOR_BUILD@
-CPP = @CPP_FOR_BUILD@
-CPPFLAGS = @CPPFLAGS_FOR_BUILD@
-CXX = @CXX_FOR_BUILD@
-CXXFLAGS = @CXXFLAGS_FOR_BUILD@
-LD = @LD_FOR_BUILD@
-LDFLAGS = @LDFLAGS_FOR_BUILD@
-
 AM_CFLAGS = \
 	-I $(top_srcdir)/include \
 	-I $(top_srcdir)/src/mapi \
 	-I $(top_srcdir)/src/mesa \
 	-I $(GLSL_SRCDIR) \
 	-I $(GLSL_SRCDIR)/glcpp \
-	-I $(GLSL_BUILDDIR) \
-	$(DEFINES_FOR_BUILD)
+	-I $(GLSL_BUILDDIR)
+
+if CROSS_COMPILING
+proxyCC = @CC_FOR_BUILD@
+proxyCFLAGS = @CFLAGS_FOR_BUILD@
+proxyCPP = @CPP_FOR_BUILD@
+proxyCPPFLAGS = @CPPFLAGS_FOR_BUILD@
+proxyCXX = @CXX_FOR_BUILD@
+proxyCXXFLAGS = @CXXFLAGS_FOR_BUILD@
+proxyLD = @LD_FOR_BUILD@
+proxyLDFLAGS = @LDFLAGS_FOR_BUILD@
+AM_CFLAGS += $(DEFINES_FOR_BUILD)
+else
+proxyCC = @CC@
+proxyCFLAGS = @CFLAGS@
+proxyCPP = @CPP@
+proxyCPPFLAGS = @CPPFLAGS@
+proxyCXX = @CXX@
+proxyCXXFLAGS = @CXXFLAGS@
+proxyLD = @LD@
+proxyLDFLAGS = @LDFLAGS@
+AM_CFLAGS += $(DEFINES)
+endif
+
+CC = $(proxyCC)
+CFLAGS = $(proxyCFLAGS)
+CPP = $(proxyCPP)
+CPPFLAGS = $(proxyCPPFLAGS)
+CXX = $(proxyCXX)
+CXXFLAGS = $(proxyCXXFLAGS)
+LD = $(proxyLD)
+LDFLAGS = $(proxyLDFLAGS)

 AM_CXXFLAGS = $(AM_CFLAGS)

--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -156,6 +156,24 @@ glsl_type::contains_sampler() const
   }
 }

+
+bool
+glsl_type::contains_integer() const
+{
+   if (this->is_array()) {
+      return this->fields.array->contains_integer();
+   } else if (this->is_record()) {
+      for (unsigned int i = 0; i < this->length; i++) {
+	 if (this->fields.structure[i].type->contains_integer())
+	    return true;
+      }
+      return false;
+   } else {
+      return this->is_integer();
+   }
+}
+
+
 gl_texture_index
 glsl_type::sampler_index() const
 {
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -359,6 +359,12 @@ struct glsl_type {
      return (base_type == GLSL_TYPE_UINT) || (base_type == GLSL_TYPE_INT);
   }

+   /**
+    * Query whether or not type is an integral type, or for struct and array
+    * types, contains an integral type.
+    */
+   bool contains_integer() const;
+
   /**
    * Query whether or not a type is a float type
    */
--- a/src/glsl/link_uniform_blocks.cpp
+++ b/src/glsl/link_uniform_blocks.cpp
@@ -29,7 +29,7 @@
 #include "main/hash_table.h"
 #include "program.h"

-class ubo_visitor : public uniform_field_visitor {
+class ubo_visitor : public program_resource_visitor {
 public:
   ubo_visitor(void *mem_ctx, gl_uniform_buffer_variable *variables,
               unsigned num_variables)
@@ -44,7 +44,7 @@ public:
      this->offset = 0;
      this->buffer_size = 0;
      this->is_array_instance = strchr(name, ']') != NULL;
-      this->uniform_field_visitor::process(type, name);
+      this->program_resource_visitor::process(type, name);
   }

   unsigned index;
@@ -112,7 +112,7 @@ private:
   }
 };

-class count_block_size : public uniform_field_visitor {
+class count_block_size : public program_resource_visitor {
 public:
   count_block_size() : num_active_uniforms(0)
   {
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -52,7 +52,7 @@ values_for_type(const glsl_type *type)
 }

 void
-uniform_field_visitor::process(const glsl_type *type, const char *name)
+program_resource_visitor::process(const glsl_type *type, const char *name)
 {
   assert(type->is_record()
          || (type->is_array() && type->fields.array->is_record())
@@ -65,7 +65,7 @@ uniform_field_visitor::process(const glsl_type *type, const char *name)
 }

 void
-uniform_field_visitor::process(ir_variable *var)
+program_resource_visitor::process(ir_variable *var)
 {
   const glsl_type *t = var->type;

@@ -93,8 +93,8 @@ uniform_field_visitor::process(ir_variable *var)
 }

 void
-uniform_field_visitor::recursion(const glsl_type *t, char **name,
-                                 size_t name_length, bool row_major)
+program_resource_visitor::recursion(const glsl_type *t, char **name,
+                                    size_t name_length, bool row_major)
 {
   /* Records need to have each field processed individually.
    *
@@ -110,7 +110,7 @@ uniform_field_visitor::recursion(const glsl_type *t, char **name,
         if (t->fields.structure[i].type->is_record())
            this->visit_field(&t->fields.structure[i]);

-         /* Append '.field' to the current uniform name. */
+         /* Append '.field' to the current variable name. */
         if (name_length == 0) {
            ralloc_asprintf_rewrite_tail(name, &new_length, "%s", field);
         } else {
@@ -125,7 +125,7 @@ uniform_field_visitor::recursion(const glsl_type *t, char **name,
      for (unsigned i = 0; i < t->length; i++) {
 	 size_t new_length = name_length;

-	 /* Append the subscript to the current uniform name */
+	 /* Append the subscript to the current variable name */
 	 ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);

         recursion(t->fields.array, name, new_length,
@@ -137,7 +137,7 @@ uniform_field_visitor::recursion(const glsl_type *t, char **name,
 }

 void
-uniform_field_visitor::visit_field(const glsl_struct_field *field)
+program_resource_visitor::visit_field(const glsl_struct_field *field)
 {
   (void) field;
   /* empty */
@@ -153,7 +153,7 @@ uniform_field_visitor::visit_field(const glsl_struct_field *field)
 * If the same uniform is added multiple times (i.e., once for each shader
 * target), it will only be accounted once.
 */
-class count_uniform_size : public uniform_field_visitor {
+class count_uniform_size : public program_resource_visitor {
 public:
   count_uniform_size(struct string_to_uint_map *map)
      : num_active_uniforms(0), num_values(0), num_shader_samplers(0),
@@ -171,10 +171,10 @@ public:
   void process(ir_variable *var)
   {
      if (var->is_interface_instance())
-         uniform_field_visitor::process(var->interface_type,
-                                        var->interface_type->name);
+         program_resource_visitor::process(var->interface_type,
+                                           var->interface_type->name);
      else
-         uniform_field_visitor::process(var);
+         program_resource_visitor::process(var);
   }

   /**
@@ -258,7 +258,7 @@ private:
 * the \c gl_uniform_storage and \c gl_constant_value arrays are "big
 * enough."
 */
-class parcel_out_uniform_storage : public uniform_field_visitor {
+class parcel_out_uniform_storage : public program_resource_visitor {
 public:
   parcel_out_uniform_storage(struct string_to_uint_map *map,
 			      struct gl_uniform_storage *uniforms,
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -35,6 +35,8 @@
 #include "linker.h"
 #include "link_varyings.h"
 #include "main/macros.h"
+#include "program/hash_table.h"
+#include "program.h"


 /**
@@ -154,10 +156,13 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,

 /**
 * Initialize this object based on a string that was passed to
- * glTransformFeedbackVaryings.  If there is a parse error, the error is
- * reported using linker_error(), and false is returned.
+ * glTransformFeedbackVaryings.
+ *
+ * If the input is mal-formed, this call still succeeds, but it sets
+ * this->var_name to a mal-formed input, so tfeedback_decl::find_output_var()
+ * will fail to find any matching variable.
 */
-bool
+void
 tfeedback_decl::init(struct gl_context *ctx, struct gl_shader_program *prog,
                     const void *mem_ctx, const char *input)
 {
@@ -170,12 +175,13 @@ tfeedback_decl::init(struct gl_context *ctx, struct gl_shader_program *prog,
   this->is_clip_distance_mesa = false;
   this->skip_components = 0;
   this->next_buffer_separator = false;
+   this->matched_candidate = NULL;

   if (ctx->Extensions.ARB_transform_feedback3) {
      /* Parse gl_NextBuffer. */
      if (strcmp(input, "gl_NextBuffer") == 0) {
         this->next_buffer_separator = true;
-         return true;
+         return;
      }

      /* Parse gl_SkipComponents. */
@@ -189,21 +195,17 @@ tfeedback_decl::init(struct gl_context *ctx, struct gl_shader_program *prog,
         this->skip_components = 4;

      if (this->skip_components)
-         return true;
+         return;
   }

   /* Parse a declaration. */
-   const char *bracket = strrchr(input, '[');
-
-   if (bracket) {
-      this->var_name = ralloc_strndup(mem_ctx, input, bracket - input);
-      if (sscanf(bracket, "[%u]", &this->array_subscript) != 1) {
-         linker_error(prog, "Cannot parse transform feedback varying %s", input);
-         return false;
-      }
+   const char *base_name_end;
+   long subscript = parse_program_resource_name(input, &base_name_end);
+   this->var_name = ralloc_strndup(mem_ctx, input, base_name_end - input);
+   if (subscript >= 0) {
+      this->array_subscript = subscript;
      this->is_subscripted = true;
   } else {
-      this->var_name = ralloc_strdup(mem_ctx, input);
      this->is_subscripted = false;
   }

@@ -215,8 +217,6 @@ tfeedback_decl::init(struct gl_context *ctx, struct gl_shader_program *prog,
       strcmp(this->var_name, "gl_ClipDistance") == 0) {
      this->is_clip_distance_mesa = true;
   }
-
-   return true;
 }


@@ -240,27 +240,32 @@ tfeedback_decl::is_same(const tfeedback_decl &x, const tfeedback_decl &y)


 /**
- * Assign a location for this tfeedback_decl object based on the location
- * assignment in output_var.
+ * Assign a location for this tfeedback_decl object based on the transform
+ * feedback candidate found by find_candidate.
 *
 * If an error occurs, the error is reported through linker_error() and false
 * is returned.
 */
 bool
 tfeedback_decl::assign_location(struct gl_context *ctx,
-                                struct gl_shader_program *prog,
-                                ir_variable *output_var)
+                                struct gl_shader_program *prog)
 {
   assert(this->is_varying());

-   if (output_var->type->is_array()) {
+   unsigned fine_location
+      = this->matched_candidate->toplevel_var->location * 4
+      + this->matched_candidate->toplevel_var->location_frac
+      + this->matched_candidate->offset;
+
+   if (this->matched_candidate->type->is_array()) {
      /* Array variable */
      const unsigned matrix_cols =
-         output_var->type->fields.array->matrix_columns;
+         this->matched_candidate->type->fields.array->matrix_columns;
      const unsigned vector_elements =
-         output_var->type->fields.array->vector_elements;
+         this->matched_candidate->type->fields.array->vector_elements;
      unsigned actual_array_size = this->is_clip_distance_mesa ?
-         prog->Vert.ClipDistanceArraySize : output_var->type->array_size();
+         prog->Vert.ClipDistanceArraySize :
+         this->matched_candidate->type->array_size();

      if (this->is_subscripted) {
         /* Check array bounds. */
@@ -271,22 +276,11 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
                         actual_array_size);
            return false;
         }
-         if (this->is_clip_distance_mesa) {
-            this->location =
-               output_var->location + this->array_subscript / 4;
-            this->location_frac = this->array_subscript % 4;
-         } else {
-            unsigned fine_location
-               = output_var->location * 4 + output_var->location_frac;
-            unsigned array_elem_size = vector_elements * matrix_cols;
-            fine_location += array_elem_size * this->array_subscript;
-            this->location = fine_location / 4;
-            this->location_frac = fine_location % 4;
-         }
+         unsigned array_elem_size = this->is_clip_distance_mesa ?
+            1 : vector_elements * matrix_cols;
+         fine_location += array_elem_size * this->array_subscript;
         this->size = 1;
      } else {
-         this->location = output_var->location;
-         this->location_frac = output_var->location_frac;
         this->size = actual_array_size;
      }
      this->vector_elements = vector_elements;
@@ -294,7 +288,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
      if (this->is_clip_distance_mesa)
         this->type = GL_FLOAT;
      else
-         this->type = output_var->type->fields.array->gl_type;
+         this->type = this->matched_candidate->type->fields.array->gl_type;
   } else {
      /* Regular variable (scalar, vector, or matrix) */
      if (this->is_subscripted) {
@@ -303,13 +297,13 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
                      this->orig_name, this->var_name);
         return false;
      }
-      this->location = output_var->location;
-      this->location_frac = output_var->location_frac;
      this->size = 1;
-      this->vector_elements = output_var->type->vector_elements;
-      this->matrix_columns = output_var->type->matrix_columns;
-      this->type = output_var->type->gl_type;
+      this->vector_elements = this->matched_candidate->type->vector_elements;
+      this->matrix_columns = this->matched_candidate->type->matrix_columns;
+      this->type = this->matched_candidate->type->gl_type;
   }
+   this->location = fine_location / 4;
+   this->location_frac = fine_location % 4;

   /* From GL_EXT_transform_feedback:
    *   A program will fail to link if:
@@ -404,35 +398,26 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
 }


-ir_variable *
-tfeedback_decl::find_output_var(gl_shader_program *prog,
-                                gl_shader *producer) const
+const tfeedback_candidate *
+tfeedback_decl::find_candidate(gl_shader_program *prog,
+                               hash_table *tfeedback_candidates)
 {
   const char *name = this->is_clip_distance_mesa
      ? "gl_ClipDistanceMESA" : this->var_name;
-   ir_variable *var = producer->symbols->get_variable(name);
-   if (var && var->mode == ir_var_shader_out) {
-      const glsl_type *type = var->type;
-      while (type->base_type == GLSL_TYPE_ARRAY)
-         type = type->fields.array;
-      if (type->base_type == GLSL_TYPE_STRUCT) {
-         linker_error(prog, "Transform feedback of varying structs not "
-                      "implemented yet.");
-         return NULL;
-      }
-      return var;
+   this->matched_candidate = (const tfeedback_candidate *)
+      hash_table_find(tfeedback_candidates, name);
+   if (!this->matched_candidate) {
+      /* From GL_EXT_transform_feedback:
+       *   A program will fail to link if:
+       *
+       *   * any variable name specified in the <varyings> array is not
+       *     declared as an output in the geometry shader (if present) or
+       *     the vertex shader (if no geometry shader is present);
+       */
+      linker_error(prog, "Transform feedback varying %s undeclared.",
+                   this->orig_name);
   }
-
-   /* From GL_EXT_transform_feedback:
-    *   A program will fail to link if:
-    *
-    *   * any variable name specified in the <varyings> array is not
-    *     declared as an output in the geometry shader (if present) or
-    *     the vertex shader (if no geometry shader is present);
-    */
-   linker_error(prog, "Transform feedback varying %s undeclared.",
-                this->orig_name);
-   return NULL;
+   return this->matched_candidate;
 }


@@ -449,8 +434,7 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
                      char **varying_names, tfeedback_decl *decls)
 {
   for (unsigned i = 0; i < num_names; ++i) {
-      if (!decls[i].init(ctx, prog, mem_ctx, varying_names[i]))
-         return false;
+      decls[i].init(ctx, prog, mem_ctx, varying_names[i]);

      if (!decls[i].is_varying())
         continue;
@@ -870,6 +854,80 @@ is_varying_var(GLenum shaderType, const ir_variable *var)
 }


+/**
+ * Visitor class that generates tfeedback_candidate structs describing all
+ * possible targets of transform feedback.
+ *
+ * tfeedback_candidate structs are stored in the hash table
+ * tfeedback_candidates, which is passed to the constructor.  This hash table
+ * maps varying names to instances of the tfeedback_candidate struct.
+ */
+class tfeedback_candidate_generator : public program_resource_visitor
+{
+public:
+   tfeedback_candidate_generator(void *mem_ctx,
+                                 hash_table *tfeedback_candidates)
+      : mem_ctx(mem_ctx),
+        tfeedback_candidates(tfeedback_candidates)
+   {
+   }
+
+   void process(ir_variable *var)
+   {
+      this->toplevel_var = var;
+      this->varying_floats = 0;
+      if (var->is_interface_instance())
+         program_resource_visitor::process(var->interface_type,
+                                           var->interface_type->name);
+      else
+         program_resource_visitor::process(var);
+   }
+
+private:
+   virtual void visit_field(const glsl_type *type, const char *name,
+                            bool row_major)
+   {
+      assert(!type->is_record());
+      assert(!(type->is_array() && type->fields.array->is_record()));
+      assert(!type->is_interface());
+      assert(!(type->is_array() && type->fields.array->is_interface()));
+
+      (void) row_major;
+
+      tfeedback_candidate *candidate
+         = rzalloc(this->mem_ctx, tfeedback_candidate);
+      candidate->toplevel_var = this->toplevel_var;
+      candidate->type = type;
+      candidate->offset = this->varying_floats;
+      hash_table_insert(this->tfeedback_candidates, candidate,
+                        ralloc_strdup(this->mem_ctx, name));
+      this->varying_floats += type->component_slots();
+   }
+
+   /**
+    * Memory context used to allocate hash table keys and values.
+    */
+   void * const mem_ctx;
+
+   /**
+    * Hash table in which tfeedback_candidate objects should be stored.
+    */
+   hash_table * const tfeedback_candidates;
+
+   /**
+    * Pointer to the toplevel variable that is being traversed.
+    */
+   ir_variable *toplevel_var;
+
+   /**
+    * Total number of varying floats that have been visited so far.  This is
+    * used to determine the offset to each varying within the toplevel
+    * variable.
+    */
+   unsigned varying_floats;
+};
+
+
 /**
 * Assign locations for all variables that are produced in one pipeline stage
 * (the "producer") and consumed in the next stage (the "consumer").
@@ -902,6 +960,8 @@ assign_varying_locations(struct gl_context *ctx,
   const unsigned producer_base = VERT_RESULT_VAR0;
   const unsigned consumer_base = FRAG_ATTRIB_VAR0;
   varying_matches matches(ctx->Const.DisableVaryingPacking);
+   hash_table *tfeedback_candidates
+      = hash_table_ctor(0, hash_table_string_hash, hash_table_string_compare);

   /* Operate in a total of three passes.
    *
@@ -920,6 +980,9 @@ assign_varying_locations(struct gl_context *ctx,
      if ((output_var == NULL) || (output_var->mode != ir_var_shader_out))
 	 continue;

+      tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates);
+      g.process(output_var);
+
      ir_variable *input_var =
 	 consumer ? consumer->symbols->get_variable(output_var->name) : NULL;

@@ -935,15 +998,16 @@ assign_varying_locations(struct gl_context *ctx,
      if (!tfeedback_decls[i].is_varying())
         continue;

-      ir_variable *output_var
-         = tfeedback_decls[i].find_output_var(prog, producer);
+      const tfeedback_candidate *matched_candidate
+         = tfeedback_decls[i].find_candidate(prog, tfeedback_candidates);

-      if (output_var == NULL)
+      if (matched_candidate == NULL) {
+         hash_table_dtor(tfeedback_candidates);
         return false;
-
-      if (output_var->is_unmatched_generic_inout) {
-         matches.record(output_var, NULL);
      }
+
+      if (matched_candidate->toplevel_var->is_unmatched_generic_inout)
+         matches.record(matched_candidate->toplevel_var, NULL);
   }

   const unsigned slots_used = matches.assign_locations();
@@ -953,13 +1017,14 @@ assign_varying_locations(struct gl_context *ctx,
      if (!tfeedback_decls[i].is_varying())
         continue;

-      ir_variable *output_var
-         = tfeedback_decls[i].find_output_var(prog, producer);
-
-      if (!tfeedback_decls[i].assign_location(ctx, prog, output_var))
+      if (!tfeedback_decls[i].assign_location(ctx, prog)) {
+         hash_table_dtor(tfeedback_candidates);
         return false;
+      }
   }

+   hash_table_dtor(tfeedback_candidates);
+
   if (ctx->Const.DisableVaryingPacking) {
      /* Transform feedback code assumes varyings are packed, so if the driver
       * has disabled varying packing, make sure it does not support transform
--- a/src/glsl/link_varyings.h
+++ b/src/glsl/link_varyings.h
@@ -41,6 +41,49 @@ struct gl_shader;
 class ir_variable;


+/**
+ * Data structure describing a varying which is available for use in transform
+ * feedback.
+ *
+ * For example, if the vertex shader contains:
+ *
+ *     struct S {
+ *       vec4 foo;
+ *       float[3] bar;
+ *     };
+ *
+ *     varying S[2] v;
+ *
+ * Then there would be tfeedback_candidate objects corresponding to the
+ * following varyings:
+ *
+ *     v[0].foo
+ *     v[0].bar
+ *     v[1].foo
+ *     v[1].bar
+ */
+struct tfeedback_candidate
+{
+   /**
+    * Toplevel variable containing this varying.  In the above example, this
+    * would point to the declaration of the varying v.
+    */
+   ir_variable *toplevel_var;
+
+   /**
+    * Type of this varying.  In the above example, this would point to the
+    * glsl_type for "vec4" or "float[3]".
+    */
+   const glsl_type *type;
+
+   /**
+    * Offset within the toplevel variable where this varying occurs (counted
+    * in multiples of the size of a float).
+    */
+   unsigned offset;
+};
+
+
 /**
 * Data structure tracking information about a transform feedback declaration
 * during linking.
@@ -48,17 +91,17 @@ class ir_variable;
 class tfeedback_decl
 {
 public:
-   bool init(struct gl_context *ctx, struct gl_shader_program *prog,
+   void init(struct gl_context *ctx, struct gl_shader_program *prog,
             const void *mem_ctx, const char *input);
   static bool is_same(const tfeedback_decl &x, const tfeedback_decl &y);
-   bool assign_location(struct gl_context *ctx, struct gl_shader_program *prog,
-                        ir_variable *output_var);
+   bool assign_location(struct gl_context *ctx,
+                        struct gl_shader_program *prog);
   unsigned get_num_outputs() const;
   bool store(struct gl_context *ctx, struct gl_shader_program *prog,
              struct gl_transform_feedback_info *info, unsigned buffer,
              const unsigned max_outputs) const;
-   ir_variable *find_output_var(gl_shader_program *prog,
-                                gl_shader *producer) const;
+   const tfeedback_candidate *find_candidate(gl_shader_program *prog,
+                                             hash_table *tfeedback_candidates);

   bool is_next_buffer_separator() const
   {
@@ -158,6 +201,12 @@ private:
    * Whether this is gl_NextBuffer from ARB_transform_feedback3.
    */
   bool next_buffer_separator;
+
+   /**
+    * If find_candidate() has been called, pointer to the tfeedback_candidate
+    * data structure that was found.  Otherwise NULL.
+    */
+   const tfeedback_candidate *matched_candidate;
 };


--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -200,6 +200,65 @@ linker_warning(gl_shader_program *prog, const char *fmt, ...)
 }


+/**
+ * Given a string identifying a program resource, break it into a base name
+ * and an optional array index in square brackets.
+ *
+ * If an array index is present, \c out_base_name_end is set to point to the
+ * "[" that precedes the array index, and the array index itself is returned
+ * as a long.
+ *
+ * If no array index is present (or if the array index is negative or
+ * mal-formed), \c out_base_name_end, is set to point to the null terminator
+ * at the end of the input string, and -1 is returned.
+ *
+ * Only the final array index is parsed; if the string contains other array
+ * indices (or structure field accesses), they are left in the base name.
+ *
+ * No attempt is made to check that the base name is properly formed;
+ * typically the caller will look up the base name in a hash table, so
+ * ill-formed base names simply turn into hash table lookup failures.
+ */
+long
+parse_program_resource_name(const GLchar *name,
+                            const GLchar **out_base_name_end)
+{
+   /* Section 7.3.1 ("Program Interfaces") of the OpenGL 4.3 spec says:
+    *
+    *     "When an integer array element or block instance number is part of
+    *     the name string, it will be specified in decimal form without a "+"
+    *     or "-" sign or any extra leading zeroes. Additionally, the name
+    *     string will not include white space anywhere in the string."
+    */
+
+   const size_t len = strlen(name);
+   *out_base_name_end = name + len;
+
+   if (len == 0 || name[len-1] != ']')
+      return -1;
+
+   /* Walk backwards over the string looking for a non-digit character.  This
+    * had better be the opening bracket for an array index.
+    *
+    * Initially, i specifies the location of the ']'.  Since the string may
+    * contain only the ']' charcater, walk backwards very carefully.
+    */
+   unsigned i;
+   for (i = len - 1; (i > 0) && isdigit(name[i-1]); --i)
+      /* empty */ ;
+
+   if ((i == 0) || name[i-1] != '[')
+      return -1;
+
+   long array_index = strtol(&name[i], NULL, 10);
+   if (array_index < 0)
+      return -1;
+
+   *out_base_name_end = name + (i - 1);
+   return array_index;
+}
+
+
 void
 link_invalidate_variable_locations(gl_shader *sh, int input_base,
                                   int output_base)
--- a/src/glsl/linker.h
+++ b/src/glsl/linker.h
@@ -61,38 +61,39 @@ link_uniform_blocks(void *mem_ctx,
                    struct gl_uniform_block **blocks_ret);

 /**
- * Class for processing all of the leaf fields of an uniform
+ * Class for processing all of the leaf fields of a variable that corresponds
+ * to a program resource.
 *
- * Leaves are, roughly speaking, the parts of the uniform that the application
- * could query with \c glGetUniformLocation (or that could be returned by
- * \c glGetActiveUniforms).
+ * The leaf fields are all the parts of the variable that the application
+ * could query using \c glGetProgramResourceIndex (or that could be returned
+ * by \c glGetProgramResourceName).
 *
 * Classes my derive from this class to implement specific functionality.
 * This class only provides the mechanism to iterate over the leaves.  Derived
 * classes must implement \c ::visit_field and may override \c ::process.
 */
-class uniform_field_visitor {
+class program_resource_visitor {
 public:
   /**
-    * Begin processing a uniform
+    * Begin processing a variable
    *
    * Classes that overload this function should call \c ::process from the
-    * base class to start the recursive processing of the uniform.
+    * base class to start the recursive processing of the variable.
    *
-    * \param var  The uniform variable that is to be processed
+    * \param var  The variable that is to be processed
    *
-    * Calls \c ::visit_field for each leaf of the uniform.
+    * Calls \c ::visit_field for each leaf of the variable.
    *
    * \warning
-    * This entry should only be used with uniform blocks in cases where the
-    * row / column ordering of matrices in the block does not matter.  For
-    * example, enumerating the names of members of the block, but not for
-    * determining the offsets of members.
+    * When processing a uniform block, this entry should only be used in cases
+    * where the row / column ordering of matrices in the block does not
+    * matter.  For example, enumerating the names of members of the block, but
+    * not for determining the offsets of members.
    */
   void process(ir_variable *var);

   /**
-    * Begin processing a uniform of a structured type.
+    * Begin processing a variable of a structured type.
    *
    * This flavor of \c process should be used to handle structured types
    * (i.e., structures, interfaces, or arrays there of) that need special
@@ -100,7 +101,7 @@ public:
    * (instead of the instance name) is used for an interface block.
    *
    * \param type  Type that is to be processed, associated with \c name
-    * \param name  Base name of the structured uniform being processed
+    * \param name  Base name of the structured variable being processed
    *
    * \note
    * \c type must be \c GLSL_TYPE_RECORD, \c GLSL_TYPE_INTERFACE, or an array
@@ -110,7 +111,7 @@ public:

 protected:
   /**
-    * Method invoked for each leaf of the uniform
+    * Method invoked for each leaf of the variable
    *
    * \param type  Type of the field.
    * \param name  Fully qualified name of the field.
--- a/src/glsl/program.h
+++ b/src/glsl/program.h
@@ -33,3 +33,7 @@ linker_error(gl_shader_program *prog, const char *fmt, ...)
 extern void
 linker_warning(gl_shader_program *prog, const char *fmt, ...)
   PRINTFLIKE(2, 3);
+
+extern long
+parse_program_resource_name(const GLchar *name,
+                            const GLchar **out_base_name_end);
--- a/src/glx/dri2_glx.c
+++ b/src/glx/dri2_glx.c
@@ -789,9 +789,11 @@ dri2XcbSwapBuffers(Display *dpy,

   swap_buffers_reply =
      xcb_dri2_swap_buffers_reply(c, swap_buffers_cookie, NULL);
-   ret = merge_counter(swap_buffers_reply->swap_hi,
-                       swap_buffers_reply->swap_lo);
-   free(swap_buffers_reply);
+   if (swap_buffers_reply) {
+      ret = merge_counter(swap_buffers_reply->swap_hi,
+                          swap_buffers_reply->swap_lo);
+      free(swap_buffers_reply);
+   }
   return ret;
 }

@@ -1053,7 +1055,8 @@ static const struct glx_context_vtable dri2_context_vtable = {
 };

 static void
-dri2BindExtensions(struct dri2_screen *psc, const __DRIextension **extensions)
+dri2BindExtensions(struct dri2_screen *psc, const __DRIextension **extensions,
+                   const char *driverName)
 {
   int i;

@@ -1062,7 +1065,15 @@ dri2BindExtensions(struct dri2_screen *psc, const __DRIextension **extensions)
   __glXEnableDirectExtension(&psc->base, "GLX_MESA_swap_control");
   __glXEnableDirectExtension(&psc->base, "GLX_SGI_make_current_read");

-   if (psc->dri2->base.version >= 4) {
+   /*
+    * GLX_INTEL_swap_event is broken on the server side, where it's
+    * currently unconditionally enabled. This completely breaks
+    * systems running on drivers which don't support that extension.
+    * There's no way to test for its presence on this side, so instead
+    * of disabling it uncondtionally, just disable it for drivers
+    * which are known to not support it.
+    */
+   if (strcmp(driverName, "vmwgfx") != 0) {
      __glXEnableDirectExtension(&psc->base, "GLX_INTEL_swap_event");
   }

@@ -1206,7 +1217,7 @@ dri2CreateScreen(int screen, struct glx_display * priv)
   }

   extensions = psc->core->getExtensions(psc->driScreen);
-   dri2BindExtensions(psc, extensions);
+   dri2BindExtensions(psc, extensions, driverName);

   configs = driConvertConfigs(psc->core, psc->base.configs, driver_configs);
   visuals = driConvertConfigs(psc->core, psc->base.visuals, driver_configs);
--- a/src/mapi/glapi/gen/glX_proto_send.py
+++ b/src/mapi/glapi/gen/glX_proto_send.py
@@ -700,7 +700,9 @@ generic_%u_byte( GLint rop, const void * ptr )
                        if f.reply_always_array:
                            print '        (void)memcpy(%s, %s_data(reply), %s_data_length(reply) * sizeof(%s));' % (output.name, xcb_name, xcb_name, output.get_base_type_string())
                        else:
-                            print '        if (%s_data_length(reply) == 0)' % (xcb_name)
+                            print '        /* the XXX_data_length() xcb function name is misleading, it returns the number */'
+                            print '        /* of elements, not the length of the data part. A single element is embedded. */'
+                            print '        if (%s_data_length(reply) == 1)' % (xcb_name)
                            print '            (void)memcpy(%s, &reply->datum, sizeof(reply->datum));' % (output.name)
                            print '        else'
                            print '            (void)memcpy(%s, %s_data(reply), %s_data_length(reply) * sizeof(%s));' % (output.name, xcb_name, xcb_name, output.get_base_type_string())
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -1910,6 +1910,14 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
      GLuint *tmp = malloc(srcW * srcH * sizeof(GLuint));

      if (tmp) {
+
+         newTex = alloc_texture(depthTex, srcW, srcH, GL_DEPTH_COMPONENT);
+         _mesa_ReadPixels(srcX, srcY, srcW, srcH, GL_DEPTH_COMPONENT,
+                          GL_UNSIGNED_INT, tmp);
+         setup_drawpix_texture(ctx, depthTex, newTex, GL_DEPTH_COMPONENT,
+                               srcW, srcH, GL_DEPTH_COMPONENT,
+                               GL_UNSIGNED_INT, tmp);
+
         /* texcoords (after texture allocation!) */
         {
            verts[0].s = 0.0F;
@@ -1928,15 +1936,6 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
         if (!blit->DepthFP)
            init_blit_depth_pixels(ctx);

-         /* maybe change tex format here */
-         newTex = alloc_texture(depthTex, srcW, srcH, GL_DEPTH_COMPONENT);
-
-         _mesa_ReadPixels(srcX, srcY, srcW, srcH,
-                          GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, tmp);
-
-         setup_drawpix_texture(ctx, depthTex, newTex, GL_DEPTH_COMPONENT, srcW, srcH,
-                               GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, tmp);
-
         _mesa_BindProgramARB(GL_FRAGMENT_PROGRAM_ARB, blit->DepthFP);
         _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB, GL_TRUE);
         _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -62,6 +62,7 @@ TEST_LIBS = \
 	../common/libdri_test_stubs.la

 i965_dri_la_SOURCES =
+nodist_EXTRA_i965_dri_la_SOURCES = dummy2.cpp
 i965_dri_la_LIBADD = $(COMMON_LIBS)
 i965_dri_la_LDFLAGS = -module -avoid-version -shared

--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -23,6 +23,7 @@

 #include "main/teximage.h"
 #include "main/fbobject.h"
+#include "main/renderbuffer.h"

 #include "glsl/ralloc.h"

@@ -183,10 +184,19 @@ formats_match(GLbitfield buffer_bit, struct intel_renderbuffer *src_irb,
   gl_format src_format = find_miptree(buffer_bit, src_irb)->format;
   gl_format dst_format = find_miptree(buffer_bit, dst_irb)->format;

-   return _mesa_get_srgb_format_linear(src_format) ==
-          _mesa_get_srgb_format_linear(dst_format);
-}
+   gl_format linear_src_format = _mesa_get_srgb_format_linear(src_format);
+   gl_format linear_dst_format = _mesa_get_srgb_format_linear(dst_format);

+   /* Normally, we require the formats to be equal.  However, we also support
+    * blitting from ARGB to XRGB (discarding alpha), and from XRGB to ARGB
+    * (overriding alpha to 1.0 via blending).
+    */
+   return linear_src_format == linear_dst_format ||
+          (linear_src_format == MESA_FORMAT_XRGB8888 &&
+           linear_dst_format == MESA_FORMAT_ARGB8888) ||
+          (linear_src_format == MESA_FORMAT_ARGB8888 &&
+           linear_dst_format == MESA_FORMAT_XRGB8888);
+}

 static bool
 try_blorp_blit(struct intel_context *intel,
@@ -295,6 +305,93 @@ try_blorp_blit(struct intel_context *intel,
   return true;
 }

+bool
+brw_blorp_copytexsubimage(struct intel_context *intel,
+                          struct gl_renderbuffer *src_rb,
+                          struct gl_texture_image *dst_image,
+                          int srcX0, int srcY0,
+                          int dstX0, int dstY0,
+                          int width, int height)
+{
+   struct gl_context *ctx = &intel->ctx;
+   struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb);
+   struct intel_renderbuffer *dst_irb;
+
+   /* BLORP is not supported before Gen6. */
+   if (intel->gen < 6)
+      return false;
+
+   /* Create a fake/wrapper renderbuffer to allow us to use do_blorp_blit(). */
+   dst_irb = intel_create_fake_renderbuffer_wrapper(intel, dst_image);
+   if (!dst_irb)
+      return false;
+
+   struct gl_renderbuffer *dst_rb = &dst_irb->Base.Base;
+
+   /* Unlike BlitFramebuffer, CopyTexSubImage doesn't have a buffer bit.
+    * It's only used by find_miptee() to decide whether to dereference the
+    * separate stencil miptree.  In the case of packed depth/stencil, core
+    * Mesa hands us the depth attachment as src_rb (not stencil), so assume
+    * non-stencil for now.  A buffer bit of 0 works for both color and depth.
+    */
+   GLbitfield buffer_bit = 0;
+
+   if (!formats_match(buffer_bit, src_irb, dst_irb)) {
+      dst_rb->Delete(ctx, dst_rb);
+      return false;
+   }
+
+   /* Source clipping shouldn't be necessary, since copytexsubimage (in
+    * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
+    * takes care of it.
+    *
+    * Destination clipping shouldn't be necessary since the restrictions on
+    * glCopyTexSubImage prevent the user from specifying a destination rectangle
+    * that falls outside the bounds of the destination texture.
+    * See error_check_subtexture_dimensions().
+    */
+
+   int srcY1 = srcY0 + height;
+   int dstX1 = dstX0 + width;
+   int dstY1 = dstY0 + height;
+
+   /* Sync up the state of window system buffers.  We need to do this before
+    * we go looking for the buffers.
+    */
+   intel_prepare_render(intel);
+
+   /* Account for the fact that in the system framebuffer, the origin is at
+    * the lower left.
+    */
+   bool mirror_y = false;
+   if (_mesa_is_winsys_fbo(ctx->ReadBuffer)) {
+      GLint tmp = src_rb->Height - srcY0;
+      srcY0 = src_rb->Height - srcY1;
+      srcY1 = tmp;
+      mirror_y = true;
+   }
+
+   do_blorp_blit(intel, buffer_bit, src_irb, dst_irb,
+                 srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, false, mirror_y);
+
+   /* If we're copying a packed depth stencil texture, the above do_blorp_blit
+    * copied depth (since buffer_bit != GL_STENCIL_BIT).  Now copy stencil as
+    * well.  There's no need to do a formats_match() check because the separate
+    * stencil buffer is always S8.
+    */
+   src_rb = ctx->ReadBuffer->Attachment[BUFFER_STENCIL].Renderbuffer;
+   if (_mesa_get_format_bits(dst_image->TexFormat, GL_STENCIL_BITS) > 0 &&
+       src_rb != NULL) {
+      src_irb = intel_renderbuffer(src_rb);
+      do_blorp_blit(intel, GL_STENCIL_BUFFER_BIT, src_irb, dst_irb,
+                    srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, false, mirror_y);
+   }
+
+   dst_rb->Delete(ctx, dst_rb);
+   return true;
+}
+
+
 GLbitfield
 brw_blorp_framebuffer(struct intel_context *intel,
                      GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
@@ -1642,17 +1739,6 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
   src.set(brw, src_mt, src_level, src_layer);
   dst.set(brw, dst_mt, dst_level, dst_layer);

-   /* If we are blitting from sRGB to linear or vice versa, we still want the
-    * blit to be a direct copy, so we need source and destination to use the
-    * same format.  However, we want the destination sRGB/linear state to be
-    * correct (so that sRGB blending is used when doing an MSAA resolve to an
-    * sRGB surface, and linear blending is used when doing an MSAA resolve to
-    * a linear surface).  Since blorp blits don't support any format
-    * conversion (except between sRGB and linear), we can accomplish this by
-    * simply setting up the source to use the same format as the destination.
-    */
-   assert(_mesa_get_srgb_format_linear(src_mt->format) ==
-          _mesa_get_srgb_format_linear(dst_mt->format));
   src.brw_surfaceformat = dst.brw_surfaceformat;

   use_wm_prog = true;
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -278,7 +278,23 @@ brwCreateContext(int api,
   }

   /* WM maximum threads is number of EUs times number of threads per EU. */
-   if (intel->gen >= 7) {
+   assert(intel->gen <= 7);
+
+   if (intel->is_haswell) {
+      if (intel->gt == 1) {
+	 brw->max_wm_threads = 102;
+	 brw->max_vs_threads = 70;
+	 brw->urb.size = 128;
+	 brw->urb.max_vs_entries = 640;
+	 brw->urb.max_gs_entries = 256;
+      } else if (intel->gt == 2) {
+	 brw->max_wm_threads = 204;
+	 brw->max_vs_threads = 280;
+	 brw->urb.size = 256;
+	 brw->urb.max_vs_entries = 1664;
+	 brw->urb.max_gs_entries = 640;
+      }
+   } else if (intel->gen == 7) {
      if (intel->gt == 1) {
 	 brw->max_wm_threads = 48;
 	 brw->max_vs_threads = 36;
@@ -360,6 +376,7 @@ brwCreateContext(int api,

   ctx->Const.NativeIntegers = true;
   ctx->Const.UniformBooleanTrue = 1;
+   ctx->Const.UniformBufferOffsetAlignment = 16;

   ctx->Const.ForceGLSLExtensionsWarn = driQueryOptionb(&intel->optionCache, "force_glsl_extensions_warn");

--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1217,6 +1217,14 @@ brw_blorp_framebuffer(struct intel_context *intel,
                      GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                      GLbitfield mask, GLenum filter);

+bool
+brw_blorp_copytexsubimage(struct intel_context *intel,
+                          struct gl_renderbuffer *src_rb,
+                          struct gl_texture_image *dst_image,
+                          int srcX0, int srcY0,
+                          int dstX0, int dstY0,
+                          int width, int height);
+
 /* gen6_multisample_state.c */
 void
 gen6_emit_3dstate_multisample(struct brw_context *brw,
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -258,6 +258,26 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
   return instructions;
 }

+/**
+ * A helper for MOV generation for fixing up broken hardware SEND dependency
+ * handling.
+ */
+fs_inst *
+fs_visitor::DEP_RESOLVE_MOV(int grf)
+{
+   fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
+
+   inst->ir = NULL;
+   inst->annotation = "send dependency resolve";
+
+   /* The caller always wants uncompressed to emit the minimal extra
+    * dependencies, and to avoid having to deal with aligning its regs to 2.
+    */
+   inst->force_uncompressed = true;
+
+   return inst;
+}
+
 bool
 fs_inst::equals(fs_inst *inst)
 {
@@ -327,6 +347,23 @@ fs_inst::is_math()
           opcode == SHADER_OPCODE_POW);
 }

+bool
+fs_inst::is_control_flow()
+{
+   switch (opcode) {
+   case BRW_OPCODE_DO:
+   case BRW_OPCODE_WHILE:
+   case BRW_OPCODE_IF:
+   case BRW_OPCODE_ELSE:
+   case BRW_OPCODE_ENDIF:
+   case BRW_OPCODE_BREAK:
+   case BRW_OPCODE_CONTINUE:
+      return true;
+   default:
+      return false;
+   }
+}
+
 bool
 fs_inst::is_send_from_grf()
 {
@@ -1673,8 +1710,6 @@ fs_visitor::setup_pull_constants()
                                 dst, index, offset);
 	 pull->ir = inst->ir;
 	 pull->annotation = inst->annotation;
-	 pull->base_mrf = 14;
-	 pull->mlen = 1;

 	 inst->insert_before(pull);

@@ -1894,6 +1929,7 @@ fs_visitor::register_coalesce()

      bool has_source_modifiers = (inst->src[0].abs ||
                                   inst->src[0].negate ||
+                                   inst->src[0].smear != -1 ||
                                   inst->src[0].file == UNIFORM);

      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
@@ -2070,16 +2106,12 @@ fs_visitor::compute_to_mrf()
 	    break;
 	 }

-	 /* We don't handle flow control here.  Most computation of
+	 /* We don't handle control flow here.  Most computation of
 	  * values that end up in MRFs are shortly before the MRF
 	  * write anyway.
 	  */
-	 if (scan_inst->opcode == BRW_OPCODE_DO ||
-	     scan_inst->opcode == BRW_OPCODE_WHILE ||
-	     scan_inst->opcode == BRW_OPCODE_ELSE ||
-	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
+	 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
 	    break;
-	 }

 	 /* You can't read from an MRF, so if someone else reads our
 	  * MRF's source GRF that we wanted to rewrite, that stops us.
@@ -2163,16 +2195,8 @@ fs_visitor::remove_duplicate_mrf_writes()
   foreach_list_safe(node, &this->instructions) {
      fs_inst *inst = (fs_inst *)node;

-      switch (inst->opcode) {
-      case BRW_OPCODE_DO:
-      case BRW_OPCODE_WHILE:
-      case BRW_OPCODE_IF:
-      case BRW_OPCODE_ELSE:
-      case BRW_OPCODE_ENDIF:
+      if (inst->is_control_flow()) {
 	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
-	 continue;
-      default:
-	 break;
      }

      if (inst->opcode == BRW_OPCODE_MOV &&
@@ -2223,6 +2247,265 @@ fs_visitor::remove_duplicate_mrf_writes()
   return progress;
 }

+static void
+clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
+                        int first_grf, int grf_len)
+{
+   bool inst_16wide = (dispatch_width > 8 &&
+                       !inst->force_uncompressed &&
+                       !inst->force_sechalf);
+
+   /* Clear the flag for registers that actually got read (as expected). */
+   for (int i = 0; i < 3; i++) {
+      int grf;
+      if (inst->src[i].file == GRF) {
+         grf = inst->src[i].reg;
+      } else if (inst->src[i].file == FIXED_HW_REG &&
+                 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+         grf = inst->src[i].fixed_hw_reg.nr;
+      } else {
+         continue;
+      }
+
+      if (grf >= first_grf &&
+          grf < first_grf + grf_len) {
+         deps[grf - first_grf] = false;
+         if (inst_16wide)
+            deps[grf - first_grf + 1] = false;
+      }
+   }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
+ *      check for post destination dependencies on this instruction, software
+ *      must ensure that there is no destination hazard for the case of ‘write
+ *      followed by a posted write’ shown in the following example.
+ *
+ *      1. mov r3 0
+ *      2. send r3.xy <rest of send instruction>
+ *      3. mov r2 r3
+ *
+ *      Due to no post-destination dependency check on the ‘send’, the above
+ *      code sequence could have two instructions (1 and 2) in flight at the
+ *      same time that both consider ‘r3’ as the target of their final writes.
+ */
+void
+fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
+{
+   int write_len = inst->regs_written() * dispatch_width / 8;
+   int first_write_grf = inst->dst.reg;
+   bool needs_dep[BRW_MAX_MRF];
+   assert(write_len < (int)sizeof(needs_dep) - 1);
+
+   memset(needs_dep, false, sizeof(needs_dep));
+   memset(needs_dep, true, write_len);
+
+   clear_deps_for_inst_src(inst, dispatch_width,
+                           needs_dep, first_write_grf, write_len);
+
+   /* Walk backwards looking for writes to registers we're writing which
+    * aren't read since being written.  If we hit the start of the program,
+    * we assume that there are no outstanding dependencies on entry to the
+    * program.
+    */
+   for (fs_inst *scan_inst = (fs_inst *)inst->prev;
+        scan_inst != NULL;
+        scan_inst = (fs_inst *)scan_inst->prev) {
+
+      /* If we hit control flow, assume that there *are* outstanding
+       * dependencies, and force their cleanup before our instruction.
+       */
+      if (scan_inst->is_control_flow()) {
+         for (int i = 0; i < write_len; i++) {
+            if (needs_dep[i]) {
+               inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+            }
+         }
+      }
+
+      bool scan_inst_16wide = (dispatch_width > 8 &&
+                               !scan_inst->force_uncompressed &&
+                               !scan_inst->force_sechalf);
+
+      /* We insert our reads as late as possible on the assumption that any
+       * instruction but a MOV that might have left us an outstanding
+       * dependency has more latency than a MOV.
+       */
+      if (scan_inst->dst.file == GRF &&
+          scan_inst->dst.reg >= first_write_grf &&
+          scan_inst->dst.reg < first_write_grf + write_len &&
+          needs_dep[scan_inst->dst.reg - first_write_grf]) {
+         inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
+         needs_dep[scan_inst->dst.reg - first_write_grf] = false;
+         if (scan_inst_16wide)
+            needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
+      }
+
+      /* Clear the flag for registers that actually got read (as expected). */
+      clear_deps_for_inst_src(scan_inst, dispatch_width,
+                              needs_dep, first_write_grf, write_len);
+
+      /* Continue the loop only if we haven't resolved all the dependencies */
+      int i;
+      for (i = 0; i < write_len; i++) {
+         if (needs_dep[i])
+            break;
+      }
+      if (i == write_len)
+         return;
+   }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ *     "[DevBW, DevCL] Errata: A destination register from a send can not be
+ *      used as a destination register until after it has been sourced by an
+ *      instruction with a different destination register.
+ */
+void
+fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
+{
+   int write_len = inst->regs_written() * dispatch_width / 8;
+   int first_write_grf = inst->dst.reg;
+   bool needs_dep[BRW_MAX_MRF];
+   assert(write_len < (int)sizeof(needs_dep) - 1);
+
+   memset(needs_dep, false, sizeof(needs_dep));
+   memset(needs_dep, true, write_len);
+   /* Walk forwards looking for writes to registers we're writing which aren't
+    * read before being written.
+    */
+   for (fs_inst *scan_inst = (fs_inst *)inst->next;
+        !scan_inst->is_tail_sentinel();
+        scan_inst = (fs_inst *)scan_inst->next) {
+      /* If we hit control flow, force resolve all remaining dependencies. */
+      if (scan_inst->is_control_flow()) {
+         for (int i = 0; i < write_len; i++) {
+            if (needs_dep[i])
+               scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+         }
+      }
+
+      /* Clear the flag for registers that actually got read (as expected). */
+      clear_deps_for_inst_src(scan_inst, dispatch_width,
+                              needs_dep, first_write_grf, write_len);
+
+      /* We insert our reads as late as possible since they're reading the
+       * result of a SEND, which has massive latency.
+       */
+      if (scan_inst->dst.file == GRF &&
+          scan_inst->dst.reg >= first_write_grf &&
+          scan_inst->dst.reg < first_write_grf + write_len &&
+          needs_dep[scan_inst->dst.reg - first_write_grf]) {
+         scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
+         needs_dep[scan_inst->dst.reg - first_write_grf] = false;
+      }
+
+      /* Continue the loop only if we haven't resolved all the dependencies */
+      int i;
+      for (i = 0; i < write_len; i++) {
+         if (needs_dep[i])
+            break;
+      }
+      if (i == write_len)
+         return;
+   }
+
+   /* If we hit the end of the program, resolve all remaining dependencies out
+    * of paranoia.
+    */
+   fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
+   assert(last_inst->eot);
+   for (int i = 0; i < write_len; i++) {
+      if (needs_dep[i])
+         last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+   }
+}
+
+void
+fs_visitor::insert_gen4_send_dependency_workarounds()
+{
+   if (intel->gen != 4 || intel->is_g4x)
+      return;
+
+   /* Note that we're done with register allocation, so GRF fs_regs always
+    * have a .reg_offset of 0.
+    */
+
+   foreach_list_safe(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
+
+      if (inst->mlen != 0 && inst->dst.file == GRF) {
+         insert_gen4_pre_send_dependency_workarounds(inst);
+         insert_gen4_post_send_dependency_workarounds(inst);
+      }
+   }
+}
+
+/**
+ * Turns the generic expression-style uniform pull constant load instruction
+ * into a hardware-specific series of instructions for loading a pull
+ * constant.
+ *
+ * The expression style allows the CSE pass before this to optimize out
+ * repeated loads from the same offset, and gives the pre-register-allocation
+ * scheduling full flexibility, while the conversion to native instructions
+ * allows the post-register-allocation scheduler the best information
+ * possible.
+ */
+void
+fs_visitor::lower_uniform_pull_constant_loads()
+{
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
+
+      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
+         continue;
+
+      if (intel->gen >= 7) {
+         fs_reg const_offset_reg = inst->src[1];
+         assert(const_offset_reg.file == IMM &&
+                const_offset_reg.type == BRW_REGISTER_TYPE_UD);
+         const_offset_reg.imm.u /= 16;
+         fs_reg payload = fs_reg(this, glsl_type::uint_type);
+         struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
+                                    BRW_REGISTER_TYPE_UD);
+
+         fs_inst *setup1 = MOV(payload, fs_reg(g0));
+         setup1->force_writemask_all = true;
+         /* We don't need the second half of this vgrf to be filled with g1
+          * in the 16-wide case, but if we use force_uncompressed then live
+          * variable analysis won't consider this a def!
+          */
+
+         fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
+                                                payload, payload,
+                                                const_offset_reg);
+
+         setup1->ir = inst->ir;
+         setup1->annotation = inst->annotation;
+         inst->insert_before(setup1);
+         setup2->ir = inst->ir;
+         setup2->annotation = inst->annotation;
+         inst->insert_before(setup2);
+         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
+         inst->src[1] = payload;
+      } else {
+         /* Before register allocation, we didn't tell the scheduler about the
+          * MRF we use.  We know it's safe to use this MRF because nothing
+          * else does except for register spill/unspill, which generates and
+          * uses its MRF within a single IR instruction.
+          */
+         inst->base_mrf = 14;
+         inst->mlen = 1;
+      }
+   }
+}
+
 void
 fs_visitor::dump_instruction(fs_inst *inst)
 {
@@ -2495,6 +2778,8 @@ fs_visitor::run()

      schedule_instructions(false);

+      lower_uniform_pull_constant_loads();
+
      assign_curb_setup();
      assign_urb_setup();

@@ -2517,6 +2802,12 @@ fs_visitor::run()
   assert(force_uncompressed_stack == 0);
   assert(force_sechalf_stack == 0);

+   /* This must come after all optimization and register allocation, since
+    * it inserts dead code that happens to have side effects, and it does
+    * so based on the actual physical registers in use.
+    */
+   insert_gen4_send_dependency_workarounds();
+
   if (failed)
      return false;

--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -178,6 +178,7 @@ public:
   bool overwrites_reg(const fs_reg &reg);
   bool is_tex();
   bool is_math();
+   bool is_control_flow();
   bool is_send_from_grf();

   fs_reg dst;
@@ -284,6 +285,7 @@ public:
   fs_inst *IF(fs_reg src0, fs_reg src1, uint32_t condition);
   fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1,
                uint32_t condition);
+   fs_inst *DEP_RESOLVE_MOV(int grf);

   int type_size(const struct glsl_type *type);
   fs_inst *get_instruction_generating_reg(fs_inst *start,
@@ -328,7 +330,11 @@ public:
   bool remove_duplicate_mrf_writes();
   bool virtual_grf_interferes(int a, int b);
   void schedule_instructions(bool post_reg_alloc);
+   void insert_gen4_send_dependency_workarounds();
+   void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
+   void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
   void fail(const char *msg, ...);
+   void lower_uniform_pull_constant_loads();

   void push_force_uncompressed();
   void pop_force_uncompressed();
--- a/Show More
+++ b/Show More