Add bin/test-driver to the list of files to be distributed.

Without this, the build fails for me when trying to build from a generated tar file after running just ./configure. (It's not clear to me why I didn't encounter similar breakage with previous releases.)
docs: Add release notes for 9.2.2 release
2013-10-18 16:58:32 -07:00 · 2013-10-18 16:41:15 -07:00 · 2013-10-18 16:36:31 -07:00 · 2013-10-17 11:30:26 -07:00 · 2013-10-16 15:15:05 -07:00 · 2013-10-16 15:13:29 -07:00
864 changed files with 57714 additions and 39219 deletions
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -35,7 +35,7 @@ LOCAL_C_INCLUDES += \

 # define ANDROID_VERSION (e.g., 4.0.x => 0x0400)
 LOCAL_CFLAGS += \
-	-DPACKAGE_VERSION=\"9.2.0-devel\" \
+	-DPACKAGE_VERSION=\"9.2.2\" \
 	-DPACKAGE_BUGREPORT=\"https://bugs.freedesktop.org/enter_bug.cgi?product=Mesa\" \
 	-DANDROID_VERSION=0x0$(MESA_ANDROID_MAJOR_VERSION)0$(MESA_ANDROID_MINOR_VERSION)

--- a/CleanSpec.mk
+++ b/CleanSpec.mk
@@ -1,49 +0,0 @@
-# Copyright (C) 2013 The Android-x86 Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# If you don't need to do a full clean build but would like to touch
-# a file or delete some intermediate files, add a clean step to the end
-# of the list.  These steps will only be run once, if they haven't been
-# run before.
-#
-# E.g.:
-#     $(call add-clean-step, touch -c external/sqlite/sqlite3.h)
-#     $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libz_intermediates)
-#
-# Always use "touch -c" and "rm -f" or "rm -rf" to gracefully deal with
-# files that are missing or have been moved.
-#
-# Use $(PRODUCT_OUT) to get to the "out/target/product/blah/" directory.
-# Use $(OUT_DIR) to refer to the "out" directory.
-#
-# If you need to re-do something that's already mentioned, just copy
-# the command and add it to the bottom of the list.  E.g., if a change
-# that you made last week required touching a file and a change you
-# made today requires touching the same file, just copy the old
-# touch step and add it to the end of the list.
-#
-# ************************************************
-# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
-# ************************************************
-
-$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libmesa_*_intermediates)
-$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libdrm_*intermediates)
-$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/i9?5_dri_intermediates)
-$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libglapi_intermediates)
-$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/gralloc.drm_intermediates)
-$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libgralloc_drm_intermediates)
-$(call add-clean-step, rm -rf $(OUT_DIR)/host/$(HOST_OS)-$(HOST_ARCH)/obj/EXECUTABLES/mesa_*_intermediates)
-$(call add-clean-step, rm -rf $(OUT_DIR)/host/$(HOST_OS)-$(HOST_ARCH)/obj/EXECUTABLES/glsl_compiler_intermediates)
-$(call add-clean-step, rm -rf $(OUT_DIR)/host/$(HOST_OS)-$(HOST_ARCH)/obj/STATIC_LIBRARIES/libmesa_glsl_utils_intermediates)
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,6 +50,7 @@ EXTRA_FILES = \
 	bin/install-sh					\
 	bin/ltmain.sh					\
 	bin/missing					\
+	bin/test-driver					\
 	bin/ylwrap					\
 	src/glsl/glsl_parser.cpp			\
 	src/glsl/glsl_parser.h				\
@@ -57,12 +58,6 @@ EXTRA_FILES = \
 	src/glsl/glcpp/glcpp-lex.c			\
 	src/glsl/glcpp/glcpp-parse.c			\
 	src/glsl/glcpp/glcpp-parse.h			\
-	src/mesa/main/api_exec_es1.c			\
-	src/mesa/main/api_exec_es1_dispatch.h		\
-	src/mesa/main/api_exec_es1_remap_helper.h	\
-	src/mesa/main/api_exec_es2.c			\
-	src/mesa/main/api_exec_es2_dispatch.h		\
-	src/mesa/main/api_exec_es2_remap_helper.h	\
 	src/mesa/program/lex.yy.c			\
 	src/mesa/program/program_parse.tab.c		\
 	src/mesa/program/program_parse.tab.h		\
--- a/2
+++ b/2
@@ -70,7 +70,7 @@ if env['gles']:
 # Environment setup

 env.Append(CPPDEFINES = [
-    ('PACKAGE_VERSION', '\\"9.2.0-devel\\"'),
+    ('PACKAGE_VERSION', '\\"9.2.2\\"'),
    ('PACKAGE_BUGREPORT', '\\"https://bugs.freedesktop.org/enter_bug.cgi?product=Mesa\\"'),
 ])

--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -0,0 +1,10 @@
+# Already cherry picked without -x
+d8ac987f6ab228df1a478b36c3d889992754374f glsl: Disallow uniform block layout qualifiers on non-uniform block vars.
+
+# The bug fixed by this patch does not exist in 9.2.  Discussed with Marek and
+# Brian Paul on the mesa-stable mailing list.
+89a665eb5fa176f68223bf54a472d6a0567c3546 draw: fix segfaults with aaline and aapoint stages disabled
+
+# Previously cherry picked (patch originally appeared twice on master with a
+# revert in between)
+4e5eb8ba25054ede4798fa424e6f32b23aba0f98 i965/vec4: Only zero out unused message components when there are any.
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -14,7 +14,7 @@ git log --reverse --grep="cherry picked from commit" origin/master..HEAD |\
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked

 # Grep for commits that were marked as a candidate for the stable tree.
-git log --reverse --pretty=%H -i --grep='^[[:space:]]*NOTE: .*[Cc]andidate' HEAD..origin/master |\
+git log --reverse --pretty=%H -i --grep='^\([[:space:]]*NOTE: .*[Cc]andidate\|CC:.*mesa-stable\)' HEAD..origin/master |\
 while read sha
 do
 	# Check to see whether the patch is on the ignore list.
--- a/configure.ac
+++ b/configure.ac
@@ -6,7 +6,7 @@ dnl Tell the user about autoconf.html in the --help output
 m4_divert_once([HELP_END], [
 See docs/autoconf.html for more details on the options for Mesa.])

-AC_INIT([Mesa], [9.2.0-devel],
+AC_INIT([Mesa], [9.2.2],
    [https://bugs.freedesktop.org/enter_bug.cgi?product=Mesa])
 AC_CONFIG_AUX_DIR([bin])
 AC_CONFIG_MACRO_DIR([m4])
@@ -31,7 +31,7 @@ AC_SUBST([OSMESA_VERSION])

 dnl Versions for external dependencies
 LIBDRM_REQUIRED=2.4.24
-LIBDRM_RADEON_REQUIRED=2.4.45
+LIBDRM_RADEON_REQUIRED=2.4.46
 LIBDRM_INTEL_REQUIRED=2.4.38
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
@@ -100,6 +100,7 @@ AC_MSG_RESULT([$acv_mesa_CLANG])

 dnl If we're using GCC, make sure that it is at least version 3.3.0.  Older
 dnl versions are explictly not supported.
+GEN_ASM_OFFSETS=no
 if test "x$GCC" = xyes -a "x$acv_mesa_CLANG" = xno; then
    AC_MSG_CHECKING([whether gcc version is sufficient])
    major=0
@@ -117,7 +118,12 @@ if test "x$GCC" = xyes -a "x$acv_mesa_CLANG" = xno; then
    else
        AC_MSG_RESULT([yes])
    fi
+
+    if test "x$cross_compiling" = xyes; then
+        GEN_ASM_OFFSETS=yes
+    fi
 fi
+AM_CONDITIONAL([GEN_ASM_OFFSETS], test "x$GEN_ASM_OFFSETS" = xyes)

 dnl Make sure the pkg-config macros are defined
 m4_ifndef([PKG_PROG_PKG_CONFIG],
@@ -438,7 +444,7 @@ test "x$enable_asm" = xno && AC_MSG_RESULT([no])
 # disable if cross compiling on x86/x86_64 since we must run gen_matypes
 if test "x$enable_asm" = xyes && test "x$cross_compiling" = xyes; then
    case "$host_cpu" in
-    i?86 | x86_64)
+    i?86 | x86_64 | amd64)
        enable_asm=no
        AC_MSG_RESULT([no, cross compiling])
        ;;
@@ -449,7 +455,7 @@ if test "x$enable_asm" = xyes; then
    case "$host_cpu" in
    i?86)
        case "$host_os" in
-        linux* | *freebsd* | dragonfly* | *netbsd*)
+        linux* | *freebsd* | dragonfly* | *netbsd* | openbsd*)
            test "x$enable_64bit" = xyes && asm_arch=x86_64 || asm_arch=x86
            ;;
        gnu*)
@@ -457,9 +463,9 @@ if test "x$enable_asm" = xyes; then
            ;;
        esac
        ;;
-    x86_64)
+    x86_64|amd64)
        case "$host_os" in
-        linux* | *freebsd* | dragonfly* | *netbsd*)
+        linux* | *freebsd* | dragonfly* | *netbsd* | openbsd*)
            test "x$enable_32bit" = xyes && asm_arch=x86 || asm_arch=x86_64
            ;;
        esac
@@ -478,7 +484,7 @@ if test "x$enable_asm" = xyes; then
        DEFINES="$DEFINES -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM"
        AC_MSG_RESULT([yes, x86])
        ;;
-    x86_64)
+    x86_64|amd64)
        DEFINES="$DEFINES -DUSE_X86_64_ASM"
        AC_MSG_RESULT([yes, x86_64])
        ;;
@@ -573,6 +579,11 @@ AC_ARG_ENABLE([osmesa],
        [enable OSMesa library @<:@default=disabled@:>@])],
    [enable_osmesa="$enableval"],
    [enable_osmesa=no])
+AC_ARG_ENABLE([gallium-osmesa],
+    [AS_HELP_STRING([--enable-gallium-osmesa],
+        [enable Gallium implementation of the OSMesa library @<:@default=disabled@:>@])],
+    [enable_gallium_osmesa="$enableval"],
+    [enable_gallium_osmesa=no])
 AC_ARG_ENABLE([egl],
    [AS_HELP_STRING([--disable-egl],
        [disable EGL library @<:@default=enabled@:>@])],
@@ -763,7 +774,13 @@ if test "x$enable_dri" = xyes; then
    GALLIUM_STATE_TRACKERS_DIRS="dri $GALLIUM_STATE_TRACKERS_DIRS"
 fi

-if test "x$enable_osmesa" = xyes; then
+if test "x$enable_gallium_osmesa" = xyes; then
+    if test -z "$with_gallium_drivers"; then
+        AC_MSG_ERROR([Cannot enable gallium_osmesa without Gallium])
+    fi
+    if test "x$enable_osmesa" = xyes; then
+        AC_MSG_ERROR([Cannot enable both classic and Gallium OSMesa implementations])
+    fi
    GALLIUM_STATE_TRACKERS_DIRS="osmesa $GALLIUM_STATE_TRACKERS_DIRS"
    GALLIUM_TARGET_DIRS="$GALLIUM_TARGET_DIRS osmesa"
 fi
@@ -966,7 +983,7 @@ if test "x$enable_dri" = xyes; then
        DEFINES="$DEFINES -DHAVE_ALIAS"

        case "$host_cpu" in
-        x86_64)
+        x86_64|amd64)
            if test "x$DRI_DIRS" = "xyes"; then
                DRI_DIRS="i915 i965 nouveau r200 radeon swrast"
            fi
@@ -985,7 +1002,7 @@ if test "x$enable_dri" = xyes; then
            ;;
        esac
        ;;
-    freebsd* | dragonfly* | *netbsd*)
+    freebsd* | dragonfly* | *netbsd* | openbsd*)
        DEFINES="$DEFINES -DHAVE_PTHREAD -DUSE_EXTERNAL_DXTN_LIB=1"
        DEFINES="$DEFINES -DHAVE_ALIAS"

@@ -1129,7 +1146,7 @@ x16|x32)
    ;;
 esac

-if test "x$enable_osmesa" = xyes; then
+if test "x$enable_osmesa" = xyes -o "x$enable_gallium_osmesa" = xyes; then
    # only link libraries with osmesa if shared
    if test "$enable_static" = no; then
        OSMESA_LIB_DEPS="-lm $PTHREAD_LIBS $SELINUX_LIBS $DLOPEN_LIBS"
@@ -1490,6 +1507,13 @@ AC_SUBST([EGL_NATIVE_PLATFORM])
 AC_SUBST([EGL_PLATFORMS])
 AC_SUBST([EGL_CFLAGS])

+# If we don't have the X11 platform, set this define so we don't try to include
+# the X11 headers.
+if ! echo "$egl_platforms" | grep -q 'x11'; then
+    DEFINES="$DEFINES -DMESA_EGL_NO_X11_HEADERS"
+    GL_PC_CFLAGS="$GL_PC_CFLAGS -DMESA_EGL_NO_X11_HEADERS"
+fi
+
 AC_ARG_WITH([egl-driver-dir],
    [AS_HELP_STRING([--with-egl-driver-dir=DIR],
                    [directory for EGL drivers [[default=${libdir}/egl]]])],
@@ -1566,7 +1590,7 @@ if test "x$with_gallium_drivers" = x; then
 fi
 if test "x$enable_gallium_llvm" = xauto; then
    case "$host_cpu" in
-    i*86|x86_64) enable_gallium_llvm=yes;;
+    i*86|x86_64|amd64) enable_gallium_llvm=yes;;
    esac
 fi
 if test "x$enable_gallium_llvm" = xyes; then
@@ -1577,42 +1601,53 @@ if test "x$enable_gallium_llvm" = xyes; then
    fi

    if test "x$LLVM_CONFIG" != xno; then
-	LLVM_VERSION=`$LLVM_CONFIG --version | sed 's/svn.*//g'`
-	LLVM_VERSION_INT=`echo $LLVM_VERSION | sed -e 's/\([[0-9]]\)\.\([[0-9]]\)/\10\2/g'`
+        LLVM_VERSION=`$LLVM_CONFIG --version | sed 's/svn.*//g'`
+        LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags`
+        LLVM_BINDIR=`$LLVM_CONFIG --bindir`
+        LLVM_CPPFLAGS=`strip_unwanted_llvm_flags "$LLVM_CONFIG --cppflags"`
+        LLVM_CFLAGS=$LLVM_CPPFLAGS   # CPPFLAGS seem to be sufficient
+        LLVM_CXXFLAGS=`strip_unwanted_llvm_flags "$LLVM_CONFIG --cxxflags"`
+        LLVM_INCLUDEDIR=`$LLVM_CONFIG --includedir`
+        LLVM_LIBDIR=`$LLVM_CONFIG --libdir`
+
+        AC_COMPUTE_INT([LLVM_VERSION_MAJOR], [LLVM_VERSION_MAJOR],
+            [#include "${LLVM_INCLUDEDIR}/llvm/Config/llvm-config.h"])
+        AC_COMPUTE_INT([LLVM_VERSION_MINOR], [LLVM_VERSION_MINOR],
+            [#include "${LLVM_INCLUDEDIR}/llvm/Config/llvm-config.h"])
+
+        if test "x${LLVM_VERSION_MAJOR}" != x; then
+            LLVM_VERSION_INT="${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}"
+        else
+            LLVM_VERSION_INT=`echo $LLVM_VERSION | sed -e 's/\([[0-9]]\)\.\([[0-9]]\)/\10\2/g'`
+        fi
+
        LLVM_COMPONENTS="engine bitwriter"
-        if $LLVM_CONFIG --components | grep -q '\<mcjit\>'; then
+        if $LLVM_CONFIG --components | grep -qw 'mcjit'; then
            LLVM_COMPONENTS="${LLVM_COMPONENTS} mcjit"
        fi

        if test "x$enable_opencl" = xyes; then
            LLVM_COMPONENTS="${LLVM_COMPONENTS} ipo linker instrumentation"
            # LLVM 3.3 >= 177971 requires IRReader
-            if $LLVM_CONFIG --components | grep -q '\<irreader\>'; then
+            if $LLVM_CONFIG --components | grep -qw 'irreader'; then
                LLVM_COMPONENTS="${LLVM_COMPONENTS} irreader"
            fi
        fi
-	LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags`
-	LLVM_BINDIR=`$LLVM_CONFIG --bindir`
-	LLVM_CPPFLAGS=`strip_unwanted_llvm_flags "$LLVM_CONFIG --cppflags"`
-	LLVM_CFLAGS=$LLVM_CPPFLAGS   # CPPFLAGS seem to be sufficient
-	LLVM_CXXFLAGS=`strip_unwanted_llvm_flags "$LLVM_CONFIG --cxxflags"`
-	LLVM_INCLUDEDIR=`$LLVM_CONFIG --includedir`
-	LLVM_LIBDIR=`$LLVM_CONFIG --libdir`
-	DEFINES="${DEFINES} -DHAVE_LLVM=0x0$LLVM_VERSION_INT"
-	MESA_LLVM=1
+        DEFINES="${DEFINES} -DHAVE_LLVM=0x0$LLVM_VERSION_INT"
+        MESA_LLVM=1

-	dnl Check for Clang interanl headers
+        dnl Check for Clang internal headers
        if test "x$enable_opencl" = xyes; then
            if test "x$CLANG_LIBDIR" = x; then
                CLANG_LIBDIR=${LLVM_LIBDIR}
            fi
            CLANG_RESOURCE_DIR=$CLANG_LIBDIR/clang/${LLVM_VERSION}
-            AC_CHECK_FILE("$CLANG_RESOURCE_DIR/include/stddef.h",,
-                AC_MSG_ERROR([Could not find clang internal header stddef.h in $CLANG_RESOURCE_DIR Use --with-clang-libdir to specify the correct path to the clang libraries.]))
+            AS_IF([test ! -f "$CLANG_RESOURCE_DIR/include/stddef.h"],
+                [AC_MSG_ERROR([Could not find clang internal header stddef.h in $CLANG_RESOURCE_DIR Use --with-clang-libdir to specify the correct path to the clang libraries.])])
        fi
    else
-	MESA_LLVM=0
-	LLVM_VERSION_INT=0
+        MESA_LLVM=0
+        LLVM_VERSION_INT=0
    fi
 else
    MESA_LLVM=0
@@ -1687,7 +1722,7 @@ gallium_check_st() {
 gallium_require_llvm() {
    if test "x$MESA_LLVM" = x0; then
        case "$host_cpu" in
-        i*86|x86_64) AC_MSG_ERROR([LLVM is required to build $1 on x86 and x86_64]);;
+        i*86|x86_64|amd64) AC_MSG_ERROR([LLVM is required to build $1 on x86 and x86_64]);;
        esac
    fi
 }
@@ -1709,7 +1744,7 @@ radeon_llvm_check() {
    if test "$LLVM_VERSION_INT" -lt "${LLVM_REQUIRED_VERSION_MAJOR}0${LLVM_REQUIRED_VERSION_MINOR}"; then
        AC_MSG_ERROR([LLVM $LLVM_REQUIRED_VERSION_MAJOR.$LLVM_REQUIRED_VERSION_MINOR or newer is required for r600g and radeonsi.])
    fi
-    if test true && $LLVM_CONFIG --targets-built | grep -qv '\<R600\>' ; then
+    if test true && $LLVM_CONFIG --targets-built | grep -qvw 'R600' ; then
        AC_MSG_ERROR([LLVM R600 Target not enabled.  You can enable it when building the LLVM
                      sources with the --enable-experimental-targets=R600
                      configure flag])
@@ -1846,7 +1881,7 @@ if test "x$MESA_LLVM" != x0; then
    if test "x$with_llvm_shared_libs" = xyes; then
        dnl We can't use $LLVM_VERSION because it has 'svn' stripped out,
        LLVM_SO_NAME=LLVM-`$LLVM_CONFIG --version`
-        AC_CHECK_FILE("$LLVM_LIBDIR/lib$LLVM_SO_NAME.so", llvm_have_one_so=yes,)
+        AS_IF([test -f "$LLVM_LIBDIR/lib$LLVM_SO_NAME.so"], [llvm_have_one_so=yes])

        if test "x$llvm_have_one_so" = xyes; then
            dnl LLVM was built using auto*, so there is only one shared object.
@@ -1854,8 +1889,8 @@ if test "x$MESA_LLVM" != x0; then
        else
            dnl If LLVM was built with CMake, there will be one shared object per
            dnl component.
-            AC_CHECK_FILE("$LLVM_LIBDIR/libLLVMTarget.so",,
-                    AC_MSG_ERROR([Could not find llvm shared libraries:
+            AS_IF([test ! -f "$LLVM_LIBDIR/libLLVMTarget.so"],
+                    [AC_MSG_ERROR([Could not find llvm shared libraries:
 	Please make sure you have built llvm with the --enable-shared option
 	and that your llvm libraries are installed in $LLVM_LIBDIR
 	If you have installed your llvm libraries to a different directory you
@@ -1866,7 +1901,7 @@ if test "x$MESA_LLVM" != x0; then
 		--enable-opencl
 	If you do not want to build with llvm shared libraries and instead want to
 	use llvm static libraries then remove these options from your configure
-	invocation and reconfigure.]))
+	invocation and reconfigure.])])

           dnl We don't need to update LLVM_LIBS in this case because the LLVM
           dnl install uses a shared object for each compoenent and we have
@@ -1890,8 +1925,8 @@ AM_CONDITIONAL(NEED_GALLIUM_SOFTPIPE_DRIVER, test "x$HAVE_GALLIUM_SVGA" = xyes -
                                                  "x$HAVE_GALLIUM_I915" = xyes -o \
                                                  "x$HAVE_GALLIUM_SOFTPIPE" = xyes)
 AM_CONDITIONAL(NEED_GALLIUM_LLVMPIPE_DRIVER, test "x$HAVE_GALLIUM_I915" = xyes -o \
-                                                  "x$HAVE_GALLIUM_SOFTPIPE" = xyes -a \
-                                                  "x$MESA_LLVM" = x1)
+                                                  "x$HAVE_GALLIUM_SOFTPIPE" = xyes \
+                                                  && test "x$MESA_LLVM" = x1)

 if test "x$enable_gallium_loader" = xyes; then
    GALLIUM_WINSYS_DIRS="$GALLIUM_WINSYS_DIRS sw/null"
@@ -1938,9 +1973,11 @@ AC_SUBST([ELF_LIB])

 AM_CONDITIONAL(NEED_LIBPROGRAM, test "x$with_gallium_drivers" != x -o \
                                     "x$enable_xlib_glx" = xyes -o \
-                                     "x$enable_osmesa" = xyes)
+                                     "x$enable_osmesa" = xyes -o \
+                                     "x$enable_gallium_osmesa" = xyes)
 AM_CONDITIONAL(HAVE_X11_DRIVER, test "x$enable_xlib_glx" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
+AM_CONDITIONAL(HAVE_GALLIUM_OSMESA, test "x$enable_gallium_osmesa" = xyes)

 AM_CONDITIONAL(HAVE_X86_ASM, echo "$DEFINES" | grep 'X86_ASM' >/dev/null 2>&1)
 AM_CONDITIONAL(HAVE_X86_64_ASM, echo "$DEFINES" | grep 'X86_64_ASM' >/dev/null 2>&1)
@@ -2029,6 +2066,7 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/targets/gbm/Makefile
 		src/gallium/targets/opencl/Makefile
 		src/gallium/targets/osmesa/Makefile
+		src/gallium/targets/osmesa/osmesa.pc
 		src/gallium/targets/pipe-loader/Makefile
 		src/gallium/targets/libgl-xlib/Makefile
 		src/gallium/targets/vdpau-nouveau/Makefile
@@ -2127,11 +2165,17 @@ echo "        OpenVG:          $enable_openvg"

 dnl Driver info
 echo ""
-if test "x$enable_osmesa" != xno; then
+case "x$enable_osmesa$enable_gallium_osmesa" in
+xnoyes)
+        echo "        OSMesa:          lib$OSMESA_LIB (Gallium)"
+        ;;
+xyesno)
        echo "        OSMesa:          lib$OSMESA_LIB"
-else
+        ;;
+xnono)
        echo "        OSMesa:          no"
-fi
+        ;;
+esac

 if test "x$enable_dri" != xno; then
        # cleanup the drivers var
--- a/docs/README.WIN32
+++ b/docs/README.WIN32
@@ -1,6 +1,6 @@
 File: docs/README.WIN32

-Last updated: 23 April 2011
+Last updated: 21 June 2013


 Quick Start
@@ -30,6 +30,23 @@ At this time, only the gallium GDI driver is known to work.
 Source code also exists in the tree for other drivers in
 src/mesa/drivers/windows, but the status of this code is unknown.

+Recipe
+------
+
+Building on windows requires several open-source packages. These are
+steps that work as of this writing.
+
+1) install python 2.7
+2) install scons (latest)
+3) install mingw, flex, and bison
+4) install libxml2 from here: http://www.lfd.uci.edu/~gohlke/pythonlibs
+  get libxml2-python-2.9.1.win-amd64-py2.7.exe
+5) install pywin32 from here: http://www.lfd.uci.edu/~gohlke/pythonlibs
+  get pywin32-218.4.win-amd64-py2.7.exe
+6) install git
+7) download mesa from git
+  see http://www.mesa3d.org/repository.html
+8) run scons

 General
 -------
--- a/docs/extensions.html
+++ b/docs/extensions.html
@@ -32,7 +32,7 @@ The specifications follow.
 <li><a href="specs/MESA_pixmap_colormap.spec">MESA_pixmap_colormap.spec</a>
 <li><a href="specs/OLD/MESA_program_debug.spec">MESA_program_debug.spec</a> (obsolete)
 <li><a href="specs/MESA_release_buffers.spec">MESA_release_buffers.spec</a>
-<li><a href="specs/MESA_resize_buffers.spec">MESA_resize_buffers.spec</a>
+<li><a href="specs/OLD/MESA_resize_buffers.spec">MESA_resize_buffers.spec</a> (obsolete)
 <li><a href="specs/MESA_set_3dfx_mode.spec">MESA_set_3dfx_mode.spec</a>
 <li><a href="specs/MESA_shader_debug.spec">MESA_shader_debug.spec</a>
 <li><a href="specs/OLD/MESA_sprite_point.spec">MESA_sprite_point.spec</a> (obsolete)
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,24 @@

 <h1>News</h1>

+<h2>August 1, 2013</h2>
+<p>
+<a href="relnotes/9.1.6.html">Mesa 9.1.6</a> is released.
+This is a bug fix release.
+</p>
+
+<h2>July 17, 2013</h2>
+<p>
+<a href="relnotes/9.1.5.html">Mesa 9.1.5</a> is released.
+This is a bug fix release.
+</p>
+
+<h2>July 1, 2013</h2>
+<p>
+<a href="relnotes/9.1.4.html">Mesa 9.1.4</a> is released.
+This is a bug fix release.
+</p>
+
 <h2>May 21, 2013</h2>
 <p>
 <a href="relnotes/9.1.3.html">Mesa 9.1.3</a> is released.
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -22,6 +22,9 @@ The release notes summarize what's new or changed in each Mesa release.

 <ul>
 <li><a href="relnotes/9.2.html">9.2 release notes</a>
+<li><a href="relnotes/9.1.6.html">9.1.6 release notes</a>
+<li><a href="relnotes/9.1.5.html">9.1.5 release notes</a>
+<li><a href="relnotes/9.1.4.html">9.1.4 release notes</a>
 <li><a href="relnotes/9.1.3.html">9.1.3 release notes</a>
 <li><a href="relnotes/9.1.2.html">9.1.2 release notes</a>
 <li><a href="relnotes/9.1.1.html">9.1.1 release notes</a>
--- a/docs/relnotes/9.1.4.html
+++ b/docs/relnotes/9.1.4.html
@@ -0,0 +1,321 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 9.1.4 Release Notes / July 1st, 2013</h1>
+
+<p>
+Mesa 9.1.4 is a bug fix release which fixes bugs found since the 9.1.3 release.
+</p>
+<p>
+Mesa 9.1 implements the OpenGL 3.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.1.  OpenGL
+3.1 is <strong>only</strong> available if requested at context creation
+because GL_ARB_compatibility is not supported.
+</p>
+
+<h2>MD5 checksums</h2>
+<pre>
+a2c4e25d0e27918bc67f61bae04d0cb8  MesaLib-9.1.4.tar.bz2
+8c7e9ce5b05cb2223f0587396dd9dc08  MesaLib-9.1.4.tar.gz
+020459c5793d4279bdcb2daa1f7dd9f6  MesaLib-9.1.4.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=37871">Bug 37871</a> - [bisected i965] Bus error (core dumped) on oglc texdecaltile</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=42182">Bug 42182</a> - egl/opengles1/tri_x11 renders wrong</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=44958">Bug 44958</a> - [SNB IVB HSW] mesa demo test texleak bus error</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=53494">Bug 53494</a> - [snb] crash in texsubimage to a large atlas in clutter</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60518">Bug 60518</a> - glDrawElements segfault when compiled into display list</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=61821">Bug 61821</a> - src/mesa/drivers/dri/common/xmlpool.h:96:29: fatal error: xmlpool/options.h</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=63520">Bug 63520</a> - r300g regression (RV380): Strange rendering of light sources in Penumbra  (bisected)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=63701">Bug 63701</a> - [HSW] support new haswell graphics [8086:0a2e]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=64727">Bug 64727</a> - [gm45, bisected] some piglit glsl 1.10 built-in-functions tests crash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=64745">Bug 64745</a> - [llvmpipe] SIGSEGV src/gallium/state_trackers/glx/xlib/glx_api.c:1374</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=64934">Bug 64934</a> - [llvmpipe] SIGSEGV src/gallium/state_trackers/glx/xlib/glx_api.c:1363</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=65173">Bug 65173</a> - segfault in _mesa_get_format_datatype and _mesa_get_color_read_type when state dumping with glretrace</li>
+
+</ul>
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-9.1.3..mesa-9.1.4
+</pre>
+
+<p>Alan Coopersmith (2):</p>
+<ul>
+  <li>integer overflow in XF86DRIOpenConnection() [CVE-2013-1993 1/2]</li>
+  <li>integer overflow in XF86DRIGetClientDriverName() [CVE-2013-1993 2/2]</li>
+</ul>
+
+<p>Alex Deucher (3):</p>
+<ul>
+  <li>radeonsi: add support for hainan chips</li>
+  <li>radeonsi: add Hainan pci ids</li>
+  <li>winsys/radeon: add env var to disable VM on Cayman/Trinity</li>
+</ul>
+pp
+<p>Andreas Boll (1):</p>
+<ul>
+  <li>glapi: Add some missing static_dispatch="false" annotations to es_EXT.xml</li>
+</ul>
+
+<p>Anuj Phogat (1):</p>
+<ul>
+  <li>intel: Add a null pointer check before dereferencing the pointer</li>
+</ul>
+
+<p>Armin K (1):</p>
+<ul>
+  <li>gallivm: Fix build with LLVM 3.3</li>
+</ul>
+
+<p>Brian Paul (9):</p>
+<ul>
+  <li>mesa: fix the compressed TexSubImage size checking code</li>
+  <li>st/mesa: generate GL_OUT_OF_MEMORY if we can't create the index buffer</li>
+  <li>mesa: fix error checking of DXT sRGB formats in _mesa_base_tex_format()</li>
+  <li>st/glx/xlib: check for null ctx pointer in glXIsDirect()</li>
+  <li>xlib: check for null ctx pointer in glXIsDirect()</li>
+  <li>st/glx: add null ctx check in glXDestroyContext()</li>
+  <li>xlib: add null ctx check in glXDestroyContext()</li>
+  <li>meta: move vertex array enables for mipmap generation</li>
+  <li>mesa: handle missing read buffer in _mesa_get_color_read_format/type()</li>
+</ul>
+
+<p>Bryan Cain (1):</p>
+<ul>
+  <li>nv50: initialize kick_notify callback in nv50_create</li>
+</ul>
+
+<p>Chad Versace (3):</p>
+<ul>
+  <li>egl/android: Fix error condition for EGL_ANDROID_image_native_buffer</li>
+  <li>i965: Fix glColorPointer(GL_FIXED)</li>
+  <li>intel: Return early if miptree allocation fails</li>
+</ul>
+
+<p>Chia-I Wu (1):</p>
+<ul>
+  <li>u_vbuf: fix index buffer leak</li>
+</ul>
+
+<p>Chris Forbes (8):</p>
+<ul>
+  <li>mesa: add accessor for effective stencil ref</li>
+  <li>intel: Use accessor for stencil reference values</li>
+  <li>nouveau: Use accessor for stencil reference values</li>
+  <li>radeon: Use accessor for stencil reference values</li>
+  <li>st: Use accessor for stencil reference values</li>
+  <li>swrast: Use accessor for stencil reference values</li>
+  <li>mesa: Stop clamping stencil reference value at specification time</li>
+  <li>mesa: Use accessor for stencil reference values in glGet</li>
+</ul>
+
+<p>Chí-Thanh Christopher Nguyễn (1):</p>
+<ul>
+  <li>targets/dri-i915: Force c++ linker in all cases</li>
+</ul>
+
+<p>Daniel Martin (1):</p>
+<ul>
+  <li>Fix build of swrast only without libdrm</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>i965: fix problem with constant out of bounds access (v3)</li>
+</ul>
+
+<p>Eric Anholt (10):</p>
+<ul>
+  <li>mesa: Make core Mesa allocate the texture renderbuffer wrapper.</li>
+  <li>mesa: Make gl_renderbuffers backed by EGL images use FinishRenderTexture.</li>
+  <li>i965/fs: Bake regs_written into the IR instead of recomputing it later.</li>
+  <li>i965/vs: Fix implied_mrf_writes() for integer division pre-gen6.</li>
+  <li>intel: Add support for writing to our linear-temporary-CPU-map case.</li>
+  <li>intel: Do temporary CPU maps of textures that are too big to GTT map.</li>
+  <li>intel: Avoid making tiled miptrees we won't be able to blit.</li>
+  <li>intel: Fix MRT handling of glBitmap().</li>
+  <li>intel: Fix format handling of blit glBitmap()</li>
+  <li>i965: Shut up the last release build warning.</li>
+</ul>
+
+<p>Fabian Bieler (2):</p>
+<ul>
+  <li>mesa/st: Don't copy propagate from swizzles.</li>
+  <li>mesa/program: Don't copy propagate from swizzles.</li>
+</ul>
+
+<p>Frank Henigman (1):</p>
+<ul>
+  <li>intel: initialize fs_visitor::params_remap in constructor</li>
+</ul>
+
+<p>Ian Romanick (2):</p>
+<ul>
+  <li>docs: Add 9.1.3 release md5sums</li>
+  <li>mesa: Bump version to 9.1.4</li>
+</ul>
+
+<p>José Fonseca (1):</p>
+<ul>
+  <li>scons: Fix implicit python dependency discovery on Windows.</li>
+</ul>
+
+<p>Kenneth Graunke (17):</p>
+<ul>
+  <li>mesa: Add i965 varying index patches to .cherry-ignore.</li>
+  <li>i965: Turn brw-&gt;urb.vs_size and gs_size into local variables.</li>
+  <li>i965: Use a variable for the push constant size in kB.</li>
+  <li>i965: Update URB partitioning code for Haswell's GT3 variant.</li>
+  <li>i965: Add chipset limits for the Haswell GT3 variant.</li>
+  <li>i965: Enable the Bay Trail platform.</li>
+  <li>mesa: Add a reverted commit to cherry-ignore.</li>
+  <li>vbo: Ignore PRIMITIVE_RESTART_FIXED_INDEX for glDrawArrays().</li>
+  <li>mesa: Add a helper function for determining the restart index.</li>
+  <li>vbo: Use the new primitive restart index helper function.</li>
+  <li>i965: Use the correct restart index for fixed index mode on Haswell.</li>
+  <li>mesa: Cherry-ignore a patch that got picked but squashed.</li>
+  <li>i965: Fix can_cut_index_handle_restart_index() for byte/short types.</li>
+  <li>st/mesa: Go back to using ctx-&gt;Array.RestartIndex, not _RestartIndex.</li>
+  <li>mesa: Ignore fixed-index primitive restart in ArrayElement().</li>
+  <li>mesa: Delete the ctx-&gt;Array._RestartIndex derived state.</li>
+  <li>glsl: Bail on parsing if the #version directive is bogus.</li>
+</ul>
+
+<p>Lauri Kasanen (1):</p>
+<ul>
+  <li>r600g: Correctly initialize the shader key, v2</li>
+</ul>
+
+<p>Maarten Lankhorst (4):</p>
+<ul>
+  <li>nvc0: fix up video buffer alignment requirements</li>
+  <li>nvc0: kill assert in ppp code</li>
+  <li>nvc0: set rsvd_kick correctly</li>
+  <li>nvc0: allow frame dropping in h264</li>
+</ul>
+
+<p>Marek Olšák (7):</p>
+<ul>
+  <li>radeonsi: increase array size for shader inputs and outputs</li>
+  <li>vbo: fix possible use-after-free segfault after a VAO is deleted</li>
+  <li>glsl: fix the value of gl_MaxFragmentUniformVectors</li>
+  <li>st/mesa: initialize all program constants and UBO limits</li>
+  <li>st/mesa: initialize Const.MaxColorAttachments</li>
+  <li>st/mesa: fix a couple of issues in st_bind_ubos</li>
+  <li>mesa: declare UniformBufferBindings as an array with a static size</li>
+</ul>
+
+<p>Matt Turner (3):</p>
+<ul>
+  <li>configure.ac: Remove redundant checks of enable_dri.</li>
+  <li>configure.ac: Build dricommon for DRI gallium drivers</li>
+  <li>i965: NULL check depth_mt to quiet static analysis.</li>
+</ul>
+
+<p>Michel Dänzer (3):</p>
+<ul>
+  <li>radeonsi: Fix handling of TGSI_SEMANTIC_PSIZE</li>
+  <li>radeonsi: Fix user clip planes</li>
+  <li>mesa: Note that two radeonsi fixes cannot be backported after all</li>
+</ul>
+
+<p>Mike Stroyan (1):</p>
+<ul>
+  <li>configure.ac: Build dricommon for gallium swrast</li>
+</ul>
+
+<p>Naohiro Aota (1):</p>
+<ul>
+  <li>xmlpool/build: Make sure to set mo properly</li>
+</ul>
+
+<p>Paul Berry (2):</p>
+<ul>
+  <li>glsl: Fix error checking on "flat" keyword to match GLSL ES 3.00, GLSL 1.50.</li>
+  <li>i965/gen7.5: Allow HW primitive restart for all primitive types.</li>
+</ul>
+
+<p>Paulo Zanoni (1):</p>
+<ul>
+  <li>i965: make GT3 machines work as GT3 instead of GT2</li>
+</ul>
+
+<p>Rodrigo Vivi (2):</p>
+<ul>
+  <li>i965: Add missing Haswell GT3 Desktop to IS_HSW_GT3 check.</li>
+  <li>i965: Adding more reserved PCI IDs for Haswell.</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>gallivm: fix out-of-bounds access with mirror_clamp_to_edge address mode</li>
+</ul>
+
+<p>Stéphane Marchesin (2):</p>
+<ul>
+  <li>st/xlib: Fix upside down coordinates for CopySubBuffer</li>
+  <li>st/xlib: Flush the front buffer before doing CopySubBuffer</li>
+</ul>
+
+<p>Sven Joachim (1):</p>
+<ul>
+  <li>mesa: Fix ieee fp on Alpha</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>mesa: fix type comparison errors in sub-texture error checking code</li>
+</ul>
+
+<p>Tom Stellard (2):</p>
+<ul>
+  <li>gallivm: Fix build with LLVM &gt;= r180063</li>
+  <li>r300g/compiler: Prevent regalloc from swizzling texture operands v2</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>radeon: Initialize variables in radeon_llvm_context_init.</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/9.1.5.html
+++ b/docs/relnotes/9.1.5.html
@@ -0,0 +1,140 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 9.1.5 Release Notes / July 17, 2013</h1>
+
+<p>
+Mesa 9.1.5 is a bug fix release which fixes bugs found since the 9.1.4 release.
+</p>
+<p>
+Mesa 9.1 implements the OpenGL 3.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.1.  OpenGL
+3.1 is <strong>only</strong> available if requested at context creation
+because GL_ARB_compatibility is not supported.
+</p>
+
+<h2>MD5 checksums</h2>
+<pre>
+4ed2af5943141a85a21869053a2fc2eb  MesaLib-9.1.5.tar.bz2
+47181066acf3231d74e027b2033f9455  MesaLib-9.1.5.tar.gz
+4c9c6615bd99215325250f87ed34058f  MesaLib-9.1.5.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=58384">Bug 58384</a> - [i965 Bisected]Oglc max_values(advanced.fragmentProgram.GL_MAX_PROGRAM_ENV_PARAMETERS_ARB) segfault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=62647">Bug 62647</a> - Wrong rendering of Dota 2 on Wine (apitrace attached) - Intel IVB HD4000</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=63674">Bug 63674</a> - [IVB]frozen at the first frame when run Unigine-heaven 4.0</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=65910">Bug 65910</a> - Killing weston-launch causes segv in desktop-shell</li>
+
+</ul>
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-9.1.4..mesa-9.1.5
+</pre>
+
+<p>Anuj Phogat (1):</p>
+<ul>
+  <li>mesa: Return ZeroVec/dummyReg instead of NULL pointer</li>
+</ul>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>svga: check for NaN shader immediates</li>
+</ul>
+
+<p>Carl Worth (3):</p>
+<ul>
+  <li>cherry-ignore: Ignore previously backported patch</li>
+  <li>cherry-ignore: Drop two patches which we've decided not to include</li>
+  <li>mesa: Bump version to 9.1.5</li>
+</ul>
+
+<p>Chris Forbes (1):</p>
+<ul>
+  <li>i965: fix alpha test for MRT</li>
+</ul>
+
+<p>Christoph Bumiller (1):</p>
+<ul>
+  <li>r600g: x/y coordinates must be divided by block dim in dma blit</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>ra: Fix register spilling.</li>
+</ul>
+
+<p>Ian Romanick (6):</p>
+<ul>
+  <li>docs: Add 9.1.4 release md5sums</li>
+  <li>glsl: Add a gl_shader_program parameter to _mesa_uniform_{merge,split}_location_offset</li>
+  <li>glsl: Add gl_shader_program::UniformLocationBaseScale</li>
+  <li>glsl: Generate smaller values for uniform locations</li>
+  <li>i965: Be more careful with the interleaved user array upload optimization</li>
+  <li>glsl: Move all var decls to the front of the IR list in reverse order</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>glsl/builtins: Fix ARB_texture_cube_map_array built-in availability.</li>
+</ul>
+
+<p>Kristian Høgsberg (1):</p>
+<ul>
+  <li>wayland: Handle global_remove event as well</li>
+</ul>
+
+<p>Matt Turner (1):</p>
+<ul>
+  <li>register_allocate: Fix the type of best_benefit.</li>
+</ul>
+
+<p>Paul Berry (1):</p>
+<ul>
+  <li>glsl ES: Fix magnitude of gl_MaxVertexUniformVectors.</li>
+</ul>
+
+<p>Richard Sandiford (3):</p>
+<ul>
+  <li>st/xlib Fix XIMage bytes-per-pixel calculation</li>
+  <li>st/xlib: Fix XImage stride calculation</li>
+  <li>st/dri/sw: Fix pitch calculation in drisw_update_tex_buffer</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>swrast: Fix memory leak.</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/9.1.6.html
+++ b/docs/relnotes/9.1.6.html
@@ -0,0 +1,168 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 9.1.6 Release Notes / August 1, 2013</h1>
+
+<p>
+Mesa 9.1.6 is a bug fix release which fixes bugs found since the 9.1.5 release.
+</p>
+<p>
+Mesa 9.1 implements the OpenGL 3.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.1.  OpenGL
+3.1 is <strong>only</strong> available if requested at context creation
+because GL_ARB_compatibility is not supported.
+</p>
+
+<h2>MD5 checksums</h2>
+<pre>
+443a2a352667294b53d56cb1a74114e9  MesaLib-9.1.6.tar.bz2
+08d3069cccd6821e5f33e0840bca0718  MesaLib-9.1.6.tar.gz
+90aa7a6d9878cdbfcb055312f356d6b9  MesaLib-9.1.6.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=47824">Bug 47824</a> - osmesa using --enable-shared-glapi depends on libgl</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=62362">Bug 62362</a> - Crash when using Wayland EGL platform</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=63435">Bug 63435</a> - [Regression since 9.0] Flickering in EGL OpenGL full-screen window with swap interval 1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=64087">Bug 64087</a> - Webgl conformance shader-with-non-reserved-words crash when mesa is compiled without --enable-debug</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=64330">Bug 64330</a> - WebGL snake demo crash in loop_analysis.cpp:506: bool is_loop_terminator(ir_if*): assertion „inst != __null“ failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=65236">Bug 65236</a> - [i965] Rendering artifacts in VDrift/GL2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=66558">Bug 66558</a> - RS690: 3D artifacts when playing SuperTuxKart</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=66847">Bug 66847</a> - compilation broken with llvm 3.3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=66850">Bug 66850</a> - glGenerateMipmap crashes when using GL_TEXTURE_2D_ARRAY with compressed internal format</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=66921">Bug 66921</a> - [r300g] Heroes of Newerth: HiZ related corruption</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=67283">Bug 67283</a> - VDPAU doesn't work on hybrid laptop through DRI_PRIME</li>
+
+</ul>
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-9.1.5..mesa-9.1.6
+</pre>
+
+<p>Andreas Boll (1):</p>
+<ul>
+  <li>configure.ac: Require llvm-3.2 for r600g/radeonsi llvm backends</li>
+</ul>
+
+<p>Brian Paul (4):</p>
+<ul>
+  <li>mesa: handle 2D texture arrays in get_tex_rgba_compressed()</li>
+  <li>meta: handle 2D texture arrays in decompress_texture_image()</li>
+  <li>mesa: implement mipmap generation for compressed 2D array textures</li>
+  <li>mesa: improve free() cleanup in generate_mipmap_compressed()</li>
+</ul>
+
+<p>Carl Worth (7):</p>
+<ul>
+  <li>docs: Add 9.1.5 release md5sums</li>
+  <li>Merge 'origin/9.1' into stable</li>
+  <li>cherry-ignore: Drop 13 patches from the pick list</li>
+  <li>get-pick-list.sh: Include commits mentionining "CC: mesa-stable..." in pick list</li>
+  <li>get-pick-list: Allow for non-whitespace between "CC:" and "mesa-stable"</li>
+  <li>get-pick-list: Ignore commits which CC mesa-stable unless they say "9.1"</li>
+  <li>Bump version to 9.1.6</li>
+</ul>
+
+<p>Chris Forbes (5):</p>
+<ul>
+  <li>i965/Gen4: Zero extra coordinates for ir_tex</li>
+  <li>i965/vs: Fix flaky texture swizzling</li>
+  <li>i965/vs: set up sampler state pointer for Gen4/5.</li>
+  <li>i965/vs: Put lod parameter in the correct place for Gen4</li>
+  <li>i965/vs: Gen4/5: enable front colors if back colors are written</li>
+</ul>
+
+<p>Christoph Bumiller (1):</p>
+<ul>
+  <li>nv50,nvc0: s/uint16/uint32 for constant buffer offset</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>gallium/vl: add prime support</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>egl: Restore "bogus" DRI2 invalidate event code.</li>
+</ul>
+
+<p>Jeremy Huddleston Sequoia (1):</p>
+<ul>
+  <li>Apple: glFlush() is not needed with CGLFlushDrawable()</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>glsl: Classify "layout" like other identifiers.</li>
+</ul>
+
+<p>Kristian Høgsberg (1):</p>
+<ul>
+  <li>egl-wayland: Fix left-over wl_display_roundtrip() usage</li>
+</ul>
+
+<p>Maarten Lankhorst (2):</p>
+<ul>
+  <li>osmesa: link against static libglapi library too to get the gl exports</li>
+  <li>nvc0: force use of correct firmware file</li>
+</ul>
+
+<p>Marek Olšák (4):</p>
+<ul>
+  <li>r300g/swtcl: fix geometry corruption by uploading indices to a buffer</li>
+  <li>r300g/swtcl: fix a lockup in MSAA resolve</li>
+  <li>Revert "r300g: allow HiZ with a 16-bit zbuffer"</li>
+  <li>r600g: increase array size for shader inputs and outputs</li>
+</ul>
+
+<p>Matt Turner (2):</p>
+<ul>
+  <li>i965: NULL check prog on shader compilation failure.</li>
+  <li>i965/vs: Print error if vertex shader fails to compile.</li>
+</ul>
+
+<p>Paul Berry (1):</p>
+<ul>
+  <li>glsl: Handle empty if statement encountered during loop analysis.</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/9.2.1.html
+++ b/docs/relnotes/9.2.1.html
@@ -0,0 +1,206 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 9.2.1 Release Notes / (October 4, 2013)</h1>
+
+<p>
+Mesa 9.2.1 is a bug fix release which fixes bugs found since the 9.2 release.
+</p>
+<p>
+Mesa 9.2 implements the OpenGL 3.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.1.  OpenGL
+3.1 is <strong>only</strong> available if requested at context creation
+because GL_ARB_compatibility is not supported.
+</p>
+
+
+<h2>MD5 checksums</h2>
+<pre>
+e6cdfa84dfddd86e3d36ec7ff4b6478a  MesaLib-9.2.1.tar.gz
+dd4c82667d9c19c28a553b12eba3f8a0  MesaLib-9.2.1.tar.bz2
+d9af0f5607f7d275793d293057ca9ac6  MesaLib-9.2.1.zip
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=66779">Bug 66779</a> - Use of uninitialized stack variable with brw_search_cache()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=68233">Bug 68233</a> - Valgrind errors in mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=68250">Bug 68250</a> - Automatic mipmap generation with texture compression produces borders that fade to black</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=68637">Bug 68637</a> - [Bisected IVB/HSW]Unigine demo crash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=68753">Bug 68753</a> - [regression bisected] GLSL ES: structs members can't have precision qualifiers anymore in 9.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=69525">Bug 69525</a> - [GM45, bisected] Piglit tex-shadow2drect fails</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-9.2..mesa-9.2.1
+</pre>
+
+
+<p>Alex Deucher (1):</p>
+<ul>
+  <li>radeon/winsys: pad IBs to a multiple of 8 DWs</li>
+</ul>
+
+<p>Andreas Boll (1):</p>
+<ul>
+  <li>os: First check for __GLIBC__ and then for PIPE_OS_BSD</li>
+</ul>
+
+<p>Anuj Phogat (1):</p>
+<ul>
+  <li>glsl: Allow precision qualifiers for sampler types</li>
+</ul>
+
+<p>Brian Paul (2):</p>
+<ul>
+  <li>docs: minor fixes for 9.2 release notes</li>
+  <li>mesa: check for bufSize &gt; 0 in _mesa_GetSynciv()</li>
+</ul>
+
+<p>Carl Worth (3):</p>
+<ul>
+  <li>cherry-ignore: Ignore a commit which appeared twice on master</li>
+  <li>Use -Bsymbolic when linking libEGL.so</li>
+  <li>mesa: Bump version to 9.2.1</li>
+</ul>
+
+<p>Chris Forbes (3):</p>
+<ul>
+  <li>i965/fs: Gen4: Zero out extra coordinates when using shadow compare</li>
+  <li>i965: Fix cube array coordinate normalization</li>
+  <li>i965: fix bogus swizzle in brw_cubemap_normalize</li>
+</ul>
+
+<p>Christoph Bumiller (2):</p>
+<ul>
+  <li>nvc0/ir: add f32 long immediate cannot saturate</li>
+  <li>nvc0: delete compute object on screen destruction</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>st/mesa: don't dereference stObj-&gt;pt if NULL</li>
+</ul>
+
+<p>Dominik Behr (1):</p>
+<ul>
+  <li>glsl: propagate max_array_access through function calls</li>
+</ul>
+
+<p>Emil Velikov (1):</p>
+<ul>
+  <li>nouveau: initialise the nouveau_transfer maps</li>
+</ul>
+
+<p>Eric Anholt (4):</p>
+<ul>
+  <li>mesa: Rip out more extension checking from texformat.c.</li>
+  <li>mesa: Don't choose S3TC for generic compression if we can't compress.</li>
+  <li>i965/gen4: Fix fragment program rectangle texture shadow compares.</li>
+  <li>i965: Reenable glBitmap() after the sRGB winsys enabling.</li>
+</ul>
+
+<p>Ian Romanick (7):</p>
+<ul>
+  <li>docs: Add 9.2 release md5sums</li>
+  <li>Add .cherry-ignore file</li>
+  <li>mesa: Note that 89a665e should not be picked</li>
+  <li>glsl: Reallow precision qualifiers on structure members</li>
+  <li>mesa: Support GL_MAX_VERTEX_OUTPUT_COMPONENTS query with ES3</li>
+  <li>mesa: Remove all traces of GL_OES_matrix_get</li>
+  <li>mesa: Don't return any data for GL_SHADER_BINARY_FORMATS</li>
+</ul>
+
+<p>Ilia Mirkin (2):</p>
+<ul>
+  <li>nv30: find first unused texcoord rather than bailing if first is used</li>
+  <li>nv30: fix inconsistent setting of push-&gt;user_priv</li>
+</ul>
+
+<p>Joakim Sindholt (1):</p>
+<ul>
+  <li>nvc0: fix blitctx memory leak</li>
+</ul>
+
+<p>Johannes Obermayr (1):</p>
+<ul>
+  <li>st/gbm: Add $(WAYLAND_CFLAGS) for HAVE_EGL_PLATFORM_WAYLAND.</li>
+</ul>
+
+<p>Kenneth Graunke (5):</p>
+<ul>
+  <li>i965/vs: Detect GRF sources in split_virtual_grfs send-from-GRF code.</li>
+  <li>i965/fs: Detect GRF sources in split_virtual_grfs send-from-GRF code.</li>
+  <li>i965/vec4: Only zero out unused message components when there are any.</li>
+  <li>i965: Fix brw_vs_prog_data_compare to actually check field members.</li>
+  <li>meta: Set correct viewport and projection in decompress_texture_image.</li>
+</ul>
+
+<p>Maarten Lankhorst (2):</p>
+<ul>
+  <li>st/dri: do not create a new context for msaa copy</li>
+  <li>nvc0: restore viewport after blit</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>r600g: fix constant buffer cache flushing</li>
+  <li>r600g: fix texture buffer object cache flushing</li>
+</ul>
+
+<p>Paul Berry (1):</p>
+<ul>
+  <li>i965: Initialize inout_offset parameter to brw_search_cache().</li>
+</ul>
+
+<p>Rico Schüller (1):</p>
+<ul>
+  <li>glx: Initialize OpenGL version to 1.0</li>
+</ul>
+
+<p>Tiziano Bacocco (1):</p>
+<ul>
+  <li>nvc0/ir: fix use after free in texture barrier insertion pass</li>
+</ul>
+
+<p>Torsten Duwe (1):</p>
+<ul>
+  <li>wayland-egl.pc requires wayland-client.pc.</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/9.2.2.html
+++ b/docs/relnotes/9.2.2.html
@@ -0,0 +1,97 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 9.2.2 Release Notes / (October 18, 2013)</h1>
+
+<p>
+Mesa 9.2.2 is a bug fix release which fixes bugs found since the 9.2.1 release.
+</p>
+<p>
+Mesa 9.2 implements the OpenGL 3.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.1.  OpenGL
+3.1 is <strong>only</strong> available if requested at context creation
+because GL_ARB_compatibility is not supported.
+</p>
+
+
+<h2>MD5 checksums</h2>
+<pre>
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=69449">Bug 69449</a> - Valgrind error in program_resource_visitor::recursion</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=70411">Bug 70411</a> - glInvalidateFramebuffer fails with GL_INVALID_ENUM</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-9.2.1..mesa-9.2.2
+</pre>
+
+<p>Brian Paul (3):</p>
+<ul>
+  <li>docs: add missing &lt;pre&gt; tag</li>
+  <li>svga: fix incorrect memcpy src in svga_buffer_upload_piecewise()</li>
+  <li>mesa: consolidate cube width=height error checking</li>
+</ul>
+
+<p>Carl Worth (3):</p>
+<ul>
+  <li>docs: Add md5sums for 9.2.1 release</li>
+  <li>Bump version to 9.2.2</li>
+</ul>
+
+<p>Constantin Baranov (1):</p>
+<ul>
+  <li>mesa: Add missing switch break in invalidate_framebuffer_storage()</li>
+</ul>
+
+<p>Eric Anholt (3):</p>
+<ul>
+  <li>i965: Don't forget the cube map padding on gen5+.</li>
+  <li>mesa: Fix compiler warnings when ALIGN's alignment is "1 &lt;&lt; value".</li>
+  <li>i965: Fix 3D texture layout by more literally copying from the spec.</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>glsl: Fix usage of the wrong union member in program_resource_visitor::recursion.</li>
+</ul>
+
+<p>Tom Stellard (1):</p>
+<ul>
+  <li>radeonsi: Use 'SI' as the LLVM processor for CIK on LLVM &lt;= 3.3</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/9.2.html
+++ b/docs/relnotes/9.2.html
@@ -14,7 +14,7 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">

-<h1>Mesa 9.2 Release Notes / (date TBD)</h1>
+<h1>Mesa 9.2 Release Notes / (August 27, 2013)</h1>

 <p>
 Mesa 9.2 is a new development release.
@@ -33,7 +33,9 @@ because GL_ARB_compatibility is not supported.

 <h2>MD5 checksums</h2>
 <pre>
-tbd
+4f93c6475ec656fc1f7b93aeffc9b6c4  MesaLib-9.2.0.tar.gz
+4185b6aae890bc62a964f4b24cc1aca8  MesaLib-9.2.0.tar.bz2
+3bc5339bc98b9c37777ffd14e3a8eca4  MesaLib-9.2.0.zip
 </pre>


@@ -44,25 +46,179 @@ Note: some of the new features are only available with certain drivers.
 </p>

 <ul>
+<li>GL_ARB_shading_language_420pack in all drivers that support GLSL 1.30.</li>
 <li>GL_ARB_texture_buffer_range</li>
 <li>GL_ARB_texture_multisample</li>
 <li>GL_ARB_texture_storage_multisample</li>
 <li>GL_ARB_texture_query_lod</li>
+<li>GL_ARB_texture_storage on radeon, r200, and nouveau</li>
+<li>GL_EXT_discard_framebuffer in all OpenGL ES (all versions) drivers</li>
+<li>GL_EXT_framebuffer_multisample_blit_scaled on i965</li>
 <li>Added new freedreno gallium driver</li>
 <li>OSMesa interface for gallium llvmpipe/softpipe drivers</li>
 <li>Gallium Heads-Up Display (HUD) feature for performance monitoring</li>
+<li>Added support for UVD (2.2 and 3.0) video decoding on r600g and radeonsi through VDPAU (requires Kernel 3.10 or later)</li>
 </ul>


 <h2>Bug fixes</h2>

-<p>TBD -- This list is likely incomplete.</p>
+<p>Attempts have been made to <b>not</b> include bugs fixed in previous 9.1
+releases or bugs that were regressions during 9.2 development. This list is
+likely incomplete.</p>

+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=41787">Bug 41787</a> - [llvmpipe] stencil broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=44618">Bug 44618</a> - Cross-compilation broken by glsl builtin_compiler</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=46632">Bug 46632</a> - Make the alignment checks for the readpixel blit fastpath a bit more lenient</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=47116">Bug 47116</a> - Enemy territory freezes with rs880 and commit fbebd431ec4e2e461a0cbcd5f3a04a000b8f6bbf</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=47248">Bug 47248</a> - autogen missing dependency on flex and bison, causes infinite loop in glsl build</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48694">Bug 48694</a> - radeonsi_pipe.c:322:7: error: ‘PIPE_CAP_DUAL_SOURCE_BLEND’ undeclared</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=50655">Bug 50655</a> - [r600g][RV670 HD3870] Ioquake games causes GPU lockup (waiting for 0x00003039 last fence id 0x00003030)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=51471">Bug 51471</a> - [965gm] Corrupted graphics in corners of screen with pixel shaders enabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=51782">Bug 51782</a> - mesa-8.0.3: fails to compile against uclibc</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=54240">Bug 54240</a> - [swrast] piglit fbo-generatemipmap-filtering regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=55503">Bug 55503</a> - Constant vertex attributes broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=55783">Bug 55783</a> - glEnable(GL_FRAMEBUFFER_SRGB) has no effect on the backbuffer</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=55825">Bug 55825</a> - [Bisected i965]Oglc max_values(advanced.fragmentProgram.GL_MAX_PROGRAM_ALU_INSTRUCTIONS_ARB)  causes OOM-killer</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=56920">Bug 56920</a> - [sandybridge][uxa] graphics very glitchy and always flickering</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=57753">Bug 57753</a> - leak in loop_analysis</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=57875">Bug 57875</a> - Second Life viewer bad rendering with git-ec83535</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=58666">Bug 58666</a> - rv670 + llvm = errors.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=58680">Bug 58680</a> - [IVB] Graphical glitches in 0 A.D</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=58872">Bug 58872</a> - Mac OS X configure: error: Couldn't find clock_gettime</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59322">Bug 59322</a> - r300g MSAA breaks Half-Life 2 in Wine</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59364">Bug 59364</a> - [bisected] Mesa build fails: clientattrib.c:33:22: fatal error: indirect.h: No such file or directory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59439">Bug 59439</a> - glCopyPixels generates no fragments (occlusion_query_meta_fragments test fails)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59440">Bug 59440</a> - glBitmap generates no fragments (occlusion_query_meta_fragments test fails)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59494">Bug 59494</a> - [Bisected]Piglit glean_depthStencil fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59592">Bug 59592</a> - Radeon HD 5670: reproducable GPU lockups with htile enabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59648">Bug 59648</a> - [SNB/IVB/HSW Bisected]Piglit spec/ARB_uniform_buffer/object_layout-std140-base-size-and-alignment fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59701">Bug 59701</a> - lp_test_arit fails on non-sse41 capable machines, breaking make check</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59737">Bug 59737</a> - [bisected] 0d108116bd80b757fb01a84a9f1946ef870b57b8 breaks osmesa when cross compiling</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59740">Bug 59740</a> - [i965 Bisected]Oglc api-error(negative.glEvalMesh) fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59851">Bug 59851</a> - AC_ARG_WITH misusage leading to mesa configure failure</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59873">Bug 59873</a> - [swrast] piglit ext_framebuffer_multisample-interpolation 0 centroid-edges regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=59876">Bug 59876</a> - glGetTexLevelParameteriv broken for indirect rendering</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60038">Bug 60038</a> - [osmesa] [git] building 32-bit mesa on 64 bit fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60047">Bug 60047</a> - [softpipe] piglit masked-clear regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60052">Bug 60052</a> - [Bisected]Piglit glx_extension_string_sanity fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60082">Bug 60082</a> - [  FAILED  ] DispatchSanity_test.GL31_CORE</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60086">Bug 60086</a> - Wayland platform backend crashes if there's no back buffer during dri2_swap_buffers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60098">Bug 60098</a> - [softpipe] Unexpected PIPE_CAP 78 query</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60172">Bug 60172</a> - Planeshift: triangles where grass would be</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60200">Bug 60200</a> - radeon_bo with virtual address referencing mismatch</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60212">Bug 60212</a> - [Bisected] Weston black output</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60524">Bug 60524</a> - [softpipe] piglit depthstencil-render-miplevels 146 s=z24_s8 regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60527">Bug 60527</a> - [softpipe] fbo-stencil GL_DEPTH24_STENCIL8 clear regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60633">Bug 60633</a> - EXT_texture_sRGB does not work in game The Cave on IvyBridge</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60737">Bug 60737</a> - In GLSL ES, a missing FS precision qualifier does not generate an error</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60866">Bug 60866</a> - GLSL performance issues for uniform buffer objects</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=61036">Bug 61036</a> - Shader fails to build in LLVMpipe, aborts program</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=61200">Bug 61200</a> - insufficient linking of libxatracker.so</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=61635">Bug 61635</a> - glVertexAttribPointer(id, GL_UNSIGNED_BYTE, GL_FALSE,...) does not work</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=62466">Bug 62466</a> - r600g hyperz lockups with KSP 0.19</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=62669">Bug 62669</a> - HyperZ freeze when playing PrBoom-Plus demo with lots of monsters</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=62721">Bug 62721</a> - GPU lockup in Minecraft 1.5.1 with HyperZ</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=62830">Bug 62830</a> - [i965 bisected] Wrong Lightning on Freespace 2 SCP (patch attached)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=63124">Bug 63124</a> - [r600g] HyperZ lockup on REDWOOD in Half Life 2 Deathmatch</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=63702">Bug 63702</a> - tiling2d in radeon trash vdpau UVD textures</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=64935">Bug 64935</a> - [swrast] s_texfetch.c:1335: set_fetch_functions: Assertion `texImage-&gt;FetchTexel' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=64959">Bug 64959</a> - Cannot build against EGL without X11</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=65112">Bug 65112</a> - glcpp hangs parsing line continuations</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=65958">Bug 65958</a> - GPU Lockup on Trinity 7500G</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=66450">Bug 66450</a> - JUNIPER UVD accelerated  playback of MPEG 1/2 streams does not work</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=66606">Bug 66606</a> - [i965 bisected]GLBenchmark 2.5.1/2.7.0 sometimes render error with gnome-session enabling SNA</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=66713">Bug 66713</a> - Team Fortress 2 crashes with r600-sb on HD4850</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=67354">Bug 67354</a> - glsl_parser.cpp is broken with bison 3.0</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=67548">Bug 67548</a> - glGetAttribLocation seems to be broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=67927">Bug 67927</a> - R600_DEBUG=sb: Celestia show 2 earths, one wrongly rendered</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=67934">Bug 67934</a> - [SNB/IVB/HSW 9.2 Bisected]Ogles2conform/GL2Tests/glUniform/glUniform.test fails with gnome-session enable compositing</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=68162">Bug 68162</a> - [radeonsi] texture rendering is broken in Source-Engine games</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=68195">Bug 68195</a> - piglit tests vs-struct-pad and fs-struct-pad both fail</li>
+
+</ul>

 <h2>Changes</h2>

 <ul>
 <li>Removed d3d1x state tracker (unused, unmaintained and broken)</li>
+<li>Removed GL_EXT_clip_volume_hint because no driver had enabled it since
+2007.</li>
+<li>Removed GL_MESA_resize_buffers because it was only really implemented by
+the (unsupported) GDI driver.</li>
+<li>GL_EXT_separate_shader_objects has been removed from all Gallium drivers,
+    because it disallows a critical GLSL shader optimization.
+    GL_ARB_separate_shader_objects doesn't have this issue.</li>
+<li>i965 Gen6+ requires Kernel 3.6 or later. (92d2f5a)</li>
 </ul>

 </div>
--- a/docs/specs/OLD/MESA_resize_buffers.spec
+++ b/docs/specs/OLD/MESA_resize_buffers.spec
@@ -12,7 +12,7 @@ Contact

 Status

-    Shipping (since Mesa version 2.2)
+    Obsolete.

 Version

--- a/include/EGL/eglplatform.h
+++ b/include/EGL/eglplatform.h
@@ -109,8 +109,8 @@ typedef void                        *EGLNativeDisplayType;
 #ifdef MESA_EGL_NO_X11_HEADERS

 typedef void            *EGLNativeDisplayType;
-typedef khronos_uint32_t EGLNativePixmapType;
-typedef khronos_uint32_t EGLNativeWindowType;
+typedef khronos_uintptr_t EGLNativePixmapType;
+typedef khronos_uintptr_t EGLNativeWindowType;

 #else

--- a/include/GL/glext.h
+++ b/include/GL/glext.h
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -552,6 +552,8 @@ struct __DRIuseInvalidateExtensionRec {
 #define __DRI_ATTRIB_RGBA_BIT			0x01	
 #define __DRI_ATTRIB_COLOR_INDEX_BIT		0x02
 #define __DRI_ATTRIB_LUMINANCE_BIT		0x04
+#define __DRI_ATTRIB_FLOAT_BIT			0x08
+#define __DRI_ATTRIB_UNSIGNED_FLOAT_BIT		0x10

 /* __DRI_ATTRIB_CONFIG_CAVEAT */
 #define __DRI_ATTRIB_SLOW_BIT			0x01
@@ -983,7 +985,6 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_FOURCC_YUV410	0x39565559
 #define __DRI_IMAGE_FOURCC_YUV411	0x31315559
 #define __DRI_IMAGE_FOURCC_YUV420	0x32315559
-#define __DRI_IMAGE_FOURCC_YVU420	0x32315659
 #define __DRI_IMAGE_FOURCC_YUV422	0x36315559
 #define __DRI_IMAGE_FOURCC_YUV444	0x34325559
 #define __DRI_IMAGE_FOURCC_NV12		0x3231564e
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -70,3 +70,29 @@ CHIPSET(0x6664, HAINAN_6664, HAINAN)
 CHIPSET(0x6665, HAINAN_6665, HAINAN)
 CHIPSET(0x6667, HAINAN_6667, HAINAN)
 CHIPSET(0x666F, HAINAN_666F, HAINAN)
+
+CHIPSET(0x6640, BONAIRE_6640, BONAIRE)
+CHIPSET(0x6641, BONAIRE_6641, BONAIRE)
+CHIPSET(0x6649, BONAIRE_6649, BONAIRE)
+CHIPSET(0x6650, BONAIRE_6650, BONAIRE)
+CHIPSET(0x6651, BONAIRE_6651, BONAIRE)
+CHIPSET(0x6658, BONAIRE_6658, BONAIRE)
+CHIPSET(0x665C, BONAIRE_665C, BONAIRE)
+CHIPSET(0x665D, BONAIRE_665D, BONAIRE)
+
+CHIPSET(0x9830, KABINI_9830, KABINI)
+CHIPSET(0x9831, KABINI_9831, KABINI)
+CHIPSET(0x9832, KABINI_9832, KABINI)
+CHIPSET(0x9833, KABINI_9833, KABINI)
+CHIPSET(0x9834, KABINI_9834, KABINI)
+CHIPSET(0x9835, KABINI_9835, KABINI)
+CHIPSET(0x9836, KABINI_9836, KABINI)
+CHIPSET(0x9837, KABINI_9837, KABINI)
+CHIPSET(0x9838, KABINI_9838, KABINI)
+CHIPSET(0x9839, KABINI_9839, KABINI)
+CHIPSET(0x983A, KABINI_983A, KABINI)
+CHIPSET(0x983B, KABINI_983B, KABINI)
+CHIPSET(0x983C, KABINI_983C, KABINI)
+CHIPSET(0x983D, KABINI_983D, KABINI)
+CHIPSET(0x983E, KABINI_983E, KABINI)
+CHIPSET(0x983F, KABINI_983F, KABINI)
--- a/m4/ax_prog_flex.m4
+++ b/m4/ax_prog_flex.m4
@@ -53,7 +53,7 @@ AC_DEFUN([AX_PROG_FLEX], [
  AC_REQUIRE([AC_PROG_EGREP])

  AC_CACHE_CHECK([if flex is the lexer generator],[ax_cv_prog_flex],[
-    AS_IF([$LEX --version 2>/dev/null | $EGREP -q '^flex '],
+    AS_IF([$LEX --version 2>/dev/null | $EGREP -q '^\<flex\>'],
      [ax_cv_prog_flex=yes], [ax_cv_prog_flex=no])
  ])
  AS_IF([test "$ax_cv_prog_flex" = "yes"],
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -29,6 +29,10 @@ if HAVE_DRI_GLX
 SUBDIRS += glx
 endif

+if HAVE_EGL_PLATFORM_WAYLAND
+SUBDIRS += egl/wayland
+endif
+
 if HAVE_GBM
 SUBDIRS += gbm
 endif
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -21,8 +21,4 @@

 SUBDIRS=

-if HAVE_EGL_PLATFORM_WAYLAND
-SUBDIRS += wayland
-endif
-
 SUBDIRS += drivers main
--- a/src/egl/drivers/dri2/Makefile.am
+++ b/src/egl/drivers/dri2/Makefile.am
@@ -28,6 +28,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
 	-I$(top_builddir)/src/egl/wayland/wayland-drm \
 	$(DEFINES) \
+	$(VISIBILITY_CFLAGS) \
 	$(LIBDRM_CFLAGS) \
 	$(LIBUDEV_CFLAGS) \
 	$(LIBKMS_CFLAGS) \
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -75,7 +75,7 @@ EGLint dri2_to_egl_attribute_map[] = {
   0,				/* __DRI_ATTRIB_TRANSPARENT_GREEN_VALUE */
   0,				/* __DRI_ATTRIB_TRANSPARENT_BLUE_VALUE */
   0,				/* __DRI_ATTRIB_TRANSPARENT_ALPHA_VALUE */
-   0,				/* __DRI_ATTRIB_FLOAT_MODE */
+   0,				/* __DRI_ATTRIB_FLOAT_MODE (deprecated) */
   0,				/* __DRI_ATTRIB_RED_MASK */
   0,				/* __DRI_ATTRIB_GREEN_MASK */
   0,				/* __DRI_ATTRIB_BLUE_MASK */
@@ -141,7 +141,7 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
 	 else if (value & __DRI_ATTRIB_LUMINANCE_BIT)
 	    value = EGL_LUMINANCE_BUFFER;
 	 else
-	    /* not valid */;
+	    return NULL;
 	 _eglSetConfigKey(&base, EGL_COLOR_BUFFER_TYPE, value);
 	 break;	 

--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -38,7 +38,6 @@
 #include <xf86drm.h>
 #include <i915_drm.h>
 #include <radeon_drm.h>
-#include <gralloc_drm.h>

 #include "egl_dri2.h"
 #include "gralloc_drm.h"
@@ -57,9 +56,9 @@ get_format_bpp(int native)
   case HAL_PIXEL_FORMAT_RGB_888:
      bpp = 3;
      break;
-   case HAL_PIXEL_FORMAT_DRM_NV12:
-   case HAL_PIXEL_FORMAT_YV12:
   case HAL_PIXEL_FORMAT_RGB_565:
+   case HAL_PIXEL_FORMAT_RGBA_5551:
+   case HAL_PIXEL_FORMAT_RGBA_4444:
      bpp = 2;
      break;
   default:
@@ -340,7 +339,6 @@ dri2_create_image_android_native_buffer(_EGLDisplay *disp, _EGLContext *ctx,
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   struct dri2_egl_image *dri2_img;
   int name;
-   uint32_t offsets[3], strides[3], handles[3], tmp;
   EGLint format;

   if (ctx != NULL) {
@@ -369,12 +367,6 @@ dri2_create_image_android_native_buffer(_EGLDisplay *disp, _EGLContext *ctx,

   /* see the table in droid_add_configs_for_visuals */
   switch (buf->format) {
-   case HAL_PIXEL_FORMAT_DRM_NV12:
-       format = __DRI_IMAGE_FOURCC_NV12;
-       break;
-   case HAL_PIXEL_FORMAT_YV12:
-      format = __DRI_IMAGE_FOURCC_YVU420;
-      break;
   case HAL_PIXEL_FORMAT_BGRA_8888:
      format = __DRI_IMAGE_FORMAT_ARGB8888;
      break;
@@ -388,6 +380,8 @@ dri2_create_image_android_native_buffer(_EGLDisplay *disp, _EGLContext *ctx,
      format = __DRI_IMAGE_FORMAT_XBGR8888;
      break;
   case HAL_PIXEL_FORMAT_RGB_888:
+   case HAL_PIXEL_FORMAT_RGBA_5551:
+   case HAL_PIXEL_FORMAT_RGBA_4444:
      /* unsupported */
   default:
      _eglLog(_EGL_WARNING, "unsupported native buffer format 0x%x", buf->format);
@@ -406,70 +400,14 @@ dri2_create_image_android_native_buffer(_EGLDisplay *disp, _EGLContext *ctx,
      return NULL;
   }

-   switch (format) {
-   case __DRI_IMAGE_FORMAT_ARGB8888:
-   case __DRI_IMAGE_FORMAT_RGB565:
-   case __DRI_IMAGE_FORMAT_ABGR8888:
-   case __DRI_IMAGE_FORMAT_XBGR8888:
-       dri2_img->dri_image =
-          dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
-                           buf->width,
-                           buf->height,
-                           format,
-                           name,
-                           buf->stride,
-                           dri2_img);
-       break;
-   case __DRI_IMAGE_FOURCC_YVU420:
-       offsets[0] = offsets[1] = offsets[2] = 0;
-       strides[0] = strides[1] = strides[2] = 0;
-
-       gralloc_drm_resolve_format(buf->handle, &strides[0], &offsets[0],
-                                  &handles[0]);
-
-       /* u anv v are given in wrong order than what we need here thus this:*/
-       tmp = offsets[1];
-       offsets[1] = offsets[2];
-       offsets[2] = tmp;
-       tmp = strides[1];
-       strides[1] = strides[2];
-       strides[2] = tmp;
-
-       dri2_img->dri_image =
-          dri2_dpy->image->createImageFromNames(dri2_dpy->dri_screen,
-                           buf->width,
-                           buf->height,
-                           format,
-                           &name, 1,
-                           (int*)strides,
-                           (int*)offsets,
-                           dri2_img);
-       break;
-   case __DRI_IMAGE_FOURCC_NV12:
-       offsets[0] = offsets[1] = offsets[2] = 0;
-       strides[0] = strides[1] = strides[2] = 0;
-
-       gralloc_drm_resolve_format(buf->handle, &strides[0], &offsets[0],
-                                  &handles[0]);
-
-
-       dri2_img->dri_image =
-          dri2_dpy->image->createImageFromNames(dri2_dpy->dri_screen,
-                        buf->width,
-                        buf->height,
-                        format,
-                        &name, 1,
-                        (int*)strides,
-                        (int*)offsets,
-                        dri2_img);
-       break;
-   default:
-       /* We should never arrive here */
-      _eglLog(_EGL_WARNING, "unsupported native buffer format 0x%x",
-              buf->format);
-      break;
-   }
-
+   dri2_img->dri_image =
+      dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
+					   buf->width,
+					   buf->height,
+					   format,
+					   name,
+					   buf->stride,
+					   dri2_img);
   if (!dri2_img->dri_image) {
      free(dri2_img);
      _eglError(EGL_BAD_ALLOC, "droid_create_image_mesa_drm");
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -715,8 +715,15 @@ registry_handle_global(void *data, struct wl_registry *registry, uint32_t name,
   }
 }

+static void
+registry_handle_global_remove(void *data, struct wl_registry *registry,
+			      uint32_t name)
+{
+}
+
 static const struct wl_registry_listener registry_listener = {
-	registry_handle_global
+   registry_handle_global,
+   registry_handle_global_remove
 };

 EGLBoolean
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -212,7 +212,7 @@ dri2_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
 			dri2_surf->drawable, s.data->root,
 			dri2_surf->base.Width, dri2_surf->base.Height);
   } else {
-      dri2_surf->drawable = (xcb_drawable_t)window;
+      dri2_surf->drawable = window;
   }

   if (dri2_dpy->dri2) {
@@ -743,6 +743,20 @@ dri2_swap_buffers_msc(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw,
      free(reply);
   }

+   /* Since we aren't watching for the server's invalidate events like we're
+    * supposed to (due to XCB providing no mechanism for filtering the events
+    * the way xlib does), and SwapBuffers is a common cause of invalidate
+    * events, just shove one down to the driver, even though we haven't told
+    * the driver that we're the kind of loader that provides reliable
+    * invalidate events.  This causes the driver to request buffers again at
+    * its next draw, so that we get the correct buffers if a pageflip
+    * happened.  The driver should still be using the viewport hack to catch
+    * window resizes.
+    */
+   if (dri2_dpy->flush &&
+       dri2_dpy->flush->base.version >= 3 && dri2_dpy->flush->invalidate)
+      (*dri2_dpy->flush->invalidate)(dri2_surf->dri_drawable);
+
   return swap_count;
 }

@@ -836,10 +850,10 @@ dri2_copy_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf,
   (*dri2_dpy->flush->flush)(dri2_surf->dri_drawable);

   gc = xcb_generate_id(dri2_dpy->conn);
-   xcb_create_gc(dri2_dpy->conn, gc, (xcb_drawable_t)target, 0, NULL);
+   xcb_create_gc(dri2_dpy->conn, gc, target, 0, NULL);
   xcb_copy_area(dri2_dpy->conn,
 		  dri2_surf->drawable,
-		  (xcb_drawable_t)target,
+		  target,
 		  gc,
 		  0, 0,
 		  0, 0,
--- a/src/egl/drivers/glx/Makefile.am
+++ b/src/egl/drivers/glx/Makefile.am
@@ -22,6 +22,7 @@
 AM_CFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/egl/main \
+	$(VISIBILITY_CFLAGS) \
 	$(X11_CFLAGS) \
 	$(DEFINES)

--- a/src/egl/main/Android.mk
+++ b/src/egl/main/Android.mk
@@ -121,13 +121,11 @@ endif
 # r300g/r600g/radeonsi
 ifneq ($(filter r300g r600g radeonsi, $(MESA_GPU_DRIVERS)),)
 gallium_DRIVERS += libmesa_winsys_radeon
-LOCAL_SHARED_LIBRARIES += libdrm_radeon
 ifneq ($(filter r300g, $(MESA_GPU_DRIVERS)),)
 gallium_DRIVERS += libmesa_pipe_r300
 endif
 ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_r600 libmesa_pipe_radeon
-LOCAL_SHARED_LIBRARIES += libstlport
+gallium_DRIVERS += libmesa_pipe_r600
 endif
 ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
 gallium_DRIVERS += libmesa_pipe_radeonsi
--- a/src/egl/main/Makefile.am
+++ b/src/egl/main/Makefile.am
@@ -29,6 +29,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/gbm/main \
 	$(DEFINES) \
+	$(VISIBILITY_CFLAGS) \
 	$(EGL_CFLAGS) \
 	-D_EGL_NATIVE_PLATFORM=$(EGL_NATIVE_PLATFORM) \
 	-D_EGL_DRIVER_SEARCH_DIR=\"$(EGL_DRIVER_INSTALL_DIR)\" \
@@ -74,7 +75,7 @@ libEGL_la_SOURCES = \

 libEGL_la_LIBADD = \
 	$(EGL_LIB_DEPS)
-libEGL_la_LDFLAGS = -version-number 1:0 -no-undefined
+libEGL_la_LDFLAGS = -Wl,-Bsymbolic -version-number 1:0 -no-undefined

 if HAVE_EGL_PLATFORM_X11
 AM_CFLAGS += -DHAVE_X11_PLATFORM
--- a/src/egl/wayland/wayland-drm/Makefile.am
+++ b/src/egl/wayland/wayland-drm/Makefile.am
@@ -1,6 +1,7 @@
 AM_CFLAGS = -I$(top_srcdir)/src/egl/main \
 	    -I$(top_srcdir)/include \
 	    $(DEFINES) \
+	    $(VISIBILITY_CFLAGS) \
 	    $(WAYLAND_CFLAGS) 

 noinst_LTLIBRARIES = libwayland-drm.la
--- a/src/egl/wayland/wayland-egl/Makefile.am
+++ b/src/egl/wayland/wayland-egl/Makefile.am
@@ -2,6 +2,7 @@ pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = wayland-egl.pc

 AM_CFLAGS = $(DEFINES) \
+	    $(VISIBILITY_CFLAGS) \
 	    $(WAYLAND_CFLAGS)

 lib_LTLIBRARIES = libwayland-egl.la
--- a/src/egl/wayland/wayland-egl/wayland-egl.pc.in
+++ b/src/egl/wayland/wayland-egl/wayland-egl.pc.in
@@ -6,5 +6,6 @@ includedir=@includedir@
 Name: wayland-egl
 Description: Mesa wayland-egl library
 Version: @VERSION@
+Requires: wayland-client
 Libs: -L${libdir} -lwayland-egl
 Cflags: -I${includedir}
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -61,7 +61,7 @@ ifneq ($(filter r300g, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += drivers/r300
 endif
 ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),)
-SUBDIRS += drivers/r600 drivers/radeon
+SUBDIRS += drivers/r600
 endif
 ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += drivers/radeonsi
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -38,13 +38,17 @@ libgallium_la_SOURCES += \
 endif

 indices/u_indices_gen.c: $(srcdir)/indices/u_indices_gen.py
+	$(MKDIR_P) indices
 	$(AM_V_GEN) $(PYTHON2) $< > $@

 indices/u_unfilled_gen.c: $(srcdir)/indices/u_unfilled_gen.py
+	$(MKDIR_P) indices
 	$(AM_V_GEN) $(PYTHON2) $< > $@

 util/u_format_srgb.c: $(srcdir)/util/u_format_srgb.py
+	$(MKDIR_P) util
 	$(AM_V_GEN) $(PYTHON2) $< > $@

 util/u_format_table.c: $(srcdir)/util/u_format_table.py $(srcdir)/util/u_format_pack.py $(srcdir)/util/u_format_parse.py $(srcdir)/util/u_format.csv
+	$(MKDIR_P) util
 	$(AM_V_GEN) $(PYTHON2) $(srcdir)/util/u_format_table.py $(srcdir)/util/u_format.csv > $@
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -44,6 +44,7 @@ C_SOURCES := \
 	hud/hud_fps.c \
        hud/hud_driver_query.c \
 	os/os_misc.c \
+	os/os_process.c \
 	os/os_time.c \
 	pipebuffer/pb_buffer_fenced.c \
 	pipebuffer/pb_buffer_malloc.c \
@@ -163,6 +164,7 @@ GENERATED_SOURCES := \

 GALLIVM_SOURCES := \
        gallivm/lp_bld_arit.c \
+        gallivm/lp_bld_arit_overflow.c \
        gallivm/lp_bld_assert.c \
        gallivm/lp_bld_bitarit.c \
        gallivm/lp_bld_const.c \
@@ -171,6 +173,7 @@ GALLIVM_SOURCES := \
        gallivm/lp_bld_format_aos.c \
        gallivm/lp_bld_format_aos_array.c \
 	gallivm/lp_bld_format_float.c \
+        gallivm/lp_bld_format_srgb.c \
        gallivm/lp_bld_format_soa.c \
        gallivm/lp_bld_format_yuv.c \
        gallivm/lp_bld_gather.c \
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -111,6 +111,7 @@ struct cso_context {
   void *velements, *velements_saved;
   struct pipe_query *render_condition, *render_condition_saved;
   uint render_condition_mode, render_condition_mode_saved;
+   boolean render_condition_cond, render_condition_cond_saved;

   struct pipe_clip_state clip;
   struct pipe_clip_state clip_saved;
@@ -723,13 +724,17 @@ void cso_restore_stencil_ref(struct cso_context *ctx)
 }

 void cso_set_render_condition(struct cso_context *ctx,
-                              struct pipe_query *query, uint mode)
+                              struct pipe_query *query,
+                              boolean condition, uint mode)
 {
   struct pipe_context *pipe = ctx->pipe;

-   if (ctx->render_condition != query || ctx->render_condition_mode != mode) {
-      pipe->render_condition(pipe, query, mode);
+   if (ctx->render_condition != query ||
+       ctx->render_condition_mode != mode ||
+       ctx->render_condition_cond != condition) {
+      pipe->render_condition(pipe, query, condition, mode);
      ctx->render_condition = query;
+      ctx->render_condition_cond = condition;
      ctx->render_condition_mode = mode;
   }
 }
@@ -737,12 +742,14 @@ void cso_set_render_condition(struct cso_context *ctx,
 void cso_save_render_condition(struct cso_context *ctx)
 {
   ctx->render_condition_saved = ctx->render_condition;
+   ctx->render_condition_cond_saved = ctx->render_condition_cond;
   ctx->render_condition_mode_saved = ctx->render_condition_mode;
 }

 void cso_restore_render_condition(struct cso_context *ctx)
 {
   cso_set_render_condition(ctx, ctx->render_condition_saved,
+                            ctx->render_condition_cond_saved,
                            ctx->render_condition_mode_saved);
 }

--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -170,7 +170,8 @@ void cso_save_stencil_ref(struct cso_context *cso);
 void cso_restore_stencil_ref(struct cso_context *cso);

 void cso_set_render_condition(struct cso_context *cso,
-                              struct pipe_query *query, uint mode);
+                              struct pipe_query *query,
+                              boolean condition, uint mode);
 void cso_save_render_condition(struct cso_context *cso);
 void cso_restore_render_condition(struct cso_context *cso);

--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -58,7 +58,7 @@ draw_get_option_use_llvm(void)

 #ifdef PIPE_ARCH_X86
      util_cpu_detect();
-      /* require SSE2 due to LLVM PR6960. */
+      /* require SSE2 due to LLVM PR6960. XXX Might be fixed by now? */
      if (!util_cpu_caps.has_sse2)
         value = FALSE;
 #endif
@@ -78,6 +78,9 @@ draw_create_context(struct pipe_context *pipe, boolean try_llvm)
   if (draw == NULL)
      goto err_out;

+   /* we need correct cpu caps for disabling denorms in draw_vbo() */
+   util_cpu_detect();
+
 #if HAVE_LLVM
   if (try_llvm && draw_get_option_use_llvm()) {
      draw->llvm = draw_llvm_create(draw);
@@ -138,6 +141,7 @@ boolean draw_init(struct draw_context *draw)
   draw->clip_z = TRUE;

   draw->pt.user.planes = (float (*) [DRAW_TOTAL_CLIP_PLANES][4]) &(draw->plane[0]);
+   draw->pt.user.eltMax = ~0;

   if (!draw_pipeline_init( draw ))
      return FALSE;
@@ -738,6 +742,7 @@ draw_current_shader_clipvertex_output(const struct draw_context *draw)
 uint
 draw_current_shader_clipdistance_output(const struct draw_context *draw, int index)
 {
+   debug_assert(index < PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT);
   if (draw->gs.geometry_shader)
      return draw->gs.geometry_shader->clipdistance_output[index];
   return draw->vs.clipdistance_output[index];
@@ -756,6 +761,7 @@ draw_current_shader_num_written_clipdistances(const struct draw_context *draw)
 uint
 draw_current_shader_culldistance_output(const struct draw_context *draw, int index)
 {
+   debug_assert(index < PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT);
   if (draw->gs.geometry_shader)
      return draw->gs.geometry_shader->culldistance_output[index];
   return draw->vs.vertex_shader->culldistance_output[index];
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -792,13 +792,13 @@ draw_create_geometry_shader(struct draw_context *draw,
      if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX)
         gs->viewport_index_output = i;
      if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_CLIPDIST) {
-         if (gs->info.output_semantic_index[i] == 0)
-            gs->clipdistance_output[0] = i;
-         else
-            gs->clipdistance_output[1] = i;
+         debug_assert(gs->info.output_semantic_index[i] <
+                      PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT);
+         gs->clipdistance_output[gs->info.output_semantic_index[i]] = i;
      }
      if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_CULLDIST) {
-         debug_assert(gs->info.output_semantic_index[i] < Elements(gs->culldistance_output));
+         debug_assert(gs->info.output_semantic_index[i] <
+                      PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT);
         gs->culldistance_output[gs->info.output_semantic_index[i]] = i;
      }
   }
--- a/src/gallium/auxiliary/draw/draw_gs.h
+++ b/src/gallium/auxiliary/draw/draw_gs.h
@@ -67,8 +67,8 @@ struct draw_geometry_shader {
   struct tgsi_shader_info info;
   unsigned position_output;
   unsigned viewport_index_output;
-   unsigned clipdistance_output[2];
-   unsigned culldistance_output[2];
+   unsigned clipdistance_output[PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT];
+   unsigned culldistance_output[PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT];

   unsigned max_output_vertices;
   unsigned primitive_boundary;
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -32,6 +32,7 @@
 #include "draw_gs.h"

 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_arit_overflow.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_swizzle.h"
@@ -673,6 +674,7 @@ generate_vs(struct draw_llvm_variant *variant,

 static void
 generate_fetch(struct gallivm_state *gallivm,
+               struct draw_context *draw,
               LLVMValueRef vbuffers_ptr,
               LLVMValueRef *res,
               struct pipe_vertex_element *velem,
@@ -695,35 +697,58 @@ generate_fetch(struct gallivm_state *gallivm,
   LLVMValueRef buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
   LLVMValueRef stride;
   LLVMValueRef buffer_overflowed;
+   LLVMValueRef needed_buffer_size;
   LLVMValueRef temp_ptr =
      lp_build_alloca(gallivm,
                      lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
+   LLVMValueRef ofbit = NULL;
   struct lp_build_if_state if_ctx;

   if (velem->instance_divisor) {
-      /* array index = instance_id / instance_divisor */
-      index = LLVMBuildUDiv(builder, instance_id,
-                            lp_build_const_int32(gallivm, velem->instance_divisor),
-                            "instance_divisor");
+      /* Index is equal to the start instance plus the number of current 
+       * instance divided by the divisor. In this case we compute it as:
+       * index = start_instance + ((instance_id - start_instance) / divisor)
+       */
+      LLVMValueRef current_instance;
+      index = lp_build_const_int32(gallivm, draw->start_instance);
+      current_instance = LLVMBuildSub(builder, instance_id, index, "");
+      current_instance = LLVMBuildUDiv(builder, current_instance,
+                                       lp_build_const_int32(gallivm, velem->instance_divisor),
+                                       "instance_divisor");
+      index = LLVMBuildAdd(builder, index, current_instance, "instance");
   }

-   stride = LLVMBuildMul(builder, vb_stride, index, "");
+   stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
+   stride = lp_build_uadd_overflow(gallivm, stride, vb_buffer_offset, &ofbit);
+   stride = lp_build_uadd_overflow(
+      gallivm, stride,
+      lp_build_const_int32(gallivm, velem->src_offset), &ofbit);
+   needed_buffer_size = lp_build_uadd_overflow(
+      gallivm, stride,
+      lp_build_const_int32(gallivm,
+                           util_format_get_blocksize(velem->src_format)),
+      &ofbit);

-   stride = LLVMBuildAdd(builder, stride,
-                         vb_buffer_offset,
-                         "");
-   stride = LLVMBuildAdd(builder, stride,
-                         lp_build_const_int32(gallivm, velem->src_offset),
-                         "");
-
-   buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGE,
-                                     stride, buffer_size,
+   buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
+                                     needed_buffer_size, buffer_size,
                                     "buffer_overflowed");
-   /*
-   lp_build_printf(gallivm, "vbuf index = %d, stride is %d\n", indices, stride);
-   lp_build_print_value(gallivm, "   buffer size = ", buffer_size);
+   buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
+#if 0
+   lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
+                   index, vb_stride);
+   lp_build_printf(gallivm, "   vb_buffer_offset = %u, src_offset is %u\n",
+                   vb_buffer_offset,
+                   lp_build_const_int32(gallivm, velem->src_offset));
+   lp_build_print_value(gallivm, "   blocksize = ",
+                        lp_build_const_int32(
+                           gallivm,
+                           util_format_get_blocksize(velem->src_format)));
+   lp_build_printf(gallivm, "   instance_id = %u\n", instance_id);
+   lp_build_printf(gallivm, "   stride = %u\n", stride);
+   lp_build_printf(gallivm, "   buffer size = %u\n", buffer_size);
+   lp_build_printf(gallivm, "   needed_buffer_size = %u\n", needed_buffer_size);
   lp_build_print_value(gallivm, "   buffer overflowed = ", buffer_overflowed);
-   */
+#endif

   lp_build_if(&if_ctx, gallivm, buffer_overflowed);
   {
@@ -1595,6 +1620,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
   if (elts) {
      start = zero;
      end = fetch_count;
+      count = fetch_count;
   }
   else {
      end = lp_build_add(&bld, start, count);
@@ -1604,7 +1630,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,

   fetch_max = LLVMBuildSub(builder, end, one, "fetch_max");

-   lp_build_loop_begin(&lp_loop, gallivm, start);
+   lp_build_loop_begin(&lp_loop, gallivm, zero);
   {
      LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
      LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][LP_MAX_VECTOR_WIDTH / 32] = { { 0 } };
@@ -1612,10 +1638,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
      LLVMValueRef clipmask;   /* holds the clipmask value */
      const LLVMValueRef (*ptr_aos)[TGSI_NUM_CHANNELS];

-      if (elts)
-         io_itr = lp_loop.counter;
-      else
-         io_itr = LLVMBuildSub(builder, lp_loop.counter, start, "");
+      io_itr = lp_loop.counter;

      io = LLVMBuildGEP(builder, io_ptr, &io_itr, 1, "");
 #if DEBUG_STORE
@@ -1628,6 +1651,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
            LLVMBuildAdd(builder,
                         lp_loop.counter,
                         lp_build_const_int32(gallivm, i), "");
+         true_index = LLVMBuildAdd(builder, start, true_index, "");

         /* make sure we're not out of bounds which can happen
          * if fetch_count % 4 != 0, because on the last iteration
@@ -1647,7 +1671,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
                  gallivm,
                  lp_build_vec_type(gallivm, lp_type_int(32)), "");
            struct lp_build_if_state if_ctx;
-            index_overflowed = LLVMBuildICmp(builder, LLVMIntUGE,
+            index_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
                                             true_index, fetch_elt_max,
                                             "index_overflowed");
            
@@ -1681,7 +1705,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
            LLVMValueRef vb_index =
               lp_build_const_int32(gallivm, velem->vertex_buffer_index);
            LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
-            generate_fetch(gallivm, vbuffers_ptr,
+            generate_fetch(gallivm, draw, vbuffers_ptr,
                           &aos_attribs[j][i], velem, vb, true_index,
                           system_values.instance_id);
         }
@@ -1744,8 +1768,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
                     vs_info->num_outputs, vs_type,
                     have_clipdist);
   }
-
-   lp_build_loop_end_cond(&lp_loop, end, step, LLVMIntUGE);
+   lp_build_loop_end_cond(&lp_loop, count, step, LLVMIntUGE);

   sampler->destroy(sampler);

--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -238,6 +238,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                       const struct lp_derivatives *derivs,
                                       LLVMValueRef lod_bias, /* optional */
                                       LLVMValueRef explicit_lod, /* optional */
+                                       boolean scalar_lod,
                                       LLVMValueRef *texel)
 {
   struct draw_llvm_sampler_soa *sampler = (struct draw_llvm_sampler_soa *)base;
@@ -256,7 +257,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                       coords,
                       offsets,
                       derivs,
-                       lod_bias, explicit_lod,
+                       lod_bias, explicit_lod, scalar_lod,
                       texel);
 }

--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -831,7 +831,12 @@ static struct aaline_stage *
 aaline_stage_from_pipe(struct pipe_context *pipe)
 {
   struct draw_context *draw = (struct draw_context *) pipe->draw;
-   return aaline_stage(draw->pipeline.aaline);
+
+   if (draw) {
+      return aaline_stage(draw->pipeline.aaline);
+   } else {
+      return NULL;
+   }
 }


@@ -844,7 +849,12 @@ aaline_create_fs_state(struct pipe_context *pipe,
                       const struct pipe_shader_state *fs)
 {
   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
-   struct aaline_fragment_shader *aafs = CALLOC_STRUCT(aaline_fragment_shader);
+   struct aaline_fragment_shader *aafs = NULL;
+
+   if (aaline == NULL)
+      return NULL;
+
+   aafs = CALLOC_STRUCT(aaline_fragment_shader);

   if (aafs == NULL)
      return NULL;
@@ -864,6 +874,10 @@ aaline_bind_fs_state(struct pipe_context *pipe, void *fs)
   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
   struct aaline_fragment_shader *aafs = (struct aaline_fragment_shader *) fs;

+   if (aaline == NULL) {
+      return;
+   }
+
   /* save current */
   aaline->fs = aafs;
   /* pass-through */
@@ -877,14 +891,19 @@ aaline_delete_fs_state(struct pipe_context *pipe, void *fs)
   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
   struct aaline_fragment_shader *aafs = (struct aaline_fragment_shader *) fs;

-   /* pass-through */
-   aaline->driver_delete_fs_state(pipe, aafs->driver_fs);
+   if (aafs == NULL) {
+      return;
+   }

-   if (aafs->aaline_fs)
-      aaline->driver_delete_fs_state(pipe, aafs->aaline_fs);
+   if (aaline != NULL) {
+      /* pass-through */
+      aaline->driver_delete_fs_state(pipe, aafs->driver_fs);
+
+      if (aafs->aaline_fs)
+         aaline->driver_delete_fs_state(pipe, aafs->aaline_fs);
+   }

   FREE((void*)aafs->state.tokens);
-
   FREE(aafs);
 }

@@ -895,6 +914,10 @@ aaline_bind_sampler_states(struct pipe_context *pipe,
 {
   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);

+   if (aaline == NULL) {
+      return;
+   }
+
   /* save current */
   memcpy(aaline->state.sampler, sampler, num * sizeof(void *));
   aaline->num_samplers = num;
@@ -912,6 +935,10 @@ aaline_set_sampler_views(struct pipe_context *pipe,
   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
   uint i;

+   if (aaline == NULL) {
+      return;
+   }
+
   /* save current */
   for (i = 0; i < num; i++) {
      pipe_sampler_view_reference(&aaline->state.sampler_views[i], views[i]);
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -308,9 +308,9 @@ aa_transform_inst(struct tgsi_transform_context *ctx,
      newInst.Src[1].Register.SwizzleY = TGSI_SWIZZLE_W;
      ctx->emit_instruction(ctx, &newInst);

-      /* KIL -tmp0.yyyy;   # if -tmp0.y < 0, KILL */
+      /* KILL_IF -tmp0.yyyy;   # if -tmp0.y < 0, KILL */
      newInst = tgsi_default_full_instruction();
-      newInst.Instruction.Opcode = TGSI_OPCODE_KIL;
+      newInst.Instruction.Opcode = TGSI_OPCODE_KILL_IF;
      newInst.Instruction.NumDstRegs = 0;
      newInst.Instruction.NumSrcRegs = 1;
      newInst.Src[0].Register.File = TGSI_FILE_TEMPORARY;
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -1,5 +1,5 @@
 /**************************************************************************
- * 
+ *
 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 *
@@ -10,11 +10,11 @@
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
- * 
+ *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
- * 
+ *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
 **************************************************************************/

 /**
@@ -51,10 +51,10 @@ static INLINE struct cull_stage *cull_stage( struct draw_stage *stage )
   return (struct cull_stage *)stage;
 }

-static INLINE
-boolean cull_distance_is_out(float dist)
+static INLINE boolean
+cull_distance_is_out(float dist)
 {
-   return (dist < 0) || util_is_inf_or_nan(dist);
+   return (dist < 0.0f) || util_is_inf_or_nan(dist);
 }

 /*
@@ -68,23 +68,21 @@ static void cull_point( struct draw_stage *stage,
 {
   const unsigned num_written_culldistances =
      draw_current_shader_num_written_culldistances(stage->draw);
+   unsigned i;

-   if (num_written_culldistances) {
-      unsigned i;
-      boolean culled = FALSE;
-      for (i = 0; i < num_written_culldistances; ++i) {
-         unsigned cull_idx = i / 4;
-         unsigned out_idx =
-            draw_current_shader_culldistance_output(stage->draw, cull_idx);
-         unsigned idx = i % 4;
-         float cull1 = header->v[0]->data[out_idx][idx];
-         boolean vert1_out = cull_distance_is_out(cull1);
-         if (vert1_out)
-            culled = TRUE;
-      }
-      if (!culled)
-         stage->next->point( stage->next, header );
+   debug_assert(num_written_culldistances);
+
+   for (i = 0; i < num_written_culldistances; ++i) {
+      unsigned cull_idx = i / 4;
+      unsigned out_idx =
+         draw_current_shader_culldistance_output(stage->draw, cull_idx);
+      unsigned idx = i % 4;
+      float cull1 = header->v[0]->data[out_idx][idx];
+      boolean vert1_out = cull_distance_is_out(cull1);
+      if (vert1_out)
+         return;
   }
+   stage->next->point( stage->next, header );
 }

 /*
@@ -94,29 +92,27 @@ static void cull_point( struct draw_stage *stage,
 * on primitives without faces (e.g. points and lines)
 */
 static void cull_line( struct draw_stage *stage,
-		      struct prim_header *header )
+                       struct prim_header *header )
 {
   const unsigned num_written_culldistances =
      draw_current_shader_num_written_culldistances(stage->draw);
+   unsigned i;

-   if (num_written_culldistances) {
-      unsigned i;
-      boolean culled = FALSE;
-      for (i = 0; i < num_written_culldistances; ++i) {
-         unsigned cull_idx = i / 4;
-         unsigned out_idx =
-            draw_current_shader_culldistance_output(stage->draw, cull_idx);
-         unsigned idx = i % 4;
-         float cull1 = header->v[0]->data[out_idx][idx];
-         float cull2 = header->v[1]->data[out_idx][idx];
-         boolean vert1_out = cull_distance_is_out(cull1);
-         boolean vert2_out = cull_distance_is_out(cull2);
-         if (vert1_out && vert2_out)
-            culled = TRUE;
-      }
-      if (!culled)
-         stage->next->line( stage->next, header );
+   debug_assert(num_written_culldistances);
+
+   for (i = 0; i < num_written_culldistances; ++i) {
+      unsigned cull_idx = i / 4;
+      unsigned out_idx =
+         draw_current_shader_culldistance_output(stage->draw, cull_idx);
+      unsigned idx = i % 4;
+      float cull1 = header->v[0]->data[out_idx][idx];
+      float cull2 = header->v[1]->data[out_idx][idx];
+      boolean vert1_out = cull_distance_is_out(cull1);
+      boolean vert2_out = cull_distance_is_out(cull2);
+      if (vert1_out && vert2_out)
+         return;
   }
+   stage->next->line( stage->next, header );
 }

 /*
@@ -133,7 +129,6 @@ static void cull_tri( struct draw_stage *stage,
   /* Do the distance culling */
   if (num_written_culldistances) {
      unsigned i;
-      boolean culled = FALSE;
      for (i = 0; i < num_written_culldistances; ++i) {
         unsigned cull_idx = i / 4;
         unsigned out_idx =
@@ -146,10 +141,8 @@ static void cull_tri( struct draw_stage *stage,
         boolean vert2_out = cull_distance_is_out(cull2);
         boolean vert3_out = cull_distance_is_out(cull3);
         if (vert1_out && vert2_out && vert3_out)
-            culled = TRUE;
+            return;
      }
-      if (!culled)
-         stage->next->tri( stage->next, header );
   }

   /* Do the regular face culling */
@@ -166,7 +159,7 @@ static void cull_tri( struct draw_stage *stage,
      const float fx = v1[0] - v2[0];
      const float fy = v1[1] - v2[1];

-   
+
      /* det = cross(e,f).z */
      header->det = ex * fy - ey * fx;

@@ -217,7 +210,7 @@ static void cull_first_line( struct draw_stage *stage,
   }
 }

-static void cull_first_tri( struct draw_stage *stage, 
+static void cull_first_tri( struct draw_stage *stage,
 			    struct prim_header *header )
 {
   struct cull_stage *cull = cull_stage(stage);
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -278,7 +278,7 @@ pstip_transform_inst(struct tgsi_transform_context *ctx,


      /* 
-       * Insert new MUL/TEX/KILP instructions at start of program
+       * Insert new MUL/TEX/KILL_IF instructions at start of program
       * Take gl_FragCoord, divide by 32 (stipple size), sample the
       * texture and kill fragment if needed.
       *
@@ -315,9 +315,9 @@ pstip_transform_inst(struct tgsi_transform_context *ctx,
      newInst.Src[1].Register.Index = pctx->freeSampler;
      ctx->emit_instruction(ctx, &newInst);

-      /* KIL -texTemp;   # if -texTemp < 0, KILL fragment */
+      /* KILL_IF -texTemp;   # if -texTemp < 0, KILL fragment */
      newInst = tgsi_default_full_instruction();
-      newInst.Instruction.Opcode = TGSI_OPCODE_KIL;
+      newInst.Instruction.Opcode = TGSI_OPCODE_KILL_IF;
      newInst.Instruction.NumDstRegs = 0;
      newInst.Instruction.NumSrcRegs = 1;
      newInst.Src[0].Register.File = TGSI_FILE_TEMPORARY;
@@ -402,7 +402,7 @@ pstip_update_texture(struct pstip_stage *pstip)
   /*
    * Load alpha texture.
    * Note: 0 means keep the fragment, 255 means kill it.
-    * We'll negate the texel value and use KILP which kills if value
+    * We'll negate the texel value and use KILL_IF which kills if value
    * is negative.
    */
   for (i = 0; i < 32; i++) {
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -138,7 +138,7 @@ emit_vertex( struct vbuf_stage *vbuf,
      /* Note: we really do want data[0] here, not data[pos]: 
       */
      vbuf->translate->set_buffer(vbuf->translate, 0, vertex->data[0], 0, ~0);
-      vbuf->translate->run(vbuf->translate, 0, 1, 0, vbuf->vertex_ptr);
+      vbuf->translate->run(vbuf->translate, 0, 1, 0, 0, vbuf->vertex_ptr);

      if (0) draw_dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr);
      
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -55,6 +55,10 @@ struct gallivm_state;
 /** Sum of frustum planes and user-defined planes */
 #define DRAW_TOTAL_CLIP_PLANES (6 + PIPE_MAX_CLIP_PLANES)

+/**
+ * The largest possible index of a vertex that can be fetched.
+ */
+#define DRAW_MAX_FETCH_IDX 0xffffffff

 struct pipe_context;
 struct draw_vertex_shader;
@@ -306,6 +310,7 @@ struct draw_context
   } extra_shader_outputs;

   unsigned instance_id;
+   unsigned start_instance;

 #ifdef HAVE_LLVM
   struct draw_llvm *llvm;
@@ -467,14 +472,13 @@ void
 draw_stats_clipper_primitives(struct draw_context *draw,
                              const struct draw_prim_info *prim_info);

-
 /** 
 * Return index i from the index buffer.
 * If the index buffer would overflow we return the
- * index of the first element in the vb.
+ * maximum possible index.
 */
 #define DRAW_GET_IDX(_elts, _i)                   \
-   (((_i) >= draw->pt.user.eltMax) ? 0 : (_elts)[_i])
+   (((_i) >= draw->pt.user.eltMax) ? DRAW_MAX_FETCH_IDX : (_elts)[_i])

 /**
 * Return index of the given viewport clamping it
@@ -486,5 +490,20 @@ draw_clamp_viewport_idx(int idx)
   return ((PIPE_MAX_VIEWPORTS > idx || idx < 0) ? idx : 0);
 }

+/**
+ * Adds two unsigned integers and if the addition
+ * overflows then it returns the value from
+ * from the overflow_value variable.
+ */
+static INLINE unsigned
+draw_overflow_uadd(unsigned a, unsigned b,
+                   unsigned overflow_value)
+{
+   unsigned res = a + b;
+   if (res < a || res < b) {
+      res = overflow_value;
+   }
+   return res;
+}

 #endif /* DRAW_PRIVATE_H */
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -345,7 +345,8 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
 /** Helper code for below */
 #define PRIM_RESTART_LOOP(elements) \
   do { \
-      for (i = start; i < end; i++) { \
+      for (j = 0; j < count; j++) {               \
+         i = draw_overflow_uadd(start, j, MAX_LOOP_IDX);  \
         if (i < elt_max && elements[i] == info->restart_index) { \
            if (cur_count > 0) { \
               /* draw elts up to prev pos */ \
@@ -377,9 +378,11 @@ draw_pt_arrays_restart(struct draw_context *draw,
   const unsigned prim = info->mode;
   const unsigned start = info->start;
   const unsigned count = info->count;
-   const unsigned end = start + count;
   const unsigned elt_max = draw->pt.user.eltMax;
-   unsigned i, cur_start, cur_count;
+   unsigned i, j, cur_start, cur_count;
+   /* The largest index within a loop using the i variable as the index.
+    * Used for overflow detection */
+   const unsigned MAX_LOOP_IDX = 0xffffffff;

   assert(info->primitive_restart);

@@ -456,8 +459,14 @@ draw_vbo(struct draw_context *draw,
   unsigned instance;
   unsigned index_limit;
   unsigned count;
+   unsigned fpstate = util_fpstate_get();
   struct pipe_draw_info resolved_info;

+   /* Make sure that denorms are treated like zeros. This is 
+    * the behavior required by D3D10. OpenGL doesn't care.
+    */
+   util_fpstate_set_denorms_to_zero(fpstate);
+
   resolve_draw_info(info, &resolved_info);
   info = &resolved_info;

@@ -508,11 +517,16 @@ draw_vbo(struct draw_context *draw,
                                     draw->pt.vertex_element,
                                     draw->pt.nr_vertex_elements,
                                     info);
-
-   if (index_limit == 0) {
+#if HAVE_LLVM
+   if (!draw->llvm)
+#endif
+   {
+      if (index_limit == 0) {
      /* one of the buffers is too small to do any valid drawing */
-      debug_warning("draw: VBO too small to draw anything\n");
-      return;
+         debug_warning("draw: VBO too small to draw anything\n");
+         util_fpstate_set(fpstate);
+         return;
+      }
   }

   /* If we're collecting stats then make sure we start from scratch */
@@ -529,6 +543,13 @@ draw_vbo(struct draw_context *draw,

   for (instance = 0; instance < info->instance_count; instance++) {
      draw->instance_id = instance + info->start_instance;
+      draw->start_instance = info->start_instance;
+      /* check for overflow */
+      if (draw->instance_id < instance ||
+          draw->instance_id < info->start_instance) {
+         /* if we overflown just set the instance id to the max */
+         draw->instance_id = 0xffffffff;
+      }

      draw_new_instance(draw);

@@ -544,4 +565,5 @@ draw_vbo(struct draw_context *draw,
   if (draw->collect_statistics) {
      draw->render->pipeline_statistics(draw->render, &draw->statistics);
   }
+   util_fpstate_set(fpstate);
 }
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -171,6 +171,7 @@ draw_pt_emit(struct pt_emit *emit,
   translate->run(translate,
 		  0,
 		  vertex_count,
+                  draw->start_instance,
                  draw->instance_id,
 		  hw_verts );

@@ -234,6 +235,7 @@ draw_pt_emit_linear(struct pt_emit *emit,
   translate->run(translate,
                  0,
                  count,
+                  draw->start_instance,
                  draw->instance_id,
                  hw_verts);

@@ -253,12 +255,6 @@ draw_pt_emit_linear(struct pt_emit *emit,
        i < prim_info->primitive_count;
        start += prim_info->primitive_lengths[i], i++)
   {
-      if (draw->collect_statistics) {
-         draw->statistics.c_invocations +=
-            u_decomposed_prims_for_vertices(prim_info->prim,
-                                            prim_info->primitive_lengths[i]);
-      }
-
      render->draw_arrays(render,
                          start,
                          prim_info->primitive_lengths[i]);
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -168,6 +168,7 @@ draw_pt_fetch_run(struct pt_fetch *fetch,
   translate->run_elts( translate,
 			elts,
 			count,
+                        draw->start_instance,
                        draw->instance_id,
 			verts );
 }
@@ -195,6 +196,7 @@ draw_pt_fetch_run_linear(struct pt_fetch *fetch,
   translate->run( translate,
                   start,
                   count,
+                   draw->start_instance,
                   draw->instance_id,
                   verts );
 }
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -210,6 +210,7 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
   feme->translate->run_elts( feme->translate,
 			      fetch_elts,
 			      fetch_count,
+                              draw->start_instance,
                              draw->instance_id,
 			      hw_verts );

@@ -267,6 +268,7 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
   feme->translate->run( feme->translate,
                         start,
                         count,
+                         draw->start_instance,
                         draw->instance_id,
                         hw_verts );

@@ -326,6 +328,7 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
   feme->translate->run( feme->translate,
                         start,
                         count,
+                         draw->start_instance,
                         draw->instance_id,
                         hw_verts );

--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -182,12 +182,29 @@ static void so_emit_prim(struct pt_so_emit *so,

         buffer = (float *)((char *)draw->so.targets[ob]->mapping +
                            draw->so.targets[ob]->target.buffer_offset +
-                            draw->so.targets[ob]->internal_offset) + state->output[slot].dst_offset;
+                            draw->so.targets[ob]->internal_offset) +
+            state->output[slot].dst_offset;
         
         if (idx == so->pos_idx && pcp_ptr)
-            memcpy(buffer, &pre_clip_pos[start_comp], num_comps * sizeof(float));
+            memcpy(buffer, &pre_clip_pos[start_comp],
+                   num_comps * sizeof(float));
         else
-            memcpy(buffer, &input[idx][start_comp], num_comps * sizeof(float));
+            memcpy(buffer, &input[idx][start_comp],
+                   num_comps * sizeof(float));
+#if 0
+         {
+            int j;
+            debug_printf("VERT[%d], offset = %d, slot[%d] sc = %d, num_c = %d, idx = %d = [",
+                         i + draw->so.targets[ob]->emitted_vertices,
+                         draw->so.targets[ob]->internal_offset,
+                         slot, start_comp, num_comps, idx);
+            for (j = 0; j < num_comps; ++j) {
+               unsigned *ubuffer = (unsigned*)buffer;
+               debug_printf("%d (0x%x), ", ubuffer[j], ubuffer[j]);
+            }
+            debug_printf("]\n");
+         }
+#endif
      }
      for (ob = 0; ob < draw->so.num_targets; ++ob) {
         struct draw_so_target *target = draw->so.targets[ob];
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -33,6 +33,9 @@
 #define SEGMENT_SIZE 1024
 #define MAP_SIZE     256

+/* The largest possible index withing an index buffer */
+#define MAX_ELT_IDX 0xffffffff
+
 struct vsplit_frontend {
   struct draw_pt_front_end base;
   struct draw_context *draw;
@@ -82,16 +85,15 @@ vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags)
 * Add a fetch element and add it to the draw elements.
 */
 static INLINE void
-vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch)
+vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch, unsigned ofbias)
 {
-   struct draw_context *draw = vsplit->draw;
   unsigned hash;

-   fetch = MIN2(fetch, draw->pt.max_index);
-
   hash = fetch % MAP_SIZE;

-   if (vsplit->cache.fetches[hash] != fetch) {
+   /* If the value isn't in the cache of it's an overflow due to the
+    * element bias */
+   if (vsplit->cache.fetches[hash] != fetch || ofbias) {
      /* update cache */
      vsplit->cache.fetches[hash] = fetch;
      vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts;
@@ -104,22 +106,109 @@ vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch)
   vsplit->draw_elts[vsplit->cache.num_draw_elts++] = vsplit->cache.draws[hash];
 }

+/**
+ * Returns the base index to the elements array.
+ * The value is checked for overflows (both integer overflows
+ * and the elements array overflow).
+ */
+static INLINE unsigned
+vsplit_get_base_idx(struct vsplit_frontend *vsplit,
+                    unsigned start, unsigned fetch, unsigned *ofbit)
+{
+   struct draw_context *draw = vsplit->draw;
+   unsigned elt_idx = draw_overflow_uadd(start, fetch, MAX_ELT_IDX);
+   if (ofbit)
+      *ofbit = 0;
+
+   /* Overflown indices need to wrap to the first element
+    * in the index buffer */
+   if (elt_idx >= draw->pt.user.eltMax) {
+      if (ofbit)
+         *ofbit = 1;
+      elt_idx = 0;
+   }
+
+   return elt_idx;
+}
+
+/**
+ * Returns the element index adjust for the element bias.
+ * The final element index is created from the actual element
+ * index, plus the element bias, clamped to maximum elememt
+ * index if that addition overflows.
+ */
+static INLINE unsigned
+vsplit_get_bias_idx(struct vsplit_frontend *vsplit,
+                    int idx, int bias, unsigned *ofbias)
+{
+   int res = idx + bias;
+
+   if (ofbias)
+      *ofbias = 0;
+
+   if (idx > 0 && bias > 0) {
+      if (res < idx || res < bias) {
+         res = DRAW_MAX_FETCH_IDX;
+         if (ofbias)
+            *ofbias = 1;
+      }
+   } else if (idx < 0 && bias < 0) {
+      if (res > idx || res > bias) {
+         res = DRAW_MAX_FETCH_IDX;
+         if (ofbias)
+            *ofbias = 1;
+      }
+   }
+
+   return res;
+}
+
+#define VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias)    \
+   unsigned elt_idx;                                       \
+   unsigned ofbit;                                         \
+   unsigned ofbias;                                        \
+   elt_idx = vsplit_get_base_idx(vsplit, start, fetch, &ofbit);          \
+   elt_idx = vsplit_get_bias_idx(vsplit, ofbit ? 0 : DRAW_GET_IDX(elts, elt_idx), elt_bias, &ofbias)
+
+static INLINE void
+vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts,
+                       unsigned start, unsigned fetch, int elt_bias)
+{
+   struct draw_context *draw = vsplit->draw;
+   VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
+   vsplit_add_cache(vsplit, elt_idx, ofbias);
+}
+
+static INLINE void
+vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, const ushort *elts,
+                       unsigned start, unsigned fetch, int elt_bias)
+{
+   struct draw_context *draw = vsplit->draw;
+   VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
+   vsplit_add_cache(vsplit, elt_idx, ofbias);
+}
+

 /**
 * Add a fetch element and add it to the draw elements.  The fetch element is
 * in full range (uint).
 */
 static INLINE void
-vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch)
+vsplit_add_cache_uint(struct vsplit_frontend *vsplit, const uint *elts,
+                      unsigned start, unsigned fetch, int elt_bias)
 {
-   /* special care for 0xffffffff */
-   if (fetch == 0xffffffff && !vsplit->cache.has_max_fetch) {
+   struct draw_context *draw = vsplit->draw;
+   unsigned raw_elem_idx = start + fetch + elt_bias;
+   VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
+
+   /* special care for DRAW_MAX_FETCH_IDX */
+   if (raw_elem_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) {
      unsigned hash = fetch % MAP_SIZE;
-      vsplit->cache.fetches[hash] = fetch - 1; /* force update */
+      vsplit->cache.fetches[hash] = raw_elem_idx - 1; /* force update */
      vsplit->cache.has_max_fetch = TRUE;
   }

-   vsplit_add_cache(vsplit, fetch);
+   vsplit_add_cache(vsplit, elt_idx, ofbias);
 }


@@ -128,17 +217,17 @@ vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch)

 #define FUNC vsplit_run_ubyte
 #define ELT_TYPE ubyte
-#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch)
+#define ADD_CACHE(vsplit, ib, start, fetch, bias) vsplit_add_cache_ubyte(vsplit,ib,start,fetch,bias)
 #include "draw_pt_vsplit_tmp.h"

 #define FUNC vsplit_run_ushort
 #define ELT_TYPE ushort
-#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch)
+#define ADD_CACHE(vsplit, ib, start, fetch, bias) vsplit_add_cache_ushort(vsplit,ib,start,fetch, bias)
 #include "draw_pt_vsplit_tmp.h"

 #define FUNC vsplit_run_uint
 #define ELT_TYPE uint
-#define ADD_CACHE(vsplit, fetch) vsplit_add_cache_uint(vsplit, fetch)
+#define ADD_CACHE(vsplit, ib, start, fetch, bias) vsplit_add_cache_uint(vsplit, ib, start, fetch, bias)
 #include "draw_pt_vsplit_tmp.h"


--- a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
@@ -47,13 +47,20 @@ CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit,
   const unsigned start = istart;
   const unsigned end = istart + icount;

+   /* If the index buffer overflows we'll need to run
+    * through the normal paths */
+   if (start >= draw->pt.user.eltMax ||
+       end > draw->pt.user.eltMax ||
+       end < istart || end < icount)
+      return FALSE;
+
   /* use the ib directly */
   if (min_index == 0 && sizeof(ib[0]) == sizeof(draw_elts[0])) {
      if (icount > vsplit->max_vertices)
         return FALSE;

-      for (i = start; i < end; i++) {
-         ELT_TYPE idx = DRAW_GET_IDX(ib, i);
+      for (i = 0; i < icount; i++) {
+         ELT_TYPE idx = DRAW_GET_IDX(ib, start + i);
         if (idx < min_index || idx > max_index) {
            debug_printf("warning: index out of range\n");
         }
@@ -82,25 +89,29 @@ CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit,
   fetch_start = min_index + elt_bias;
   fetch_count = max_index - min_index + 1;

+   /* Check for overflow in the fetch_start */
+   if (fetch_start < min_index || fetch_start < elt_bias)
+      return FALSE;
+
   if (!draw_elts) {
      if (min_index == 0) {
-         for (i = start; i < end; i++) {
-            ELT_TYPE idx = DRAW_GET_IDX(ib, i);
+         for (i = 0; i < icount; i++) {
+            ELT_TYPE idx = DRAW_GET_IDX(ib, i + start);

            if (idx < min_index || idx > max_index) {
               debug_printf("warning: index out of range\n");
            }
-            vsplit->draw_elts[i - start] = (ushort) idx;
+            vsplit->draw_elts[i] = (ushort) idx;
         }
      }
      else {
-         for (i = start; i < end; i++) {
-            ELT_TYPE idx = DRAW_GET_IDX(ib, i);
+         for (i = 0; i < icount; i++) {
+            ELT_TYPE idx = DRAW_GET_IDX(ib, i + start);

            if (idx < min_index || idx > max_index) {
               debug_printf("warning: index out of range\n");
            }
-            vsplit->draw_elts[i - start] = (ushort) (idx - min_index);
+            vsplit->draw_elts[i] = (ushort) (idx - min_index);
         }
      }

@@ -137,41 +148,36 @@ CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit,
   spoken = !!spoken;
   if (ibias == 0) {
      if (spoken)
-         ADD_CACHE(vsplit, DRAW_GET_IDX(ib, ispoken));
+         ADD_CACHE(vsplit, ib, 0, ispoken, 0);

-      for (i = spoken; i < icount; i++)
-         ADD_CACHE(vsplit, DRAW_GET_IDX(ib, istart + i));
+      for (i = spoken; i < icount; i++) {
+         ADD_CACHE(vsplit, ib, istart, i, 0);
+      }

      if (close)
-         ADD_CACHE(vsplit, DRAW_GET_IDX(ib, iclose));
+         ADD_CACHE(vsplit, ib, 0, iclose, 0);
   }
   else if (ibias > 0) {
      if (spoken)
-         ADD_CACHE(vsplit, (uint) DRAW_GET_IDX(ib, ispoken) + ibias);
+         ADD_CACHE(vsplit, ib, 0, ispoken, ibias);

      for (i = spoken; i < icount; i++)
-         ADD_CACHE(vsplit, (uint) DRAW_GET_IDX(ib, istart + i) + ibias);
+         ADD_CACHE(vsplit, ib, istart, i, ibias);

      if (close)
-         ADD_CACHE(vsplit, (uint) DRAW_GET_IDX(ib, iclose) + ibias);
+         ADD_CACHE(vsplit, ib, 0, iclose, ibias);
   }
   else {
      if (spoken) {
-         if ((int) ib[ispoken] < -ibias)
-            return;
-         ADD_CACHE(vsplit, DRAW_GET_IDX(ib, ispoken) + ibias);
+         ADD_CACHE(vsplit, ib, 0, ispoken, ibias);
      }

      for (i = spoken; i < icount; i++) {
-         if ((int) DRAW_GET_IDX(ib, istart + i) < -ibias)
-            return;
-         ADD_CACHE(vsplit, DRAW_GET_IDX(ib, istart + i) + ibias);
+         ADD_CACHE(vsplit, ib, istart, i, ibias);
      }

      if (close) {
-         if ((int) DRAW_GET_IDX(ib, iclose) < -ibias)
-            return;
-         ADD_CACHE(vsplit, DRAW_GET_IDX(ib, iclose) + ibias);
+         ADD_CACHE(vsplit, ib, 0, iclose, ibias);
      }
   }

--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -86,12 +86,12 @@ draw_create_vertex_shader(struct draw_context *draw,
            found_clipvertex = TRUE;
            vs->clipvertex_output = i;
         } else if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_CLIPDIST) {
-            if (vs->info.output_semantic_index[i] == 0)
-               vs->clipdistance_output[0] = i;
-            else
-               vs->clipdistance_output[1] = i;
+            debug_assert(vs->info.output_semantic_index[i] <
+                         PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT);
+            vs->clipdistance_output[vs->info.output_semantic_index[i]] = i;
         } else if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_CULLDIST) {
-            debug_assert(vs->info.output_semantic_index[i] < Elements(vs->culldistance_output));
+            debug_assert(vs->info.output_semantic_index[i] <
+                         PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT);
            vs->culldistance_output[vs->info.output_semantic_index[i]] = i;
         }
      }
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -112,8 +112,8 @@ struct draw_vertex_shader {
   unsigned position_output;
   unsigned edgeflag_output;
   unsigned clipvertex_output;
-   unsigned clipdistance_output[2];
-   unsigned culldistance_output[2];
+   unsigned clipdistance_output[PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT];
+   unsigned culldistance_output[PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT];
   /* Extracted from shader:
    */
   const float (*immediates)[4];
--- a/src/gallium/auxiliary/draw/draw_vs_variant.c
+++ b/src/gallium/auxiliary/draw/draw_vs_variant.c
@@ -168,6 +168,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_variant *variant,
   vsvg->fetch->run_elts( vsvg->fetch, 
                          elts,
                          count,
+                          vsvg->draw->start_instance,
                          vsvg->draw->instance_id,
                          temp_buffer );

@@ -211,6 +212,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_variant *variant,

   vsvg->emit->run( vsvg->emit,
                    0, count,
+                    vsvg->draw->start_instance,
                    vsvg->draw->instance_id,
                    output_buffer );

@@ -234,6 +236,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_variant *variant,
   vsvg->fetch->run( vsvg->fetch, 
                     start,
                     count,
+                     vsvg->draw->start_instance,
                     vsvg->draw->instance_id,
                     temp_buffer );

@@ -274,6 +277,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_variant *variant,
   
   vsvg->emit->run( vsvg->emit,
                    0, count,
+                    vsvg->draw->start_instance,
                    vsvg->draw->instance_id,
                    output_buffer );

--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -62,6 +62,7 @@
 #include "lp_bld_debug.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_flow.h"


 #define EXP_POLY_DEGREE 5
@@ -2305,19 +2306,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
   /*
    * This should be faster but all denormals will end up as infinity.
    */
-   if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
-        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
+   if (0 && lp_build_fast_rsqrt_available(type)) {
      const unsigned num_iterations = 1;
      LLVMValueRef res;
      unsigned i;
-      const char *intrinsic = NULL;

-      if (type.length == 4) {
-         intrinsic = "llvm.x86.sse.rsqrt.ps";
-      }
-      else {
-         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
-      }
+      /* rsqrt(1.0) != 1.0 here */
+      res = lp_build_fast_rsqrt(bld, a);
+
      if (num_iterations) {
         /*
          * Newton-Raphson will result in NaN instead of infinity for zero,
@@ -2337,8 +2333,6 @@ lp_build_rsqrt(struct lp_build_context *bld,

         inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");

-         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
-
         for (i = 0; i < num_iterations; ++i) {
            res = lp_build_rsqrt_refine(bld, a, res);
         }
@@ -2349,11 +2343,6 @@ lp_build_rsqrt(struct lp_build_context *bld,
         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
         res = lp_build_select(bld, cmp, bld->one, res);
      }
-      else {
-         /* rsqrt(1.0) != 1.0 here */
-         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
-
-      }

      return res;
   }
@@ -2361,6 +2350,58 @@ lp_build_rsqrt(struct lp_build_context *bld,
   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }

+/**
+ * If there's a fast (inaccurate) rsqrt instruction available
+ * (caller may want to avoid to call rsqrt_fast if it's not available,
+ * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
+ * unavailable it would result in sqrt/div/mul so obviously
+ * much better to just call sqrt, skipping both div and mul).
+ */
+boolean
+lp_build_fast_rsqrt_available(struct lp_type type)
+{
+   assert(type.floating);
+
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+      return true;
+   }
+   return false;
+}
+
+
+/**
+ * Generate 1/sqrt(a).
+ * Result is undefined for values < 0, infinity for +0.
+ * Precision is limited, only ~10 bits guaranteed
+ * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
+ */
+LLVMValueRef
+lp_build_fast_rsqrt(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   const struct lp_type type = bld->type;
+
+   assert(lp_check_value(type, a));
+
+   if (lp_build_fast_rsqrt_available(type)) {
+      const char *intrinsic = NULL;
+
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rsqrt.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+      }
+      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+   }
+   else {
+      debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
+   }
+   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
+}
+

 /**
 * Generate sin(a) using SSE2
@@ -2561,15 +2602,14 @@ lp_build_sin(struct lp_build_context *bld,
    * xmm3 = poly_mask;
    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
    * y = _mm_andnot_ps(xmm3, y);
-    * y = _mm_add_ps(y,y2);
+    * y = _mm_or_ps(y,y2);
    */
   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
-   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
-   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
+   LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
-   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
+   LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");

   /*
    * update the sign
@@ -2779,14 +2819,14 @@ lp_build_cos(struct lp_build_context *bld,
    * xmm3 = poly_mask;
    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
    * y = _mm_andnot_ps(xmm3, y);
-    * y = _mm_add_ps(y,y2);
+    * y = _mm_or_ps(y,y2);
    */
   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
-   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
+   LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
-   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
+   LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");

   /*
    * update the sign
@@ -2855,7 +2895,7 @@ lp_build_log(struct lp_build_context *bld,
 * Generate polynomial.
 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
 */
-static LLVMValueRef
+LLVMValueRef
 lp_build_polynomial(struct lp_build_context *bld,
                    LLVMValueRef x,
                    const double *coeffs,
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -231,6 +231,19 @@ LLVMValueRef
 lp_build_rsqrt(struct lp_build_context *bld,
               LLVMValueRef a);

+boolean
+lp_build_fast_rsqrt_available(struct lp_type type);
+
+LLVMValueRef
+lp_build_fast_rsqrt(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+LLVMValueRef
+lp_build_polynomial(struct lp_build_context *bld,
+                    LLVMValueRef x,
+                    const double *coeffs,
+                    unsigned num_coeffs);
+
 LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
             LLVMValueRef a);
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
@@ -0,0 +1,151 @@
+/**************************************************************************
+ *
+ * Copyright 2013
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper
+ *
+ * The functions in this file implement arthmetic operations with support
+ * for overflow detection and reporting.
+ *
+ */
+
+#include "lp_bld_arit_overflow.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_init.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_bitarit.h"
+
+#include "util/u_memory.h"
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_string.h"
+#include "util/u_cpu_detect.h"
+
+#include <float.h>
+
+
+static LLVMValueRef
+build_binary_int_overflow(struct gallivm_state *gallivm,
+                          const char *intr_prefix,
+                          LLVMValueRef a,
+                          LLVMValueRef b,
+                          LLVMValueRef *ofbit)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   char intr_str[256];
+   LLVMTypeRef type_ref;
+   LLVMTypeKind type_kind;
+   unsigned type_width;
+   LLVMTypeRef oelems[2];
+   LLVMValueRef oresult;
+   LLVMTypeRef otype;
+
+   debug_assert(LLVMTypeOf(a) == LLVMTypeOf(b));
+   type_ref = LLVMTypeOf(a);
+   type_kind = LLVMGetTypeKind(type_ref);
+
+   debug_assert(type_kind == LLVMIntegerTypeKind);
+   type_width = LLVMGetIntTypeWidth(type_ref);
+
+   debug_assert(type_width == 16 || type_width == 32 || type_width == 64);
+
+   util_snprintf(intr_str, sizeof intr_str, "%s.i%u",
+                 intr_prefix, type_width);
+
+   oelems[0] = type_ref;
+   oelems[1] = LLVMInt1TypeInContext(gallivm->context);
+
+   otype = LLVMStructTypeInContext(gallivm->context, oelems, 2, FALSE);
+   oresult = lp_build_intrinsic_binary(builder, intr_str,
+                                       otype, a, b);
+   if (ofbit) {
+      if (*ofbit) {
+         *ofbit = LLVMBuildOr(
+            builder, *ofbit,
+            LLVMBuildExtractValue(builder, oresult, 1, ""), "");
+      } else {
+         *ofbit = LLVMBuildExtractValue(builder, oresult, 1, "");
+      }
+   }
+
+   return LLVMBuildExtractValue(builder, oresult, 0, "");
+}
+
+/**
+ * Performs unsigned addition of two integers and reports 
+ * overflow if detected.
+ *
+ * The values @a and @b must be of the same integer type. If
+ * an overflow is detected the IN/OUT @ofbit parameter is used:
+ * - if it's pointing to a null value, the overflow bit is simply
+ *   stored inside the variable it's pointing to,
+ * - if it's pointing to a valid value, then that variable,
+ *   which must be of i1 type, is ORed with the newly detected
+ *   overflow bit. This is done to allow chaining of a number of
+ *   overflow functions together without having to test the 
+ *   overflow bit after every single one.
+ */
+LLVMValueRef
+lp_build_uadd_overflow(struct gallivm_state *gallivm,
+                       LLVMValueRef a,
+                       LLVMValueRef b,
+                       LLVMValueRef *ofbit)
+{
+   return build_binary_int_overflow(gallivm, "llvm.uadd.with.overflow",
+                                    a, b, ofbit);
+}
+
+/**
+ * Performs unsigned multiplication of  two integers and 
+ * reports overflow if detected.
+ *
+ * The values @a and @b must be of the same integer type. If
+ * an overflow is detected the IN/OUT @ofbit parameter is used:
+ * - if it's pointing to a null value, the overflow bit is simply
+ *   stored inside the variable it's pointing to,
+ * - if it's pointing to a valid value, then that variable,
+ *   which must be of i1 type, is ORed with the newly detected
+ *   overflow bit. This is done to allow chaining of a number of
+ *   overflow functions together without having to test the 
+ *   overflow bit after every single one.
+ */
+LLVMValueRef
+lp_build_umul_overflow(struct gallivm_state *gallivm,
+                       LLVMValueRef a,
+                       LLVMValueRef b,
+                       LLVMValueRef *ofbit)
+{
+   return build_binary_int_overflow(gallivm, "llvm.umul.with.overflow",
+                                    a, b, ofbit);
+}
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ *
+ * Copyright 2013 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper arithmetic functions with support for overflow detection
+ * and reporting.
+ *
+ * @author Zack Rusin <zackr@vmware.com>
+ */
+
+
+#ifndef LP_BLD_ARIT_OVERFLOW_H
+#define LP_BLD_ARIT_OVERFLOW_H
+
+
+#include "gallivm/lp_bld.h"
+
+struct gallivm_state;
+
+LLVMValueRef
+lp_build_uadd_overflow(struct gallivm_state *gallivm,
+                       LLVMValueRef a,
+                       LLVMValueRef b,
+                       LLVMValueRef *ofbit);
+
+LLVMValueRef
+lp_build_umul_overflow(struct gallivm_state *gallivm,
+                       LLVMValueRef a,
+                       LLVMValueRef b,
+                       LLVMValueRef *ofbit);
+
+#endif /* !LP_BLD_ARIT_OVERFLOW_H */
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -79,82 +79,6 @@



-/**
- * Byte swap on element. It will construct a call to intrinsic llvm.bswap
- * based on the type.
- *
- * @param res           element to byte swap.
- * @param type          int16_t, int32_t, int64_t, float or double
- * @param 
- */
-LLVMValueRef
-lp_build_bswap(struct gallivm_state *gallivm,
-               LLVMValueRef res,
-               struct lp_type type)
-{
-   LLVMTypeRef int_type = LLVMIntTypeInContext(gallivm->context,
-                                               type.width);
-   const char *intrinsic = NULL;
-   if (type.width == 8)
-      return res;
-   if (type.width == 16)
-      intrinsic = "llvm.bswap.i16";
-   else if (type.width == 32)
-     intrinsic = "llvm.bswap.i32";
-   else if (type.width == 64)
-      intrinsic = "llvm.bswap.i64";
-
-   assert (intrinsic != NULL);
-
-   /* In case of a floating-point type cast to a int of same size and then
-    * cast back to fp type.
-    */
-   if (type.floating)
-      res = LLVMBuildBitCast(gallivm->builder, res, int_type, "");
-   res = lp_build_intrinsic_unary(gallivm->builder, intrinsic, int_type, res);
-   if (type.floating)
-      res = LLVMBuildBitCast(gallivm->builder, res,
-                             lp_build_elem_type(gallivm, type), "");
-   return res;
-}
-
-
-/**
- * Byte swap every element in the vector.
- *
- * @param packed        <vector> to convert
- * @param src_type      <vector> type of int16_t, int32_t, int64_t, float or
- *                      double
- * @param dst_type      <vector> type to return
- */
-LLVMValueRef
-lp_build_bswap_vec(struct gallivm_state *gallivm,
-                   LLVMValueRef packed,
-                   struct lp_type src_type_vec,
-                   struct lp_type dst_type_vec)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef dst_type = lp_build_elem_type(gallivm, dst_type_vec);
-   LLVMValueRef res;
-
-   if (src_type_vec.length == 1) {
-      res = lp_build_bswap(gallivm, packed, src_type_vec);
-      res = LLVMBuildBitCast(gallivm->builder, res, dst_type, "");
-   } else {
-      unsigned i;
-      res = LLVMGetUndef(lp_build_vec_type(gallivm, dst_type_vec));
-      for (i = 0; i < src_type_vec.length; ++i) {
-         LLVMValueRef index = lp_build_const_int32(gallivm, i);
-         LLVMValueRef elem = LLVMBuildExtractElement(builder, packed, index, "");
-         elem = lp_build_bswap(gallivm, elem, src_type_vec);
-         elem = LLVMBuildBitCast(gallivm->builder, elem, dst_type, "");
-         res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
-      }
-   }
-   return res;
-}
-
-
 /**
 * Converts int16 half-float to float32
 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -42,17 +42,6 @@

 struct lp_type;

-LLVMValueRef
-lp_build_bswap(struct gallivm_state *gallivm,
-               LLVMValueRef res,
-               struct lp_type type);
-
-LLVMValueRef
-lp_build_bswap_vec(struct gallivm_state *gallivm,
-                   LLVMValueRef packed,
-                   struct lp_type src_type,
-                   struct lp_type dst_type);
-
 LLVMValueRef
 lp_build_half_to_float(struct gallivm_state *gallivm,
                       LLVMValueRef src);
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -188,7 +188,7 @@ lp_build_mask_value(struct lp_build_mask_context *mask)
 /**
 * Update boolean mask with given value (bitwise AND).
 * Typically used to update the quad's pixel alive/killed mask
- * after depth testing, alpha testing, TGSI_OPCODE_KIL, etc.
+ * after depth testing, alpha testing, TGSI_OPCODE_KILL_IF, etc.
 */
 void
 lp_build_mask_update(struct lp_build_mask_context *mask,
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -158,4 +158,16 @@ lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
                         LLVMValueRef src,
                         LLVMValueRef *dst);

+LLVMValueRef
+lp_build_float_to_srgb_packed(struct gallivm_state *gallivm,
+                              const struct util_format_description *dst_fmt,
+                              struct lp_type src_type,
+                              LLVMValueRef *src);
+
+LLVMValueRef
+lp_build_srgb_to_linear(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        LLVMValueRef src);
+
+
 #endif /* !LP_BLD_FORMAT_H */
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -139,12 +139,12 @@ format_matches_type(const struct util_format_description *desc,


 /**
- * Unpack a single pixel into its RGBA components.
+ * Unpack a single pixel into its XYZW components.
 *
 * @param desc  the pixel format for the packed pixel value
 * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
 *
- * @return RGBA in a float[4] or ubyte[4] or ushort[4] vector.
+ * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
 */
 static INLINE LLVMValueRef
 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
@@ -159,7 +159,6 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,

   boolean normalized;
   boolean needs_uitofp;
-   unsigned shift;
   unsigned i;

   /* TODO: Support more formats */
@@ -172,10 +171,6 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
    * matches floating point size */
   assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));

-#ifdef PIPE_ARCH_BIG_ENDIAN
-   packed = lp_build_bswap(gallivm, packed, lp_type_uint(32));
-#endif
-
   /* Broadcast the packed value to all four channels
    * before: packed = BGRA
    * after: packed = {BGRA, BGRA, BGRA, BGRA}
@@ -194,11 +189,11 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
   /* Initialize vector constants */
   normalized = FALSE;
   needs_uitofp = FALSE;
-   shift = 0;

   /* Loop over 4 color components */
   for (i = 0; i < 4; ++i) {
      unsigned bits = desc->channel[i].size;
+      unsigned shift = desc->channel[i].shift;

      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
         shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
@@ -224,12 +219,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
         else
            scales[i] =  lp_build_const_float(gallivm, 1.0);
      }
-
-      shift += bits;
   }

-   /* Ex: convert packed = {BGRA, BGRA, BGRA, BGRA}
-    * into masked = {B, G, R, A}
+   /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
+    * into masked = {X, Y, Z, W}
    */
   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
@@ -276,7 +269,6 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
   LLVMValueRef shifts[4];
   LLVMValueRef scales[4];
   boolean normalized;
-   unsigned shift;
   unsigned i, j;

   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
@@ -302,9 +294,9 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
                                       LLVMConstVector(swizzles, 4), "");

   normalized = FALSE;
-   shift = 0;
   for (i = 0; i < 4; ++i) {
      unsigned bits = desc->channel[i].size;
+      unsigned shift = desc->channel[i].shift;

      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
         shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
@@ -325,8 +317,6 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
         else
            scales[i] = lp_build_const_float(gallivm, 1.0);
      }
-
-      shift += bits;
   }

   if (normalized)
@@ -410,16 +400,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,

      packed = lp_build_gather(gallivm, type.length/4,
                               format_desc->block.bits, type.width*4,
-                               base_ptr, offset);
+                               base_ptr, offset, TRUE);

      assert(format_desc->block.bits <= vec_len);

      packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
-#ifdef PIPE_ARCH_BIG_ENDIAN
-      if (type.floating)
-         packed = lp_build_bswap_vec(gallivm, packed, type,
-                                    lp_type_float_vec(type.width, vec_len));
-#endif
      return lp_build_format_swizzle_aos(format_desc, &bld, packed);
   }

@@ -453,7 +438,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,

         packed = lp_build_gather_elem(gallivm, num_pixels,
                                       format_desc->block.bits, 32,
-                                       base_ptr, offset, k);
+                                       base_ptr, offset, k, FALSE);

         tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
                                                  format_desc,
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -40,58 +40,6 @@
 #include "pipe/p_state.h"


-#ifdef PIPE_ARCH_BIG_ENDIAN
-static LLVMValueRef
-lp_build_read_int_bswap(struct gallivm_state *gallivm,
-                        LLVMValueRef base_ptr,
-                        unsigned src_width,
-                        LLVMTypeRef src_type,
-                        unsigned i,
-                        LLVMTypeRef dst_type)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef index = lp_build_const_int32(gallivm, i);
-   LLVMValueRef ptr = LLVMBuildGEP(builder, base_ptr, &index, 1, "");
-   LLVMValueRef res = LLVMBuildLoad(builder, ptr, "");
-   res = lp_build_bswap(gallivm, res, lp_type_uint(src_width));
-   return LLVMBuildBitCast(builder, res, dst_type, "");
-}
-
-static LLVMValueRef
-lp_build_fetch_read_big_endian(struct gallivm_state *gallivm,
-                               struct lp_type src_type,
-                               LLVMValueRef base_ptr)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   unsigned src_width = src_type.width;
-   unsigned length = src_type.length;
-   LLVMTypeRef src_elem_type = LLVMIntTypeInContext(gallivm->context, src_width);
-   LLVMTypeRef dst_elem_type = lp_build_elem_type (gallivm, src_type);
-   LLVMTypeRef src_ptr_type = LLVMPointerType(src_elem_type, 0);
-   LLVMValueRef res;
-
-   base_ptr = LLVMBuildPointerCast(builder, base_ptr, src_ptr_type, "");
-   if (length == 1) {
-      /* Scalar */
-      res = lp_build_read_int_bswap(gallivm, base_ptr, src_width, src_elem_type,
-                                    0, dst_elem_type);
-   } else {
-      /* Vector */
-      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-      unsigned i;
-
-      res = LLVMGetUndef(dst_vec_type);
-      for (i = 0; i < length; ++i) {
-         LLVMValueRef index = lp_build_const_int32(gallivm, i);
-         LLVMValueRef elem = lp_build_read_int_bswap(gallivm, base_ptr, src_width,
-                                                     src_elem_type, i, dst_elem_type);
-         res = LLVMBuildInsertElement(builder, res, elem, index, "");
-      }
-   }
-
-   return res;
-}
-#endif

 /**
 * @brief lp_build_fetch_rgba_aos_array
@@ -124,13 +72,9 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,

   /* Read whole vector from memory, unaligned */
   ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
-#ifdef PIPE_ARCH_BIG_ENDIAN
-   res = lp_build_fetch_read_big_endian(gallivm, src_type, ptr);
-#else
   ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
   res = LLVMBuildLoad(builder, ptr, "");
   lp_set_load_alignment(res, src_type.width / 8);
-#endif

   /* Truncate doubles to float */
   if (src_type.floating && src_type.width == 64) {
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -115,7 +115,6 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_build_context bld;
   LLVMValueRef inputs[4];
-   unsigned start;
   unsigned chan;

   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
@@ -128,9 +127,9 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
   lp_build_context_init(&bld, gallivm, type);

   /* Decode the input vector components */
-   start = 0;
   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
      const unsigned width = format_desc->channel[chan].size;
+      const unsigned start = format_desc->channel[chan].shift;
      const unsigned stop = start + width;
      LLVMValueRef input;

@@ -164,11 +163,23 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
          */

         if (type.floating) {
-            if(format_desc->channel[chan].normalized)
-               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
-            else
-               input = LLVMBuildSIToFP(builder, input,
-                                       lp_build_vec_type(gallivm, type), "");
+            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+               assert(width == 8);
+               if (format_desc->swizzle[3] == chan) {
+                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+               }
+               else {
+                  struct lp_type conv_type = lp_uint_type(type);
+                  input = lp_build_srgb_to_linear(gallivm, conv_type, input);
+               }
+            }
+            else {
+               if(format_desc->channel[chan].normalized)
+                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+               else
+                  input = LLVMBuildSIToFP(builder, input,
+                                          lp_build_vec_type(gallivm, type), "");
+            }
         }
         else if (format_desc->channel[chan].pure_integer) {
            /* Nothing to do */
@@ -256,8 +267,6 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
      }

      inputs[chan] = input;
-
-      start = stop;
   }

   lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
@@ -291,7 +300,11 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,

   /* Decode the input vector components */
   for (chan = 0; chan < 4; ++chan) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
      unsigned start = chan*8;
+#else
+      unsigned start = (3-chan)*8;
+#endif
      unsigned stop = start + 8;
      LLVMValueRef input;

@@ -343,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,

   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
@@ -360,13 +374,14 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,

      /*
       * gather the texels from the texture
-       * Ex: packed = {BGRA, BGRA, BGRA, BGRA}.
+       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
       */
+      assert(format_desc->block.bits <= type.width);
      packed = lp_build_gather(gallivm,
                               type.length,
                               format_desc->block.bits,
                               type.width,
-                               base_ptr, offset);
+                               base_ptr, offset, FALSE);

      /*
       * convert texels to float rgba
@@ -391,7 +406,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,

      packed = lp_build_gather(gallivm, type.length,
                               format_desc->block.bits,
-                               type.width, base_ptr, offset);
+                               type.width, base_ptr, offset,
+                               FALSE);
      if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
      }
@@ -418,14 +434,14 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
         offset = LLVMBuildAdd(builder, offset, s_offset, "");
         packed = lp_build_gather(gallivm, type.length,
-                                  32, type.width, base_ptr, offset);
+                                  32, type.width, base_ptr, offset, FALSE);
         packed = LLVMBuildAnd(builder, packed,
                               lp_build_const_int_vec(gallivm, type, mask), "");
      }
      else {
         assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
         packed = lp_build_gather(gallivm, type.length,
-                                  32, type.width, base_ptr, offset);
+                                  32, type.width, base_ptr, offset, TRUE);
         packed = LLVMBuildBitCast(builder, packed,
                                   lp_build_vec_type(gallivm, type), "");
      }
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
@@ -0,0 +1,344 @@
+/**************************************************************************
+ *
+ * Copyright 2013 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Format conversion code for srgb formats.
+ *
+ * Functions for converting from srgb to linear and vice versa.
+ * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
+ *
+ * srgb->linear:
+ * cl = cs / 12.92,                 cs <= 0.04045
+ * cl = ((cs + 0.055)/1.055)^2.4,   cs >  0.04045
+ *
+ * linear->srgb:
+ * if (isnan(cl)) {
+ *    Map IEEE-754 Not-a-number to zero.
+ *    cs = 0.0;
+ * } else if (cl > 1.0) {
+ *    cs = 1.0;
+ * } else if (cl < 0.0) {
+ *    cs = 0.0;
+ * } else if (cl < 0.0031308) {
+ *    cs = 12.92 * cl;
+ * } else {
+ *    cs = 1.055 * pow(cl, 0.41666) - 0.055;
+ * }
+ *
+ * This does not need to be accurate, however at least for d3d10
+ * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
+ * 1) For srgb->linear, it is required that the error on the srgb side is
+ *    not larger than 0.5f, which I interpret that if you map the value back
+ *    to srgb from linear using the ideal conversion, it would not be off by
+ *    more than 0.5f (that is, it would map to the same 8-bit integer value
+ *    as it was before conversion to linear).
+ * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
+ *    error is allowed.
+ * 3) Additionally, all srgb values converted to linear and back must result
+ *    in the same value as they were originally.
+ *
+ * @author Roland Scheidegger <sroland@vmware.com>
+ */
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_bitarit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_format.h"
+
+
+
+/**
+ * Convert srgb int values to linear float values.
+ * Several possibilities how to do this, e.g.
+ * - table
+ * - doing the pow() with int-to-float and float-to-int tricks
+ *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
+ * - just using standard polynomial approximation
+ *   (3rd order polynomial is required for crappy but just sufficient accuracy)
+ *
+ * @param src   integer (vector) value(s) to convert
+ *              (8 bit values unpacked to 32 bit already).
+ */
+LLVMValueRef
+lp_build_srgb_to_linear(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        LLVMValueRef src)
+{
+   struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
+   struct lp_build_context f32_bld;
+   LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh;
+   double coeffs[4] = {0.0023f,
+                       0.0030f / 255.0f,
+                       0.6935f / (255.0f * 255.0f),
+                       0.3012f / (255.0f * 255.0f * 255.0f)
+   };
+
+   assert(src_type.width == 32);
+
+   lp_build_context_init(&f32_bld, gallivm, f32_type);
+
+   /*
+    * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
+    * ( poly =  0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)
+    * (found with octave polyfit and some magic as I couldn't get the error
+    * function right). Using the above mentioned error function, the values stay
+    * within +-0.35, except for the lowest values - hence tweaking linear segment
+    * to cover the first 16 instead of the first 11 values (the error stays
+    * just about acceptable there too).
+    * Hence: lin = src > 15 ? poly : src / 12.6
+    * This function really only makes sense for vectors, should use LUT otherwise.
+    * All in all (including float conversion) 11 instructions (with sse4.1),
+    * 6 constants (polynomial could be done with 1 instruction less at the cost
+    * of slightly worse dependency chain, fma should also help).
+    */
+   /* doing the 1/255 mul as part of the approximation */
+   srcf = lp_build_int_to_float(&f32_bld, src);
+   lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f));
+   part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
+
+   part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4);
+
+   lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
+   is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh);
+   return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
+}
+
+
+/**
+ * Convert linear float values to srgb int values.
+ * Several possibilities how to do this, e.g.
+ * - use table (based on exponent/highest order mantissa bits) and do
+ *   linear interpolation (https://gist.github.com/rygorous/2203834)
+ * - Chebyshev polynomial
+ * - Approximation using reciprocals
+ * - using int-to-float and float-to-int tricks for pow()
+ *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
+ *
+ * @param src   float (vector) value(s) to convert.
+ */
+static LLVMValueRef
+lp_build_linear_to_srgb(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        LLVMValueRef src)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context f32_bld;
+   LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;
+
+   lp_build_context_init(&f32_bld, gallivm, src_type);
+
+   src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
+
+   if (0) {
+      /*
+       * using int-to-float and float-to-int trick for pow().
+       * This is much more accurate than necessary thanks to the correction,
+       * but it most certainly makes no sense without rsqrt available.
+       * Bonus points if you understand how this works...
+       * All in all (including min/max clamp, conversion) 19 instructions.
+       */
+
+      float exp_f = 2.0f / 3.0f;
+      /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */
+      float exp2f_c = 1.30438178253e+19f;
+      float coeff_f = 0.62996f;
+      LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;
+      struct lp_type int_type = lp_int_type(src_type);
+
+      /*
+       * First calculate approx x^8/12
+       */
+      exponent = lp_build_const_vec(gallivm, src_type, exp_f);
+      coeff = lp_build_const_vec(gallivm, src_type,
+                                 exp2f_c * powf(coeff_f, 1.0f / exp_f));
+
+      /* premultiply src */
+      tmp = lp_build_mul(&f32_bld, coeff, src);
+      /* "log2" */
+      tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");
+      tmp = lp_build_int_to_float(&f32_bld, tmp);
+      /* multiply for pow */
+      tmp = lp_build_mul(&f32_bld, tmp, exponent);
+      /* "exp2" */
+      pow_approx = lp_build_itrunc(&f32_bld, tmp);
+      pow_approx = LLVMBuildBitCast(builder, pow_approx,
+                                    lp_build_vec_type(gallivm, src_type), "");
+
+      /*
+       * Since that pow was inaccurate (like 3 bits, though each sqrt step would
+       * give another bit), compensate the error (which is why we chose another
+       * exponent in the first place).
+       */
+      /* x * x^(8/12) = x^(20/12) */
+      pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
+
+      /* x * x * x^(-4/12) = x^(20/12) */
+      /* Should avoid using rsqrt if it's not available, but
+       * using x * x^(4/12) * x^(4/12) instead will change error weight */
+      tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
+      x2 = lp_build_mul(&f32_bld, src, src);
+      pow_2 = lp_build_mul(&f32_bld, x2, tmp);
+
+      /* average the values so the errors cancel out, compensate bias,
+       * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
+       * for conversion to int in here */
+      tmp = lp_build_add(&f32_bld, pow_1, pow_2);
+      coeff = lp_build_const_vec(gallivm, src_type,
+                                 1.0f / (3.0f * coeff_f) * 0.999852f *
+                                 powf(1.055f * 255.0f, 4.0f));
+      pow_final = lp_build_mul(&f32_bld, tmp, coeff);
+
+      /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
+      if (lp_build_fast_rsqrt_available(src_type)) {
+         pow_final = lp_build_fast_rsqrt(&f32_bld,
+                        lp_build_fast_rsqrt(&f32_bld, pow_final));
+      }
+      else {
+         pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));
+      }
+      pow_final = lp_build_add(&f32_bld, pow_final,
+                               lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));
+   }
+
+   else {
+      /*
+       * using "rational polynomial" approximation here.
+       * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
+       * factoring in the 255.0 mul and the scaling mul.
+       * (a is closer to actual value so has higher weight than b.)
+       * Note: the constants are magic values. They were found empirically,
+       * possibly could be improved but good enough (be VERY careful with
+       * error metric if you'd want to tweak them, they also MUST fit with
+       * the crappy polynomial above for srgb->linear since it is required
+       * that each srgb value maps back to the same value).
+       * This function has an error of max +-0.17 (and we'd only require +-0.6),
+       * for the approximated srgb->linear values the error is naturally larger
+       * (+-0.42) but still accurate enough (required +-0.5 essentially).
+       * All in all (including min/max clamp, conversion) 15 instructions.
+       * FMA would help (minus 2 instructions).
+       */
+
+      LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;
+
+      if (lp_build_fast_rsqrt_available(src_type)) {
+         tmp = lp_build_fast_rsqrt(&f32_bld, src);
+         x05 = lp_build_mul(&f32_bld, src, tmp);
+      }
+      else {
+         /*
+          * I don't really expect this to be practical without rsqrt
+          * but there's no reason for triple punishment so at least
+          * save the otherwise resulting division and unnecessary mul...
+          */
+         x05 = lp_build_sqrt(&f32_bld, src);
+      }
+
+      tmp = lp_build_mul(&f32_bld, x05, src);
+      if (lp_build_fast_rsqrt_available(src_type)) {
+         x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));
+      }
+      else {
+         x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
+      }
+
+      a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);
+      b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);
+      c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
+
+      tmp = lp_build_mul(&f32_bld, a_const, x0375);
+      tmp2 = lp_build_mul(&f32_bld, b_const, x05);
+      tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
+      pow_final = lp_build_add(&f32_bld, tmp, tmp2);
+   }
+
+   /* linear part is easy */
+   lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
+   lin = lp_build_mul(&f32_bld, src, lin_const);
+
+   lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
+   is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);
+   tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
+
+   f32_bld.type.sign = 0;
+   return lp_build_iround(&f32_bld, tmp);
+}
+
+
+/**
+ * Convert linear float soa values to packed srgb AoS values.
+ * This only handles packed formats which are 4x8bit in size
+ * (rgba and rgbx plus swizzles).
+ *
+ * @param src   float SoA (vector) values to convert.
+ */
+LLVMValueRef
+lp_build_float_to_srgb_packed(struct gallivm_state *gallivm,
+                              const struct util_format_description *dst_fmt,
+                              struct lp_type src_type,
+                              LLVMValueRef *src)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned chan;
+   struct lp_build_context f32_bld;
+   struct lp_type int32_type = lp_int_type(src_type);
+   LLVMValueRef tmpsrgb[4], alpha, dst;
+
+   lp_build_context_init(&f32_bld, gallivm, src_type);
+
+   /* rgb is subject to linear->srgb conversion, alpha is not */
+   for (chan = 0; chan < 3; chan++) {
+      tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, src[chan]);
+   }
+   /*
+    * can't use lp_build_conv since we want to keep values as 32bit
+    * here so we can interleave with rgb to go from SoA->AoS.
+    */
+   alpha = lp_build_clamp(&f32_bld, src[3], f32_bld.zero, f32_bld.one);
+   alpha = lp_build_mul(&f32_bld, alpha,
+                        lp_build_const_vec(gallivm, src_type, 255.0f));
+   tmpsrgb[3] = lp_build_iround(&f32_bld, alpha);
+
+   dst = lp_build_zero(gallivm, int32_type);
+   for (chan = 0; chan < dst_fmt->nr_channels; chan++) {
+      if (dst_fmt->swizzle[chan] <= UTIL_FORMAT_SWIZZLE_W) {
+         unsigned ls;
+         LLVMValueRef shifted, shift_val;
+         ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift;
+         shift_val = lp_build_const_int_vec(gallivm, int32_type, ls);
+         shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, "");
+         dst = LLVMBuildOr(builder, dst, shifted, "");
+      }
+   }
+   return dst;
+}
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -497,7 +497,7 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
   assert(format_desc->block.width == 2);
   assert(format_desc->block.height == 1);

-   packed = lp_build_gather(gallivm, n, 32, 32, base_ptr, offset);
+   packed = lp_build_gather(gallivm, n, 32, 32, base_ptr, offset, FALSE);

   (void)j;

--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -78,7 +78,8 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
                     unsigned dst_width,
                     LLVMValueRef base_ptr,
                     LLVMValueRef offsets,
-                     unsigned i)
+                     unsigned i,
+                     boolean vector_justify)
 {
   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
@@ -97,10 +98,12 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
      res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
   } else if (src_width < dst_width) {
      res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
+      if (vector_justify) {
 #ifdef PIPE_ARCH_BIG_ENDIAN
-      res = LLVMBuildShl(gallivm->builder, res,
-                         LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
+         res = LLVMBuildShl(gallivm->builder, res,
+                            LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
 #endif
+      }
   }

   return res;
@@ -112,11 +115,20 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
 * Use for fetching texels from a texture.
 * For SSE, typical values are length=4, src_width=32, dst_width=32.
 *
+ * When src_width < dst_width, the return value can be justified in
+ * one of two ways:
+ * "integer justification" is used when the caller treats the destination
+ * as a packed integer bitmask, as described by the channels' "shift" and
+ * "width" fields;
+ * "vector justification" is used when the caller casts the destination
+ * to a vector and needs channel X to be in vector element 0.
+ *
 * @param length length of the offsets
 * @param src_width src element width in bits
 * @param dst_width result element width in bits (src will be expanded to fit)
 * @param base_ptr base pointer, should be a i8 pointer type.
 * @param offsets vector with offsets
+ * @param vector_justify select vector rather than integer justification
 */
 LLVMValueRef
 lp_build_gather(struct gallivm_state *gallivm,
@@ -124,7 +136,8 @@ lp_build_gather(struct gallivm_state *gallivm,
                unsigned src_width,
                unsigned dst_width,
                LLVMValueRef base_ptr,
-                LLVMValueRef offsets)
+                LLVMValueRef offsets,
+                boolean vector_justify)
 {
   LLVMValueRef res;

@@ -132,7 +145,7 @@ lp_build_gather(struct gallivm_state *gallivm,
      /* Scalar */
      return lp_build_gather_elem(gallivm, length,
                                  src_width, dst_width,
-                                  base_ptr, offsets, 0);
+                                  base_ptr, offsets, 0, vector_justify);
   } else {
      /* Vector */

@@ -146,7 +159,7 @@ lp_build_gather(struct gallivm_state *gallivm,
         LLVMValueRef elem;
         elem = lp_build_gather_elem(gallivm, length,
                                     src_width, dst_width,
-                                     base_ptr, offsets, i);
+                                     base_ptr, offsets, i, vector_justify);
         res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
      }
   }
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -47,7 +47,8 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
                     unsigned dst_width,
                     LLVMValueRef base_ptr,
                     LLVMValueRef offsets,
-                     unsigned i);
+                     unsigned i,
+                     boolean vector_justify);

 LLVMValueRef
 lp_build_gather(struct gallivm_state *gallivm,
@@ -55,7 +56,8 @@ lp_build_gather(struct gallivm_state *gallivm,
                unsigned src_width,
                unsigned dst_width,
                LLVMValueRef base_ptr,
-                LLVMValueRef offsets);
+                LLVMValueRef offsets,
+                boolean vector_justify);

 LLVMValueRef
 lp_build_gather_values(struct gallivm_state * gallivm,
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -49,7 +49,7 @@
 *   - MC-JIT supports limited OSes (MacOSX and Linux)
 * - standard JIT in LLVM 3.1, with backports
 */
-#if defined(PIPE_ARCH_PPC_64) || defined(PIPE_ARCH_S390)
+#if defined(PIPE_ARCH_PPC_64) || defined(PIPE_ARCH_S390) || defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
 #  define USE_MCJIT 1
 #  define HAVE_AVX 0
 #elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT))
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -215,7 +215,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
   struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
   struct lp_build_context *float_bld = &bld->float_bld;
   struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   struct lp_build_context *levelf_bld = &bld->levelf_bld;
   const unsigned dims = bld->dims;
   LLVMValueRef ddx_ddy[2];
   LLVMBuilderRef builder = bld->gallivm->builder;
@@ -235,6 +235,8 @@ lp_build_rho(struct lp_build_sample_context *bld,

   /* Note that all simplified calculations will only work for isotropic filtering */

+   assert(bld->num_lods != length);
+
   first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                 bld->gallivm, texture_unit);
   first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
@@ -248,14 +250,14 @@ lp_build_rho(struct lp_build_sample_context *bld,
       * Cube map code did already everything except size mul and per-quad extraction.
       */
      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                      perquadf_bld->type, cube_rho, 0);
+                                      levelf_bld->type, cube_rho, 0);
      if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-         rho = lp_build_sqrt(perquadf_bld, rho);
+         rho = lp_build_sqrt(levelf_bld, rho);
      }
      /* Could optimize this for single quad just skip the broadcast */
      cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
-                                            perquadf_bld->type, float_size, index0);
-      rho = lp_build_mul(perquadf_bld, cubesize, rho);
+                                            levelf_bld->type, float_size, index0);
+      rho = lp_build_mul(levelf_bld, cubesize, rho);
   }
   else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
      LLVMValueRef ddmax[3], ddx[3], ddy[3];
@@ -289,12 +291,12 @@ lp_build_rho(struct lp_build_sample_context *bld,
         }
         rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         perquadf_bld->type, rho_vec, 0);
+                                         levelf_bld->type, rho_vec, 0);
         /*
          * note that as long as we don't care about per-pixel lod could reduce math
          * more (at some shuffle cost), but for now only do sqrt after packing.
          */
-         rho = lp_build_sqrt(perquadf_bld, rho);
+         rho = lp_build_sqrt(levelf_bld, rho);
      }
      else {
         rho_vec = ddmax[0];
@@ -309,7 +311,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
          * since we can't handle per-pixel rho/lod from now on (TODO).
          */
         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         perquadf_bld->type, rho_vec, 0);
+                                         levelf_bld->type, rho_vec, 0);
      }
   }
   else {
@@ -381,8 +383,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
         rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);

         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         perquadf_bld->type, rho_vec, 0);
-         rho = lp_build_sqrt(perquadf_bld, rho);
+                                         levelf_bld->type, rho_vec, 0);
+         rho = lp_build_sqrt(levelf_bld, rho);
      }
      else {
         ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
@@ -462,7 +464,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
               }
            }
            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            perquadf_bld->type, rho, 0);
+                                            levelf_bld->type, rho, 0);
         }
         else {
            if (dims <= 1) {
@@ -652,11 +654,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,

 {
   LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   struct lp_build_context *levelf_bld = &bld->levelf_bld;
   LLVMValueRef lod;

-   *out_lod_ipart = bld->perquadi_bld.zero;
-   *out_lod_fpart = perquadf_bld->zero;
+   *out_lod_ipart = bld->leveli_bld.zero;
+   *out_lod_fpart = levelf_bld->zero;

   if (bld->static_sampler_state->min_max_lod_equal) {
      /* User is forcing sampling from a particular mipmap level.
@@ -666,12 +668,15 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
         bld->dynamic_state->min_lod(bld->dynamic_state,
                                     bld->gallivm, sampler_unit);

-      lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
+      lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
   }
   else {
      if (explicit_lod) {
-         lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
-                                         perquadf_bld->type, explicit_lod, 0);
+         if (bld->num_lods != bld->coord_type.length)
+            lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                                            levelf_bld->type, explicit_lod, 0);
+         else
+            lod = explicit_lod;
      }
      else {
         LLVMValueRef rho;
@@ -694,29 +699,29 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,

            if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
                mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-               *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
-               *out_lod_fpart = perquadf_bld->zero;
+               *out_lod_ipart = lp_build_ilog2(levelf_bld, rho);
+               *out_lod_fpart = levelf_bld->zero;
               return;
            }
            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-               lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
+               lp_build_brilinear_rho(levelf_bld, rho, BRILINEAR_FACTOR,
                                      out_lod_ipart, out_lod_fpart);
               return;
            }
         }

         if (0) {
-            lod = lp_build_log2(perquadf_bld, rho);
+            lod = lp_build_log2(levelf_bld, rho);
         }
         else {
-            lod = lp_build_fast_log2(perquadf_bld, rho);
+            lod = lp_build_fast_log2(levelf_bld, rho);
         }

         /* add shader lod bias */
         if (lod_bias) {
            lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
-                  perquadf_bld->type, lod_bias, 0);
+                  levelf_bld->type, lod_bias, 0);
            lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
         }
      }
@@ -726,7 +731,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
         LLVMValueRef sampler_lod_bias =
            bld->dynamic_state->lod_bias(bld->dynamic_state,
                                         bld->gallivm, sampler_unit);
-         sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
+         sampler_lod_bias = lp_build_broadcast_scalar(levelf_bld,
                                                      sampler_lod_bias);
         lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
      }
@@ -736,33 +741,33 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
         LLVMValueRef max_lod =
            bld->dynamic_state->max_lod(bld->dynamic_state,
                                        bld->gallivm, sampler_unit);
-         max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
+         max_lod = lp_build_broadcast_scalar(levelf_bld, max_lod);

-         lod = lp_build_min(perquadf_bld, lod, max_lod);
+         lod = lp_build_min(levelf_bld, lod, max_lod);
      }
      if (bld->static_sampler_state->apply_min_lod) {
         LLVMValueRef min_lod =
            bld->dynamic_state->min_lod(bld->dynamic_state,
                                        bld->gallivm, sampler_unit);
-         min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
+         min_lod = lp_build_broadcast_scalar(levelf_bld, min_lod);

-         lod = lp_build_max(perquadf_bld, lod, min_lod);
+         lod = lp_build_max(levelf_bld, lod, min_lod);
      }
   }

   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
      if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-         lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
+         lp_build_brilinear_lod(levelf_bld, lod, BRILINEAR_FACTOR,
                                out_lod_ipart, out_lod_fpart);
      }
      else {
-         lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
+         lp_build_ifloor_fract(levelf_bld, lod, out_lod_ipart, out_lod_fpart);
      }

      lp_build_name(*out_lod_fpart, "lod_fpart");
   }
   else {
-      *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
+      *out_lod_ipart = lp_build_iround(levelf_bld, lod);
   }

   lp_build_name(*out_lod_ipart, "lod_ipart");
@@ -784,20 +789,20 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                           LLVMValueRef lod_ipart,
                           LLVMValueRef *level_out)
 {
-   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+   struct lp_build_context *leveli_bld = &bld->leveli_bld;
   LLVMValueRef first_level, last_level, level;

   first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                 bld->gallivm, texture_unit);
   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                               bld->gallivm, texture_unit);
-   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
-   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+   first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
+   last_level = lp_build_broadcast_scalar(leveli_bld, last_level);

-   level = lp_build_add(perquadi_bld, lod_ipart, first_level);
+   level = lp_build_add(leveli_bld, lod_ipart, first_level);

   /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
+   *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
 }


@@ -815,8 +820,8 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                           LLVMValueRef *level1_out)
 {
   LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
-   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   struct lp_build_context *leveli_bld = &bld->leveli_bld;
+   struct lp_build_context *levelf_bld = &bld->levelf_bld;
   LLVMValueRef first_level, last_level;
   LLVMValueRef clamp_min;
   LLVMValueRef clamp_max;
@@ -825,11 +830,11 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                                 bld->gallivm, texture_unit);
   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                               bld->gallivm, texture_unit);
-   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
-   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+   first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
+   last_level = lp_build_broadcast_scalar(leveli_bld, last_level);

-   *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
-   *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
+   *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
+   *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);

   /*
    * Clamp both *level0_out and *level1_out to [first_level, last_level], with
@@ -843,7 +848,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
    * converting to our lp_bld_logic helpers.
    */
 #if HAVE_LLVM < 0x0301
-   assert(perquadi_bld->type.length == 1);
+   assert(leveli_bld->type.length == 1);
 #endif

   /* *level0_out < first_level */
@@ -858,7 +863,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                 first_level, *level1_out, "");

   *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
-                                      perquadf_bld->zero, *lod_fpart_inout, "");
+                                      levelf_bld->zero, *lod_fpart_inout, "");

   /* *level0_out >= last_level */
   clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
@@ -872,7 +877,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                 last_level, *level1_out, "");

   *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
-                                      perquadf_bld->zero, *lod_fpart_inout, "");
+                                      levelf_bld->zero, *lod_fpart_inout, "");

   lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
   lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
@@ -1087,7 +1092,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
            LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);

            ileveli = lp_build_extract_broadcast(bld->gallivm,
-                                                 bld->perquadi_bld.type,
+                                                 bld->leveli_bld.type,
                                                 bld4.type,
                                                 ilevel,
                                                 indexi);
@@ -1116,7 +1121,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
         */
         assert(bld->num_lods == bld->coord_bld.type.length);
         if (bld->dims == 1) {
-            assert(bld->int_size_bld.type.length == 1);
+            assert(bld->int_size_in_bld.type.length == 1);
            int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
                                                     bld->int_size);
            /* vector shift with variable shift count alert... */
@@ -1131,10 +1136,9 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
               tmp[i] = bld->int_size;
               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
            }
-            int_size_vec = lp_build_concat(bld->gallivm,
-                                           tmp,
-                                           bld->int_size_in_bld.type,
-                                           bld->num_lods);
+            *out_size = lp_build_concat(bld->gallivm, tmp,
+                                        bld->int_size_in_bld.type,
+                                        bld->num_lods);
         }
      }
   }
@@ -1218,10 +1222,10 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
         *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
                                                coord_type, size, 0);
         if (dims >= 2) {
-            *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
-                                                   coord_type, size, 1);
+            *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
+                                                    coord_type, size, 1);
            if (dims == 3) {
-               *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
+               *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
                                                      coord_type, size, 2);
            }
         }
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -268,13 +268,13 @@ struct lp_build_sample_context
   struct lp_type texel_type;
   struct lp_build_context texel_bld;

-   /** Float per-quad type */
-   struct lp_type perquadf_type;
-   struct lp_build_context perquadf_bld;
+   /** Float level type */
+   struct lp_type levelf_type;
+   struct lp_build_context levelf_bld;

-   /** Int per-quad type */
-   struct lp_type perquadi_type;
-   struct lp_build_context perquadi_bld;
+   /** Int level type */
+   struct lp_type leveli_type;
+   struct lp_build_context leveli_bld;

   /* Common dynamic state values */
   LLVMValueRef row_stride_array;
@@ -477,6 +477,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                    const struct lp_derivatives *derivs,
                    LLVMValueRef lod_bias,
                    LLVMValueRef explicit_lod,
+                    boolean scalar_lod,
                    LLVMValueRef texel_out[4]);


--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -531,7 +531,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
                              bld->texel_type.length,
                              bld->format_desc->block.bits,
                              bld->texel_type.width,
-                              data_ptr, offset);
+                              data_ptr, offset, TRUE);

      rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
   }
@@ -893,7 +893,7 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
                                       bld->texel_type.length,
                                       bld->format_desc->block.bits,
                                       bld->texel_type.width,
-                                       data_ptr, offset[k][j][i]);
+                                       data_ptr, offset[k][j][i], TRUE);

               rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
            }
@@ -1422,8 +1422,8 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,

   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
      LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
-                                                     bld->perquadf_bld.type, 256.0);
-      LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
+                                                     bld->levelf_bld.type, 256.0);
+      LLVMTypeRef i32vec_type = bld->leveli_bld.vec_type;
      struct lp_build_if_state if_ctx;
      LLVMValueRef need_lerp;
      unsigned num_quads = bld->coord_bld.type.length / 4;
@@ -1433,9 +1433,9 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
      lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");

      /* need_lerp = lod_fpart > 0 */
-      if (num_quads == 1) {
+      if (bld->num_lods == 1) {
         need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
-                                   lod_fpart, bld->perquadi_bld.zero,
+                                   lod_fpart, bld->leveli_bld.zero,
                                   "need_lerp");
      }
      else {
@@ -1450,9 +1450,9 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
          * lod_fpart values have same sign.
          * We can however then skip the greater than comparison.
          */
-         lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
-                                  bld->perquadi_bld.zero);
-         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
+         lod_fpart = lp_build_max(&bld->leveli_bld, lod_fpart,
+                                  bld->leveli_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->leveli_bld, bld->num_lods, lod_fpart);
      }

      lp_build_if(&if_ctx, bld->gallivm, need_lerp);
@@ -1462,9 +1462,6 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
         lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));

         /* sample the second mipmap level */
-         lp_build_mipmap_level_sizes(bld, ilevel1,
-                                     &size1,
-                                     &row_stride1_vec, &img_stride1_vec);
         lp_build_mipmap_level_sizes(bld, ilevel1,
                                     &size1,
                                     &row_stride1_vec, &img_stride1_vec);
@@ -1511,7 +1508,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,

         /* interpolate samples from the two mipmap levels */

-         if (num_quads == 1) {
+         if (num_quads == 1 && bld->num_lods == 1) {
            lod_fpart = LLVMBuildTrunc(builder, lod_fpart, u8n_bld.elem_type, "");
            lod_fpart = lp_build_broadcast_scalar(&u8n_bld, lod_fpart);

@@ -1526,17 +1523,16 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 #endif
         }
         else {
-            const unsigned num_chans_per_quad = 4 * 4;
-            LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->perquadi_bld.type.length);
+            unsigned num_chans_per_lod = 4 * bld->coord_type.length / bld->num_lods;
+            LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->leveli_bld.type.length);
            LLVMValueRef shuffle[LP_MAX_VECTOR_LENGTH];

            /* Take the LSB of lod_fpart */
            lod_fpart = LLVMBuildTrunc(builder, lod_fpart, tmp_vec_type, "");

            /* Broadcast each lod weight into their respective channels */
-            assert(u8n_bld.type.length == num_quads * num_chans_per_quad);
            for (i = 0; i < u8n_bld.type.length; ++i) {
-               shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_quad);
+               shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_lod);
            }
            lod_fpart = LLVMBuildShuffleVector(builder, lod_fpart, LLVMGetUndef(tmp_vec_type),
                                               LLVMConstVector(shuffle, u8n_bld.type.length), "");
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -979,17 +979,17 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
      struct lp_build_if_state if_ctx;
      LLVMValueRef need_lerp;
-      unsigned num_quads = bld->coord_bld.type.length / 4;

      /* need_lerp = lod_fpart > 0 */
-      if (num_quads == 1) {
+      if (bld->num_lods == 1) {
         need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
-                                   lod_fpart, bld->perquadf_bld.zero,
+                                   lod_fpart, bld->levelf_bld.zero,
                                   "need_lerp");
      }
      else {
         /*
-          * We'll do mip filtering if any of the quads need it.
+          * We'll do mip filtering if any of the quads (or individual
+          * pixel in case of per-pixel lod) need it.
          * It might be better to split the vectors here and only fetch/filter
          * quads which need it.
          */
@@ -998,13 +998,13 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
          * negative values which would screw up filtering if not all
          * lod_fpart values have same sign.
          */
-         lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart,
-                                  bld->perquadf_bld.zero);
-         need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type,
+         lod_fpart = lp_build_max(&bld->levelf_bld, lod_fpart,
+                                  bld->levelf_bld.zero);
+         need_lerp = lp_build_compare(bld->gallivm, bld->levelf_bld.type,
                                      PIPE_FUNC_GREATER,
-                                      lod_fpart, bld->perquadf_bld.zero);
-         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp);
-     }
+                                      lod_fpart, bld->levelf_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->leveli_bld, bld->num_lods, need_lerp);
+      }

      lp_build_if(&if_ctx, bld->gallivm, need_lerp);
      {
@@ -1036,10 +1036,11 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,

         /* interpolate samples from the two mipmap levels */

-         lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
-                                                           bld->perquadf_bld.type,
-                                                           bld->texel_bld.type,
-                                                           lod_fpart);
+         if (bld->num_lods != bld->coord_type.length)
+            lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+                                                              bld->levelf_bld.type,
+                                                              bld->texel_bld.type,
+                                                              lod_fpart);

         for (chan = 0; chan < 4; chan++) {
            colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
@@ -1143,7 +1144,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
                            mip_filter,
                            lod_ipart, lod_fpart);
   } else {
-      *lod_ipart = bld->perquadi_bld.zero;
+      *lod_ipart = bld->leveli_bld.zero;
   }

   /*
@@ -1166,7 +1167,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
      else {
         first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                       bld->gallivm, texture_index);
-         first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level);
+         first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
         *ilevel0 = first_level;
      }
      break;
@@ -1295,7 +1296,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
                     const LLVMValueRef *offsets,
                     LLVMValueRef *colors_out)
 {
-   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+   struct lp_build_context *perquadi_bld = &bld->leveli_bld;
   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
   unsigned dims = bld->dims, chan;
   unsigned target = bld->static_texture_state->target;
@@ -1305,10 +1306,14 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
   LLVMValueRef width, height, depth, i, j;
   LLVMValueRef offset, out_of_bounds, out1;

-   /* XXX just like ordinary sampling, we don't handle per-pixel lod (yet). */
   if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
-      ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
-                                         perquadi_bld->type, explicit_lod, 0);
+      if (bld->num_lods != int_coord_bld->type.length) {
+         ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
+                                            perquadi_bld->type, explicit_lod, 0);
+      }
+      else {
+         ilevel = explicit_lod;
+      }
      lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel);
   }
   else {
@@ -1489,6 +1494,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                    const struct lp_derivatives *derivs, /* optional */
                    LLVMValueRef lod_bias, /* optional */
                    LLVMValueRef explicit_lod, /* optional */
+                    boolean scalar_lod,
                    LLVMValueRef texel_out[4])
 {
   unsigned dims = texture_dims(static_texture_state->target);
@@ -1529,10 +1535,6 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
   bld.float_size_in_type.length = dims > 1 ? 4 : 1;
   bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
   bld.texel_type = type;
-   bld.perquadf_type = type;
-   /* we want native vector size to be able to use our intrinsics */
-   bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
-   bld.perquadi_type = lp_int_type(bld.perquadf_type);

   /* always using the first channel hopefully should be safe,
    * if not things WILL break in other places anyway.
@@ -1563,21 +1565,51 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
      debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
   }

+   /*
+    * This is all a bit complicated different paths are chosen for performance
+    * reasons.
+    * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
+    * everything (the last two options are equivalent for 4-wide case).
+    * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
+    * lod is calculated then the lod value extracted afterwards so making this
+    * case basically the same as far as lod handling is concerned for the
+    * further sample/filter code as the 1 lod for everything case.
+    * Different lod handling mostly shows up when building mipmap sizes
+    * (lp_build_mipmap_level_sizes() and friends) and also in filtering
+    * (getting the fractional part of the lod to the right texels).
+    */
+
   /*
    * There are other situations where at least the multiple int lods could be
    * avoided like min and max lod being equal.
    */
-   if ((is_fetch && explicit_lod && bld.static_texture_state->target != PIPE_BUFFER) ||
-       (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
+   if (explicit_lod && !scalar_lod &&
+       ((is_fetch && bld.static_texture_state->target != PIPE_BUFFER) ||
+        (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
+      bld.num_lods = type.length;
+   /* TODO: for true scalar_lod should only use 1 lod value */
+   else if ((is_fetch && explicit_lod && bld.static_texture_state->target != PIPE_BUFFER ) ||
+            (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
      bld.num_lods = num_quads;
   }
   else {
      bld.num_lods = 1;
   }

+   bld.levelf_type = type;
+   /* we want native vector size to be able to use our intrinsics */
+   if (bld.num_lods != type.length) {
+      bld.levelf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
+   }
+   bld.leveli_type = lp_int_type(bld.levelf_type);
   bld.float_size_type = bld.float_size_in_type;
-   bld.float_size_type.length = bld.num_lods > 1 ? type.length :
-                                   bld.float_size_in_type.length;
+   /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
+    * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
+   if (bld.num_lods > 1) {
+      bld.float_size_type.length = bld.num_lods == type.length ?
+                                      bld.num_lods * bld.float_size_in_type.length :
+                                      type.length;
+   }
   bld.int_size_type = lp_int_type(bld.float_size_type);

   lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
@@ -1590,8 +1622,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
   lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
   lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
   lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
-   lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type);
-   lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type);
+   lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
+   lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);

   /* Get the dynamic state */
   tex_width = dynamic_state->width(dynamic_state, gallivm, texture_index);
@@ -1735,14 +1767,31 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
         bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
         bld4.texel_type = bld.texel_type;
         bld4.texel_type.length = 4;
-         bld4.perquadf_type = type4;
+         bld4.levelf_type = type4;
         /* we want native vector size to be able to use our intrinsics */
-         bld4.perquadf_type.length = 1;
-         bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+         bld4.levelf_type.length = 1;
+         bld4.leveli_type = lp_int_type(bld4.levelf_type);

-         bld4.num_lods = 1;
-         bld4.int_size_type = bld4.int_size_in_type;
+         if (explicit_lod && !scalar_lod &&
+             ((is_fetch && bld.static_texture_state->target != PIPE_BUFFER) ||
+              (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
+            bld4.num_lods = type4.length;
+         else
+            bld4.num_lods = 1;
+
+         bld4.levelf_type = type4;
+         /* we want native vector size to be able to use our intrinsics */
+         if (bld4.num_lods != type4.length) {
+            bld4.levelf_type.length = 1;
+         }
+         bld4.leveli_type = lp_int_type(bld4.levelf_type);
         bld4.float_size_type = bld4.float_size_in_type;
+         if (bld4.num_lods > 1) {
+            bld4.float_size_type.length = bld4.num_lods == type4.length ?
+                                            bld4.num_lods * bld4.float_size_in_type.length :
+                                            type4.length;
+         }
+         bld4.int_size_type = lp_int_type(bld4.float_size_type);

         lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
         lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
@@ -1754,15 +1803,15 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
         lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
         lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
         lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
-         lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type);
-         lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type);
+         lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
+         lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);

         for (i = 0; i < num_quads; i++) {
            LLVMValueRef s4, t4, r4;
-            LLVMValueRef lod_iparts, lod_fparts = NULL;
-            LLVMValueRef ilevel0s, ilevel1s = NULL;
-            LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+            LLVMValueRef lod_ipart4, lod_fpart4 = NULL;
+            LLVMValueRef ilevel04, ilevel14 = NULL;
            LLVMValueRef offsets4[4] = { NULL };
+            unsigned num_lods = bld4.num_lods;

            s4 = lp_build_extract_range(gallivm, s, 4*i, 4);
            t4 = lp_build_extract_range(gallivm, t, 4*i, 4);
@@ -1777,27 +1826,27 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                  }
               }
            }
-            lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, "");
-            ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, "");
+            lod_ipart4 = lp_build_extract_range(gallivm, lod_ipart, num_lods * i, num_lods);
+            ilevel04 = lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-               ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, "");
-               lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, "");
+               ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
+               lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
            }

            if (use_aos) {
               /* do sampling/filtering with fixed pt arithmetic */
               lp_build_sample_aos(&bld4, sampler_index,
                                   s4, t4, r4, offsets4,
-                                   lod_iparts, lod_fparts,
-                                   ilevel0s, ilevel1s,
+                                   lod_ipart4, lod_fpart4,
+                                   ilevel04, ilevel14,
                                   texelout4);
            }

            else {
               lp_build_sample_general(&bld4, sampler_index,
                                       s4, t4, r4, offsets4,
-                                       lod_iparts, lod_fparts,
-                                       ilevel0s, ilevel1s,
+                                       lod_ipart4, lod_fpart4,
+                                       ilevel04, ilevel14,
                                       texelout4);
            }
            for (j = 0; j < 4; j++) {
@@ -1864,6 +1913,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
   lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128));

   if (explicit_lod) {
+      /* FIXME: this needs to honor per-element lod */
      lod = LLVMBuildExtractElement(gallivm->builder, explicit_lod, lp_build_const_int32(gallivm, 0), "");
      first_level = dynamic_state->first_level(dynamic_state, gallivm, texture_unit);
      lod = lp_build_broadcast_scalar(&bld_int_vec,
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -217,6 +217,20 @@ lp_build_swizzle_scalar_aos(struct lp_build_context *bld,

      a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type2), "");

+      /*
+       * Vector element 0 is always channel X.
+       *
+       *                        76 54 32 10 (array numbering)
+       * Little endian reg in:  YX YX YX YX
+       * Little endian reg out: YY YY YY YY if shift right (shift == -1)
+       *                        XX XX XX XX if shift left (shift == 1)
+       *
+       *                        01 23 45 67 (array numbering)
+       * Big endian reg in:     XY XY XY XY
+       * Big endian reg out:    YY YY YY YY if shift left (shift == 1)
+       *                        XX XX XX XX if shift right (shift == -1)
+       *
+       */
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
      shift = channel == 0 ? 1 : -1;
 #else
@@ -240,10 +254,23 @@ lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
      /*
       * Bit mask and recursive shifts
       *
+       * Little-endian registers:
+       *
+       *   7654 3210
+       *   WZYX WZYX .... WZYX  <= input
+       *   00Y0 00Y0 .... 00Y0  <= mask
+       *   00YY 00YY .... 00YY  <= shift right 1 (shift amount -1)
+       *   YYYY YYYY .... YYYY  <= shift left 2 (shift amount 2)
+       *
+       * Big-endian registers:
+       *
+       *   0123 4567
       *   XYZW XYZW .... XYZW  <= input
-       *   0Y00 0Y00 .... 0Y00
-       *   YY00 YY00 .... YY00
-       *   YYYY YYYY .... YYYY  <= output
+       *   0Y00 0Y00 .... 0Y00  <= mask
+       *   YY00 YY00 .... YY00  <= shift left 1 (shift amount 1)
+       *   YYYY YYYY .... YYYY  <= shift right 2 (shift amount -2)
+       *
+       * shifts[] gives little-endian shift amounts; we need to negate for big-endian.
       */
      struct lp_type type4;
      const int shifts[4][2] = {
@@ -274,14 +301,15 @@ lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
         LLVMValueRef tmp = NULL;
         int shift = shifts[channel][i];

-#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         /* See endianness diagram above */
+#ifdef PIPE_ARCH_BIG_ENDIAN
         shift = -shift;
 #endif

         if(shift > 0)
-            tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
+            tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
         if(shift < 0)
-            tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
+            tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");

         assert(tmp);
         if(tmp)
@@ -474,21 +502,39 @@ lp_build_swizzle_aos(struct lp_build_context *bld,

      /*
       * Mask and shift the channels, trying to group as many channels in the
-       * same shift as possible
+       * same shift as possible.  The shift amount is positive for shifts left
+       * and negative for shifts right.
       */
      for (shift = -3; shift <= 3; ++shift) {
         uint64_t mask = 0;

         assert(type4.width <= sizeof(mask)*8);

+         /*
+          * Vector element numbers follow the XYZW order, so 0 is always X, etc.
+          * After widening 4 times we have:
+          *
+          *                                3210
+          * Little-endian register layout: WZYX
+          *
+          *                                0123
+          * Big-endian register layout:    XYZW
+          *
+          * For little-endian, higher-numbered channels are obtained by a shift right
+          * (negative shift amount) and lower-numbered channels by a shift left
+          * (positive shift amount).  The opposite is true for big-endian.
+          */
         for (chan = 0; chan < 4; ++chan) {
-            /* FIXME: big endian */
-            if (swizzles[chan] < 4 &&
-                chan - swizzles[chan] == shift) {
+            if (swizzles[chan] < 4) {
+               /* We need to move channel swizzles[chan] into channel chan */
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-               mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width);
+               if (swizzles[chan] - chan == -shift) {
+                  mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width);
+               }
 #else
-               mask |= ((1ULL << type.width) - 1) << (type4.width - type.width) >> (swizzles[chan] * type.width);
+               if (swizzles[chan] - chan == shift) {
+                  mask |= ((1ULL << type.width) - 1) << (type4.width - type.width) >> (swizzles[chan] * type.width);
+               }
 #endif
            }
         }
@@ -502,21 +548,11 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
            masked = LLVMBuildAnd(builder, a,
                                  lp_build_const_int_vec(bld->gallivm, type4, mask), "");
            if (shift > 0) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
               shifted = LLVMBuildShl(builder, masked,
                                      lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
-#else
-               shifted = LLVMBuildLShr(builder, masked,
-                                       lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
-#endif
            } else if (shift < 0) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
               shifted = LLVMBuildLShr(builder, masked,
                                       lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
-#else
-               shifted = LLVMBuildShl(builder, masked,
-                                      lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
-#endif
            } else {
               shifted = masked;
            }
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -390,11 +390,8 @@ lp_build_emit_fetch_texoffset(
   if (chan_index == LP_CHAN_ALL) {
      swizzle = ~0;
   } else {
+      assert(chan_index < TGSI_SWIZZLE_W);
      swizzle = tgsi_util_get_src_register_swizzle(&reg.Register, chan_index);
-      if (swizzle > 2) {
-         assert(0 && "invalid swizzle in emit_fetch_texoffset()");
-         return bld_base->base.undef;
-      }
   }

   assert(off->Index <= bld_base->info->file_max[off->File]);
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -184,6 +184,7 @@ struct lp_build_sampler_soa
                        const struct lp_derivatives *derivs,
                        LLVMValueRef lod_bias, /* optional */
                        LLVMValueRef explicit_lod, /* optional */
+                        boolean scalar_lod,
                        LLVMValueRef *texel);

   void
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -396,7 +396,7 @@ frc_emit(
                                       TGSI_OPCODE_SUB, emit_data->args[0], tmp);
 }

-/* TGSI_OPCODE_KIL */
+/* TGSI_OPCODE_KILL_IF */

 static void
 kil_fetch_args(
@@ -419,7 +419,7 @@ kil_fetch_args(
   emit_data->dst_type = LLVMVoidTypeInContext(bld_base->base.gallivm->context);
 }

-/* TGSI_OPCODE_KILP */
+/* TGSI_OPCODE_KILL */

 static void
 kilp_fetch_args(
@@ -633,8 +633,6 @@ rsq_emit(
   struct lp_build_tgsi_context * bld_base,
   struct lp_build_emit_data * emit_data)
 {
-   emit_data->args[0] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS,
-                                               emit_data->args[0]);
   if (bld_base->rsq_action.emit) {
      bld_base->rsq_action.emit(&bld_base->rsq_action, bld_base, emit_data);
   } else {
@@ -871,8 +869,8 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
   bld_base->op_actions[TGSI_OPCODE_EX2].fetch_args = scalar_unary_fetch_args;
   bld_base->op_actions[TGSI_OPCODE_IF].fetch_args = scalar_unary_fetch_args;
   bld_base->op_actions[TGSI_OPCODE_UIF].fetch_args = scalar_unary_fetch_args;
-   bld_base->op_actions[TGSI_OPCODE_KIL].fetch_args = kil_fetch_args;
-   bld_base->op_actions[TGSI_OPCODE_KILP].fetch_args = kilp_fetch_args;
+   bld_base->op_actions[TGSI_OPCODE_KILL_IF].fetch_args = kil_fetch_args;
+   bld_base->op_actions[TGSI_OPCODE_KILL].fetch_args = kilp_fetch_args;
   bld_base->op_actions[TGSI_OPCODE_RCP].fetch_args = scalar_unary_fetch_args;
   bld_base->op_actions[TGSI_OPCODE_SIN].fetch_args = scalar_unary_fetch_args;
   bld_base->op_actions[TGSI_OPCODE_LG2].fetch_args = scalar_unary_fetch_args;
@@ -1161,14 +1159,9 @@ iset_emit_cpu(
   struct lp_build_emit_data * emit_data,
   unsigned pipe_func)
 {
-   LLVMValueRef nz = lp_build_const_vec(bld_base->base.gallivm,
-					bld_base->int_bld.type, ~0U);
   LLVMValueRef cond = lp_build_cmp(&bld_base->int_bld, pipe_func,
                                    emit_data->args[0], emit_data->args[1]);
-   emit_data->output[emit_data->chan] = lp_build_select(&bld_base->int_bld,
-                                          cond,
-                                          nz,
-                                          bld_base->int_bld.zero);
+   emit_data->output[emit_data->chan] = cond;
 }

 /* TGSI_OPCODE_IMAX (CPU Only) */
@@ -1354,9 +1347,6 @@ rcp_emit_cpu(
 }

 /* Reciprical squareroot (CPU Only) */
-
-/* This is not the same as TGSI_OPCODE_RSQ, which requres the argument to be
- * greater than or equal to 0 */
 static void
 recip_sqrt_emit_cpu(
   const struct lp_build_tgsi_action * action,
@@ -1620,14 +1610,9 @@ uset_emit_cpu(
   struct lp_build_emit_data * emit_data,
   unsigned pipe_func)
 {
-   LLVMValueRef nz = lp_build_const_vec(bld_base->base.gallivm,
-					bld_base->uint_bld.type, ~0U);
   LLVMValueRef cond = lp_build_cmp(&bld_base->uint_bld, pipe_func,
                                    emit_data->args[0], emit_data->args[1]);
-   emit_data->output[emit_data->chan] = lp_build_select(&bld_base->uint_bld,
-                                          cond,
-					  nz,
-                                          bld_base->uint_bld.zero);
+   emit_data->output[emit_data->chan] = cond;
 }


--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -657,12 +657,10 @@ lp_emit_instruction_aos(
   case TGSI_OPCODE_DDY:
      return FALSE;

-   case TGSI_OPCODE_KILP:
-      /* predicated kill */
+   case TGSI_OPCODE_KILL:
      return FALSE;

-   case TGSI_OPCODE_KIL:
-      /* conditional kill */
+   case TGSI_OPCODE_KILL_IF:
      return FALSE;

   case TGSI_OPCODE_PK2H:
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1026,9 +1026,9 @@ emit_fetch_immediate(
   }

   if (stype == TGSI_TYPE_UNSIGNED) {
-      res = LLVMConstBitCast(res, bld_base->uint_bld.vec_type);
+      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
   } else if (stype == TGSI_TYPE_SIGNED) {
-      res = LLVMConstBitCast(res, bld_base->int_bld.vec_type);
+      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
   }
   return res;
 }
@@ -1576,6 +1576,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
   LLVMValueRef offsets[3] = { NULL };
   struct lp_derivatives derivs;
   struct lp_derivatives *deriv_ptr = NULL;
+   boolean scalar_lod;
   unsigned num_coords, num_derivs, num_offsets;
   unsigned i;

@@ -1693,6 +1694,9 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
      }
   }

+   /* TODO: use scalar lod if explicit_lod, lod_bias or derivs are broadcasted scalars */
+   scalar_lod = bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT;
+
   bld->sampler->emit_fetch_texel(bld->sampler,
                                  bld->bld_base.base.gallivm,
                                  bld->bld_base.base.type,
@@ -1701,7 +1705,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
                                  coords,
                                  offsets,
                                  deriv_ptr,
-                                  lod_bias, explicit_lod,
+                                  lod_bias, explicit_lod, scalar_lod,
                                  texel);
 }

@@ -1719,6 +1723,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
   LLVMValueRef offsets[3] = { NULL };
   struct lp_derivatives derivs;
   struct lp_derivatives *deriv_ptr = NULL;
+   boolean scalar_lod;
   unsigned num_coords, num_offsets, num_derivs;
   unsigned i;

@@ -1784,13 +1789,6 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
      return;
   }

-   /*
-    * unlike old-style tex opcodes the texture/sampler indices
-    * always come from src1 and src2 respectively.
-    */
-   texture_unit = inst->Src[1].Register.Index;
-   sampler_unit = inst->Src[2].Register.Index;
-
   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
      lod_bias = lp_build_emit_fetch( &bld->bld_base, inst, 3, 0 );
      explicit_lod = NULL;
@@ -1843,6 +1841,9 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
      }
   }

+   /* TODO: use scalar lod if explicit_lod, lod_bias or derivs are broadcasted scalars */
+   scalar_lod = bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT;
+
   bld->sampler->emit_fetch_texel(bld->sampler,
                                  bld->bld_base.base.gallivm,
                                  bld->bld_base.base.type,
@@ -1851,7 +1852,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
                                  coords,
                                  offsets,
                                  deriv_ptr,
-                                  lod_bias, explicit_lod,
+                                  lod_bias, explicit_lod, scalar_lod,
                                  texel);
 }

@@ -1866,6 +1867,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
   LLVMValueRef explicit_lod = NULL;
   LLVMValueRef coords[3];
   LLVMValueRef offsets[3] = { NULL };
+   boolean scalar_lod;
   unsigned num_coords;
   unsigned dims;
   unsigned i;
@@ -1934,6 +1936,9 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
      }
   }

+   /* TODO: use scalar lod if explicit_lod is broadcasted scalar */
+   scalar_lod = bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT;
+
   bld->sampler->emit_fetch_texel(bld->sampler,
                                  bld->bld_base.base.gallivm,
                                  bld->bld_base.base.type,
@@ -1942,7 +1947,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
                                  coords,
                                  offsets,
                                  NULL,
-                                  NULL, explicit_lod,
+                                  NULL, explicit_lod, scalar_lod,
                                  texel);
 }

@@ -2038,7 +2043,7 @@ near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
 * Kill fragment if any of the src register values are negative.
 */
 static void
-emit_kil(
+emit_kill_if(
   struct lp_build_tgsi_soa_context *bld,
   const struct tgsi_full_instruction *inst,
   int pc)
@@ -2091,13 +2096,12 @@ emit_kil(


 /**
- * Predicated fragment kill.
- * XXX Actually, we do an unconditional kill (as in tgsi_exec.c).
+ * Unconditional fragment kill.
 * The only predication is the execution mask which will apply if
 * we're inside a loop or conditional.
 */
 static void
-emit_kilp(struct lp_build_tgsi_soa_context *bld,
+emit_kill(struct lp_build_tgsi_soa_context *bld,
          int pc)
 {
   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
@@ -2315,25 +2319,25 @@ ddy_emit(
 }

 static void
-kilp_emit(
+kill_emit(
   const struct lp_build_tgsi_action * action,
   struct lp_build_tgsi_context * bld_base,
   struct lp_build_emit_data * emit_data)
 {
   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);

-   emit_kilp(bld, bld_base->pc - 1);
+   emit_kill(bld, bld_base->pc - 1);
 }

 static void
-kil_emit(
+kill_if_emit(
   const struct lp_build_tgsi_action * action,
   struct lp_build_tgsi_context * bld_base,
   struct lp_build_emit_data * emit_data)
 {
   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);

-   emit_kil(bld, emit_data->inst, bld_base->pc - 1);
+   emit_kill_if(bld, emit_data->inst, bld_base->pc - 1);
 }

 static void
@@ -3164,8 +3168,8 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
   bld.bld_base.op_actions[TGSI_OPCODE_ENDSWITCH].emit = endswitch_emit;
   bld.bld_base.op_actions[TGSI_OPCODE_IF].emit = if_emit;
   bld.bld_base.op_actions[TGSI_OPCODE_UIF].emit = uif_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_KIL].emit = kil_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_KILP].emit = kilp_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_KILL_IF].emit = kill_if_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_KILL].emit = kill_emit;
   bld.bld_base.op_actions[TGSI_OPCODE_NRM].emit = nrm_emit;
   bld.bld_base.op_actions[TGSI_OPCODE_NRM4].emit = nrm_emit;
   bld.bld_base.op_actions[TGSI_OPCODE_RET].emit = ret_emit;
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -33,6 +33,8 @@
 * Set GALLIUM_HUD=help for more info.
 */

+#include <stdio.h>
+
 #include "hud/hud_context.h"
 #include "hud/hud_private.h"
 #include "hud/font.h"
@@ -106,8 +108,8 @@ hud_draw_colored_prims(struct hud_context *hud, unsigned prim,
   hud->constants.color[1] = g;
   hud->constants.color[2] = b;
   hud->constants.color[3] = a;
-   hud->constants.translate[0] = xoffset;
-   hud->constants.translate[1] = yoffset;
+   hud->constants.translate[0] = (float) xoffset;
+   hud->constants.translate[1] = (float) yoffset;
   hud->constants.scale[0] = 1;
   hud->constants.scale[1] = yscale;
   cso_set_constant_buffer(cso, PIPE_SHADER_VERTEX, 0, &hud->constbuf);
@@ -127,10 +129,10 @@ hud_draw_colored_quad(struct hud_context *hud, unsigned prim,
                      float r, float g, float b, float a)
 {
   float buffer[] = {
-      x1, y1,
-      x1, y2,
-      x2, y2,
-      x2, y1,
+      (float) x1, (float) y1,
+      (float) x1, (float) y2,
+      (float) x2, (float) y2,
+      (float) x2, (float) y1,
   };

   hud_draw_colored_prims(hud, prim, buffer, 4, r, g, b, a, 0, 0, 1);
@@ -145,17 +147,17 @@ hud_draw_background_quad(struct hud_context *hud,

   assert(hud->bg.num_vertices + 4 <= hud->bg.max_num_vertices);

-   vertices[num++] = x1;
-   vertices[num++] = y1;
+   vertices[num++] = (float) x1;
+   vertices[num++] = (float) y1;

-   vertices[num++] = x1;
-   vertices[num++] = y2;
+   vertices[num++] = (float) x1;
+   vertices[num++] = (float) y2;

-   vertices[num++] = x2;
-   vertices[num++] = y2;
+   vertices[num++] = (float) x2;
+   vertices[num++] = (float) y2;

-   vertices[num++] = x2;
-   vertices[num++] = y1;
+   vertices[num++] = (float) x2;
+   vertices[num++] = (float) y1;

   hud->bg.num_vertices += num/2;
 }
@@ -200,25 +202,25 @@ hud_draw_string(struct hud_context *hud, unsigned x, unsigned y,

      assert(hud->text.num_vertices + num/4 + 4 <= hud->text.max_num_vertices);

-      vertices[num++] = x1;
-      vertices[num++] = y1;
-      vertices[num++] = tx1;
-      vertices[num++] = ty1;
+      vertices[num++] = (float) x1;
+      vertices[num++] = (float) y1;
+      vertices[num++] = (float) tx1;
+      vertices[num++] = (float) ty1;

-      vertices[num++] = x1;
-      vertices[num++] = y2;
-      vertices[num++] = tx1;
-      vertices[num++] = ty2;
+      vertices[num++] = (float) x1;
+      vertices[num++] = (float) y2;
+      vertices[num++] = (float) tx1;
+      vertices[num++] = (float) ty2;

-      vertices[num++] = x2;
-      vertices[num++] = y2;
-      vertices[num++] = tx2;
-      vertices[num++] = ty2;
+      vertices[num++] = (float) x2;
+      vertices[num++] = (float) y2;
+      vertices[num++] = (float) tx2;
+      vertices[num++] = (float) ty2;

-      vertices[num++] = x2;
-      vertices[num++] = y1;
-      vertices[num++] = tx2;
-      vertices[num++] = ty1;
+      vertices[num++] = (float) x2;
+      vertices[num++] = (float) y1;
+      vertices[num++] = (float) tx2;
+      vertices[num++] = (float) ty1;

      x += hud->font.glyph_width;
      s++;
@@ -316,25 +318,25 @@ hud_pane_accumulate_vertices(struct hud_context *hud,

   /* draw border */
   assert(hud->whitelines.num_vertices + num/2 + 8 <= hud->whitelines.max_num_vertices);
-   line_verts[num++] = pane->x1;
-   line_verts[num++] = pane->y1;
-   line_verts[num++] = pane->x2;
-   line_verts[num++] = pane->y1;
+   line_verts[num++] = (float) pane->x1;
+   line_verts[num++] = (float) pane->y1;
+   line_verts[num++] = (float) pane->x2;
+   line_verts[num++] = (float) pane->y1;

-   line_verts[num++] = pane->x2;
-   line_verts[num++] = pane->y1;
-   line_verts[num++] = pane->x2;
-   line_verts[num++] = pane->y2;
+   line_verts[num++] = (float) pane->x2;
+   line_verts[num++] = (float) pane->y1;
+   line_verts[num++] = (float) pane->x2;
+   line_verts[num++] = (float) pane->y2;

-   line_verts[num++] = pane->x1;
-   line_verts[num++] = pane->y2;
-   line_verts[num++] = pane->x2;
-   line_verts[num++] = pane->y2;
+   line_verts[num++] = (float) pane->x1;
+   line_verts[num++] = (float) pane->y2;
+   line_verts[num++] = (float) pane->x2;
+   line_verts[num++] = (float) pane->y2;

-   line_verts[num++] = pane->x1;
-   line_verts[num++] = pane->y1;
-   line_verts[num++] = pane->x1;
-   line_verts[num++] = pane->y2;
+   line_verts[num++] = (float) pane->x1;
+   line_verts[num++] = (float) pane->y1;
+   line_verts[num++] = (float) pane->x1;
+   line_verts[num++] = (float) pane->y2;

   /* draw horizontal lines inside the graph */
   for (i = 0; i <= 5; i++) {
@@ -405,8 +407,8 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)

   hud->fb_width = tex->width0;
   hud->fb_height = tex->height0;
-   hud->constants.two_div_fb_width = 2.0 / hud->fb_width;
-   hud->constants.two_div_fb_height = 2.0 / hud->fb_height;
+   hud->constants.two_div_fb_width = 2.0f / hud->fb_width;
+   hud->constants.two_div_fb_height = 2.0f / hud->fb_height;

   cso_save_framebuffer(cso);
   cso_save_sample_mask(cso);
@@ -456,7 +458,7 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
   cso_set_geometry_shader_handle(cso, NULL);
   cso_set_vertex_shader_handle(cso, hud->vs);
   cso_set_vertex_elements(cso, 2, hud->velems);
-   cso_set_render_condition(cso, NULL, 0);
+   cso_set_render_condition(cso, NULL, FALSE, 0);
   cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, 1,
                         &hud->font_sampler_view);
   cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, 1, sampler_states);
@@ -486,7 +488,7 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
      hud->constants.color[0] = 0;
      hud->constants.color[1] = 0;
      hud->constants.color[2] = 0;
-      hud->constants.color[3] = 0.666;
+      hud->constants.color[3] = 0.666f;
      hud->constants.translate[0] = 0;
      hud->constants.translate[1] = 0;
      hud->constants.scale[0] = 1;
@@ -562,7 +564,7 @@ void
 hud_pane_set_max_value(struct hud_pane *pane, uint64_t value)
 {
   pane->max_value = value;
-   pane->yscale = -(int)pane->inner_height / (double)pane->max_value;
+   pane->yscale = -(int)pane->inner_height / (float)pane->max_value;
 }

 static struct hud_pane *
@@ -634,8 +636,8 @@ hud_graph_add_value(struct hud_graph *gr, uint64_t value)
      gr->vertices[1] = gr->vertices[(gr->index-1)*2+1];
      gr->index = 1;
   }
-   gr->vertices[(gr->index)*2+0] = gr->index*2;
-   gr->vertices[(gr->index)*2+1] = value;
+   gr->vertices[(gr->index)*2+0] = (float) (gr->index * 2);
+   gr->vertices[(gr->index)*2+1] = (float) value;
   gr->index++;

   if (gr->num_vertices < gr->pane->max_num_vertices) {
@@ -715,8 +717,8 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
    */
   period_env = getenv("GALLIUM_HUD_PERIOD");
   if (period_env) {
-      float p = atof(period_env);
-      if (p >= 0.0) {
+      float p = (float) atof(period_env);
+      if (p >= 0.0f) {
         period = (unsigned) (p * 1000 * 1000);
      }
   }
@@ -959,7 +961,8 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
   hud->fs_color =
         util_make_fragment_passthrough_shader(pipe,
                                               TGSI_SEMANTIC_COLOR,
-                                               TGSI_INTERPOLATE_CONSTANT);
+                                               TGSI_INTERPOLATE_CONSTANT,
+                                               TRUE);

   {
      /* Read a texture and do .xxxx swizzling. */
--- a/src/gallium/auxiliary/hud/hud_cpu.c
+++ b/src/gallium/auxiliary/hud/hud_cpu.c
@@ -116,6 +116,12 @@ query_cpu_load(struct hud_graph *gr)
   }
 }

+static void
+free_query_data(void *p)
+{
+   FREE(p);
+}
+
 void
 hud_cpu_graph_install(struct hud_pane *pane, unsigned cpu_index)
 {
@@ -144,7 +150,11 @@ hud_cpu_graph_install(struct hud_pane *pane, unsigned cpu_index)
   }

   gr->query_new_value = query_cpu_load;
-   gr->free_query_data = free;
+
+   /* Don't use free() as our callback as that messes up Gallium's
+    * memory debugger.  Use simple free_query_data() wrapper.
+    */
+   gr->free_query_data = free_query_data;

   info = gr->query_data;
   info->cpu_index = cpu_index;
--- a/src/gallium/auxiliary/hud/hud_fps.c
+++ b/src/gallium/auxiliary/hud/hud_fps.c
@@ -52,7 +52,7 @@ query_fps(struct hud_graph *gr)
         info->frames = 0;
         info->last_time = now;

-         hud_graph_add_value(gr, fps);
+         hud_graph_add_value(gr, (uint64_t) fps);
      }
   }
   else {
@@ -60,6 +60,12 @@ query_fps(struct hud_graph *gr)
   }
 }

+static void
+free_query_data(void *p)
+{
+   FREE(p);
+}
+
 void
 hud_fps_graph_install(struct hud_pane *pane)
 {
@@ -76,7 +82,11 @@ hud_fps_graph_install(struct hud_pane *pane)
   }

   gr->query_new_value = query_fps;
-   gr->free_query_data = free;
+
+   /* Don't use free() as our callback as that messes up Gallium's
+    * memory debugger.  Use simple free_query_data() wrapper.
+    */
+   gr->free_query_data = free_query_data;

   hud_pane_add_graph(pane, gr);
 }
--- a/src/gallium/auxiliary/hud/hud_private.h
+++ b/src/gallium/auxiliary/hud/hud_private.h
@@ -42,7 +42,7 @@ struct hud_graph {
   char name[128];
   void *query_data;
   void (*query_new_value)(struct hud_graph *gr);
-   void (*free_query_data)(void *ptr);
+   void (*free_query_data)(void *ptr); /**< do not use ordinary free() */

   /* mutable variables */
   unsigned num_vertices;
--- a/src/gallium/auxiliary/indices/u_indices.c
+++ b/src/gallium/auxiliary/indices/u_indices.c
@@ -150,9 +150,26 @@ int u_index_translator( unsigned hw_mask,
 }


-
-
-
+/**
+ * If a driver does not support a particular gallium primitive type
+ * (such as PIPE_PRIM_QUAD_STRIP) this function can be used to help
+ * convert the primitive into a simpler type (like PIPE_PRIM_TRIANGLES).
+ *
+ * The generator functions generates a number of ushort or uint indexes
+ * for drawing the new type of primitive.
+ *
+ * \param hw_mask  a bitmask of (1 << PIPE_PRIM_x) values that indicates
+ *                 kind of primitives are supported by the driver.
+ * \param prim  the PIPE_PRIM_x that the user wants to draw
+ * \param start  index of first vertex to draw
+ * \param nr  number of vertices to draw
+ * \param in_pv  user's provoking vertex (PV_FIRST/LAST)
+ * \param out_pv  desired proking vertex for the hardware (PV_FIRST/LAST)
+ * \param out_prim  returns the new primitive type for the driver
+ * \param out_index_size  returns OUT_USHORT or OUT_UINT
+ * \param out_nr  returns new number of vertices to draw
+ * \param out_generate  returns pointer to the generator function
+ */
 int u_index_generator( unsigned hw_mask,
                       unsigned prim,
                       unsigned start,
--- a/src/gallium/auxiliary/indices/u_unfilled_indices.c
+++ b/src/gallium/auxiliary/indices/u_unfilled_indices.c
@@ -151,7 +151,14 @@ int u_unfilled_translator( unsigned prim,
 }


-
+/**
+ * Utility for converting unfilled polygons into points, lines, triangles.
+ * Few drivers have direct support for OpenGL's glPolygonMode.
+ * This function helps with converting triangles into points or lines
+ * when the front and back fill modes are the same.  When there's
+ * different front/back fill modes, that can be handled with the
+ * 'draw' module.
+ */
 int u_unfilled_generator( unsigned prim,
                          unsigned start,
                          unsigned nr,
--- a/src/gallium/auxiliary/os/os_process.c
+++ b/src/gallium/auxiliary/os/os_process.c
@@ -0,0 +1,92 @@
+/**************************************************************************
+ *
+ * Copyright 2013 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_config.h"
+#include "os/os_process.h"
+#include "util/u_memory.h"
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+#  include <windows.h>
+#elif defined(__GLIBC__)
+#  include <errno.h>
+#elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE)
+#  include <stdlib.h>
+#else
+#warning unexpected platform in os_process.c
+#endif
+
+
+/**
+ * Return the name of the current process.
+ * \param procname  returns the process name
+ * \param size  size of the procname buffer
+ * \return  TRUE or FALSE for success, failure
+ */
+boolean
+os_get_process_name(char *procname, size_t size)
+{
+   const char *name;
+#if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   char szProcessPath[MAX_PATH];
+   char *lpProcessName;
+   char *lpProcessExt;
+
+   GetModuleFileNameA(NULL, szProcessPath, Elements(szProcessPath));
+
+   lpProcessName = strrchr(szProcessPath, '\\');
+   lpProcessName = lpProcessName ? lpProcessName + 1 : szProcessPath;
+
+   lpProcessExt = strrchr(lpProcessName, '.');
+   if (lpProcessExt) {
+      *lpProcessExt = '\0';
+   }
+
+   name = lpProcessName;
+
+#elif defined(__GLIBC__)
+   name = program_invocation_short_name;
+#elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE)
+   /* *BSD and OS X */
+   name = getprogname();
+#else
+#warning unexpected platform in os_process.c
+   return FALSE;
+#endif
+
+   assert(size > 0);
+   assert(procname);
+
+   if (name && procname && size > 0) {
+      strncpy(procname, name, size);
+      procname[size - 1] = '\0';
+      return TRUE;
+   }
+   else {
+      return FALSE;
+   }
+}
--- a/src/gallium/auxiliary/os/os_process.h
+++ b/src/gallium/auxiliary/os/os_process.h
@@ -0,0 +1,40 @@
+/**************************************************************************
+ *
+ * Copyright 2013 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef OS_PROCESS_H
+#define OS_PROCESS_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern boolean
+os_get_process_name(char *str, size_t size);
+
+
+#endif /* OS_PROCESS_H */
--- a/src/gallium/auxiliary/os/os_time.c
+++ b/src/gallium/auxiliary/os/os_time.c
@@ -84,7 +84,7 @@ os_time_get_nano(void)
 void
 os_time_sleep(int64_t usecs)
 {
-   DWORD dwMilliseconds = (usecs + 999) / 1000;
+   DWORD dwMilliseconds = (DWORD) ((usecs + 999) / 1000);
   /* Avoid Sleep(O) as that would cause to sleep for an undetermined duration */
   if (dwMilliseconds) {
      Sleep(dwMilliseconds);
--- a/src/gallium/auxiliary/postprocess/filters.h
+++ b/src/gallium/auxiliary/postprocess/filters.h
@@ -30,8 +30,9 @@

 #include "postprocess/postprocess.h"

-typedef void (*pp_init_func) (struct pp_queue_t *, unsigned int,
+typedef bool (*pp_init_func) (struct pp_queue_t *, unsigned int,
                              unsigned int);
+typedef void (*pp_free_func) (struct pp_queue_t *, unsigned int);

 struct pp_filter_t
 {
@@ -41,18 +42,19 @@ struct pp_filter_t
   unsigned int verts;          /* How many are vertex shaders */
   pp_init_func init;           /* Init function */
   pp_func main;                /* Run function */
+   pp_free_func free;           /* Free function */
 };

 /*	Order matters. Put new filters in a suitable place. */

 static const struct pp_filter_t pp_filters[PP_FILTERS] = {
-/*    name			inner	shaders	verts	init			run */
-   { "pp_noblue",		0,	2,	1,	pp_noblue_init,		pp_nocolor },
-   { "pp_nogreen",		0,	2,	1,	pp_nogreen_init,	pp_nocolor },
-   { "pp_nored",		0,	2,	1,	pp_nored_init,		pp_nocolor },
-   { "pp_celshade",		0,	2,	1,	pp_celshade_init,	pp_nocolor },
-   { "pp_jimenezmlaa",		2,	5,	2,	pp_jimenezmlaa_init,	pp_jimenezmlaa },
-   { "pp_jimenezmlaa_color",	2,	5,	2,	pp_jimenezmlaa_init_color, pp_jimenezmlaa_color },
+/*    name			inner	shaders	verts	init			run                       free   */
+   { "pp_noblue",		0,	2,	1,	pp_noblue_init,		pp_nocolor,               pp_nocolor_free },
+   { "pp_nogreen",		0,	2,	1,	pp_nogreen_init,	pp_nocolor,               pp_nocolor_free },
+   { "pp_nored",		0,	2,	1,	pp_nored_init,		pp_nocolor,               pp_nocolor_free },
+   { "pp_celshade",		0,	2,	1,	pp_celshade_init,	pp_nocolor,               pp_celshade_free },
+   { "pp_jimenezmlaa",		2,	5,	2,	pp_jimenezmlaa_init,	pp_jimenezmlaa,           pp_jimenezmlaa_free },
+   { "pp_jimenezmlaa_color",	2,	5,	2,	pp_jimenezmlaa_init_color, pp_jimenezmlaa_color,  pp_jimenezmlaa_free },
 };

 #endif
--- a/src/gallium/auxiliary/postprocess/postprocess.h
+++ b/src/gallium/auxiliary/postprocess/postprocess.h
@@ -53,11 +53,13 @@ struct pp_queue_t

   struct pipe_resource *depth; /* depth of original input */
   struct pipe_resource *stencil;       /* stencil shared by inner_tmps */
+   struct pipe_resource *constbuf;      /* MLAA constant buffer */
+   struct pipe_resource *areamaptex;    /* MLAA area map texture */

   struct pipe_surface *tmps[2], *inner_tmps[3], *stencils;

   void ***shaders;             /* Shaders in TGSI form */
-   unsigned int *verts;
+   unsigned int *filters;       /* Active filter to filters.h mapping. */
   struct program *p;

   bool fbos_init;
@@ -75,6 +77,14 @@ void pp_debug(const char *, ...);
 struct program *pp_init_prog(struct pp_queue_t *, struct pipe_context *pipe,
                             struct cso_context *);
 void pp_init_fbos(struct pp_queue_t *, unsigned int, unsigned int);
+void pp_blit(struct pipe_context *pipe,
+             struct pipe_resource *src_tex,
+             int srcX0, int srcY0,
+             int srcX1, int srcY1,
+             int srcZ0,
+             struct pipe_surface *dst,
+             int dstX0, int dstY0,
+             int dstX1, int dstY1);

 /* The filters */

@@ -88,14 +98,20 @@ void pp_jimenezmlaa_color(struct pp_queue_t *, struct pipe_resource *,

 /* The filter init functions */

-void pp_celshade_init(struct pp_queue_t *, unsigned int, unsigned int);
+bool pp_celshade_init(struct pp_queue_t *, unsigned int, unsigned int);

-void pp_nored_init(struct pp_queue_t *, unsigned int, unsigned int);
-void pp_nogreen_init(struct pp_queue_t *, unsigned int, unsigned int);
-void pp_noblue_init(struct pp_queue_t *, unsigned int, unsigned int);
+bool pp_nored_init(struct pp_queue_t *, unsigned int, unsigned int);
+bool pp_nogreen_init(struct pp_queue_t *, unsigned int, unsigned int);
+bool pp_noblue_init(struct pp_queue_t *, unsigned int, unsigned int);

-void pp_jimenezmlaa_init(struct pp_queue_t *, unsigned int, unsigned int);
-void pp_jimenezmlaa_init_color(struct pp_queue_t *, unsigned int,
+bool pp_jimenezmlaa_init(struct pp_queue_t *, unsigned int, unsigned int);
+bool pp_jimenezmlaa_init_color(struct pp_queue_t *, unsigned int,
                               unsigned int);

+/* The filter free functions */
+
+void pp_celshade_free(struct pp_queue_t *, unsigned int);
+void pp_nocolor_free(struct pp_queue_t *, unsigned int);
+void pp_jimenezmlaa_free(struct pp_queue_t *, unsigned int);
+
 #endif
--- a/src/gallium/auxiliary/postprocess/pp_celshade.c
+++ b/src/gallium/auxiliary/postprocess/pp_celshade.c
@@ -30,9 +30,17 @@
 #include "postprocess/pp_filters.h"

 /** Init function */
-void
+bool
 pp_celshade_init(struct pp_queue_t *ppq, unsigned int n, unsigned int val)
 {
   ppq->shaders[n][1] =
      pp_tgsi_to_state(ppq->p->pipe, celshade, false, "celshade");
+
+   return (ppq->shaders[n][1] != NULL) ? TRUE : FALSE;
+}
+
+/** Free function */
+void
+pp_celshade_free(struct pp_queue_t *ppq, unsigned int n)
+{
 }
--- a/Show More
+++ b/Show More