i965: add ARB_texture_rgb10_a2ui support

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Brian Paul <brianp@vmware.com>
meta: allow CopyTexSubImage on integer formats
2012-07-22 00:10:27 -07:00 · 2012-07-22 00:10:27 -07:00 · 2012-07-21 16:50:22 -07:00 · 2012-07-21 16:50:22 -07:00 · 2012-07-21 16:50:21 -07:00 · 2012-07-21 16:50:07 -07:00
498 changed files with 20260 additions and 11488 deletions
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -3,6 +3,7 @@
  (tab-width . 8)
  (c-basic-offset . 3)
  (c-file-style . "stroustrup")
+  (fill-column . 78)
  (eval . (progn
 	    (c-set-offset 'innamespace '0)
 	    (c-set-offset 'inline-open '0)))
--- a/Makefile.am
+++ b/Makefile.am
@@ -58,9 +58,9 @@ PACKAGE_NAME = MesaLib-$(PACKAGE_VERSION)
 EXTRA_FILES = \
 	aclocal.m4					\
 	configure					\
-	src/glsl/glsl_parser.cpp			\
+	src/glsl/glsl_parser.cc				\
 	src/glsl/glsl_parser.h				\
-	src/glsl/glsl_lexer.cpp				\
+	src/glsl/glsl_lexer.cc				\
 	src/glsl/glcpp/glcpp-lex.c			\
 	src/glsl/glcpp/glcpp-parse.c			\
 	src/glsl/glcpp/glcpp-parse.h			\
@@ -80,7 +80,7 @@ IGNORE_FILES = \

 parsers: configure
 	-@touch $(top_builddir)/configs/current
-	$(MAKE) -C src/glsl glsl_parser.cpp glsl_parser.h glsl_lexer.cpp
+	$(MAKE) -C src/glsl glsl_parser.cc glsl_parser.h glsl_lexer.cc
 	$(MAKE) -C src/glsl/glcpp glcpp-lex.c glcpp-parse.c glcpp-parse.h
 	$(MAKE) -C src/mesa program/lex.yy.c program/program_parse.tab.c program/program_parse.tab.h

--- a/configs/current.in
+++ b/configs/current.in
@@ -47,7 +47,6 @@ DLOPEN_LIBS = @DLOPEN_LIBS@

 # Source selection
 MESA_ASM_FILES = @MESA_ASM_FILES@
-GLAPI_ASM_SOURCES = @GLAPI_ASM_SOURCES@

 # Misc tools and flags
 MAKE = @MAKE@
@@ -147,8 +146,8 @@ VG_LIB_DEPS = $(EXTRA_LIB_PATH) @VG_LIB_DEPS@
 GLAPI_LIB_DEPS = $(EXTRA_LIB_PATH) @GLAPI_LIB_DEPS@

 # DRI dependencies
-MESA_MODULES = @MESA_MODULES@
 DRI_LIB_DEPS = $(EXTRA_LIB_PATH) @DRI_LIB_DEPS@
+GALLIUM_DRI_LIB_DEPS = $(EXTRA_LIB_PATH) @GALLIUM_DRI_LIB_DEPS@
 LIBDRM_CFLAGS = @LIBDRM_CFLAGS@
 LIBDRM_LIB = @LIBDRM_LIBS@
 DRI2PROTO_CFLAGS = @DRI2PROTO_CFLAGS@
--- a/configure.ac
+++ b/configure.ac
@@ -33,7 +33,7 @@ USER_CXXFLAGS="$CXXFLAGS"
 dnl Versions for external dependencies
 LIBDRM_REQUIRED=2.4.24
 LIBDRM_RADEON_REQUIRED=2.4.31
-LIBDRM_INTEL_REQUIRED=2.4.34
+LIBDRM_INTEL_REQUIRED=2.4.37
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
 LIBDRM_NOUVEAU_REQUIRED=2.4.33
 DRI2PROTO_REQUIRED=2.6
@@ -454,7 +454,6 @@ AC_ARG_ENABLE([asm],
 )
 asm_arch=""
 MESA_ASM_FILES=""
-GLAPI_ASM_SOURCES=""
 AC_MSG_CHECKING([whether to enable assembly])
 test "x$enable_asm" = xno && AC_MSG_RESULT([no])
 # disable if cross compiling on x86/x86_64 since we must run gen_matypes
@@ -503,19 +502,16 @@ if test "x$enable_asm" = xyes; then
    x86)
        DEFINES="$DEFINES -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM"
        MESA_ASM_FILES='$(X86_FILES)'
-        GLAPI_ASM_SOURCES='$(X86_API)'
        AC_MSG_RESULT([yes, x86])
        ;;
    x86_64)
        DEFINES="$DEFINES -DUSE_X86_64_ASM"
        MESA_ASM_FILES='$(X86_64_FILES)'
-        GLAPI_ASM_SOURCES='$(X86-64_API)'
        AC_MSG_RESULT([yes, x86_64])
        ;;
    sparc)
        DEFINES="$DEFINES -DUSE_SPARC_ASM"
        MESA_ASM_FILES='$(SPARC_FILES)'
-        GLAPI_ASM_SOURCES='$(SPARC_API)'
        AC_MSG_RESULT([yes, sparc])
        ;;
    *)
@@ -524,7 +520,6 @@ if test "x$enable_asm" = xyes; then
    esac
 fi
 AC_SUBST([MESA_ASM_FILES])
-AC_SUBST([GLAPI_ASM_SOURCES])

 dnl PIC code macro
 MESA_PIC_FLAGS
@@ -1030,29 +1025,12 @@ xyesno)
        GL_PC_CFLAGS="$X11_INCLUDES"

        # XCB can only be used from pkg-config
-        PKG_CHECK_MODULES([XCB],[x11-xcb xcb-glx])
+        PKG_CHECK_MODULES([XCB],[x11-xcb xcb-glx >= 1.8.1])
        GL_PC_REQ_PRIV="$GL_PC_REQ_PRIV x11-xcb xcb-glx"
        X11_INCLUDES="$X11_INCLUDES $XCB_CFLAGS"
        GL_LIB_DEPS="$GL_LIB_DEPS $XCB_LIBS"
    fi

-    # Check to see if the xcb-glx library is new enough to support
-    # GLX_ARB_create_context.  This bit of hackery is necessary until XCB 1.8
-    # is released.
-    save_CPPFLAGS="$CPPFLAGS"
-    save_LDFLAGS="$LDFLAGS"
-    CPPFLAGS="$CPPFLAGS $X11_INCLUDES"
-    LDFLAGS="$LDFLAGS $GL_LIB_DEPS"
-    AC_CHECK_LIB(xcb-glx, xcb_glx_create_context_attribs_arb_checked,
-        [HAVE_XCB_GLX_CREATE_CONTEXT=yes],
-        [HAVE_XCB_GLX_CREATE_CONTEXT=no])
-    CPPFLAGS="$save_CPPFLAGS"
-    LDFLAGS="$save_LDFLAGS"
-
-    if test x$HAVE_XCB_GLX_CREATE_CONTEXT = xyes; then
-        X11_INCLUDES="$X11_INCLUDES -DHAVE_XCB_GLX_CREATE_CONTEXT"
-    fi
-
    # need DRM libs, -lpthread, etc.
    GL_LIB_DEPS="$GL_LIB_DEPS $LIBDRM_LIBS -lm -lpthread $DLOPEN_LIBS"
    GL_PC_LIB_PRIV="-lm -lpthread $DLOPEN_LIBS"
@@ -1061,8 +1039,6 @@ esac

 # This is outside the case (above) so that it is invoked even for non-GLX
 # builds.
-AM_CONDITIONAL(HAVE_XCB_GLX_CREATE_CONTEXT,
-    test x$HAVE_XCB_GLX_CREATE_CONTEXT = xyes)
 AM_CONDITIONAL(HAVE_XF86VIDMODE, test "x$HAVE_XF86VIDMODE" = xyes)

 GLESv1_CM_LIB_DEPS="$LIBDRM_LIBS -lm -lpthread $DLOPEN_LIBS"
@@ -1084,17 +1060,17 @@ GLAPI_LIB_DEPS="-lpthread $SELINUX_LIBS"
 AC_SUBST([GLAPI_LIB_DEPS])


-dnl Setup default DRI CFLAGS
-DRI_LIB_DEPS='$(TOP)/src/mesa/libmesa.a'
-MESA_MODULES='$(TOP)/src/mesa/libmesa.a'
+dnl dri libraries are linking with mesa
+DRI_LIB_DEPS='$(TOP)/src/mesa/libmesa.la'
+GALLIUM_DRI_LIB_DEPS='$(TOP)/src/mesa/libmesa.a'

+dnl ... or dricore?
 if test "x$enable_dri" = xyes && test "x$driglx_direct" = xyes ; then
-    DRI_LIB_DEPS="-L\$(TOP)/\$(LIB_DIR) -ldricore$VERSION"
-    MESA_MODULES="\$(TOP)/\$(LIB_DIR)/libdricore$VERSION.so"
+    DRI_LIB_DEPS="\$(TOP)/src/mesa/libdricore/libdricore${VERSION}.la"
+    GALLIUM_DRI_LIB_DEPS="\$(TOP)/\$(LIB_DIR)/libdricore${VERSION}.so"
    HAVE_DRICORE=yes
 fi
 AM_CONDITIONAL(HAVE_DRICORE, test x$HAVE_DRICORE = xyes)
-AC_SUBST([MESA_MODULES])

 AC_SUBST([HAVE_XF86VIDMODE])

@@ -1269,10 +1245,12 @@ if test "x$enable_dri" = xyes; then

    # put all the necessary libs together
    DRI_LIB_DEPS="$DRI_LIB_DEPS $SELINUX_LIBS $LIBDRM_LIBS $EXPAT_LIB -lm -lpthread $DLOPEN_LIBS"
+    GALLIUM_DRI_LIB_DEPS="$GALLIUM_DRI_LIB_DEPS $SELINUX_LIBS $LIBDRM_LIBS $EXPAT_LIB -lm -lpthread $DLOPEN_LIBS"
 fi
 AC_SUBST([DRI_DIRS])
 AC_SUBST([EXPAT_INCLUDES])
 AC_SUBST([DRI_LIB_DEPS])
+AC_SUBST([GALLIUM_DRI_LIB_DEPS])

 case $DRI_DIRS in
 *i915*|*i965*)
@@ -1370,10 +1348,14 @@ if test "x$enable_osmesa" = xyes; then
    OSMESA_MESA_DEPS=""
    OSMESA_PC_LIB_PRIV="-lm -lpthread $SELINUX_LIBS $DLOPEN_LIBS"
 fi
+
+OSMESA_VERSION=`echo "$VERSION" | $SED 's/\./:/g'`
+
 AC_SUBST([OSMESA_LIB_DEPS])
 AC_SUBST([OSMESA_MESA_DEPS])
 AC_SUBST([OSMESA_PC_REQ])
 AC_SUBST([OSMESA_PC_LIB_PRIV])
+AC_SUBST([OSMESA_VERSION])

 dnl
 dnl gbm configuration
@@ -1746,8 +1728,9 @@ for plat in $egl_platforms; do
 				  [AC_MSG_ERROR([cannot find libwayland-client])])
 		GALLIUM_WINSYS_DIRS="$GALLIUM_WINSYS_DIRS sw/wayland"

-		m4_ifdef([WAYLAND_SCANNER_RULES],
-		         [WAYLAND_SCANNER_RULES(['$(top_srcdir)/src/egl/wayland/wayland-drm/protocol'])])
+                WAYLAND_PREFIX=`$PKG_CONFIG --variable=prefix wayland-client`
+                AC_PATH_PROG([WAYLAND_SCANNER], [wayland-scanner],,
+                             [${WAYLAND_PREFIX}/bin$PATH_SEPARATOR$PATH])
 		;;

 	x11)
@@ -1864,10 +1847,14 @@ if test "x$enable_gallium_llvm" = xyes; then
 	    dnl We can't use $LLVM_VERSION because it has 'svn' stripped out,
 	    LLVM_LIBS="-lLLVM-`$LLVM_CONFIG --version`"
 	else
-	    LLVM_LIBS="`$LLVM_CONFIG --libs engine bitwriter`"
-            if test "x$enable_opencl" = xyes; then
-                LLVM_LIBS="${LLVM_LIBS} `llvm-config --libs ipo linker instrumentation`"
+            LLVM_COMPONENTS="engine bitwriter"
+            if $LLVM_CONFIG --components | grep -q '\<mcjit\>'; then
+                LLVM_COMPONENTS="${LLVM_COMPONENTS} mcjit"
            fi
+            if test "x$enable_opencl" = xyes; then
+                LLVM_COMPONENTS="${LLVM_COMPONENTS} ipo linker instrumentation"
+            fi
+            LLVM_LIBS="`$LLVM_CONFIG --libs ${LLVM_COMPONENTS}`"
 	fi
 	LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags`
 	LLVM_BINDIR=`$LLVM_CONFIG --bindir`
@@ -2024,7 +2011,7 @@ if test "x$with_gallium_drivers" != x; then
                USE_R600_LLVM_COMPILER=yes;
            fi
            if test "x$enable_opencl" = xyes -a "x$with_llvm_shared_libs" = xno; then
-                LLVM_LIBS="${LLVM_LIBS} `llvm-config --libs bitreader asmparser`"
+                LLVM_LIBS="${LLVM_LIBS} `$LLVM_CONFIG --libs bitreader asmparser`"
            fi
            gallium_check_st "radeon/drm" "dri-r600" "xorg-r600" "" "xvmc-r600" "vdpau-r600" "va-r600"
            ;;
@@ -2166,9 +2153,12 @@ AC_CONFIG_FILES([configs/current
 		src/egl/wayland/wayland-egl/Makefile
 		src/egl/wayland/wayland-egl/wayland-egl.pc
 		src/egl/wayland/wayland-drm/Makefile
+		src/glsl/Makefile
+		src/glsl/glcpp/Makefile
 		src/glsl/tests/Makefile
 		src/glx/Makefile
 		src/glx/tests/Makefile
+		src/mapi/glapi/Makefile
 		src/mapi/glapi/gen/Makefile
 		src/mapi/shared-glapi/Makefile
 		src/mapi/glapi/tests/Makefile
@@ -2188,6 +2178,7 @@ AC_CONFIG_FILES([configs/current
 		src/mesa/drivers/dri/r200/Makefile
 		src/mesa/drivers/dri/radeon/Makefile
 		src/mesa/drivers/dri/swrast/Makefile
+		src/mesa/drivers/osmesa/Makefile
 		src/mesa/drivers/x11/Makefile
 		src/mesa/gl.pc
 		src/mesa/osmesa.pc])
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -42,8 +42,8 @@ GLX_ARB_create_context (GLX 1.4 is required)          DONE

 GL 3.1:

-GLSL 1.40                                             missing: UBOS, inverse(),
-                                                      highp change
+GLSL 1.40                                             needs UBOs (in progress)
+Forward compatibile context support/deprecations      not started
 Instanced drawing (GL_ARB_draw_instanced)             DONE (i965, gallium, swrast)
 Buffer copying (GL_ARB_copy_buffer)                   DONE (i965, r300, r600, swrast)
 Primitive restart (GL_NV_primitive_restart)           DONE (i965, r600)
@@ -78,10 +78,10 @@ GL_ARB_explicit_attrib_location                       DONE (i915, i965, r300, r6
 GL_ARB_occlusion_query2                               DONE (r300, r600, swrast)
 GL_ARB_sampler_objects                                DONE (i965, r300, r600)
 GL_ARB_shader_bit_encoding                            DONE
-GL_ARB_texture_rgb10_a2ui                             DONE (r600)
+GL_ARB_texture_rgb10_a2ui                             DONE (i965, r600)
 GL_ARB_texture_swizzle                                DONE (same as EXT version) (i965, r300, r600, swrast)
-GL_ARB_timer_query                                    ~60% done (the EXT variant)
-GL_ARB_instanced_arrays                               DONE (r300, r600)
+GL_ARB_timer_query                                    DONE
+GL_ARB_instanced_arrays                               DONE (i965, r300, r600)
 GL_ARB_vertex_type_2_10_10_10_rev                     DONE (r600)


@@ -91,6 +91,7 @@ GLSL 4.0                                             not started
 GL_ARB_texture_query_lod                             not started
 GL_ARB_draw_buffers_blend                            DONE (i965, r600, softpipe)
 GL_ARB_draw_indirect                                 not started
+GL_ARB_gpu_shader5                                   not started
 GL_ARB_gpu_shader_fp64                               not started
 GL_ARB_sample_shading                                not started
 GL_ARB_shader_subroutine                             not started
@@ -99,7 +100,7 @@ GL_ARB_texture_buffer_object_rgb32                   not started
 GL_ARB_texture_cube_map_array                        not started
 GL_ARB_texture_gather                                not started
 GL_ARB_transform_feedback2                           DONE
-GL_ARB_transform_feedback3                           not started
+GL_ARB_transform_feedback3                           DONE


 GL 4.1:
@@ -114,12 +115,13 @@ GL_ARB_viewport_array                                not started


 GL 4.2:
+
 GLSL 4.2                                             not started
 GL_ARB_texture_compression_bptc                      not started
 GL_ARB_compressed_texture_pixel_storage              not started
 GL_ARB_shader_atomic_counters                        not started
 GL_ARB_texture_storage                               DONE (r300, r600, swrast)
-GL_ARB_transform_feedback_instanced                  not started
+GL_ARB_transform_feedback_instanced                  DONE
 GL_ARB_base_instance                                 DONE (nv50, nvc0, r600, radeonsi)
 GL_ARB_shader_image_load_store                       not started
 GL_ARB_conservative_depth                            DONE (softpipe)
--- a/docs/WL_bind_wayland_display.spec
+++ b/docs/WL_bind_wayland_display.spec
@@ -56,12 +56,27 @@ New Procedures and Functions
    EGLBoolean eglUnbindWaylandDisplayWL(EGLDisplay dpy,
                                         struct wl_display *display);

+    EGLBoolean eglQueryWaylandBufferWL(EGLDisplay dpy,
+                                       struct wl_buffer *buffer,
+                                       EGLint attribute, EGLint *value);
+
 New Tokens

    Accepted as <target> in eglCreateImageKHR

        EGL_WAYLAND_BUFFER_WL                   0x31D5

+    Accepted in the <attrib_list> parameter of eglCreateImageKHR:
+
+        EGL_WAYLAND_PLANE_WL                    0x31D6
+
+    Possible values for EGL_TEXTURE_FORMAT:
+
+        EGL_TEXTURE_Y_U_V_WL                    0x31D7
+        EGL_TEXTURE_Y_UV_WL                     0x31D8
+        EGL_TEXTURE_Y_XUXV_WL                   0x31D9
+
+
 Additions to the EGL 1.4 Specification:

    To bind a server side wl_display to an EGLDisplay, call
@@ -80,9 +95,67 @@ Additions to the EGL 1.4 Specification:
    eglUnbindWaylandDisplayWL returns EGL_FALSE when there is no
    wl_display bound to the EGLDisplay currently otherwise EGL_TRUE.

-    Import a wl_buffer by calling eglCreateImageKHR with
-    wl_buffer as EGLClientBuffer, EGL_WAYLAND_BUFFER_WL as the target,
-    NULL context and an empty attribute_list.
+    A wl_buffer can have several planes, typically in case of planar
+    YUV formats.  Depending on the exact YUV format in use, the
+    compositor will have to create one or more EGLImages for the
+    various planes.  The eglQueryWaylandBufferWL function should be
+    used to first query the wl_buffer texture format using
+    EGL_TEXTURE_FORMAT as the attribute.  If the wl_buffer object is
+    not an EGL wl_buffer (wl_shm and other wayland extensions can
+    create wl_buffer objects of different types), this query will
+    return EGL_FALSE.  In that case the wl_buffer can not be used with
+    EGL and the compositor should have another way to get the buffer
+    contents.
+
+    If eglQueryWaylandBufferWL succeeds, the returned value will be
+    one of EGL_TEXTURE_RGB, EGL_TEXTURE_RGBA, EGL_TEXTURE_Y_U_V_WL,
+    EGL_TEXTURE_Y_UV_WL, EGL_TEXTURE_Y_XUXV_WL.  The value returned
+    describes how many EGLImages must be used, which components will
+    be sampled from each EGLImage and how they map to rgba components
+    in the shader.  The naming conventions separates planes by _ and
+    within each plane, the order or R, G, B, A, Y, U, and V indicates
+    how those components map to the rgba value returned by the
+    sampler.  X indicates that the corresponding component in the rgba
+    value isn't used.
+
+    RGB and RGBA buffer types:
+
+        EGL_TEXTURE_RGB
+                One plane, samples RGB from the texture to rgb in the
+                shader.  Alpha channel is not valid.
+
+        EGL_TEXTURE_RGBA
+                One plane, samples RGBA from the texture to rgba in the
+                shader.
+
+    YUV buffer types:
+
+        EGL_TEXTURE_Y_U_V_WL
+                Three planes, samples Y from the first plane to r in
+                the shader, U from the second plane to r, and V from
+                the third plane to r.
+
+        EGL_TEXTURE_Y_UV_WL
+                Two planes, samples Y from the first plane to r in
+                the shader, U and V from the second plane to rg.
+
+        EGL_TEXTURE_Y_XUXV_WL
+                Two planes, samples Y from the first plane to r in
+                the shader, U and V from the second plane to g and a.
+
+    After querying the wl_buffer layout, create EGLImages for the
+    planes by calling eglCreateImageKHR with wl_buffer as
+    EGLClientBuffer, EGL_WAYLAND_BUFFER_WL as the target, NULL
+    context.  If no attributes are given, an EGLImage will be created
+    for the first plane.  For multi-planar buffers, specify the plane
+    to create the EGLImage for by using the EGL_WAYLAND_PLANE_WL
+    attribute.  The value of the attribute is the index of the plane,
+    as defined by the buffer format.  Writing to an EGLImage created
+    from a wl_buffer in any way (such as glTexImage2D, binding the
+    EGLImage as a renderbuffer etc) will result in undefined behavior.
+
+    Further, eglQueryWaylandBufferWL accepts attributes EGL_WIDTH and
+    EGL_HEIGHT to query the width and height of the wl_buffer.

 Issues

@@ -90,3 +163,13 @@ Revision History

    Version 1, March 1, 2011
        Initial draft (Benjamin Franzke)
+    Version 2, July 5, 2012
+        Add EGL_WAYLAND_PLANE_WL attribute to allow creating an EGLImage
+        for different planes of planar buffer. (Kristian Høgsberg)
+    Version 3, July 10, 2012
+        Add eglQueryWaylandBufferWL and the various buffer
+        formats. (Kristian Høgsberg)
+    Version 4, July 19, 2012
+        Use EGL_TEXTURE_FORMAT, EGL_TEXTURE_RGB, and EGL_TEXTURE_RGBA,
+        and just define the new YUV texture formats.  Add support for
+        EGL_WIDTH and EGL_HEIGHT in the query attributes (Kristian Høgsberg)
--- a/docs/helpwanted.html
+++ b/docs/helpwanted.html
@@ -46,6 +46,19 @@ the latest Mesa code and run tests (such as piglit) then report issues to
 the mailing list.
 </ol>

+<p>
+You can find some further To-do lists here:
+</p>
+<ul>
+  <li><a href="http://cgit.freedesktop.org/mesa/mesa/tree/docs/GL3.txt"
+    target="_parent"><b>GL3.txt</b></a></li>
+  <li><a href="http://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/docs/llvm-todo.txt"
+    target="_parent"><b>LLVMpipe - todo</b></a></li>
+  <li><a href="http://dri.freedesktop.org/wiki/MissingFunctionality"
+    target="_parent"><b>MissingFunctionality</b></a></li>
+  <li><a href="http://dri.freedesktop.org/wiki/R300ToDo"
+    target="_parent"><b>R300ToDo</b></a></li>
+</ul>

 <p>
 If you want to do something new in Mesa, first join the Mesa developer's
--- a/docs/news.html
+++ b/docs/news.html
@@ -9,6 +9,14 @@

 <h1>News</h1>

+<h2>July 10, 2012</h2>
+
+<p>
+<a href="relnotes-8.0.4.html">Mesa 8.0.4</a> is released.
+This is a bug fix release.
+</p>
+
+
 <h2>May 18, 2012</h2>

 <p>
--- a/docs/relnotes-8.0.4.html
+++ b/docs/relnotes-8.0.4.html
@@ -0,0 +1,201 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="mesa.css">
+</head>
+<body>
+
+<h1>Mesa 8.0.4 Release Notes / July 10, 2012</h1>
+
+<p>
+Mesa 8.0.4 is a bug fix release which fixes bugs found since the 8.0.2 release.
+</p>
+<p>
+Mesa 8.0.4 implements the OpenGL 3.0 API, but the version reported by
+glGetString(GL_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.0.
+</p>
+<p>
+See the <a href="install.html">Compiling/Installing page</a> for prerequisites
+for DRI hardware acceleration.
+</p>
+
+
+<h2>MD5 checksums</h2>
+<pre>
+02b96082d2f1ad72e7385f4022afafb9  MesaLib-8.0.4.tar.gz
+d546f988adfdf986cff45b1efa2d8a46  MesaLib-8.0.4.tar.bz2
+1f0fdabe6e8019d4de6c16e20e74d163  MesaLib-8.0.4.zip
+</pre>
+
+<h2>New features</h2>
+<p>None.</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45967">Bug 45967</a> - piglit getteximage-invalid-format-for-packed-type regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=47742">Bug 47742</a> - [softpipe] piglit fbo-generatemipmap-array regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48141">Bug 48141</a> - [vmwgfx] src/gallium/auxiliary/util/u_inlines.h:256:pipe_buffer_map_range: Assertion `offset + length &lt;= buffer-&gt;width0' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=48472">Bug 48472</a> - GPU Lockup while running demo (rzr - the scene is dead) in wine</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=50033">Bug 50033</a> - src/mesa/state_tracker/st_cb_fbo.c:379:st_render_texture: Assertion `strb-&gt;rtt_level &lt;= strb-&gt;texture-&gt;last_level' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=50621">Bug 50621</a> - Mesa fails its test suite with a buffer overflow.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=50298">Bug 50298</a> - [ILK IVB bisected]Ogles2conform GL/sin/sin_float_vert_xvary.test regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=51574">Bug 51574</a> - ir_loop_jump constructor assigns member variable to itself</li>
+
+<!-- <li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=">Bug </a> - </li> -->
+
+</ul>
+
+
+<h2>Changes</h2>
+<p>The full set of changes can be viewed by using the following GIT command:</p>
+
+<pre>
+  git log mesa-8.0.3..mesa-8.0.4
+</pre>
+
+<p>Andreas Betz (1):</p>
+<ul>
+  <li>vega: fix 565 color unpacking bug</li>
+</ul>
+
+<p>Antoine Labour (2):</p>
+<ul>
+  <li>meta: Cleanup the resources we allocate.</li>
+  <li>mesa: Free uniforms correclty.</li>
+</ul>
+
+<p>Brian Paul (22):</p>
+<ul>
+  <li>docs: add link to 8.0.3 release notes</li>
+  <li>mesa: fix Z32_FLOAT -&gt; uint conversion functions</li>
+  <li>draw: fix primitive restart bug by using the index buffer offset</li>
+  <li>st/mesa: fix glDrawPixels(GL_DEPTH_COMPONENT) color output</li>
+  <li>svga: fix synchronization bug between sampler views and surfaces</li>
+  <li>mesa: new _mesa_error_check_format_and_type() function</li>
+  <li>mesa: add missing GL_UNSIGNED_INT_10F_11F_11F_REV case</li>
+  <li>mesa: fix missing return value in getteximage_error_check()</li>
+  <li>st/mesa: pass GL_MAP_INVALIDATE_RANGE_BIT to gallium drivers</li>
+  <li>svga: add 0.5 in float-&gt;int conversion of sample min/max lod</li>
+  <li>svga: fix min/max lod clamping</li>
+  <li>svga: change PIPE_CAPF_MAX_TEXTURE_LOD_BIAS from 16.0 to 15.0</li>
+  <li>st/mesa: add fallback pipe formats for (compressed) R, RG formats</li>
+  <li>st/mesa: copy num_immediates field when copying the immediates array</li>
+  <li>svga: move svga_texture() casts/calls in svga_surface_copy()</li>
+  <li>svga: reset vertex buffer offset in svga_release_user_upl_buffers()</li>
+  <li>st/mesa: don't set PIPE_BIND_DISPLAY_TARGET for user-created renderbuffers</li>
+  <li>st/mesa: use private pipe_sampler_view in decompress_with_blit()</li>
+  <li>st/mesa: add null pointer check in st_texture_image_map()</li>
+  <li>st/mesa: fix mipmap image size computation w.r.t. texture arrays</li>
+  <li>draw: fix missing immediates bug in polygon stipple code</li>
+  <li>st/mesa: fix max_offset computation for base vertex</li>
+</ul>
+
+<p>Christoph Bumiller (1):</p>
+<ul>
+  <li>nv50: handle NEG,ABS modifiers for short RCP encoding</li>
+</ul>
+
+<p>Dylan Noblesmith (1):</p>
+<ul>
+  <li>mesa: require GL_MAX_SAMPLES &gt;= 4 for GL 3.0</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>i965/vs: Fix texelFetchOffset()</li>
+</ul>
+
+<p>Ian Romanick (5):</p>
+<ul>
+  <li>docs: Add 8.0.3 release md5sums</li>
+  <li>glx/tests: Fix off-by-one error in allocating extension string buffer</li>
+  <li>glsl: Remove spurious printf messages</li>
+  <li>glsl: Fix pi/2 constant in acos built-in function</li>
+  <li>mesa: Bump version number to 8.0.4</li>
+</ul>
+
+<p>José Fonseca (2):</p>
+<ul>
+  <li>mesa: Avoid void acinclude.m4 Android.common.mk Android.mk autogen.sh bin common.py configs configure.ac docs doxygen include Makefile scons SConstruct src tests arithmetic.</li>
+  <li>draw: Ensure that prepare is always run after LLVM garbagge collection.</li>
+</ul>
+
+<p>Kenneth Graunke (15):</p>
+<ul>
+  <li>mesa: Check for a negative "size" parameter in glCopyBufferSubData().</li>
+  <li>i965: Fix brw_swap_cmod() for LE/GE comparisons.</li>
+  <li>glsl: Remove unused ir_loop_jump::loop pointer.</li>
+  <li>ralloc: Fix ralloc_parent() of memory allocated out of the NULL context.</li>
+  <li>mesa: Restore depth texture state on glPopAttrib(GL_TEXTURE_BIT).</li>
+  <li>glsl/builtins: Fix textureGrad() for Array samplers.</li>
+  <li>mesa: Unbind ARB_copy_buffer and transform feedback buffers on delete.</li>
+  <li>mesa: Support BindBuffer{Base,Offset,Range} with a buffer of 0.</li>
+  <li>mesa: Unbind ARB_transform_feedback2 binding points on Delete too.</li>
+  <li>meta: Fix GL_RENDERBUFFER binding in decompress_texture_image().</li>
+  <li>i965/fs: Fix texelFetchOffset() on pre-Gen7.</li>
+  <li>i965/vs: Fix texelFetchOffset() on pre-Gen7.</li>
+  <li>i965/fs: Fix user-defined FS outputs with less than four components.</li>
+  <li>glsl: Hook up loop_variable_state destructor to plug a memory leak.</li>
+  <li>glsl: Don't trust loop analysis in the presence of function calls.</li>
+</ul>
+
+<p>Kurt Roeckx (1):</p>
+<ul>
+  <li>i830: Fix crash for GL_STENCIL_TEST in i830Enable()</li>
+</ul>
+
+<p>Lukas Rössler (1):</p>
+<ul>
+  <li>glu: fix two Clang warnings</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>mesa: allow exposing GL3 without EXT_texture_integer</li>
+  <li>st/mesa: don't do srgb-&gt;linear conversion in decompress_with_blit</li>
+</ul>
+
+<p>Paul Seidler (1):</p>
+<ul>
+  <li>tests: include mesa headers</li>
+</ul>
+
+<p>Stéphane Marchesin (3):</p>
+<ul>
+  <li>glx: Handle a null reply in QueryVersion.</li>
+  <li>i915g: Don't invert signalled/unsignalled fences</li>
+  <li>i915g: Don't avoid flushing when we have a pending fence.</li>
+</ul>
+
+<p>Thomas Gstädtner (1):</p>
+<ul>
+  <li>gallium/targets: pass ldflags parameter to MKLIB</li>
+</ul>
+
+<p>Vadim Girlin (2):</p>
+<ul>
+  <li>st/mesa: set stObj-&gt;lastLevel in guess_and_alloc_texture</li>
+  <li>r600g: check gpr count limit</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>st/mesa: Fix uninitialized members in glsl_to_tgsi_visitor constructor.</li>
+</ul>
+
+</body>
+</html>
--- a/docs/relnotes-8.1.html
+++ b/docs/relnotes-8.1.html
@@ -32,12 +32,17 @@ Note: some of the new features are only available with certain drivers.
 </p>

 <ul>
-<li>GL_ARB_base_instance extension</li>
-<li>GL_NV_read_buffer extension for ES 2.0</li>
+<li>GL_ARB_base_instance</li>
+<li>GL_ARB_blend_func_extended</li>
+<li>GL_ARB_debug_output</li>
 <li>GL_ARB_shader_bit_encoding</li>
+<li>GL_ARB_timer_query</li>
+<li>GL_ARB_transform_feedback3</li>
+<li>GL_ARB_transform_feedback_instanced</li>
 <li>GL_EXT_unpack_subimage for ES 2.0</li>
 <li>GL_EXT_read_format_bgra for ES 1.1 and 2.0</li>
-<li>GL_ARB_debug_output</li>
+<li>GL_EXT_texture_rg for ES 2.x</li>
+<li>GL_NV_read_buffer for ES 2.0</li>
 </ul>


--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -15,6 +15,7 @@ The release notes summarize what's new or changed in each Mesa release.

 <ul>
 <li><a href="relnotes-8.1.html">8.1 release notes</a>
+<li><a href="relnotes-8.0.4.html">8.0.4 release notes</a>
 <li><a href="relnotes-8.0.3.html">8.0.3 release notes</a>
 <li><a href="relnotes-8.0.2.html">8.0.2 release notes</a>
 <li><a href="relnotes-8.0.1.html">8.0.1 release notes</a>
--- a/doxygen/glsl.doxy
+++ b/doxygen/glsl.doxy
@@ -11,9 +11,8 @@ PROJECT_NAME           = "Mesa GLSL module"
 #---------------------------------------------------------------------------
 INPUT                  = ../src/glsl/
 RECURSIVE              = NO
-EXCLUDE                = ../src/glsl/glsl_lexer.cpp \
-                         ../src/glsl/glsl_lexer.h \
-                         ../src/glsl/glsl_parser.cpp \
+EXCLUDE                = ../src/glsl/glsl_lexer.cc \
+                         ../src/glsl/glsl_parser.cc \
                         ../src/glsl/glsl_parser.h
 EXCLUDE_PATTERNS       =
 #---------------------------------------------------------------------------
--- a/include/EGL/eglmesaext.h
+++ b/include/EGL/eglmesaext.h
@@ -112,14 +112,24 @@ typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETDRMDISPLAYMESA) (int fd);
 #ifndef EGL_WL_bind_wayland_display
 #define EGL_WL_bind_wayland_display 1

-#define EGL_WAYLAND_BUFFER_WL			0x31D5 /* eglCreateImageKHR target */
+#define EGL_WAYLAND_BUFFER_WL		0x31D5 /* eglCreateImageKHR target */
+#define EGL_WAYLAND_PLANE_WL		0x31D6 /* eglCreateImageKHR target */
+
+#define EGL_TEXTURE_Y_U_V_WL            0x31D7
+#define EGL_TEXTURE_Y_UV_WL             0x31D8
+#define EGL_TEXTURE_Y_XUXV_WL           0x31D9
+
 struct wl_display;
+struct wl_buffer;
 #ifdef EGL_EGLEXT_PROTOTYPES
 EGLAPI EGLBoolean EGLAPIENTRY eglBindWaylandDisplayWL(EGLDisplay dpy, struct wl_display *display);
 EGLAPI EGLBoolean EGLAPIENTRY eglUnbindWaylandDisplayWL(EGLDisplay dpy, struct wl_display *display);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryWaylandBufferWL(EGLDisplay dpy, struct wl_buffer *buffer, EGLint attribute, EGLint *value);
 #endif
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLBINDWAYLANDDISPLAYWL) (EGLDisplay dpy, struct wl_display *display);
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLUNBINDWAYLANDDISPLAYWL) (EGLDisplay dpy, struct wl_display *display);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYWAYLANDBUFFERWL) (EGLDisplay dpy, struct wl_buffer *buffer, EGLint attribute, EGLint *value);
+
 #endif

 #ifndef EGL_NOK_swap_region
--- a/include/GL/glext.h
+++ b/include/GL/glext.h
@@ -6,7 +6,7 @@ extern "C" {
 #endif

 /*
-** Copyright (c) 2007-2011 The Khronos Group Inc.
+** Copyright (c) 2007-2012 The Khronos Group Inc.
 ** 
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -29,9 +29,9 @@ extern "C" {
 */

 /* Header file version number, required by OpenGL ABI for Linux */
-/* glext.h last updated $Date: 2011-08-08 00:34:29 -0700 (Mon, 08 Aug 2011) $ */
+/* glext.h last updated $Date: 2012-04-26 00:59:42 -0700 (Thu, 26 Apr 2012) $ */
 /* Current version at http://www.opengl.org/registry/ */
-#define GL_GLEXT_VERSION 72
+#define GL_GLEXT_VERSION 81
 /* Function declaration macros - to move into glplatform.h */

 #if defined(_WIN32) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__)
@@ -516,8 +516,6 @@ extern "C" {
 #define GL_MINOR_VERSION                  0x821C
 #define GL_NUM_EXTENSIONS                 0x821D
 #define GL_CONTEXT_FLAGS                  0x821E
-#define GL_DEPTH_BUFFER                   0x8223
-#define GL_STENCIL_BUFFER                 0x8224
 #define GL_COMPRESSED_RED                 0x8225
 #define GL_COMPRESSED_RG                  0x8226
 #define GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT 0x0001
@@ -1021,6 +1019,7 @@ extern "C" {
 /* reuse GL_MAX_VERTEX_UNIFORM_VECTORS */
 /* reuse GL_MAX_VARYING_VECTORS */
 /* reuse GL_MAX_FRAGMENT_UNIFORM_VECTORS */
+/* reuse GL_RGB565 */
 /* Reuse tokens from ARB_get_program_binary */
 /* reuse GL_PROGRAM_BINARY_RETRIEVABLE_HINT */
 /* reuse GL_PROGRAM_BINARY_LENGTH */
@@ -1884,8 +1883,10 @@ extern "C" {
 #endif

 #ifndef GL_ARB_copy_buffer
-#define GL_COPY_READ_BUFFER               0x8F36
-#define GL_COPY_WRITE_BUFFER              0x8F37
+#define GL_COPY_READ_BUFFER_BINDING       0x8F36
+#define GL_COPY_READ_BUFFER               GL_COPY_READ_BUFFER_BINDING
+#define GL_COPY_WRITE_BUFFER_BINDING      0x8F37
+#define GL_COPY_WRITE_BUFFER              GL_COPY_WRITE_BUFFER_BINDING
 #endif

 #ifndef GL_ARB_shader_texture_lod
@@ -2133,8 +2134,10 @@ extern "C" {

 #ifndef GL_ARB_transform_feedback2
 #define GL_TRANSFORM_FEEDBACK             0x8E22
-#define GL_TRANSFORM_FEEDBACK_BUFFER_PAUSED 0x8E23
-#define GL_TRANSFORM_FEEDBACK_BUFFER_ACTIVE 0x8E24
+#define GL_TRANSFORM_FEEDBACK_PAUSED      0x8E23
+#define GL_TRANSFORM_FEEDBACK_BUFFER_PAUSED GL_TRANSFORM_FEEDBACK_PAUSED
+#define GL_TRANSFORM_FEEDBACK_ACTIVE      0x8E24
+#define GL_TRANSFORM_FEEDBACK_BUFFER_ACTIVE GL_TRANSFORM_FEEDBACK_ACTIVE
 #define GL_TRANSFORM_FEEDBACK_BINDING     0x8E25
 #endif

@@ -2158,6 +2161,7 @@ extern "C" {
 #define GL_MAX_VERTEX_UNIFORM_VECTORS     0x8DFB
 #define GL_MAX_VARYING_VECTORS            0x8DFC
 #define GL_MAX_FRAGMENT_UNIFORM_VECTORS   0x8DFD
+#define GL_RGB565                         0x8D62
 #endif

 #ifndef GL_ARB_get_program_binary
@@ -5313,6 +5317,167 @@ extern "C" {
 #define GL_SCALED_RESOLVE_NICEST_EXT      0x90BB
 #endif

+#ifndef GL_NV_path_rendering
+#define GL_PATH_FORMAT_SVG_NV             0x9070
+#define GL_PATH_FORMAT_PS_NV              0x9071
+#define GL_STANDARD_FONT_NAME_NV          0x9072
+#define GL_SYSTEM_FONT_NAME_NV            0x9073
+#define GL_FILE_NAME_NV                   0x9074
+#define GL_PATH_STROKE_WIDTH_NV           0x9075
+#define GL_PATH_END_CAPS_NV               0x9076
+#define GL_PATH_INITIAL_END_CAP_NV        0x9077
+#define GL_PATH_TERMINAL_END_CAP_NV       0x9078
+#define GL_PATH_JOIN_STYLE_NV             0x9079
+#define GL_PATH_MITER_LIMIT_NV            0x907A
+#define GL_PATH_DASH_CAPS_NV              0x907B
+#define GL_PATH_INITIAL_DASH_CAP_NV       0x907C
+#define GL_PATH_TERMINAL_DASH_CAP_NV      0x907D
+#define GL_PATH_DASH_OFFSET_NV            0x907E
+#define GL_PATH_CLIENT_LENGTH_NV          0x907F
+#define GL_PATH_FILL_MODE_NV              0x9080
+#define GL_PATH_FILL_MASK_NV              0x9081
+#define GL_PATH_FILL_COVER_MODE_NV        0x9082
+#define GL_PATH_STROKE_COVER_MODE_NV      0x9083
+#define GL_PATH_STROKE_MASK_NV            0x9084
+#define GL_PATH_SAMPLE_QUALITY_NV         0x9085
+#define GL_PATH_STROKE_BOUND_NV           0x9086
+#define GL_PATH_STROKE_OVERSAMPLE_COUNT_NV 0x9087
+#define GL_COUNT_UP_NV                    0x9088
+#define GL_COUNT_DOWN_NV                  0x9089
+#define GL_PATH_OBJECT_BOUNDING_BOX_NV    0x908A
+#define GL_CONVEX_HULL_NV                 0x908B
+#define GL_MULTI_HULLS_NV                 0x908C
+#define GL_BOUNDING_BOX_NV                0x908D
+#define GL_TRANSLATE_X_NV                 0x908E
+#define GL_TRANSLATE_Y_NV                 0x908F
+#define GL_TRANSLATE_2D_NV                0x9090
+#define GL_TRANSLATE_3D_NV                0x9091
+#define GL_AFFINE_2D_NV                   0x9092
+#define GL_PROJECTIVE_2D_NV               0x9093
+#define GL_AFFINE_3D_NV                   0x9094
+#define GL_PROJECTIVE_3D_NV               0x9095
+#define GL_TRANSPOSE_AFFINE_2D_NV         0x9096
+#define GL_TRANSPOSE_PROJECTIVE_2D_NV     0x9097
+#define GL_TRANSPOSE_AFFINE_3D_NV         0x9098
+#define GL_TRANSPOSE_PROJECTIVE_3D_NV     0x9099
+#define GL_UTF8_NV                        0x909A
+#define GL_UTF16_NV                       0x909B
+#define GL_BOUNDING_BOX_OF_BOUNDING_BOXES_NV 0x909C
+#define GL_PATH_COMMAND_COUNT_NV          0x909D
+#define GL_PATH_COORD_COUNT_NV            0x909E
+#define GL_PATH_DASH_ARRAY_COUNT_NV       0x909F
+#define GL_PATH_COMPUTED_LENGTH_NV        0x90A0
+#define GL_PATH_FILL_BOUNDING_BOX_NV      0x90A1
+#define GL_PATH_STROKE_BOUNDING_BOX_NV    0x90A2
+#define GL_SQUARE_NV                      0x90A3
+#define GL_ROUND_NV                       0x90A4
+#define GL_TRIANGULAR_NV                  0x90A5
+#define GL_BEVEL_NV                       0x90A6
+#define GL_MITER_REVERT_NV                0x90A7
+#define GL_MITER_TRUNCATE_NV              0x90A8
+#define GL_SKIP_MISSING_GLYPH_NV          0x90A9
+#define GL_USE_MISSING_GLYPH_NV           0x90AA
+#define GL_PATH_ERROR_POSITION_NV         0x90AB
+#define GL_PATH_FOG_GEN_MODE_NV           0x90AC
+#define GL_ACCUM_ADJACENT_PAIRS_NV        0x90AD
+#define GL_ADJACENT_PAIRS_NV              0x90AE
+#define GL_FIRST_TO_REST_NV               0x90AF
+#define GL_PATH_GEN_MODE_NV               0x90B0
+#define GL_PATH_GEN_COEFF_NV              0x90B1
+#define GL_PATH_GEN_COLOR_FORMAT_NV       0x90B2
+#define GL_PATH_GEN_COMPONENTS_NV         0x90B3
+#define GL_PATH_STENCIL_FUNC_NV           0x90B7
+#define GL_PATH_STENCIL_REF_NV            0x90B8
+#define GL_PATH_STENCIL_VALUE_MASK_NV     0x90B9
+#define GL_PATH_STENCIL_DEPTH_OFFSET_FACTOR_NV 0x90BD
+#define GL_PATH_STENCIL_DEPTH_OFFSET_UNITS_NV 0x90BE
+#define GL_PATH_COVER_DEPTH_FUNC_NV       0x90BF
+#define GL_PATH_DASH_OFFSET_RESET_NV      0x90B4
+#define GL_MOVE_TO_RESETS_NV              0x90B5
+#define GL_MOVE_TO_CONTINUES_NV           0x90B6
+#define GL_CLOSE_PATH_NV                  0x00
+#define GL_MOVE_TO_NV                     0x02
+#define GL_RELATIVE_MOVE_TO_NV            0x03
+#define GL_LINE_TO_NV                     0x04
+#define GL_RELATIVE_LINE_TO_NV            0x05
+#define GL_HORIZONTAL_LINE_TO_NV          0x06
+#define GL_RELATIVE_HORIZONTAL_LINE_TO_NV 0x07
+#define GL_VERTICAL_LINE_TO_NV            0x08
+#define GL_RELATIVE_VERTICAL_LINE_TO_NV   0x09
+#define GL_QUADRATIC_CURVE_TO_NV          0x0A
+#define GL_RELATIVE_QUADRATIC_CURVE_TO_NV 0x0B
+#define GL_CUBIC_CURVE_TO_NV              0x0C
+#define GL_RELATIVE_CUBIC_CURVE_TO_NV     0x0D
+#define GL_SMOOTH_QUADRATIC_CURVE_TO_NV   0x0E
+#define GL_RELATIVE_SMOOTH_QUADRATIC_CURVE_TO_NV 0x0F
+#define GL_SMOOTH_CUBIC_CURVE_TO_NV       0x10
+#define GL_RELATIVE_SMOOTH_CUBIC_CURVE_TO_NV 0x11
+#define GL_SMALL_CCW_ARC_TO_NV            0x12
+#define GL_RELATIVE_SMALL_CCW_ARC_TO_NV   0x13
+#define GL_SMALL_CW_ARC_TO_NV             0x14
+#define GL_RELATIVE_SMALL_CW_ARC_TO_NV    0x15
+#define GL_LARGE_CCW_ARC_TO_NV            0x16
+#define GL_RELATIVE_LARGE_CCW_ARC_TO_NV   0x17
+#define GL_LARGE_CW_ARC_TO_NV             0x18
+#define GL_RELATIVE_LARGE_CW_ARC_TO_NV    0x19
+#define GL_RESTART_PATH_NV                0xF0
+#define GL_DUP_FIRST_CUBIC_CURVE_TO_NV    0xF2
+#define GL_DUP_LAST_CUBIC_CURVE_TO_NV     0xF4
+#define GL_RECT_NV                        0xF6
+#define GL_CIRCULAR_CCW_ARC_TO_NV         0xF8
+#define GL_CIRCULAR_CW_ARC_TO_NV          0xFA
+#define GL_CIRCULAR_TANGENT_ARC_TO_NV     0xFC
+#define GL_ARC_TO_NV                      0xFE
+#define GL_RELATIVE_ARC_TO_NV             0xFF
+#define GL_BOLD_BIT_NV                    0x01
+#define GL_ITALIC_BIT_NV                  0x02
+#define GL_GLYPH_WIDTH_BIT_NV             0x01
+#define GL_GLYPH_HEIGHT_BIT_NV            0x02
+#define GL_GLYPH_HORIZONTAL_BEARING_X_BIT_NV 0x04
+#define GL_GLYPH_HORIZONTAL_BEARING_Y_BIT_NV 0x08
+#define GL_GLYPH_HORIZONTAL_BEARING_ADVANCE_BIT_NV 0x10
+#define GL_GLYPH_VERTICAL_BEARING_X_BIT_NV 0x20
+#define GL_GLYPH_VERTICAL_BEARING_Y_BIT_NV 0x40
+#define GL_GLYPH_VERTICAL_BEARING_ADVANCE_BIT_NV 0x80
+#define GL_GLYPH_HAS_KERNING_NV           0x100
+#define GL_FONT_X_MIN_BOUNDS_NV           0x00010000
+#define GL_FONT_Y_MIN_BOUNDS_NV           0x00020000
+#define GL_FONT_X_MAX_BOUNDS_NV           0x00040000
+#define GL_FONT_Y_MAX_BOUNDS_NV           0x00080000
+#define GL_FONT_UNITS_PER_EM_NV           0x00100000
+#define GL_FONT_ASCENDER_NV               0x00200000
+#define GL_FONT_DESCENDER_NV              0x00400000
+#define GL_FONT_HEIGHT_NV                 0x00800000
+#define GL_FONT_MAX_ADVANCE_WIDTH_NV      0x01000000
+#define GL_FONT_MAX_ADVANCE_HEIGHT_NV     0x02000000
+#define GL_FONT_UNDERLINE_POSITION_NV     0x04000000
+#define GL_FONT_UNDERLINE_THICKNESS_NV    0x08000000
+#define GL_FONT_HAS_KERNING_NV            0x10000000
+#endif
+
+#ifndef GL_AMD_pinned_memory
+#define GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD 0x9160
+#endif
+
+#ifndef GL_AMD_stencil_operation_extended
+#define GL_SET_AMD                        0x874A
+#define GL_REPLACE_VALUE_AMD              0x874B
+#define GL_STENCIL_OP_VALUE_AMD           0x874C
+#define GL_STENCIL_BACK_OP_VALUE_AMD      0x874D
+#endif
+
+#ifndef GL_AMD_vertex_shader_viewport_index
+#endif
+
+#ifndef GL_AMD_vertex_shader_layer
+#endif
+
+#ifndef GL_NV_bindless_texture
+#endif
+
+#ifndef GL_NV_shader_atomic_float
+#endif
+

 /*************************************************************/

@@ -5419,14 +5584,14 @@ typedef GLintptr GLvdpauSurfaceNV;
 #ifndef GL_VERSION_1_2
 #define GL_VERSION_1_2 1
 #ifdef GL_GLEXT_PROTOTYPES
-GLAPI void APIENTRY glBlendColor (GLclampf red, GLclampf green, GLclampf blue, GLclampf alpha);
+GLAPI void APIENTRY glBlendColor (GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
 GLAPI void APIENTRY glBlendEquation (GLenum mode);
 GLAPI void APIENTRY glDrawRangeElements (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid *indices);
 GLAPI void APIENTRY glTexImage3D (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid *pixels);
 GLAPI void APIENTRY glTexSubImage3D (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const GLvoid *pixels);
 GLAPI void APIENTRY glCopyTexSubImage3D (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
 #endif /* GL_GLEXT_PROTOTYPES */
-typedef void (APIENTRYP PFNGLBLENDCOLORPROC) (GLclampf red, GLclampf green, GLclampf blue, GLclampf alpha);
+typedef void (APIENTRYP PFNGLBLENDCOLORPROC) (GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
 typedef void (APIENTRYP PFNGLBLENDEQUATIONPROC) (GLenum mode);
 typedef void (APIENTRYP PFNGLDRAWRANGEELEMENTSPROC) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid *indices);
 typedef void (APIENTRYP PFNGLTEXIMAGE3DPROC) (GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const GLvoid *pixels);
@@ -5508,7 +5673,7 @@ typedef void (APIENTRYP PFNGLRESETMINMAXPROC) (GLenum target);
 #define GL_VERSION_1_3 1
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glActiveTexture (GLenum texture);
-GLAPI void APIENTRY glSampleCoverage (GLclampf value, GLboolean invert);
+GLAPI void APIENTRY glSampleCoverage (GLfloat value, GLboolean invert);
 GLAPI void APIENTRY glCompressedTexImage3D (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid *data);
 GLAPI void APIENTRY glCompressedTexImage2D (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const GLvoid *data);
 GLAPI void APIENTRY glCompressedTexImage1D (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const GLvoid *data);
@@ -5518,7 +5683,7 @@ GLAPI void APIENTRY glCompressedTexSubImage1D (GLenum target, GLint level, GLint
 GLAPI void APIENTRY glGetCompressedTexImage (GLenum target, GLint level, GLvoid *img);
 #endif /* GL_GLEXT_PROTOTYPES */
 typedef void (APIENTRYP PFNGLACTIVETEXTUREPROC) (GLenum texture);
-typedef void (APIENTRYP PFNGLSAMPLECOVERAGEPROC) (GLclampf value, GLboolean invert);
+typedef void (APIENTRYP PFNGLSAMPLECOVERAGEPROC) (GLfloat value, GLboolean invert);
 typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE3DPROC) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const GLvoid *data);
 typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE2DPROC) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const GLvoid *data);
 typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE1DPROC) (GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const GLvoid *data);
@@ -5613,7 +5778,7 @@ typedef void (APIENTRYP PFNGLMULTTRANSPOSEMATRIXDPROC) (const GLdouble *m);
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glBlendFuncSeparate (GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha);
 GLAPI void APIENTRY glMultiDrawArrays (GLenum mode, const GLint *first, const GLsizei *count, GLsizei primcount);
-GLAPI void APIENTRY glMultiDrawElements (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* *indices, GLsizei primcount);
+GLAPI void APIENTRY glMultiDrawElements (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* const *indices, GLsizei primcount);
 GLAPI void APIENTRY glPointParameterf (GLenum pname, GLfloat param);
 GLAPI void APIENTRY glPointParameterfv (GLenum pname, const GLfloat *params);
 GLAPI void APIENTRY glPointParameteri (GLenum pname, GLint param);
@@ -5621,7 +5786,7 @@ GLAPI void APIENTRY glPointParameteriv (GLenum pname, const GLint *params);
 #endif /* GL_GLEXT_PROTOTYPES */
 typedef void (APIENTRYP PFNGLBLENDFUNCSEPARATEPROC) (GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha);
 typedef void (APIENTRYP PFNGLMULTIDRAWARRAYSPROC) (GLenum mode, const GLint *first, const GLsizei *count, GLsizei primcount);
-typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSPROC) (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* *indices, GLsizei primcount);
+typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSPROC) (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* const *indices, GLsizei primcount);
 typedef void (APIENTRYP PFNGLPOINTPARAMETERFPROC) (GLenum pname, GLfloat param);
 typedef void (APIENTRYP PFNGLPOINTPARAMETERFVPROC) (GLenum pname, const GLfloat *params);
 typedef void (APIENTRYP PFNGLPOINTPARAMETERIPROC) (GLenum pname, GLint param);
@@ -5791,7 +5956,7 @@ GLAPI void APIENTRY glGetVertexAttribPointerv (GLuint index, GLenum pname, GLvoi
 GLAPI GLboolean APIENTRY glIsProgram (GLuint program);
 GLAPI GLboolean APIENTRY glIsShader (GLuint shader);
 GLAPI void APIENTRY glLinkProgram (GLuint program);
-GLAPI void APIENTRY glShaderSource (GLuint shader, GLsizei count, const GLchar* *string, const GLint *length);
+GLAPI void APIENTRY glShaderSource (GLuint shader, GLsizei count, const GLchar* const *string, const GLint *length);
 GLAPI void APIENTRY glUseProgram (GLuint program);
 GLAPI void APIENTRY glUniform1f (GLint location, GLfloat v0);
 GLAPI void APIENTRY glUniform2f (GLint location, GLfloat v0, GLfloat v1);
@@ -5885,7 +6050,7 @@ typedef void (APIENTRYP PFNGLGETVERTEXATTRIBPOINTERVPROC) (GLuint index, GLenum
 typedef GLboolean (APIENTRYP PFNGLISPROGRAMPROC) (GLuint program);
 typedef GLboolean (APIENTRYP PFNGLISSHADERPROC) (GLuint shader);
 typedef void (APIENTRYP PFNGLLINKPROGRAMPROC) (GLuint program);
-typedef void (APIENTRYP PFNGLSHADERSOURCEPROC) (GLuint shader, GLsizei count, const GLchar* *string, const GLint *length);
+typedef void (APIENTRYP PFNGLSHADERSOURCEPROC) (GLuint shader, GLsizei count, const GLchar* const *string, const GLint *length);
 typedef void (APIENTRYP PFNGLUSEPROGRAMPROC) (GLuint program);
 typedef void (APIENTRYP PFNGLUNIFORM1FPROC) (GLint location, GLfloat v0);
 typedef void (APIENTRYP PFNGLUNIFORM2FPROC) (GLint location, GLfloat v0, GLfloat v1);
@@ -5981,7 +6146,7 @@ GLAPI void APIENTRY glBeginTransformFeedback (GLenum primitiveMode);
 GLAPI void APIENTRY glEndTransformFeedback (void);
 GLAPI void APIENTRY glBindBufferRange (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
 GLAPI void APIENTRY glBindBufferBase (GLenum target, GLuint index, GLuint buffer);
-GLAPI void APIENTRY glTransformFeedbackVaryings (GLuint program, GLsizei count, const GLchar* *varyings, GLenum bufferMode);
+GLAPI void APIENTRY glTransformFeedbackVaryings (GLuint program, GLsizei count, const GLchar* const *varyings, GLenum bufferMode);
 GLAPI void APIENTRY glGetTransformFeedbackVarying (GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLsizei *size, GLenum *type, GLchar *name);
 GLAPI void APIENTRY glClampColor (GLenum target, GLenum clamp);
 GLAPI void APIENTRY glBeginConditionalRender (GLuint id, GLenum mode);
@@ -6040,7 +6205,7 @@ typedef void (APIENTRYP PFNGLBEGINTRANSFORMFEEDBACKPROC) (GLenum primitiveMode);
 typedef void (APIENTRYP PFNGLENDTRANSFORMFEEDBACKPROC) (void);
 typedef void (APIENTRYP PFNGLBINDBUFFERRANGEPROC) (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
 typedef void (APIENTRYP PFNGLBINDBUFFERBASEPROC) (GLenum target, GLuint index, GLuint buffer);
-typedef void (APIENTRYP PFNGLTRANSFORMFEEDBACKVARYINGSPROC) (GLuint program, GLsizei count, const GLchar* *varyings, GLenum bufferMode);
+typedef void (APIENTRYP PFNGLTRANSFORMFEEDBACKVARYINGSPROC) (GLuint program, GLsizei count, const GLchar* const *varyings, GLenum bufferMode);
 typedef void (APIENTRYP PFNGLGETTRANSFORMFEEDBACKVARYINGPROC) (GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLsizei *size, GLenum *type, GLchar *name);
 typedef void (APIENTRYP PFNGLCLAMPCOLORPROC) (GLenum target, GLenum clamp);
 typedef void (APIENTRYP PFNGLBEGINCONDITIONALRENDERPROC) (GLuint id, GLenum mode);
@@ -6157,13 +6322,13 @@ typedef void (APIENTRYP PFNGLVERTEXATTRIBDIVISORPROC) (GLuint index, GLuint divi
 /* ARB_transform_feedback2 */
 /* ARB_transform_feedback3 */
 #ifdef GL_GLEXT_PROTOTYPES
-GLAPI void APIENTRY glMinSampleShading (GLclampf value);
+GLAPI void APIENTRY glMinSampleShading (GLfloat value);
 GLAPI void APIENTRY glBlendEquationi (GLuint buf, GLenum mode);
 GLAPI void APIENTRY glBlendEquationSeparatei (GLuint buf, GLenum modeRGB, GLenum modeAlpha);
 GLAPI void APIENTRY glBlendFunci (GLuint buf, GLenum src, GLenum dst);
 GLAPI void APIENTRY glBlendFuncSeparatei (GLuint buf, GLenum srcRGB, GLenum dstRGB, GLenum srcAlpha, GLenum dstAlpha);
 #endif /* GL_GLEXT_PROTOTYPES */
-typedef void (APIENTRYP PFNGLMINSAMPLESHADINGPROC) (GLclampf value);
+typedef void (APIENTRYP PFNGLMINSAMPLESHADINGPROC) (GLfloat value);
 typedef void (APIENTRYP PFNGLBLENDEQUATIONIPROC) (GLuint buf, GLenum mode);
 typedef void (APIENTRYP PFNGLBLENDEQUATIONSEPARATEIPROC) (GLuint buf, GLenum modeRGB, GLenum modeAlpha);
 typedef void (APIENTRYP PFNGLBLENDFUNCIPROC) (GLuint buf, GLenum src, GLenum dst);
@@ -6288,9 +6453,9 @@ typedef void (APIENTRYP PFNGLMULTTRANSPOSEMATRIXDARBPROC) (const GLdouble *m);
 #ifndef GL_ARB_multisample
 #define GL_ARB_multisample 1
 #ifdef GL_GLEXT_PROTOTYPES
-GLAPI void APIENTRY glSampleCoverageARB (GLclampf value, GLboolean invert);
+GLAPI void APIENTRY glSampleCoverageARB (GLfloat value, GLboolean invert);
 #endif /* GL_GLEXT_PROTOTYPES */
-typedef void (APIENTRYP PFNGLSAMPLECOVERAGEARBPROC) (GLclampf value, GLboolean invert);
+typedef void (APIENTRYP PFNGLSAMPLECOVERAGEARBPROC) (GLfloat value, GLboolean invert);
 #endif

 #ifndef GL_ARB_texture_env_add
@@ -6909,7 +7074,7 @@ typedef GLboolean (APIENTRYP PFNGLISVERTEXARRAYPROC) (GLuint array);
 #ifndef GL_ARB_uniform_buffer_object
 #define GL_ARB_uniform_buffer_object 1
 #ifdef GL_GLEXT_PROTOTYPES
-GLAPI void APIENTRY glGetUniformIndices (GLuint program, GLsizei uniformCount, const GLchar* *uniformNames, GLuint *uniformIndices);
+GLAPI void APIENTRY glGetUniformIndices (GLuint program, GLsizei uniformCount, const GLchar* const *uniformNames, GLuint *uniformIndices);
 GLAPI void APIENTRY glGetActiveUniformsiv (GLuint program, GLsizei uniformCount, const GLuint *uniformIndices, GLenum pname, GLint *params);
 GLAPI void APIENTRY glGetActiveUniformName (GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformName);
 GLAPI GLuint APIENTRY glGetUniformBlockIndex (GLuint program, const GLchar *uniformBlockName);
@@ -6917,7 +7082,7 @@ GLAPI void APIENTRY glGetActiveUniformBlockiv (GLuint program, GLuint uniformBlo
 GLAPI void APIENTRY glGetActiveUniformBlockName (GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformBlockName);
 GLAPI void APIENTRY glUniformBlockBinding (GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
 #endif /* GL_GLEXT_PROTOTYPES */
-typedef void (APIENTRYP PFNGLGETUNIFORMINDICESPROC) (GLuint program, GLsizei uniformCount, const GLchar* *uniformNames, GLuint *uniformIndices);
+typedef void (APIENTRYP PFNGLGETUNIFORMINDICESPROC) (GLuint program, GLsizei uniformCount, const GLchar* const *uniformNames, GLuint *uniformIndices);
 typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMSIVPROC) (GLuint program, GLsizei uniformCount, const GLuint *uniformIndices, GLenum pname, GLint *params);
 typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMNAMEPROC) (GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformName);
 typedef GLuint (APIENTRYP PFNGLGETUNIFORMBLOCKINDEXPROC) (GLuint program, const GLchar *uniformBlockName);
@@ -6952,12 +7117,12 @@ typedef void (APIENTRYP PFNGLCOPYBUFFERSUBDATAPROC) (GLenum readTarget, GLenum w
 GLAPI void APIENTRY glDrawElementsBaseVertex (GLenum mode, GLsizei count, GLenum type, const GLvoid *indices, GLint basevertex);
 GLAPI void APIENTRY glDrawRangeElementsBaseVertex (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid *indices, GLint basevertex);
 GLAPI void APIENTRY glDrawElementsInstancedBaseVertex (GLenum mode, GLsizei count, GLenum type, const GLvoid *indices, GLsizei primcount, GLint basevertex);
-GLAPI void APIENTRY glMultiDrawElementsBaseVertex (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* *indices, GLsizei primcount, const GLint *basevertex);
+GLAPI void APIENTRY glMultiDrawElementsBaseVertex (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* const *indices, GLsizei primcount, const GLint *basevertex);
 #endif /* GL_GLEXT_PROTOTYPES */
 typedef void (APIENTRYP PFNGLDRAWELEMENTSBASEVERTEXPROC) (GLenum mode, GLsizei count, GLenum type, const GLvoid *indices, GLint basevertex);
 typedef void (APIENTRYP PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC) (GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const GLvoid *indices, GLint basevertex);
 typedef void (APIENTRYP PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC) (GLenum mode, GLsizei count, GLenum type, const GLvoid *indices, GLsizei primcount, GLint basevertex);
-typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC) (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* *indices, GLsizei primcount, const GLint *basevertex);
+typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC) (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* const *indices, GLsizei primcount, const GLint *basevertex);
 #endif

 #ifndef GL_ARB_fragment_coord_conventions
@@ -7031,9 +7196,9 @@ typedef void (APIENTRYP PFNGLBLENDFUNCSEPARATEIARBPROC) (GLuint buf, GLenum srcR
 #ifndef GL_ARB_sample_shading
 #define GL_ARB_sample_shading 1
 #ifdef GL_GLEXT_PROTOTYPES
-GLAPI void APIENTRY glMinSampleShadingARB (GLclampf value);
+GLAPI void APIENTRY glMinSampleShadingARB (GLfloat value);
 #endif /* GL_GLEXT_PROTOTYPES */
-typedef void (APIENTRYP PFNGLMINSAMPLESHADINGARBPROC) (GLclampf value);
+typedef void (APIENTRYP PFNGLMINSAMPLESHADINGARBPROC) (GLfloat value);
 #endif

 #ifndef GL_ARB_texture_cube_map_array
@@ -7360,14 +7525,14 @@ typedef void (APIENTRYP PFNGLGETQUERYINDEXEDIVPROC) (GLenum target, GLuint index
 GLAPI void APIENTRY glReleaseShaderCompiler (void);
 GLAPI void APIENTRY glShaderBinary (GLsizei count, const GLuint *shaders, GLenum binaryformat, const GLvoid *binary, GLsizei length);
 GLAPI void APIENTRY glGetShaderPrecisionFormat (GLenum shadertype, GLenum precisiontype, GLint *range, GLint *precision);
-GLAPI void APIENTRY glDepthRangef (GLclampf n, GLclampf f);
-GLAPI void APIENTRY glClearDepthf (GLclampf d);
+GLAPI void APIENTRY glDepthRangef (GLfloat n, GLfloat f);
+GLAPI void APIENTRY glClearDepthf (GLfloat d);
 #endif /* GL_GLEXT_PROTOTYPES */
 typedef void (APIENTRYP PFNGLRELEASESHADERCOMPILERPROC) (void);
 typedef void (APIENTRYP PFNGLSHADERBINARYPROC) (GLsizei count, const GLuint *shaders, GLenum binaryformat, const GLvoid *binary, GLsizei length);
 typedef void (APIENTRYP PFNGLGETSHADERPRECISIONFORMATPROC) (GLenum shadertype, GLenum precisiontype, GLint *range, GLint *precision);
-typedef void (APIENTRYP PFNGLDEPTHRANGEFPROC) (GLclampf n, GLclampf f);
-typedef void (APIENTRYP PFNGLCLEARDEPTHFPROC) (GLclampf d);
+typedef void (APIENTRYP PFNGLDEPTHRANGEFPROC) (GLfloat n, GLfloat f);
+typedef void (APIENTRYP PFNGLCLEARDEPTHFPROC) (GLfloat d);
 #endif

 #ifndef GL_ARB_get_program_binary
@@ -7387,7 +7552,7 @@ typedef void (APIENTRYP PFNGLPROGRAMPARAMETERIPROC) (GLuint program, GLenum pnam
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glUseProgramStages (GLuint pipeline, GLbitfield stages, GLuint program);
 GLAPI void APIENTRY glActiveShaderProgram (GLuint pipeline, GLuint program);
-GLAPI GLuint APIENTRY glCreateShaderProgramv (GLenum type, GLsizei count, const GLchar* *strings);
+GLAPI GLuint APIENTRY glCreateShaderProgramv (GLenum type, GLsizei count, const GLchar* const *strings);
 GLAPI void APIENTRY glBindProgramPipeline (GLuint pipeline);
 GLAPI void APIENTRY glDeleteProgramPipelines (GLsizei n, const GLuint *pipelines);
 GLAPI void APIENTRY glGenProgramPipelines (GLsizei n, GLuint *pipelines);
@@ -7448,7 +7613,7 @@ GLAPI void APIENTRY glGetProgramPipelineInfoLog (GLuint pipeline, GLsizei bufSiz
 #endif /* GL_GLEXT_PROTOTYPES */
 typedef void (APIENTRYP PFNGLUSEPROGRAMSTAGESPROC) (GLuint pipeline, GLbitfield stages, GLuint program);
 typedef void (APIENTRYP PFNGLACTIVESHADERPROGRAMPROC) (GLuint pipeline, GLuint program);
-typedef GLuint (APIENTRYP PFNGLCREATESHADERPROGRAMVPROC) (GLenum type, GLsizei count, const GLchar* *strings);
+typedef GLuint (APIENTRYP PFNGLCREATESHADERPROGRAMVPROC) (GLenum type, GLsizei count, const GLchar* const *strings);
 typedef void (APIENTRYP PFNGLBINDPROGRAMPIPELINEPROC) (GLuint pipeline);
 typedef void (APIENTRYP PFNGLDELETEPROGRAMPIPELINESPROC) (GLsizei n, const GLuint *pipelines);
 typedef void (APIENTRYP PFNGLGENPROGRAMPIPELINESPROC) (GLsizei n, GLuint *pipelines);
@@ -7543,8 +7708,8 @@ GLAPI void APIENTRY glViewportIndexedfv (GLuint index, const GLfloat *v);
 GLAPI void APIENTRY glScissorArrayv (GLuint first, GLsizei count, const GLint *v);
 GLAPI void APIENTRY glScissorIndexed (GLuint index, GLint left, GLint bottom, GLsizei width, GLsizei height);
 GLAPI void APIENTRY glScissorIndexedv (GLuint index, const GLint *v);
-GLAPI void APIENTRY glDepthRangeArrayv (GLuint first, GLsizei count, const GLclampd *v);
-GLAPI void APIENTRY glDepthRangeIndexed (GLuint index, GLclampd n, GLclampd f);
+GLAPI void APIENTRY glDepthRangeArrayv (GLuint first, GLsizei count, const GLdouble *v);
+GLAPI void APIENTRY glDepthRangeIndexed (GLuint index, GLdouble n, GLdouble f);
 GLAPI void APIENTRY glGetFloati_v (GLenum target, GLuint index, GLfloat *data);
 GLAPI void APIENTRY glGetDoublei_v (GLenum target, GLuint index, GLdouble *data);
 #endif /* GL_GLEXT_PROTOTYPES */
@@ -7554,8 +7719,8 @@ typedef void (APIENTRYP PFNGLVIEWPORTINDEXEDFVPROC) (GLuint index, const GLfloat
 typedef void (APIENTRYP PFNGLSCISSORARRAYVPROC) (GLuint first, GLsizei count, const GLint *v);
 typedef void (APIENTRYP PFNGLSCISSORINDEXEDPROC) (GLuint index, GLint left, GLint bottom, GLsizei width, GLsizei height);
 typedef void (APIENTRYP PFNGLSCISSORINDEXEDVPROC) (GLuint index, const GLint *v);
-typedef void (APIENTRYP PFNGLDEPTHRANGEARRAYVPROC) (GLuint first, GLsizei count, const GLclampd *v);
-typedef void (APIENTRYP PFNGLDEPTHRANGEINDEXEDPROC) (GLuint index, GLclampd n, GLclampd f);
+typedef void (APIENTRYP PFNGLDEPTHRANGEARRAYVPROC) (GLuint first, GLsizei count, const GLdouble *v);
+typedef void (APIENTRYP PFNGLDEPTHRANGEINDEXEDPROC) (GLuint index, GLdouble n, GLdouble f);
 typedef void (APIENTRYP PFNGLGETFLOATI_VPROC) (GLenum target, GLuint index, GLfloat *data);
 typedef void (APIENTRYP PFNGLGETDOUBLEI_VPROC) (GLenum target, GLuint index, GLdouble *data);
 #endif
@@ -7573,12 +7738,12 @@ typedef GLsync (APIENTRYP PFNGLCREATESYNCFROMCLEVENTARBPROC) (struct _cl_context
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glDebugMessageControlARB (GLenum source, GLenum type, GLenum severity, GLsizei count, const GLuint *ids, GLboolean enabled);
 GLAPI void APIENTRY glDebugMessageInsertARB (GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *buf);
-GLAPI void APIENTRY glDebugMessageCallbackARB (GLDEBUGPROCARB callback, GLvoid *userParam);
+GLAPI void APIENTRY glDebugMessageCallbackARB (GLDEBUGPROCARB callback, const GLvoid *userParam);
 GLAPI GLuint APIENTRY glGetDebugMessageLogARB (GLuint count, GLsizei bufsize, GLenum *sources, GLenum *types, GLuint *ids, GLenum *severities, GLsizei *lengths, GLchar *messageLog);
 #endif /* GL_GLEXT_PROTOTYPES */
 typedef void (APIENTRYP PFNGLDEBUGMESSAGECONTROLARBPROC) (GLenum source, GLenum type, GLenum severity, GLsizei count, const GLuint *ids, GLboolean enabled);
 typedef void (APIENTRYP PFNGLDEBUGMESSAGEINSERTARBPROC) (GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *buf);
-typedef void (APIENTRYP PFNGLDEBUGMESSAGECALLBACKARBPROC) (GLDEBUGPROCARB callback, GLvoid *userParam);
+typedef void (APIENTRYP PFNGLDEBUGMESSAGECALLBACKARBPROC) (GLDEBUGPROCARB callback, const GLvoid *userParam);
 typedef GLuint (APIENTRYP PFNGLGETDEBUGMESSAGELOGARBPROC) (GLuint count, GLsizei bufsize, GLenum *sources, GLenum *types, GLuint *ids, GLenum *severities, GLsizei *lengths, GLchar *messageLog);
 #endif

@@ -7725,9 +7890,9 @@ typedef void (APIENTRYP PFNGLTEXTURESTORAGE3DEXTPROC) (GLuint texture, GLenum ta
 #ifndef GL_EXT_blend_color
 #define GL_EXT_blend_color 1
 #ifdef GL_GLEXT_PROTOTYPES
-GLAPI void APIENTRY glBlendColorEXT (GLclampf red, GLclampf green, GLclampf blue, GLclampf alpha);
+GLAPI void APIENTRY glBlendColorEXT (GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
 #endif /* GL_GLEXT_PROTOTYPES */
-typedef void (APIENTRYP PFNGLBLENDCOLOREXTPROC) (GLclampf red, GLclampf green, GLclampf blue, GLclampf alpha);
+typedef void (APIENTRYP PFNGLBLENDCOLOREXTPROC) (GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
 #endif

 #ifndef GL_EXT_polygon_offset
@@ -11480,6 +11645,166 @@ typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSINDIRECTAMDPROC) (GLenum mode, GLe
 #define GL_EXT_framebuffer_multisample_blit_scaled 1
 #endif

+#ifndef GL_NV_path_rendering
+#define GL_NV_path_rendering 1
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI GLuint APIENTRY glGenPathsNV (GLsizei range);
+GLAPI void APIENTRY glDeletePathsNV (GLuint path, GLsizei range);
+GLAPI GLboolean APIENTRY glIsPathNV (GLuint path);
+GLAPI void APIENTRY glPathCommandsNV (GLuint path, GLsizei numCommands, const GLubyte *commands, GLsizei numCoords, GLenum coordType, const GLvoid *coords);
+GLAPI void APIENTRY glPathCoordsNV (GLuint path, GLsizei numCoords, GLenum coordType, const GLvoid *coords);
+GLAPI void APIENTRY glPathSubCommandsNV (GLuint path, GLsizei commandStart, GLsizei commandsToDelete, GLsizei numCommands, const GLubyte *commands, GLsizei numCoords, GLenum coordType, const GLvoid *coords);
+GLAPI void APIENTRY glPathSubCoordsNV (GLuint path, GLsizei coordStart, GLsizei numCoords, GLenum coordType, const GLvoid *coords);
+GLAPI void APIENTRY glPathStringNV (GLuint path, GLenum format, GLsizei length, const GLvoid *pathString);
+GLAPI void APIENTRY glPathGlyphsNV (GLuint firstPathName, GLenum fontTarget, const GLvoid *fontName, GLbitfield fontStyle, GLsizei numGlyphs, GLenum type, const GLvoid *charcodes, GLenum handleMissingGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+GLAPI void APIENTRY glPathGlyphRangeNV (GLuint firstPathName, GLenum fontTarget, const GLvoid *fontName, GLbitfield fontStyle, GLuint firstGlyph, GLsizei numGlyphs, GLenum handleMissingGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+GLAPI void APIENTRY glWeightPathsNV (GLuint resultPath, GLsizei numPaths, const GLuint *paths, const GLfloat *weights);
+GLAPI void APIENTRY glCopyPathNV (GLuint resultPath, GLuint srcPath);
+GLAPI void APIENTRY glInterpolatePathsNV (GLuint resultPath, GLuint pathA, GLuint pathB, GLfloat weight);
+GLAPI void APIENTRY glTransformPathNV (GLuint resultPath, GLuint srcPath, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glPathParameterivNV (GLuint path, GLenum pname, const GLint *value);
+GLAPI void APIENTRY glPathParameteriNV (GLuint path, GLenum pname, GLint value);
+GLAPI void APIENTRY glPathParameterfvNV (GLuint path, GLenum pname, const GLfloat *value);
+GLAPI void APIENTRY glPathParameterfNV (GLuint path, GLenum pname, GLfloat value);
+GLAPI void APIENTRY glPathDashArrayNV (GLuint path, GLsizei dashCount, const GLfloat *dashArray);
+GLAPI void APIENTRY glPathStencilFuncNV (GLenum func, GLint ref, GLuint mask);
+GLAPI void APIENTRY glPathStencilDepthOffsetNV (GLfloat factor, GLfloat units);
+GLAPI void APIENTRY glStencilFillPathNV (GLuint path, GLenum fillMode, GLuint mask);
+GLAPI void APIENTRY glStencilStrokePathNV (GLuint path, GLint reference, GLuint mask);
+GLAPI void APIENTRY glStencilFillPathInstancedNV (GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLenum fillMode, GLuint mask, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glStencilStrokePathInstancedNV (GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLint reference, GLuint mask, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glPathCoverDepthFuncNV (GLenum func);
+GLAPI void APIENTRY glPathColorGenNV (GLenum color, GLenum genMode, GLenum colorFormat, const GLfloat *coeffs);
+GLAPI void APIENTRY glPathTexGenNV (GLenum texCoordSet, GLenum genMode, GLint components, const GLfloat *coeffs);
+GLAPI void APIENTRY glPathFogGenNV (GLenum genMode);
+GLAPI void APIENTRY glCoverFillPathNV (GLuint path, GLenum coverMode);
+GLAPI void APIENTRY glCoverStrokePathNV (GLuint path, GLenum coverMode);
+GLAPI void APIENTRY glCoverFillPathInstancedNV (GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glCoverStrokePathInstancedNV (GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+GLAPI void APIENTRY glGetPathParameterivNV (GLuint path, GLenum pname, GLint *value);
+GLAPI void APIENTRY glGetPathParameterfvNV (GLuint path, GLenum pname, GLfloat *value);
+GLAPI void APIENTRY glGetPathCommandsNV (GLuint path, GLubyte *commands);
+GLAPI void APIENTRY glGetPathCoordsNV (GLuint path, GLfloat *coords);
+GLAPI void APIENTRY glGetPathDashArrayNV (GLuint path, GLfloat *dashArray);
+GLAPI void APIENTRY glGetPathMetricsNV (GLbitfield metricQueryMask, GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLsizei stride, GLfloat *metrics);
+GLAPI void APIENTRY glGetPathMetricRangeNV (GLbitfield metricQueryMask, GLuint firstPathName, GLsizei numPaths, GLsizei stride, GLfloat *metrics);
+GLAPI void APIENTRY glGetPathSpacingNV (GLenum pathListMode, GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLfloat advanceScale, GLfloat kerningScale, GLenum transformType, GLfloat *returnedSpacing);
+GLAPI void APIENTRY glGetPathColorGenivNV (GLenum color, GLenum pname, GLint *value);
+GLAPI void APIENTRY glGetPathColorGenfvNV (GLenum color, GLenum pname, GLfloat *value);
+GLAPI void APIENTRY glGetPathTexGenivNV (GLenum texCoordSet, GLenum pname, GLint *value);
+GLAPI void APIENTRY glGetPathTexGenfvNV (GLenum texCoordSet, GLenum pname, GLfloat *value);
+GLAPI GLboolean APIENTRY glIsPointInFillPathNV (GLuint path, GLuint mask, GLfloat x, GLfloat y);
+GLAPI GLboolean APIENTRY glIsPointInStrokePathNV (GLuint path, GLfloat x, GLfloat y);
+GLAPI GLfloat APIENTRY glGetPathLengthNV (GLuint path, GLsizei startSegment, GLsizei numSegments);
+GLAPI GLboolean APIENTRY glPointAlongPathNV (GLuint path, GLsizei startSegment, GLsizei numSegments, GLfloat distance, GLfloat *x, GLfloat *y, GLfloat *tangentX, GLfloat *tangentY);
+#endif /* GL_GLEXT_PROTOTYPES */
+typedef GLuint (APIENTRYP PFNGLGENPATHSNVPROC) (GLsizei range);
+typedef void (APIENTRYP PFNGLDELETEPATHSNVPROC) (GLuint path, GLsizei range);
+typedef GLboolean (APIENTRYP PFNGLISPATHNVPROC) (GLuint path);
+typedef void (APIENTRYP PFNGLPATHCOMMANDSNVPROC) (GLuint path, GLsizei numCommands, const GLubyte *commands, GLsizei numCoords, GLenum coordType, const GLvoid *coords);
+typedef void (APIENTRYP PFNGLPATHCOORDSNVPROC) (GLuint path, GLsizei numCoords, GLenum coordType, const GLvoid *coords);
+typedef void (APIENTRYP PFNGLPATHSUBCOMMANDSNVPROC) (GLuint path, GLsizei commandStart, GLsizei commandsToDelete, GLsizei numCommands, const GLubyte *commands, GLsizei numCoords, GLenum coordType, const GLvoid *coords);
+typedef void (APIENTRYP PFNGLPATHSUBCOORDSNVPROC) (GLuint path, GLsizei coordStart, GLsizei numCoords, GLenum coordType, const GLvoid *coords);
+typedef void (APIENTRYP PFNGLPATHSTRINGNVPROC) (GLuint path, GLenum format, GLsizei length, const GLvoid *pathString);
+typedef void (APIENTRYP PFNGLPATHGLYPHSNVPROC) (GLuint firstPathName, GLenum fontTarget, const GLvoid *fontName, GLbitfield fontStyle, GLsizei numGlyphs, GLenum type, const GLvoid *charcodes, GLenum handleMissingGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+typedef void (APIENTRYP PFNGLPATHGLYPHRANGENVPROC) (GLuint firstPathName, GLenum fontTarget, const GLvoid *fontName, GLbitfield fontStyle, GLuint firstGlyph, GLsizei numGlyphs, GLenum handleMissingGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
+typedef void (APIENTRYP PFNGLWEIGHTPATHSNVPROC) (GLuint resultPath, GLsizei numPaths, const GLuint *paths, const GLfloat *weights);
+typedef void (APIENTRYP PFNGLCOPYPATHNVPROC) (GLuint resultPath, GLuint srcPath);
+typedef void (APIENTRYP PFNGLINTERPOLATEPATHSNVPROC) (GLuint resultPath, GLuint pathA, GLuint pathB, GLfloat weight);
+typedef void (APIENTRYP PFNGLTRANSFORMPATHNVPROC) (GLuint resultPath, GLuint srcPath, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLPATHPARAMETERIVNVPROC) (GLuint path, GLenum pname, const GLint *value);
+typedef void (APIENTRYP PFNGLPATHPARAMETERINVPROC) (GLuint path, GLenum pname, GLint value);
+typedef void (APIENTRYP PFNGLPATHPARAMETERFVNVPROC) (GLuint path, GLenum pname, const GLfloat *value);
+typedef void (APIENTRYP PFNGLPATHPARAMETERFNVPROC) (GLuint path, GLenum pname, GLfloat value);
+typedef void (APIENTRYP PFNGLPATHDASHARRAYNVPROC) (GLuint path, GLsizei dashCount, const GLfloat *dashArray);
+typedef void (APIENTRYP PFNGLPATHSTENCILFUNCNVPROC) (GLenum func, GLint ref, GLuint mask);
+typedef void (APIENTRYP PFNGLPATHSTENCILDEPTHOFFSETNVPROC) (GLfloat factor, GLfloat units);
+typedef void (APIENTRYP PFNGLSTENCILFILLPATHNVPROC) (GLuint path, GLenum fillMode, GLuint mask);
+typedef void (APIENTRYP PFNGLSTENCILSTROKEPATHNVPROC) (GLuint path, GLint reference, GLuint mask);
+typedef void (APIENTRYP PFNGLSTENCILFILLPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLenum fillMode, GLuint mask, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLSTENCILSTROKEPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLint reference, GLuint mask, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLPATHCOVERDEPTHFUNCNVPROC) (GLenum func);
+typedef void (APIENTRYP PFNGLPATHCOLORGENNVPROC) (GLenum color, GLenum genMode, GLenum colorFormat, const GLfloat *coeffs);
+typedef void (APIENTRYP PFNGLPATHTEXGENNVPROC) (GLenum texCoordSet, GLenum genMode, GLint components, const GLfloat *coeffs);
+typedef void (APIENTRYP PFNGLPATHFOGGENNVPROC) (GLenum genMode);
+typedef void (APIENTRYP PFNGLCOVERFILLPATHNVPROC) (GLuint path, GLenum coverMode);
+typedef void (APIENTRYP PFNGLCOVERSTROKEPATHNVPROC) (GLuint path, GLenum coverMode);
+typedef void (APIENTRYP PFNGLCOVERFILLPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLCOVERSTROKEPATHINSTANCEDNVPROC) (GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLenum coverMode, GLenum transformType, const GLfloat *transformValues);
+typedef void (APIENTRYP PFNGLGETPATHPARAMETERIVNVPROC) (GLuint path, GLenum pname, GLint *value);
+typedef void (APIENTRYP PFNGLGETPATHPARAMETERFVNVPROC) (GLuint path, GLenum pname, GLfloat *value);
+typedef void (APIENTRYP PFNGLGETPATHCOMMANDSNVPROC) (GLuint path, GLubyte *commands);
+typedef void (APIENTRYP PFNGLGETPATHCOORDSNVPROC) (GLuint path, GLfloat *coords);
+typedef void (APIENTRYP PFNGLGETPATHDASHARRAYNVPROC) (GLuint path, GLfloat *dashArray);
+typedef void (APIENTRYP PFNGLGETPATHMETRICSNVPROC) (GLbitfield metricQueryMask, GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLsizei stride, GLfloat *metrics);
+typedef void (APIENTRYP PFNGLGETPATHMETRICRANGENVPROC) (GLbitfield metricQueryMask, GLuint firstPathName, GLsizei numPaths, GLsizei stride, GLfloat *metrics);
+typedef void (APIENTRYP PFNGLGETPATHSPACINGNVPROC) (GLenum pathListMode, GLsizei numPaths, GLenum pathNameType, const GLvoid *paths, GLuint pathBase, GLfloat advanceScale, GLfloat kerningScale, GLenum transformType, GLfloat *returnedSpacing);
+typedef void (APIENTRYP PFNGLGETPATHCOLORGENIVNVPROC) (GLenum color, GLenum pname, GLint *value);
+typedef void (APIENTRYP PFNGLGETPATHCOLORGENFVNVPROC) (GLenum color, GLenum pname, GLfloat *value);
+typedef void (APIENTRYP PFNGLGETPATHTEXGENIVNVPROC) (GLenum texCoordSet, GLenum pname, GLint *value);
+typedef void (APIENTRYP PFNGLGETPATHTEXGENFVNVPROC) (GLenum texCoordSet, GLenum pname, GLfloat *value);
+typedef GLboolean (APIENTRYP PFNGLISPOINTINFILLPATHNVPROC) (GLuint path, GLuint mask, GLfloat x, GLfloat y);
+typedef GLboolean (APIENTRYP PFNGLISPOINTINSTROKEPATHNVPROC) (GLuint path, GLfloat x, GLfloat y);
+typedef GLfloat (APIENTRYP PFNGLGETPATHLENGTHNVPROC) (GLuint path, GLsizei startSegment, GLsizei numSegments);
+typedef GLboolean (APIENTRYP PFNGLPOINTALONGPATHNVPROC) (GLuint path, GLsizei startSegment, GLsizei numSegments, GLfloat distance, GLfloat *x, GLfloat *y, GLfloat *tangentX, GLfloat *tangentY);
+#endif
+
+#ifndef GL_AMD_pinned_memory
+#define GL_AMD_pinned_memory 1
+#endif
+
+#ifndef GL_AMD_stencil_operation_extended
+#define GL_AMD_stencil_operation_extended 1
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glStencilOpValueAMD (GLenum face, GLuint value);
+#endif /* GL_GLEXT_PROTOTYPES */
+typedef void (APIENTRYP PFNGLSTENCILOPVALUEAMDPROC) (GLenum face, GLuint value);
+#endif
+
+#ifndef GL_AMD_vertex_shader_viewport_index
+#define GL_AMD_vertex_shader_viewport_index 1
+#endif
+
+#ifndef GL_AMD_vertex_shader_layer
+#define GL_AMD_vertex_shader_layer 1
+#endif
+
+#ifndef GL_NV_bindless_texture
+#define GL_NV_bindless_texture 1
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI GLuint64 APIENTRY glGetTextureHandleNV (GLuint texture);
+GLAPI GLuint64 APIENTRY glGetTextureSamplerHandleNV (GLuint texture, GLuint sampler);
+GLAPI void APIENTRY glMakeTextureHandleResidentNV (GLuint64 handle);
+GLAPI void APIENTRY glMakeTextureHandleNonResidentNV (GLuint64 handle);
+GLAPI GLuint64 APIENTRY glGetImageHandleNV (GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum format);
+GLAPI void APIENTRY glMakeImageHandleResidentNV (GLuint64 handle, GLenum access);
+GLAPI void APIENTRY glMakeImageHandleNonResidentNV (GLuint64 handle);
+GLAPI void APIENTRY glUniformHandleui64NV (GLint location, GLuint64 value);
+GLAPI void APIENTRY glUniformHandleui64vNV (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glProgramUniformHandleui64NV (GLuint program, GLint location, GLuint64 value);
+GLAPI void APIENTRY glProgramUniformHandleui64vNV (GLuint program, GLint location, GLsizei count, const GLuint64 *values);
+GLAPI GLboolean APIENTRY glIsTextureHandleResidentNV (GLuint64 handle);
+GLAPI GLboolean APIENTRY glIsImageHandleResidentNV (GLuint64 handle);
+#endif /* GL_GLEXT_PROTOTYPES */
+typedef GLuint64 (APIENTRYP PFNGLGETTEXTUREHANDLENVPROC) (GLuint texture);
+typedef GLuint64 (APIENTRYP PFNGLGETTEXTURESAMPLERHANDLENVPROC) (GLuint texture, GLuint sampler);
+typedef void (APIENTRYP PFNGLMAKETEXTUREHANDLERESIDENTNVPROC) (GLuint64 handle);
+typedef void (APIENTRYP PFNGLMAKETEXTUREHANDLENONRESIDENTNVPROC) (GLuint64 handle);
+typedef GLuint64 (APIENTRYP PFNGLGETIMAGEHANDLENVPROC) (GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum format);
+typedef void (APIENTRYP PFNGLMAKEIMAGEHANDLERESIDENTNVPROC) (GLuint64 handle, GLenum access);
+typedef void (APIENTRYP PFNGLMAKEIMAGEHANDLENONRESIDENTNVPROC) (GLuint64 handle);
+typedef void (APIENTRYP PFNGLUNIFORMHANDLEUI64NVPROC) (GLint location, GLuint64 value);
+typedef void (APIENTRYP PFNGLUNIFORMHANDLEUI64VNVPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMHANDLEUI64NVPROC) (GLuint program, GLint location, GLuint64 value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORMHANDLEUI64VNVPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *values);
+typedef GLboolean (APIENTRYP PFNGLISTEXTUREHANDLERESIDENTNVPROC) (GLuint64 handle);
+typedef GLboolean (APIENTRYP PFNGLISIMAGEHANDLERESIDENTNVPROC) (GLuint64 handle);
+#endif
+
+#ifndef GL_NV_shader_atomic_float
+#define GL_NV_shader_atomic_float 1
+#endif
+

 #ifdef __cplusplus
 }
--- a/include/GL/glxext.h
+++ b/include/GL/glxext.h
@@ -6,7 +6,7 @@ extern "C" {
 #endif

 /*
-** Copyright (c) 2007-2010 The Khronos Group Inc.
+** Copyright (c) 2007-2012 The Khronos Group Inc.
 ** 
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -48,9 +48,9 @@ extern "C" {
 /*************************************************************/

 /* Header file version number, required by OpenGL ABI for Linux */
-/* glxext.h last updated 2010/08/06 */
+/* glxext.h last updated 2012/02/29 */
 /* Current version at http://www.opengl.org/registry/ */
-#define GLX_GLXEXT_VERSION 32
+#define GLX_GLXEXT_VERSION 33

 #ifndef GLX_VERSION_1_3
 #define GLX_WINDOW_BIT                     0x00000001
@@ -440,6 +440,10 @@ extern "C" {
 #define GLX_CONTEXT_ES2_PROFILE_BIT_EXT    0x00000004
 #endif

+#ifndef GLX_EXT_swap_control_tear
+#define GLX_LATE_SWAPS_TEAR_EXT            0x20F3
+#endif
+

 /*************************************************************/

@@ -964,9 +968,9 @@ typedef void ( * PFNGLXRELEASEVIDEOCAPTUREDEVICENVPROC) (Display *dpy, GLXVideoC
 #ifndef GLX_EXT_swap_control
 #define GLX_EXT_swap_control 1
 #ifdef GLX_GLXEXT_PROTOTYPES
-extern int glXSwapIntervalEXT (Display *dpy, GLXDrawable drawable, int interval);
+extern void glXSwapIntervalEXT (Display *dpy, GLXDrawable drawable, int interval);
 #endif /* GLX_GLXEXT_PROTOTYPES */
-typedef int ( * PFNGLXSWAPINTERVALEXTPROC) (Display *dpy, GLXDrawable drawable, int interval);
+typedef void ( * PFNGLXSWAPINTERVALEXTPROC) (Display *dpy, GLXDrawable drawable, int interval);
 #endif

 #ifndef GLX_NV_copy_image
@@ -985,6 +989,10 @@ typedef void ( * PFNGLXCOPYIMAGESUBDATANVPROC) (Display *dpy, GLXContext srcCtx,
 #define GLX_NV_multisample_coverage 1
 #endif

+#ifndef GLX_EXT_swap_control_tear
+#define GLX_EXT_swap_control_tear 1
+#endif
+

 #ifdef __cplusplus
 }
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -808,9 +808,27 @@ struct __DRIdri2LoaderExtensionRec {
 #define __DRI_CTX_ATTRIB_MINOR_VERSION		1
 #define __DRI_CTX_ATTRIB_FLAGS			2

+/**
+ * \requires __DRI2_ROBUSTNESS.
+ */
+#define __DRI_CTX_ATTRIB_RESET_STRATEGY		3
+
 #define __DRI_CTX_FLAG_DEBUG			0x00000001
 #define __DRI_CTX_FLAG_FORWARD_COMPATIBLE	0x00000002

+/**
+ * \requires __DRI2_ROBUSTNESS.
+ */
+#define __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS	0x00000004
+
+/**
+ * \name Context reset strategies.
+ */
+/*@{*/
+#define __DRI_CTX_RESET_NO_NOTIFICATION		0
+#define __DRI_CTX_RESET_LOSE_CONTEXT		1
+/*@}*/
+
 /**
 * \name Reasons that __DRIdri2Extension::createContextAttribs might fail
 */
@@ -894,19 +912,26 @@ struct __DRIdri2ExtensionRec {
 * extensions.
 */
 #define __DRI_IMAGE "DRI_IMAGE"
-#define __DRI_IMAGE_VERSION 4
+#define __DRI_IMAGE_VERSION 5

 /**
 * These formats correspond to the similarly named MESA_FORMAT_*
 * tokens, except in the native endian of the CPU.  For example, on
 * little endian __DRI_IMAGE_FORMAT_XRGB8888 corresponds to
 * MESA_FORMAT_XRGB8888, but MESA_FORMAT_XRGB8888_REV on big endian.
+ *
+ * __DRI_IMAGE_FORMAT_NONE is for images that aren't directly usable
+ * by the driver (YUV planar formats) but serve as a base image for
+ * creating sub-images for the different planes within the image.
 */
 #define __DRI_IMAGE_FORMAT_RGB565       0x1001
 #define __DRI_IMAGE_FORMAT_XRGB8888     0x1002
 #define __DRI_IMAGE_FORMAT_ARGB8888     0x1003
 #define __DRI_IMAGE_FORMAT_ABGR8888     0x1004
 #define __DRI_IMAGE_FORMAT_XBGR8888     0x1005
+#define __DRI_IMAGE_FORMAT_R8           0x1006 /* Since version 5 */
+#define __DRI_IMAGE_FORMAT_GR88         0x1007
+#define __DRI_IMAGE_FORMAT_NONE         0x1008

 #define __DRI_IMAGE_USE_SHARE		0x0001
 #define __DRI_IMAGE_USE_SCANOUT		0x0002
@@ -921,6 +946,8 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_ATTRIB_HANDLE	0x2001
 #define __DRI_IMAGE_ATTRIB_NAME		0x2002
 #define __DRI_IMAGE_ATTRIB_FORMAT	0x2003 /* available in versions 3+ */
+#define __DRI_IMAGE_ATTRIB_WIDTH	0x2004 /* available in versions 5+ */
+#define __DRI_IMAGE_ATTRIB_HEIGHT	0x2005

 typedef struct __DRIimageRec          __DRIimage;
 typedef struct __DRIimageExtensionRec __DRIimageExtension;
@@ -963,6 +990,26 @@ struct __DRIimageExtensionRec {
    * \since 4
    */
   int (*write)(__DRIimage *image, const void *buf, size_t count);
+
+   /**
+    * Create an image out of a sub-region of a parent image.  This
+    * entry point lets us create individual __DRIimages for different
+    * planes in a planar buffer (typically yuv), for example.  While a
+    * sub-image shares the underlying buffer object with the parent
+    * image and other sibling sub-images, the life times of parent and
+    * sub-images are not dependent.  Destroying the parent or a
+    * sub-image doesn't affect other images.  The underlying buffer
+    * object is free when no __DRIimage remains that references it.
+    *
+    * Sub-images may overlap, but rendering to overlapping sub-images
+    * is undefined.
+    *
+    * \since 5
+    */
+    __DRIimage *(*createSubImage)(__DRIimage *image,
+                                  int width, int height, int format,
+                                  int offset, int pitch,
+                                  void *loaderPrivate);
 };


@@ -1000,4 +1047,21 @@ struct __DRI2configQueryExtensionRec {
   int (*configQueryi)(__DRIscreen *screen, const char *var, GLint *val);
   int (*configQueryf)(__DRIscreen *screen, const char *var, GLfloat *val);
 };
+
+/**
+ * Robust context driver extension.
+ *
+ * Existence of this extension means the driver can accept the
+ * \c __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS flag and the
+ * \c __DRI_CTX_ATTRIB_RESET_STRATEGY attribute in
+ * \c __DRIdri2ExtensionRec::createContextAttribs.
+ */
+#define __DRI2_ROBUSTNESS "DRI_Robustness"
+#define __DRI2_ROBUSTNESS_VERSION 1
+
+typedef struct __DRIrobustnessExtensionRec __DRIrobustnessExtension;
+struct __DRIrobustnessExtensionRec {
+   __DRIextension base;
+};
+
 #endif
--- a/include/GLES2/gl2ext.h
+++ b/include/GLES2/gl2ext.h
@@ -1,7 +1,7 @@
 #ifndef __gl2ext_h_
 #define __gl2ext_h_

-/* $Revision: 15049 $ on $Date:: 2011-07-06 17:28:16 -0700 #$ */
+/* $Revision: 18099 $ on $Date:: 2012-06-06 09:16:19 -0700 #$ */

 #ifdef __cplusplus
 extern "C" {
@@ -93,6 +93,9 @@ typedef void* GLeglImageOES;
 #define GL_DEPTH24_STENCIL8_OES                                 0x88F0
 #endif

+/* GL_OES_required_internalformat */
+/* No new tokens introduced by this extension. */
+
 /* GL_OES_rgb8_rgba8 */
 #ifndef GL_OES_rgb8_rgba8
 #define GL_RGB8_OES                                             0x8051
@@ -207,6 +210,37 @@ typedef void* GLeglImageOES;
 #define GL_MAX_SAMPLES_ANGLE                                    0x8D57
 #endif

+/* GL_ANGLE_instanced_arrays */
+#ifndef GL_ANGLE_instanced_arrays 
+#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR_ANGLE                    0x88FE
+#endif
+
+/* GL_ANGLE_pack_reverse_row_order */
+#ifndef GL_ANGLE_pack_reverse_row_order 
+#define GL_PACK_REVERSE_ROW_ORDER_ANGLE                         0x93A4
+#endif
+
+/* GL_ANGLE_texture_compression_dxt3 */
+#ifndef GL_ANGLE_texture_compression_dxt3 
+#define GL_COMPRESSED_RGBA_S3TC_DXT3_ANGLE                      0x83F2
+#endif
+
+/* GL_ANGLE_texture_compression_dxt5 */
+#ifndef GL_ANGLE_texture_compression_dxt5 
+#define GL_COMPRESSED_RGBA_S3TC_DXT5_ANGLE                      0x83F3
+#endif
+
+/* GL_ANGLE_texture_usage */
+#ifndef GL_ANGLE_texture_usage 
+#define GL_TEXTURE_USAGE_ANGLE                                  0x93A2
+#define GL_FRAMEBUFFER_ATTACHMENT_ANGLE                         0x93A3
+#endif
+
+/* GL_ANGLE_translated_shader_source */
+#ifndef GL_ANGLE_translated_shader_source 
+#define GL_TRANSLATED_SHADER_SOURCE_LENGTH_ANGLE                0x93A0
+#endif
+
 /*------------------------------------------------------------------------*
 * APPLE extension tokens
 *------------------------------------------------------------------------*/
@@ -261,6 +295,29 @@ typedef void* GLeglImageOES;
 #define GL_MAX_EXT                                              0x8008
 #endif

+/* GL_EXT_color_buffer_half_float */
+#ifndef GL_EXT_color_buffer_half_float
+#define GL_RGBA16F_EXT                                          0x881A
+#define GL_RGB16F_EXT                                           0x881B
+#define GL_RG16F_EXT                                            0x822F
+#define GL_R16F_EXT                                             0x822D
+#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE_EXT            0x8211
+#define GL_UNSIGNED_NORMALIZED_EXT                              0x8C17
+#endif
+
+/* GL_EXT_debug_label */
+#ifndef GL_EXT_debug_label
+#define GL_PROGRAM_PIPELINE_OBJECT_EXT                          0x8A4F
+#define GL_PROGRAM_OBJECT_EXT                                   0x8B40
+#define GL_SHADER_OBJECT_EXT                                    0x8B48
+#define GL_BUFFER_OBJECT_EXT                                    0x9151
+#define GL_QUERY_OBJECT_EXT                                     0x9153
+#define GL_VERTEX_ARRAY_OBJECT_EXT                              0x9154
+#endif
+
+/* GL_EXT_debug_marker */
+/* No new tokens introduced by this extension. */
+
 /* GL_EXT_discard_framebuffer */
 #ifndef GL_EXT_discard_framebuffer
 #define GL_COLOR_EXT                                            0x1800
@@ -268,9 +325,26 @@ typedef void* GLeglImageOES;
 #define GL_STENCIL_EXT                                          0x1802
 #endif

+/* GL_EXT_multisampled_render_to_texture */
+#ifndef GL_EXT_multisampled_render_to_texture
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_SAMPLES_EXT           0x8D6C
+#define GL_RENDERBUFFER_SAMPLES_EXT                             0x9133
+#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_EXT               0x9134
+#define GL_MAX_SAMPLES_EXT                                      0x9135
+#endif
+
 /* GL_EXT_multi_draw_arrays */
 /* No new tokens introduced by this extension. */

+/* GL_EXT_occlusion_query_boolean */
+#ifndef GL_EXT_occlusion_query_boolean
+#define GL_ANY_SAMPLES_PASSED_EXT                               0x8C2F
+#define GL_ANY_SAMPLES_PASSED_CONSERVATIVE_EXT                  0x8D6A
+#define GL_CURRENT_QUERY_EXT                                    0x8865
+#define GL_QUERY_RESULT_EXT                                     0x8866
+#define GL_QUERY_RESULT_AVAILABLE_EXT                           0x8867
+#endif
+
 /* GL_EXT_read_format_bgra */
 #ifndef GL_EXT_read_format_bgra
 #define GL_BGRA_EXT                                             0x80E1
@@ -278,9 +352,53 @@ typedef void* GLeglImageOES;
 #define GL_UNSIGNED_SHORT_1_5_5_5_REV_EXT                       0x8366
 #endif

+/* GL_EXT_robustness */
+#ifndef GL_EXT_robustness
+/* reuse GL_NO_ERROR */
+#define GL_GUILTY_CONTEXT_RESET_EXT                             0x8253
+#define GL_INNOCENT_CONTEXT_RESET_EXT                           0x8254
+#define GL_UNKNOWN_CONTEXT_RESET_EXT                            0x8255
+#define GL_CONTEXT_ROBUST_ACCESS_EXT                            0x90F3
+#define GL_RESET_NOTIFICATION_STRATEGY_EXT                      0x8256
+#define GL_LOSE_CONTEXT_ON_RESET_EXT                            0x8252
+#define GL_NO_RESET_NOTIFICATION_EXT                            0x8261
+#endif
+
+/* GL_EXT_separate_shader_objects */
+#ifndef GL_EXT_separate_shader_objects
+#define GL_VERTEX_SHADER_BIT_EXT                                0x00000001
+#define GL_FRAGMENT_SHADER_BIT_EXT                              0x00000002
+#define GL_ALL_SHADER_BITS_EXT                                  0xFFFFFFFF
+#define GL_PROGRAM_SEPARABLE_EXT                                0x8258
+#define GL_ACTIVE_PROGRAM_EXT                                   0x8259
+#define GL_PROGRAM_PIPELINE_BINDING_EXT                         0x825A
+#endif
+
 /* GL_EXT_shader_texture_lod */
 /* No new tokens introduced by this extension. */

+/* GL_EXT_shadow_samplers */
+#ifndef GL_EXT_shadow_samplers
+#define GL_TEXTURE_COMPARE_MODE_EXT                             0x884C
+#define GL_TEXTURE_COMPARE_FUNC_EXT                             0x884D
+#define GL_COMPARE_REF_TO_TEXTURE_EXT                           0x884E
+#define GL_SAMPLER_2D_SHADOW_EXT                                0x8B62
+#endif
+
+/* GL_EXT_sRGB */
+#ifndef GL_EXT_sRGB
+#define GL_SRGB_EXT                                             0x8C40
+#define GL_SRGB_ALPHA_EXT                                       0x8C42
+#define GL_SRGB8_ALPHA8_EXT                                     0x8C43
+#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING_EXT            0x8210
+#endif
+
+/* GL_EXT_texture_compression_dxt1 */
+#ifndef GL_EXT_texture_compression_dxt1
+#define GL_COMPRESSED_RGB_S3TC_DXT1_EXT                         0x83F0
+#define GL_COMPRESSED_RGBA_S3TC_DXT1_EXT                        0x83F1
+#endif
+
 /* GL_EXT_texture_filter_anisotropic */
 #ifndef GL_EXT_texture_filter_anisotropic
 #define GL_TEXTURE_MAX_ANISOTROPY_EXT                           0x84FE
@@ -292,17 +410,46 @@ typedef void* GLeglImageOES;
 #define GL_BGRA_EXT                                             0x80E1
 #endif

+/* GL_EXT_texture_rg */
+#ifndef GL_EXT_texture_rg
+#define GL_RED_EXT                                              0x1903
+#define GL_RG_EXT                                               0x8227
+#define GL_R8_EXT                                               0x8229
+#define GL_RG8_EXT                                              0x822B
+#endif
+
+/* GL_EXT_texture_storage */
+#ifndef GL_EXT_texture_storage
+#define GL_TEXTURE_IMMUTABLE_FORMAT_EXT                         0x912F
+#define GL_ALPHA8_EXT                                           0x803C  
+#define GL_LUMINANCE8_EXT                                       0x8040
+#define GL_LUMINANCE8_ALPHA8_EXT                                0x8045
+#define GL_RGBA32F_EXT                                          0x8814  
+#define GL_RGB32F_EXT                                           0x8815
+#define GL_ALPHA32F_EXT                                         0x8816
+#define GL_LUMINANCE32F_EXT                                     0x8818
+#define GL_LUMINANCE_ALPHA32F_EXT                               0x8819
+/* reuse GL_RGBA16F_EXT */
+/* reuse GL_RGB16F_EXT */
+#define GL_ALPHA16F_EXT                                         0x881C
+#define GL_LUMINANCE16F_EXT                                     0x881E
+#define GL_LUMINANCE_ALPHA16F_EXT                               0x881F
+#define GL_RGB10_A2_EXT                                         0x8059  
+#define GL_RGB10_EXT                                            0x8052
+#define GL_BGRA8_EXT                                            0x93A1
+#define GL_R8_EXT                                               0x8229
+#define GL_RG8_EXT                                              0x822B
+#define GL_R32F_EXT                                             0x822E  
+#define GL_RG32F_EXT                                            0x8230
+#define GL_R16F_EXT                                             0x822D
+#define GL_RG16F_EXT                                            0x822F
+#endif
+
 /* GL_EXT_texture_type_2_10_10_10_REV */
 #ifndef GL_EXT_texture_type_2_10_10_10_REV
 #define GL_UNSIGNED_INT_2_10_10_10_REV_EXT                      0x8368
 #endif

-/* GL_EXT_texture_compression_dxt1 */
-#ifndef GL_EXT_texture_compression_dxt1
-#define GL_COMPRESSED_RGB_S3TC_DXT1_EXT                         0x83F0
-#define GL_COMPRESSED_RGBA_S3TC_DXT1_EXT                        0x83F1
-#endif
-
 /* GL_EXT_unpack_subimage */
 #ifndef GL_EXT_unpack_subimage
 #define GL_UNPACK_ROW_LENGTH                                    0x0CF2
@@ -319,6 +466,15 @@ typedef void* GLeglImageOES;
 #define GL_SHADER_BINARY_DMP                                    0x9250
 #endif

+/*------------------------------------------------------------------------*
+ * FJ extension tokens
+ *------------------------------------------------------------------------*/
+
+/* GL_FJ_shader_binary_GCCSO */
+#ifndef GL_FJ_shader_binary_GCCSO
+#define GCCSO_SHADER_BINARY_FJ                                  0x9260
+#endif
+
 /*------------------------------------------------------------------------*
 * IMG extension tokens
 *------------------------------------------------------------------------*/
@@ -631,6 +787,11 @@ typedef void (GL_APIENTRYP PFNGLGETBUFFERPOINTERVOESPROC) (GLenum target, GLenum
 #define GL_OES_packed_depth_stencil 1
 #endif

+/* GL_OES_required_internalformat */
+#ifndef GL_OES_required_internalformat
+#define GL_OES_required_internalformat 1
+#endif
+
 /* GL_OES_rgb8_rgba8 */
 #ifndef GL_OES_rgb8_rgba8
 #define GL_OES_rgb8_rgba8 1
@@ -790,6 +951,45 @@ GL_APICALL void GL_APIENTRY glRenderbufferStorageMultisampleANGLE (GLenum target
 typedef void (GL_APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEANGLEPROC) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
 #endif

+#ifndef GL_ANGLE_instanced_arrays 
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glDrawArraysInstancedANGLE (GLenum mode, GLint first, GLsizei count, GLsizei primcount);
+GL_APICALL void GL_APIENTRY glDrawElementsInstancedANGLE (GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei primcount);
+GL_APICALL void GL_APIENTRY glVertexAttribDivisorANGLE (GLuint index, GLuint divisor);
+#endif
+typedef void (GL_APIENTRYP PFLGLDRAWARRAYSINSTANCEDANGLEPROC) (GLenum mode, GLint first, GLsizei count, GLsizei primcount);
+typedef void (GL_APIENTRYP PFLGLDRAWELEMENTSINSTANCEDANGLEPROC) (GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei primcount);
+typedef void (GL_APIENTRYP PFLGLVERTEXATTRIBDIVISORANGLEPROC) (GLuint index, GLuint divisor);
+#endif
+
+/* GL_ANGLE_pack_reverse_row_order */
+#ifndef GL_ANGLE_pack_reverse_row_order 
+#define GL_ANGLE_pack_reverse_row_order 1
+#endif
+
+/* GL_ANGLE_texture_compression_dxt3 */
+#ifndef GL_ANGLE_texture_compression_dxt3 
+#define GL_ANGLE_texture_compression_dxt3 1
+#endif
+
+/* GL_ANGLE_texture_compression_dxt5 */
+#ifndef GL_ANGLE_texture_compression_dxt5 
+#define GL_ANGLE_texture_compression_dxt5 1
+#endif
+
+/* GL_ANGLE_texture_usage */
+#ifndef GL_ANGLE_texture_usage 
+#define GL_ANGLE_texture_usage 1
+#endif
+
+#ifndef GL_ANGLE_translated_shader_source 
+#define GL_ANGLE_translated_shader_source 1
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glGetTranslatedShaderSourceANGLE (GLuint shader, GLsizei bufsize, GLsizei *length, GLchar *source);
+#endif
+typedef void (GL_APIENTRYP PFLGLGETTRANSLATEDSHADERSOURCEANGLEPROC) (GLuint shader, GLsizei bufsize, GLsizei *length, GLchar *source);
+#endif
+
 /*------------------------------------------------------------------------*
 * APPLE extension functions
 *------------------------------------------------------------------------*/
@@ -843,6 +1043,35 @@ typedef void (GL_APIENTRYP PFNGLRESOLVEMULTISAMPLEFRAMEBUFFERAPPLEPROC) (void);
 #define GL_EXT_blend_minmax 1
 #endif

+/* GL_EXT_color_buffer_half_float */
+#ifndef GL_EXT_color_buffer_half_float
+#define GL_EXT_color_buffer_half_float 1
+#endif
+
+/* GL_EXT_debug_label */
+#ifndef GL_EXT_debug_label
+#define GL_EXT_debug_label 1
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glLabelObjectEXT (GLenum type, GLuint object, GLsizei length, const GLchar *label);
+GL_APICALL void GL_APIENTRY glGetObjectLabelEXT (GLenum type, GLuint object, GLsizei bufSize, GLsizei *length, GLchar *label);
+#endif
+typedef void (GL_APIENTRYP PFNGLLABELOBJECTEXTPROC) (GLenum type, GLuint object, GLsizei length, const GLchar *label);
+typedef void (GL_APIENTRYP PFNGLGETOBJECTLABELEXTPROC) (GLenum type, GLuint object, GLsizei bufSize, GLsizei *length, GLchar *label);
+#endif
+
+/* GL_EXT_debug_marker */
+#ifndef GL_EXT_debug_marker
+#define GL_EXT_debug_marker 1
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glInsertEventMarkerEXT (GLsizei length, const GLchar *marker);
+GL_APICALL void GL_APIENTRY glPushGroupMarkerEXT (GLsizei length, const GLchar *marker);
+GL_APICALL void GL_APIENTRY glPopGroupMarkerEXT (void);
+#endif
+typedef void (GL_APIENTRYP PFNGLINSERTEVENTMARKEREXTPROC) (GLsizei length, const GLchar *marker);
+typedef void (GL_APIENTRYP PFNGLPUSHGROUPMARKEREXTPROC) (GLsizei length, const GLchar *marker);
+typedef void (GL_APIENTRYP PFNGLPOPGROUPMARKEREXTPROC) (void);
+#endif
+
 /* GL_EXT_discard_framebuffer */
 #ifndef GL_EXT_discard_framebuffer
 #define GL_EXT_discard_framebuffer 1
@@ -852,6 +1081,17 @@ GL_APICALL void GL_APIENTRY glDiscardFramebufferEXT (GLenum target, GLsizei numA
 typedef void (GL_APIENTRYP PFNGLDISCARDFRAMEBUFFEREXTPROC) (GLenum target, GLsizei numAttachments, const GLenum *attachments);
 #endif

+/* GL_EXT_multisampled_render_to_texture */
+#ifndef GL_EXT_multisampled_render_to_texture
+#define GL_EXT_multisampled_render_to_texture 1
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glRenderbufferStorageMultisampleEXT (GLenum, GLsizei, GLenum, GLsizei, GLsizei);
+GL_APICALL void GL_APIENTRY glFramebufferTexture2DMultisampleEXT (GLenum, GLenum, GLenum, GLuint, GLint, GLsizei);
+#endif
+typedef void (GL_APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC) (GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERTEXTURE2DMULTISAMPLEEXTPROC) (GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLsizei samples);
+#endif
+
 #ifndef GL_EXT_multi_draw_arrays
 #define GL_EXT_multi_draw_arrays 1
 #ifdef GL_GLEXT_PROTOTYPES
@@ -862,16 +1102,134 @@ typedef void (GL_APIENTRYP PFNGLMULTIDRAWARRAYSEXTPROC) (GLenum mode, GLint *fir
 typedef void (GL_APIENTRYP PFNGLMULTIDRAWELEMENTSEXTPROC) (GLenum mode, const GLsizei *count, GLenum type, const GLvoid* *indices, GLsizei primcount);
 #endif

+/* GL_EXT_occlusion_query_boolean */
+#ifndef GL_EXT_occlusion_query_boolean
+#define GL_EXT_occlusion_query_boolean 1
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glGenQueriesEXT (GLsizei n, GLuint *ids);
+GL_APICALL void GL_APIENTRY glDeleteQueriesEXT (GLsizei n, const GLuint *ids);
+GL_APICALL GLboolean GL_APIENTRY glIsQueryEXT (GLuint id);
+GL_APICALL void GL_APIENTRY glBeginQueryEXT (GLenum target, GLuint id);
+GL_APICALL void GL_APIENTRY glEndQueryEXT (GLenum target);
+GL_APICALL void GL_APIENTRY glGetQueryivEXT (GLenum target, GLenum pname, GLint *params);
+GL_APICALL void GL_APIENTRY glGetQueryObjectuivEXT (GLuint id, GLenum pname, GLuint *params);
+#endif
+typedef void (GL_APIENTRYP PFNGLGENQUERIESEXTPROC) (GLsizei n, GLuint *ids);
+typedef void (GL_APIENTRYP PFNGLDELETEQUERIESEXTPROC) (GLsizei n, const GLuint *ids);
+typedef GLboolean (GL_APIENTRYP PFNGLISQUERYEXTPROC) (GLuint id);
+typedef void (GL_APIENTRYP PFNGLBEGINQUERYEXTPROC) (GLenum target, GLuint id);
+typedef void (GL_APIENTRYP PFNGLENDQUERYEXTPROC) (GLenum target);
+typedef void (GL_APIENTRYP PFNGLGETQUERYIVEXTPROC) (GLenum target, GLenum pname, GLint *params);
+typedef void (GL_APIENTRYP PFNGLGETQUERYOBJECTUIVEXTPROC) (GLuint id, GLenum pname, GLuint *params);
+#endif
+
 /* GL_EXT_read_format_bgra */
 #ifndef GL_EXT_read_format_bgra
 #define GL_EXT_read_format_bgra 1
 #endif

+/* GL_EXT_robustness */
+#ifndef GL_EXT_robustness
+#define GL_EXT_robustness 1
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL GLenum GL_APIENTRY glGetGraphicsResetStatusEXT (void);
+GL_APICALL void GL_APIENTRY glReadnPixelsEXT (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLsizei bufSize, void *data);
+GL_APICALL void GL_APIENTRY glGetnUniformfvEXT (GLuint program, GLint location, GLsizei bufSize, float *params);
+GL_APICALL void GL_APIENTRY glGetnUniformivEXT (GLuint program, GLint location, GLsizei bufSize, GLint *params);
+#endif
+typedef GLenum (GL_APIENTRYP PFNGLGETGRAPHICSRESETSTATUSEXTPROC) (void);
+typedef void (GL_APIENTRYP PFNGLREADNPIXELSEXTPROC) (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLsizei bufSize, void *data);
+typedef void (GL_APIENTRYP PFNGLGETNUNIFORMFVEXTPROC) (GLuint program, GLint location, GLsizei bufSize, float *params);
+typedef void (GL_APIENTRYP PFNGLGETNUNIFORMIVEXTPROC) (GLuint program, GLint location, GLsizei bufSize, GLint *params);
+#endif
+
+/* GL_EXT_separate_shader_objects */
+#ifndef GL_EXT_separate_shader_objects
+#define GL_EXT_separate_shader_objects 1
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glUseProgramStagesEXT (GLuint pipeline, GLbitfield stages, GLuint program);
+GL_APICALL void GL_APIENTRY glActiveShaderProgramEXT (GLuint pipeline, GLuint program);
+GL_APICALL GLuint GL_APIENTRY glCreateShaderProgramvEXT (GLenum type, GLsizei count, const GLchar **strings);
+GL_APICALL void GL_APIENTRY glBindProgramPipelineEXT (GLuint pipeline);
+GL_APICALL void GL_APIENTRY glDeleteProgramPipelinesEXT (GLsizei n, const GLuint *pipelines);
+GL_APICALL void GL_APIENTRY glGenProgramPipelinesEXT (GLsizei n, GLuint *pipelines);
+GL_APICALL GLboolean GL_APIENTRY glIsProgramPipelineEXT (GLuint pipeline);
+GL_APICALL void GL_APIENTRY glProgramParameteriEXT (GLuint program, GLenum pname, GLint value);
+GL_APICALL void GL_APIENTRY glGetProgramPipelineivEXT (GLuint pipeline, GLenum pname, GLint *params);
+GL_APICALL void GL_APIENTRY glProgramUniform1iEXT (GLuint program, GLint location, GLint x);
+GL_APICALL void GL_APIENTRY glProgramUniform2iEXT (GLuint program, GLint location, GLint x, GLint y);
+GL_APICALL void GL_APIENTRY glProgramUniform3iEXT (GLuint program, GLint location, GLint x, GLint y, GLint z);
+GL_APICALL void GL_APIENTRY glProgramUniform4iEXT (GLuint program, GLint location, GLint x, GLint y, GLint z, GLint w);
+GL_APICALL void GL_APIENTRY glProgramUniform1fEXT (GLuint program, GLint location, GLfloat x);
+GL_APICALL void GL_APIENTRY glProgramUniform2fEXT (GLuint program, GLint location, GLfloat x, GLfloat y);
+GL_APICALL void GL_APIENTRY glProgramUniform3fEXT (GLuint program, GLint location, GLfloat x, GLfloat y, GLfloat z);
+GL_APICALL void GL_APIENTRY glProgramUniform4fEXT (GLuint program, GLint location, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
+GL_APICALL void GL_APIENTRY glProgramUniform1ivEXT (GLuint program, GLint location, GLsizei count, const GLint *value);
+GL_APICALL void GL_APIENTRY glProgramUniform2ivEXT (GLuint program, GLint location, GLsizei count, const GLint *value);
+GL_APICALL void GL_APIENTRY glProgramUniform3ivEXT (GLuint program, GLint location, GLsizei count, const GLint *value);
+GL_APICALL void GL_APIENTRY glProgramUniform4ivEXT (GLuint program, GLint location, GLsizei count, const GLint *value);
+GL_APICALL void GL_APIENTRY glProgramUniform1fvEXT (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+GL_APICALL void GL_APIENTRY glProgramUniform2fvEXT (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+GL_APICALL void GL_APIENTRY glProgramUniform3fvEXT (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+GL_APICALL void GL_APIENTRY glProgramUniform4fvEXT (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+GL_APICALL void GL_APIENTRY glProgramUniformMatrix2fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GL_APICALL void GL_APIENTRY glProgramUniformMatrix3fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GL_APICALL void GL_APIENTRY glProgramUniformMatrix4fvEXT (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GL_APICALL void GL_APIENTRY glValidateProgramPipelineEXT (GLuint pipeline);
+GL_APICALL void GL_APIENTRY glGetProgramPipelineInfoLogEXT (GLuint pipeline, GLsizei bufSize, GLsizei *length, GLchar *infoLog);
+#endif
+typedef void (GL_APIENTRYP PFNGLUSEPROGRAMSTAGESEXTPROC) (GLuint pipeline, GLbitfield stages, GLuint program);
+typedef void (GL_APIENTRYP PFNGLACTIVESHADERPROGRAMEXTPROC) (GLuint pipeline, GLuint program);
+typedef GLuint (GL_APIENTRYP PFNGLCREATESHADERPROGRAMVEXTPROC) (GLenum type, GLsizei count, const GLchar **strings);
+typedef void (GL_APIENTRYP PFNGLBINDPROGRAMPIPELINEEXTPROC) (GLuint pipeline);
+typedef void (GL_APIENTRYP PFNGLDELETEPROGRAMPIPELINESEXTPROC) (GLsizei n, const GLuint *pipelines);
+typedef void (GL_APIENTRYP PFNGLGENPROGRAMPIPELINESEXTPROC) (GLsizei n, GLuint *pipelines);
+typedef GLboolean (GL_APIENTRYP PFNGLISPROGRAMPIPELINEEXTPROC) (GLuint pipeline);
+typedef void (GL_APIENTRYP PFNGLPROGRAMPARAMETERIEXTPROC) (GLuint program, GLenum pname, GLint value);
+typedef void (GL_APIENTRYP PFNGLGETPROGRAMPIPELINEIVEXTPROC) (GLuint pipeline, GLenum pname, GLint *params);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM1IEXTPROC) (GLuint program, GLint location, GLint x);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM2IEXTPROC) (GLuint program, GLint location, GLint x, GLint y);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM3IEXTPROC) (GLuint program, GLint location, GLint x, GLint y, GLint z);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM4IEXTPROC) (GLuint program, GLint location, GLint x, GLint y, GLint z, GLint w);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM1FEXTPROC) (GLuint program, GLint location, GLfloat x);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM2FEXTPROC) (GLuint program, GLint location, GLfloat x, GLfloat y);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM3FEXTPROC) (GLuint program, GLint location, GLfloat x, GLfloat y, GLfloat z);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM4FEXTPROC) (GLuint program, GLint location, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM1IVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLint *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM2IVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLint *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM3IVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLint *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM4IVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLint *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM1FVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM2FVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM3FVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORM4FVEXTPROC) (GLuint program, GLint location, GLsizei count, const GLfloat *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORMMATRIX2FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORMMATRIX3FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (GL_APIENTRYP PFNGLPROGRAMUNIFORMMATRIX4FVEXTPROC) (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+typedef void (GL_APIENTRYP PFNGLVALIDATEPROGRAMPIPELINEEXTPROC) (GLuint pipeline);
+typedef void (GL_APIENTRYP PFNGLGETPROGRAMPIPELINEINFOLOGEXTPROC) (GLuint pipeline, GLsizei bufSize, GLsizei *length, GLchar *infoLog);
+#endif
+
 /* GL_EXT_shader_texture_lod */
 #ifndef GL_EXT_shader_texture_lod
 #define GL_EXT_shader_texture_lod 1
 #endif

+/* GL_EXT_shadow_samplers */
+#ifndef GL_EXT_shadow_samplers
+#define GL_EXT_shadow_samplers 1
+#endif
+
+/* GL_EXT_sRGB */
+#ifndef GL_EXT_sRGB
+#define GL_EXT_sRGB 1
+#endif
+
+/* GL_EXT_texture_compression_dxt1 */
+#ifndef GL_EXT_texture_compression_dxt1
+#define GL_EXT_texture_compression_dxt1 1
+#endif
+
 /* GL_EXT_texture_filter_anisotropic */
 #ifndef GL_EXT_texture_filter_anisotropic
 #define GL_EXT_texture_filter_anisotropic 1
@@ -882,16 +1240,35 @@ typedef void (GL_APIENTRYP PFNGLMULTIDRAWELEMENTSEXTPROC) (GLenum mode, const GL
 #define GL_EXT_texture_format_BGRA8888 1
 #endif

+/* GL_EXT_texture_rg */
+#ifndef GL_EXT_texture_rg
+#define GL_EXT_texture_rg 1
+#endif
+
+/* GL_EXT_texture_storage */
+#ifndef GL_EXT_texture_storage
+#define GL_EXT_texture_storage 1
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glTexStorage1DEXT (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
+GL_APICALL void GL_APIENTRY glTexStorage2DEXT (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+GL_APICALL void GL_APIENTRY glTexStorage3DEXT (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+GL_APICALL void GL_APIENTRY glTextureStorage1DEXT (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
+GL_APICALL void GL_APIENTRY glTextureStorage2DEXT (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+GL_APICALL void GL_APIENTRY glTextureStorage3DEXT (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+#endif
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGE1DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGE2DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (GL_APIENTRYP PFNGLTEXSTORAGE3DEXTPROC) (GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE1DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE2DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (GL_APIENTRYP PFNGLTEXTURESTORAGE3DEXTPROC) (GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth);
+#endif
+
 /* GL_EXT_texture_type_2_10_10_10_REV */
 #ifndef GL_EXT_texture_type_2_10_10_10_REV
 #define GL_EXT_texture_type_2_10_10_10_REV 1
 #endif

-/* GL_EXT_texture_compression_dxt1 */
-#ifndef GL_EXT_texture_compression_dxt1
-#define GL_EXT_texture_compression_dxt1 1
-#endif
-
 /* GL_EXT_unpack_subimage */
 #ifndef GL_EXT_unpack_subimage
 #define GL_EXT_unpack_subimage 1
@@ -906,6 +1283,15 @@ typedef void (GL_APIENTRYP PFNGLMULTIDRAWELEMENTSEXTPROC) (GLenum mode, const GL
 #define GL_DMP_shader_binary 1
 #endif

+/*------------------------------------------------------------------------*
+ * FJ extension functions
+ *------------------------------------------------------------------------*/
+
+/* GL_FJ_shader_binary_GCCSO */
+#ifndef GL_FJ_shader_binary_GCCSO
+#define GL_FJ_shader_binary_GCCSO 1
+#endif
+
 /*------------------------------------------------------------------------*
 * IMG extension functions
 *------------------------------------------------------------------------*/
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -368,7 +368,6 @@ def generate(env):
        ccflags += [
            '-Wall',
            '-Wno-long-long',
-            '-ffast-math',
            '-fmessage-length=0', # be nice to Eclipse
        ]
        cflags += [
@@ -405,7 +404,6 @@ def generate(env):
                '/GL-', # disable whole program optimization
            ]
        ccflags += [
-            '/fp:fast', # fast floating point 
            '/W3', # warning level
            #'/Wp64', # enable 64 bit porting warnings
            '/wd4996', # disable deprecated POSIX name warnings
@@ -516,7 +514,7 @@ def generate(env):
    createInstallMethods(env)

    env.PkgCheckModules('X11', ['x11', 'xext', 'xdamage', 'xfixes'])
-    env.PkgCheckModules('XCB', ['x11-xcb', 'xcb-glx'])
+    env.PkgCheckModules('XCB', ['x11-xcb', 'xcb-glx >= 1.8.1'])
    env.PkgCheckModules('XF86VIDMODE', ['xxf86vm'])
    env.PkgCheckModules('DRM', ['libdrm >= 2.4.24'])
    env.PkgCheckModules('DRM_INTEL', ['libdrm_intel >= 2.4.30'])
--- a/scons/llvm.py
+++ b/scons/llvm.py
@@ -178,7 +178,15 @@ def generate(env):
                pass
            env.MergeFlags(cppflags)

-            env.ParseConfig('llvm-config --libs engine bitwriter')
+            components = ['engine', 'bitwriter', 'x86asmprinter']
+
+            if llvm_version >= distutils.version.LooseVersion('3.2'):
+                components.append('instrumentation')
+
+            if llvm_version >= distutils.version.LooseVersion('3.1'):
+                components.append('mcjit')
+
+            env.ParseConfig('llvm-config --libs ' + ' '.join(components))
            env.ParseConfig('llvm-config --ldflags')
        except OSError:
            print 'scons: llvm-config version %s failed' % llvm_version
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -958,6 +958,32 @@ dri2_release_tex_image(_EGLDriver *drv,
   return EGL_TRUE;
 }

+static _EGLImage *
+dri2_create_image(_EGLDisplay *disp, __DRIimage *dri_image)
+{
+   struct dri2_egl_image *dri2_img;
+
+   if (dri_image == NULL) {
+      _eglError(EGL_BAD_ALLOC, "dri2_create_image");
+      return NULL;
+   }
+
+   dri2_img = malloc(sizeof *dri2_img);
+   if (!dri2_img) {
+      _eglError(EGL_BAD_ALLOC, "dri2_create_image");
+      return NULL;
+   }
+
+   if (!_eglInitImage(&dri2_img->base, disp)) {
+      free(dri2_img);
+      return NULL;
+   }
+
+   dri2_img->dri_image = dri_image;
+
+   return &dri2_img->base;
+}
+
 static _EGLImage *
 dri2_create_image_khr_renderbuffer(_EGLDisplay *disp, _EGLContext *ctx,
 				   EGLClientBuffer buffer,
@@ -965,79 +991,29 @@ dri2_create_image_khr_renderbuffer(_EGLDisplay *disp, _EGLContext *ctx,
 {
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
-   struct dri2_egl_image *dri2_img;
   GLuint renderbuffer = (GLuint) (uintptr_t) buffer;
+   __DRIimage *dri_image;

   if (renderbuffer == 0) {
      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
      return EGL_NO_IMAGE_KHR;
   }

-   dri2_img = malloc(sizeof *dri2_img);
-   if (!dri2_img) {
-      _eglError(EGL_BAD_ALLOC, "dri2_create_image_khr");
-      return EGL_NO_IMAGE_KHR;
-   }
-
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      free(dri2_img);
-      return EGL_NO_IMAGE_KHR;
-   }
-
-   dri2_img->dri_image = 
+   dri_image =
      dri2_dpy->image->createImageFromRenderbuffer(dri2_ctx->dri_context,
-						   renderbuffer,
-						   dri2_img);
+                                                   renderbuffer, NULL);

-   return &dri2_img->base;
-}
-
-static _EGLImage *
-dri2_create_image_drm_name(_EGLDisplay *disp, _EGLContext *ctx,
-			   EGLint name,
-                           const _EGLImageAttribs *attrs,
-                           EGLint format,
-                           EGLint pitch)
-{
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   struct dri2_egl_image *dri2_img;
-
-   (void) ctx;
-
-   dri2_img = malloc(sizeof *dri2_img);
-   if (!dri2_img) {
-      _eglError(EGL_BAD_ALLOC, "dri2_create_image_mesa_drm");
-      return NULL;
-   }
-
-   if (!_eglInitImage(&dri2_img->base, disp)) {
-      free(dri2_img);
-      return NULL;
-   }
-
-   dri2_img->dri_image =
-      dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
-					   attrs->Width,
-					   attrs->Height,
-					   format,
-					   name,
-					   pitch,
-					   dri2_img);
-   if (dri2_img->dri_image == NULL) {
-      free(dri2_img);
-      _eglError(EGL_BAD_ALLOC, "dri2_create_image_mesa_drm");
-      return NULL;
-   }
-
-   return &dri2_img->base;
+   return dri2_create_image(disp, dri_image);
 }

 static _EGLImage *
 dri2_create_image_mesa_drm_buffer(_EGLDisplay *disp, _EGLContext *ctx,
 				  EGLClientBuffer buffer, const EGLint *attr_list)
 {
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   EGLint format, name, pitch, err;
   _EGLImageAttribs attrs;
+   __DRIimage *dri_image;

   name = (EGLint) (uintptr_t) buffer;

@@ -1063,52 +1039,135 @@ dri2_create_image_mesa_drm_buffer(_EGLDisplay *disp, _EGLContext *ctx,
      return NULL;
   }

-   return dri2_create_image_drm_name (disp, ctx, name, &attrs, format, pitch);
+   dri_image =
+      dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
+					   attrs.Width,
+					   attrs.Height,
+					   format,
+					   name,
+					   pitch,
+					   NULL);
+
+   return dri2_create_image(disp, dri_image);
 }

 #ifdef HAVE_WAYLAND_PLATFORM
+
+/* This structure describes how a wl_buffer maps to one or more
+ * __DRIimages.  A wl_drm_buffer stores the wl_drm format code and the
+ * offsets and strides of the planes in the buffer.  This table maps a
+ * wl_drm format code to a description of the planes in the buffer
+ * that lets us create a __DRIimage for each of the planes. */
+
+static const struct wl_drm_format_descriptor {
+   uint32_t wl_format;
+   EGLint components;
+   int nplanes;
+   struct {
+      int buffer_index;
+      int width_shift;
+      int height_shift;
+      uint32_t dri_format;
+      int cpp;
+   } planes[3];
+} wl_drm_formats[] = {
+   { WL_DRM_FORMAT_ARGB8888, EGL_TEXTURE_RGBA, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 }, } },
+
+   { WL_DRM_FORMAT_XRGB8888, EGL_TEXTURE_RGB, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_XRGB8888, 4 }, } },
+
+   { WL_DRM_FORMAT_YUV410, EGL_TEXTURE_Y_U_V_WL, 3,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 2, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 } } },
+
+   { WL_DRM_FORMAT_YUV411, EGL_TEXTURE_Y_U_V_WL, 3,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 1, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 2, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 } } },
+
+   { WL_DRM_FORMAT_YUV420, EGL_TEXTURE_Y_U_V_WL, 3,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 1, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 2, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 } } },
+
+   { WL_DRM_FORMAT_YUV422, EGL_TEXTURE_Y_U_V_WL, 3,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 1, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 2, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 } } },
+
+   { WL_DRM_FORMAT_YUV444, EGL_TEXTURE_Y_U_V_WL, 3,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 1, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 2, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 } } },
+
+   { WL_DRM_FORMAT_NV12, EGL_TEXTURE_Y_UV_WL, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 1, 1, 1, __DRI_IMAGE_FORMAT_GR88, 2 } } },
+
+   { WL_DRM_FORMAT_NV16, EGL_TEXTURE_Y_UV_WL, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
+       { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } },
+
+   /* For YUYV buffers, we set up two overlapping DRI images and treat
+    * them as planar buffers in the compositors.  Plane 0 is GR88 and
+    * samples YU or YV pairs and places Y into the R component, while
+    * plane 1 is ARGB and samples YUYV clusters and places pairs and
+    * places U into the G component and V into A.  This lets the
+    * texture sampler interpolate the Y components correctly when
+    * sampling from plane 0, and interpolate U and V correctly when
+    * sampling from plane 1. */
+   { WL_DRM_FORMAT_YUYV, EGL_TEXTURE_Y_XUXV_WL, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 },
+       { 0, 1, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }
+};
+
 static _EGLImage *
 dri2_create_image_wayland_wl_buffer(_EGLDisplay *disp, _EGLContext *ctx,
 				    EGLClientBuffer _buffer,
 				    const EGLint *attr_list)
 {
-   struct wl_buffer *buffer = (struct wl_buffer *) _buffer;
+   struct wl_drm_buffer *buffer = (struct wl_drm_buffer *) _buffer;
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
   __DRIimage *dri_image;
   _EGLImageAttribs attrs;
-   EGLint format, name, stride, pitch, err;
+   EGLint err;
+   uint32_t format;
+   int32_t offset, stride, plane, width, height;
+   int cpp, index;
+   const struct wl_drm_format_descriptor *f;

-   if (!wayland_buffer_is_drm(buffer))
+   if (!wayland_buffer_is_drm(&buffer->buffer))
       return NULL;

-   dri_image = wayland_drm_buffer_get_buffer(buffer);
-   
-   dri2_dpy->image->queryImage(dri_image, __DRI_IMAGE_ATTRIB_NAME, &name);
-   dri2_dpy->image->queryImage(dri_image, __DRI_IMAGE_ATTRIB_STRIDE, &stride);
-
   err = _eglParseImageAttribList(&attrs, disp, attr_list);
-   if (err != EGL_SUCCESS)
-      return NULL;
-
-   attrs.Width = buffer->width;
-   attrs.Height = buffer->height;
-
-   switch (wayland_drm_buffer_get_format(buffer)) {
-   case WL_DRM_FORMAT_ARGB8888:
-      format = __DRI_IMAGE_FORMAT_ARGB8888;
-      break;
-   case WL_DRM_FORMAT_XRGB8888:
-      format = __DRI_IMAGE_FORMAT_XRGB8888;
-      break;
-   default:
-      _eglError(EGL_BAD_PARAMETER,
-		"dri2_create_image_khr: unsupported wl_buffer format");
+   plane = attrs.PlaneWL;
+   if (err != EGL_SUCCESS) {
+      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_wayland_wl_buffer");
      return NULL;
   }

-   pitch = stride / 4;
+   f = buffer->driver_format;
+   if (plane < 0 || plane >= f->nplanes) {
+      _eglError(EGL_BAD_PARAMETER,
+                "dri2_create_image_wayland_wl_buffer (plane out of bounds)");
+      return NULL;
+   }

-   return dri2_create_image_drm_name(disp, ctx, name, &attrs, format, pitch);
+   width = buffer->buffer.width >> f->planes[plane].width_shift;
+   height = buffer->buffer.height >> f->planes[plane].height_shift;
+   format = f->planes[plane].dri_format;
+   cpp = f->planes[plane].cpp;
+   index = f->planes[plane].buffer_index;
+   offset = buffer->offset[index];
+   stride = buffer->stride[index];
+
+   dri_image = dri2_dpy->image->createSubImage(buffer->driver_buffer,
+                                               width, height, format,
+                                               offset, stride / cpp, NULL);
+
+   return dri2_create_image(disp, dri_image);
 }
 #endif

@@ -1261,43 +1320,39 @@ dri2_export_drm_image_mesa(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img,

 #ifdef HAVE_WAYLAND_PLATFORM

-static void *
+static void
 dri2_wl_reference_buffer(void *user_data, uint32_t name,
-			 int32_t width, int32_t height,
-			 uint32_t stride, uint32_t format)
+                         struct wl_drm_buffer *buffer)
 {
   _EGLDisplay *disp = user_data;
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   __DRIimage *image;
-   int dri_format;
+   int i;

-   switch (format) {
-   case WL_DRM_FORMAT_ARGB8888:
-      dri_format =__DRI_IMAGE_FORMAT_ARGB8888;
-      break;
-   case WL_DRM_FORMAT_XRGB8888:
-      dri_format = __DRI_IMAGE_FORMAT_XRGB8888;
-      break;
-   default:
-      return NULL;	   
-   }
+   for (i = 0; i < ARRAY_SIZE(wl_drm_formats); i++)
+      if (wl_drm_formats[i].wl_format == buffer->format) {
+         buffer->driver_format = &wl_drm_formats[i];
+         break;
+      }

-   image = dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
-						width, height, 
-						dri_format, name, stride / 4,
-						NULL);
+   if (buffer->driver_format == NULL)
+      return;

-   return image;
+   buffer->driver_buffer =
+      dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
+                                           buffer->buffer.width,
+                                           buffer->buffer.height, 
+                                           __DRI_IMAGE_FORMAT_NONE, name,
+                                           buffer->stride[0] / 4,
+                                           NULL);
 }

 static void
-dri2_wl_release_buffer(void *user_data, void *buffer)
+dri2_wl_release_buffer(void *user_data, struct wl_drm_buffer *buffer)
 {
   _EGLDisplay *disp = user_data;
-   __DRIimage *image = buffer;
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);

-   dri2_dpy->image->destroyImage(image);
+   dri2_dpy->image->destroyImage(buffer->driver_buffer);
 }

 static struct wayland_drm_callbacks wl_drm_callbacks = {
@@ -1346,6 +1401,33 @@ dri2_unbind_wayland_display_wl(_EGLDriver *drv, _EGLDisplay *disp,

   return EGL_TRUE;
 }
+
+static EGLBoolean
+dri2_query_wayland_buffer_wl(_EGLDriver *drv, _EGLDisplay *disp,
+                             struct wl_buffer *_buffer,
+                             EGLint attribute, EGLint *value)
+{
+   struct wl_drm_buffer *buffer = (struct wl_drm_buffer *) _buffer;
+   const struct wl_drm_format_descriptor *format;
+
+   if (!wayland_buffer_is_drm(&buffer->buffer))
+      return EGL_FALSE;
+
+   format = buffer->driver_format;
+   switch (attribute) {
+   case EGL_TEXTURE_FORMAT:
+      *value = format->components;
+      return EGL_TRUE;
+   case EGL_WIDTH:
+      *value = buffer->buffer.width;
+      break;
+   case EGL_HEIGHT:
+      *value = buffer->buffer.height;
+      break;
+   }
+
+   return EGL_FALSE;
+}
 #endif

 static void
@@ -1443,6 +1525,7 @@ _eglBuiltInDriverDRI2(const char *args)
 #ifdef HAVE_WAYLAND_PLATFORM
   dri2_drv->base.API.BindWaylandDisplayWL = dri2_bind_wayland_display_wl;
   dri2_drv->base.API.UnbindWaylandDisplayWL = dri2_unbind_wayland_display_wl;
+   dri2_drv->base.API.QueryWaylandBufferWL = dri2_query_wayland_buffer_wl;
 #endif

   dri2_drv->base.Name = "DRI2";
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -108,6 +108,8 @@ dri2_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,

   switch (type) {
   case EGL_WINDOW_BIT:
+      if (!window)
+         return NULL;
      surf = gbm_dri_surface((struct gbm_surface *) window);
      dri2_surf->gbm_surf = surf;
      dri2_surf->base.Width =  surf->base.width;
--- a/src/egl/main/Makefile.am
+++ b/src/egl/main/Makefile.am
@@ -19,7 +19,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.

-GLAPI_LIB = ../mapi/glapi/libglapi.a
+GLAPI_LIB = ../mapi/glapi/libglapi.la

 if HAVE_XF86VIDMODE
 EXTRA_DEFINES_XF86VIDMODE = -DXF86VIDMODE
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -940,6 +940,7 @@ eglGetProcAddress(const char *procname)
 #ifdef EGL_WL_bind_wayland_display
      { "eglBindWaylandDisplayWL", (_EGLProc) eglBindWaylandDisplayWL },
      { "eglUnbindWaylandDisplayWL", (_EGLProc) eglUnbindWaylandDisplayWL },
+      { "eglQueryWaylandBufferWL", (_EGLProc) eglQueryWaylandBufferWL },
 #endif
      { "eglPostSubBufferNV", (_EGLProc) eglPostSubBufferNV },
      { NULL, NULL }
@@ -1540,6 +1541,25 @@ eglUnbindWaylandDisplayWL(EGLDisplay dpy, struct wl_display *display)

   RETURN_EGL_EVAL(disp, ret);
 }
+
+EGLBoolean EGLAPIENTRY
+eglQueryWaylandBufferWL(EGLDisplay dpy,struct wl_buffer *buffer,
+                        EGLint attribute, EGLint *value)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLDriver *drv;
+   EGLBoolean ret;
+
+   _EGL_CHECK_DISPLAY(disp, EGL_FALSE, drv);
+   assert(disp->Extensions.WL_bind_wayland_display);
+
+   if (!buffer)
+      RETURN_EGL_ERROR(disp, EGL_BAD_PARAMETER, EGL_FALSE);
+
+   ret = drv->API.QueryWaylandBufferWL(drv, disp, buffer, attribute, value);
+
+   RETURN_EGL_EVAL(disp, ret);
+}
 #endif


--- a/src/egl/main/eglapi.h
+++ b/src/egl/main/eglapi.h
@@ -123,6 +123,7 @@ typedef EGLBoolean (*ExportDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _
 struct wl_display;
 typedef EGLBoolean (*BindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display);
 typedef EGLBoolean (*UnbindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display);
+typedef EGLBoolean (*QueryWaylandBufferWL_t)(_EGLDriver *drv, _EGLDisplay *displ, struct wl_buffer *buffer, EGLint attribute, EGLint *value);
 #endif

 typedef EGLBoolean (*PostSubBufferNV_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surface, EGLint x, EGLint y, EGLint width, EGLint height);
@@ -199,6 +200,7 @@ struct _egl_api
 #ifdef EGL_WL_bind_wayland_display
   BindWaylandDisplayWL_t BindWaylandDisplayWL;
   UnbindWaylandDisplayWL_t UnbindWaylandDisplayWL;
+   QueryWaylandBufferWL_t QueryWaylandBufferWL;
 #endif

   PostSubBufferNV_t PostSubBufferNV;
--- a/src/egl/main/eglimage.c
+++ b/src/egl/main/eglimage.c
@@ -88,6 +88,11 @@ _eglParseImageAttribList(_EGLImageAttribs *attrs, _EGLDisplay *dpy,
         attrs->DRMBufferStrideMESA = val;
         break;

+      /* EGL_WL_bind_wayland_display */
+      case EGL_WAYLAND_PLANE_WL:
+         attrs->PlaneWL = val;
+         break;
+
      default:
         /* unknown attrs are ignored */
         break;
--- a/src/egl/main/eglimage.h
+++ b/src/egl/main/eglimage.h
@@ -50,6 +50,9 @@ struct _egl_image_attribs
   EGLint DRMBufferFormatMESA;
   EGLint DRMBufferUseMESA;
   EGLint DRMBufferStrideMESA;
+
+   /* EGL_WL_bind_wayland_display */
+   EGLint PlaneWL;
 };

 /**
--- a/src/egl/wayland/wayland-drm/Makefile.am
+++ b/src/egl/wayland/wayland-drm/Makefile.am
@@ -12,4 +12,11 @@ BUILT_SOURCES = wayland-drm-protocol.c \
 		wayland-drm-server-protocol.h
 CLEANFILES = $(BUILT_SOURCES)

-@wayland_scanner_rules@
+%-protocol.c : %.xml
+	$(AM_V_GEN)$(WAYLAND_SCANNER) code < $< > $@
+
+%-server-protocol.h : %.xml
+	$(AM_V_GEN)$(WAYLAND_SCANNER) server-header < $< > $@
+
+%-client-protocol.h : %.xml
+	$(AM_V_GEN)$(WAYLAND_SCANNER) client-header < $< > $@
--- a/src/egl/wayland/wayland-drm/wayland-drm.c
+++ b/src/egl/wayland/wayland-drm/wayland-drm.c
@@ -56,22 +56,13 @@ struct wl_drm {
 	struct wayland_drm_callbacks *callbacks;
 };

-struct wl_drm_buffer {
-	struct wl_buffer buffer;
-	struct wl_drm *drm;
-	uint32_t format;
-
-	void *driver_buffer;
-};
-
 static void
 destroy_buffer(struct wl_resource *resource)
 {
 	struct wl_drm_buffer *buffer = resource->data;
 	struct wl_drm *drm = buffer->drm;

-	drm->callbacks->release_buffer(drm->user_data,
-				       buffer->driver_buffer);
+	drm->callbacks->release_buffer(drm->user_data, buffer);
 	free(buffer);
 }

@@ -101,24 +92,16 @@ const static struct wl_buffer_interface drm_buffer_interface = {
 };

 static void
-drm_create_buffer(struct wl_client *client, struct wl_resource *resource,
-		  uint32_t id, uint32_t name, int32_t width, int32_t height,
-		  uint32_t stride, uint32_t format)
+create_buffer(struct wl_client *client, struct wl_resource *resource,
+              uint32_t id, uint32_t name, int32_t width, int32_t height,
+              uint32_t format,
+              int32_t offset0, int32_t stride0,
+              int32_t offset1, int32_t stride1,
+              int32_t offset2, int32_t stride2)
 {
 	struct wl_drm *drm = resource->data;
 	struct wl_drm_buffer *buffer;

-	switch (format) {
-	case WL_DRM_FORMAT_ARGB8888:
-	case WL_DRM_FORMAT_XRGB8888:
-		break;
-	default:
-		wl_resource_post_error(resource,
-				       WL_DRM_ERROR_INVALID_FORMAT,
-				       "invalid format");
-		return;
-	}
-
 	buffer = calloc(1, sizeof *buffer);
 	if (buffer == NULL) {
 		wl_resource_post_no_memory(resource);
@@ -129,12 +112,14 @@ drm_create_buffer(struct wl_client *client, struct wl_resource *resource,
 	buffer->buffer.width = width;
 	buffer->buffer.height = height;
 	buffer->format = format;
+	buffer->offset[0] = offset0;
+	buffer->stride[0] = stride0;
+	buffer->offset[1] = offset1;
+	buffer->stride[1] = stride1;
+	buffer->offset[2] = offset2;
+	buffer->stride[2] = stride2;

-	buffer->driver_buffer =
-		drm->callbacks->reference_buffer(drm->user_data, name,
-						 width, height,
-						 stride, format);
-
+        drm->callbacks->reference_buffer(drm->user_data, name, buffer);
 	if (buffer->driver_buffer == NULL) {
 		wl_resource_post_error(resource,
 				       WL_DRM_ERROR_INVALID_NAME,
@@ -154,6 +139,56 @@ drm_create_buffer(struct wl_client *client, struct wl_resource *resource,
 	wl_client_add_resource(resource->client, &buffer->buffer.resource);
 }

+static void
+drm_create_buffer(struct wl_client *client, struct wl_resource *resource,
+		  uint32_t id, uint32_t name, int32_t width, int32_t height,
+		  uint32_t stride, uint32_t format)
+{
+        switch (format) {
+        case WL_DRM_FORMAT_ARGB8888:
+        case WL_DRM_FORMAT_XRGB8888:
+        case WL_DRM_FORMAT_YUYV:
+                break;
+        default:
+                wl_resource_post_error(resource,
+                                       WL_DRM_ERROR_INVALID_FORMAT,
+                                       "invalid format");
+           return;
+        }
+
+        create_buffer(client, resource, id,
+                      name, width, height, format, 0, stride, 0, 0, 0, 0);
+}
+
+static void
+drm_create_planar_buffer(struct wl_client *client,
+                         struct wl_resource *resource,
+                         uint32_t id, uint32_t name,
+                         int32_t width, int32_t height, uint32_t format,
+                         int32_t offset0, int32_t stride0,
+                         int32_t offset1, int32_t stride1,
+                         int32_t offset2, int32_t stride2)
+{
+        switch (format) {
+	case WL_DRM_FORMAT_YUV410:
+	case WL_DRM_FORMAT_YUV411:
+	case WL_DRM_FORMAT_YUV420:
+	case WL_DRM_FORMAT_YUV422:
+	case WL_DRM_FORMAT_YUV444:
+	case WL_DRM_FORMAT_NV12:
+        case WL_DRM_FORMAT_NV16:
+                break;
+        default:
+                wl_resource_post_error(resource,
+                                       WL_DRM_ERROR_INVALID_FORMAT,
+                                       "invalid format");
+           return;
+        }
+
+        create_buffer(client, resource, id, name, width, height, format,
+                      offset0, stride0, offset1, stride1, offset2, stride2);
+}
+
 static void
 drm_authenticate(struct wl_client *client,
 		 struct wl_resource *resource, uint32_t id)
@@ -170,7 +205,8 @@ drm_authenticate(struct wl_client *client,

 const static struct wl_drm_interface drm_interface = {
 	drm_authenticate,
-	drm_create_buffer
+	drm_create_buffer,
+        drm_create_planar_buffer
 };

 static void
@@ -186,6 +222,14 @@ bind_drm(struct wl_client *client, void *data, uint32_t version, uint32_t id)
 			       WL_DRM_FORMAT_ARGB8888);
 	wl_resource_post_event(resource, WL_DRM_FORMAT,
 			       WL_DRM_FORMAT_XRGB8888);
+        wl_resource_post_event(resource, WL_DRM_FORMAT, WL_DRM_FORMAT_YUV410);
+        wl_resource_post_event(resource, WL_DRM_FORMAT, WL_DRM_FORMAT_YUV411);
+        wl_resource_post_event(resource, WL_DRM_FORMAT, WL_DRM_FORMAT_YUV420);
+        wl_resource_post_event(resource, WL_DRM_FORMAT, WL_DRM_FORMAT_YUV422);
+        wl_resource_post_event(resource, WL_DRM_FORMAT, WL_DRM_FORMAT_YUV444);
+        wl_resource_post_event(resource, WL_DRM_FORMAT, WL_DRM_FORMAT_NV12);
+        wl_resource_post_event(resource, WL_DRM_FORMAT, WL_DRM_FORMAT_NV16);
+        wl_resource_post_event(resource, WL_DRM_FORMAT, WL_DRM_FORMAT_YUYV);
 }

 struct wl_drm *
--- a/src/egl/wayland/wayland-drm/wayland-drm.h
+++ b/src/egl/wayland/wayland-drm/wayland-drm.h
@@ -1,22 +1,91 @@
 #ifndef WAYLAND_DRM_H
 #define WAYLAND_DRM_H

-#include "egldisplay.h"
-#include "eglimage.h"
-
 #include <wayland-server.h>
-#include "wayland-drm-server-protocol.h"
+
+#ifndef WL_DRM_FORMAT_ENUM
+#define WL_DRM_FORMAT_ENUM
+enum wl_drm_format {
+	WL_DRM_FORMAT_C8 = 0x20203843,
+	WL_DRM_FORMAT_RGB332 = 0x38424752,
+	WL_DRM_FORMAT_BGR233 = 0x38524742,
+	WL_DRM_FORMAT_XRGB4444 = 0x32315258,
+	WL_DRM_FORMAT_XBGR4444 = 0x32314258,
+	WL_DRM_FORMAT_RGBX4444 = 0x32315852,
+	WL_DRM_FORMAT_BGRX4444 = 0x32315842,
+	WL_DRM_FORMAT_ARGB4444 = 0x32315241,
+	WL_DRM_FORMAT_ABGR4444 = 0x32314241,
+	WL_DRM_FORMAT_RGBA4444 = 0x32314152,
+	WL_DRM_FORMAT_BGRA4444 = 0x32314142,
+	WL_DRM_FORMAT_XRGB1555 = 0x35315258,
+	WL_DRM_FORMAT_XBGR1555 = 0x35314258,
+	WL_DRM_FORMAT_RGBX5551 = 0x35315852,
+	WL_DRM_FORMAT_BGRX5551 = 0x35315842,
+	WL_DRM_FORMAT_ARGB1555 = 0x35315241,
+	WL_DRM_FORMAT_ABGR1555 = 0x35314241,
+	WL_DRM_FORMAT_RGBA5551 = 0x35314152,
+	WL_DRM_FORMAT_BGRA5551 = 0x35314142,
+	WL_DRM_FORMAT_RGB565 = 0x36314752,
+	WL_DRM_FORMAT_BGR565 = 0x36314742,
+	WL_DRM_FORMAT_RGB888 = 0x34324752,
+	WL_DRM_FORMAT_BGR888 = 0x34324742,
+	WL_DRM_FORMAT_XRGB8888 = 0x34325258,
+	WL_DRM_FORMAT_XBGR8888 = 0x34324258,
+	WL_DRM_FORMAT_RGBX8888 = 0x34325852,
+	WL_DRM_FORMAT_BGRX8888 = 0x34325842,
+	WL_DRM_FORMAT_ARGB8888 = 0x34325241,
+	WL_DRM_FORMAT_ABGR8888 = 0x34324241,
+	WL_DRM_FORMAT_RGBA8888 = 0x34324152,
+	WL_DRM_FORMAT_BGRA8888 = 0x34324142,
+	WL_DRM_FORMAT_XRGB2101010 = 0x30335258,
+	WL_DRM_FORMAT_XBGR2101010 = 0x30334258,
+	WL_DRM_FORMAT_RGBX1010102 = 0x30335852,
+	WL_DRM_FORMAT_BGRX1010102 = 0x30335842,
+	WL_DRM_FORMAT_ARGB2101010 = 0x30335241,
+	WL_DRM_FORMAT_ABGR2101010 = 0x30334241,
+	WL_DRM_FORMAT_RGBA1010102 = 0x30334152,
+	WL_DRM_FORMAT_BGRA1010102 = 0x30334142,
+	WL_DRM_FORMAT_YUYV = 0x56595559,
+	WL_DRM_FORMAT_YVYU = 0x55595659,
+	WL_DRM_FORMAT_UYVY = 0x59565955,
+	WL_DRM_FORMAT_VYUY = 0x59555956,
+	WL_DRM_FORMAT_AYUV = 0x56555941,
+	WL_DRM_FORMAT_NV12 = 0x3231564e,
+	WL_DRM_FORMAT_NV21 = 0x3132564e,
+	WL_DRM_FORMAT_NV16 = 0x3631564e,
+	WL_DRM_FORMAT_NV61 = 0x3136564e,
+	WL_DRM_FORMAT_YUV410 = 0x39565559,
+	WL_DRM_FORMAT_YVU410 = 0x39555659,
+	WL_DRM_FORMAT_YUV411 = 0x31315559,
+	WL_DRM_FORMAT_YVU411 = 0x31315659,
+	WL_DRM_FORMAT_YUV420 = 0x32315559,
+	WL_DRM_FORMAT_YVU420 = 0x32315659,
+	WL_DRM_FORMAT_YUV422 = 0x36315559,
+	WL_DRM_FORMAT_YVU422 = 0x36315659,
+	WL_DRM_FORMAT_YUV444 = 0x34325559,
+	WL_DRM_FORMAT_YVU444 = 0x34325659,
+};
+#endif /* WL_DRM_FORMAT_ENUM */

 struct wl_drm;

+struct wl_drm_buffer {
+	struct wl_buffer buffer;
+	struct wl_drm *drm;
+	uint32_t format;
+        const void *driver_format;
+        int32_t offset[3];
+        int32_t stride[3];
+	void *driver_buffer;
+};
+
 struct wayland_drm_callbacks {
 	int (*authenticate)(void *user_data, uint32_t id);

-	void *(*reference_buffer)(void *user_data, uint32_t name,
-				  int32_t width, int32_t height,
-				  uint32_t stride, uint32_t format);
+	void (*reference_buffer)(void *user_data, uint32_t name,
+                                 struct wl_drm_buffer *buffer);

-	void (*release_buffer)(void *user_data, void *buffer);
+	void (*release_buffer)(void *user_data, struct wl_drm_buffer *buffer);
 };

 struct wl_drm *
--- a/src/egl/wayland/wayland-drm/protocol/wayland-drm.xml
+++ b/src/egl/wayland/wayland-drm/protocol/wayland-drm.xml
@@ -119,6 +119,22 @@
      <arg name="format" type="uint"/>
    </request>

+    <!-- Create a wayland buffer for the named DRM buffer.  The DRM
+         surface must have a name using the flink ioctl -->
+    <request name="create_planar_buffer">
+      <arg name="id" type="new_id" interface="wl_buffer"/>
+      <arg name="name" type="uint"/>
+      <arg name="width" type="int"/>
+      <arg name="height" type="int"/>
+      <arg name="format" type="uint"/>
+      <arg name="offset0" type="int"/>
+      <arg name="stride0" type="int"/>
+      <arg name="offset1" type="int"/>
+      <arg name="stride1" type="int"/>
+      <arg name="offset2" type="int"/>
+      <arg name="stride2" type="int"/>
+    </request>
+
    <!-- Notification of the path of the drm device which is used by
         the server.  The client should use this device for creating
         local buffers.  Only buffers created from this device should
--- a/src/gallium/auxiliary/Android.mk
+++ b/src/gallium/auxiliary/Android.mk
@@ -44,8 +44,7 @@ $(LOCAL_GENERATED_SOURCES): PRIVATE_CUSTOM_TOOL = $(PRIVATE_PYTHON) $^ > $@

 $(intermediates)/indices/u_indices_gen.c \
 $(intermediates)/indices/u_unfilled_gen.c \
-$(intermediates)/util/u_format_srgb.c \
-$(intermediates)/util/u_half.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py
+$(intermediates)/util/u_format_srgb.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py
 	$(transform-generated-source)

 $(intermediates)/util/u_format_table.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py $(LOCAL_PATH)/util/u_format.csv
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -39,6 +39,4 @@ util/u_format_srgb.c: util/u_format_srgb.py
 util/u_format_table.c: util/u_format_table.py util/u_format_pack.py util/u_format_parse.py util/u_format.csv
 	$(PYTHON2) util/u_format_table.py util/u_format.csv > $@

-util/u_half.c: util/u_half.py
-	$(PYTHON2) util/u_half.py > $@
 # DO NOT DELETE
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -155,8 +155,7 @@ GENERATED_SOURCES := \
 	indices/u_indices_gen.c \
 	indices/u_unfilled_gen.c \
 	util/u_format_srgb.c \
-	util/u_format_table.c \
-	util/u_half.c
+	util/u_format_table.c

 GALLIVM_SOURCES := \
        gallivm/lp_bld_arit.c \
@@ -166,6 +165,7 @@ GALLIVM_SOURCES := \
        gallivm/lp_bld_conv.c \
        gallivm/lp_bld_flow.c \
        gallivm/lp_bld_format_aos.c \
+        gallivm/lp_bld_format_aos_array.c \
        gallivm/lp_bld_format_soa.c \
        gallivm/lp_bld_format_yuv.c \
        gallivm/lp_bld_gather.c \
@@ -188,7 +188,6 @@ GALLIVM_SOURCES := \
        gallivm/lp_bld_type.c \
        draw/draw_llvm.c \
        draw/draw_llvm_sample.c \
-        draw/draw_llvm_translate.c \
        draw/draw_vs_llvm.c \
        draw/draw_pt_fetch_shade_pipeline_llvm.c

--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -35,13 +35,6 @@ env.CodeGenerate(
    command = python_cmd + ' $SCRIPT $SOURCE > $TARGET'
 )

-env.CodeGenerate(
-    target = 'util/u_half.c',
-    script = 'util/u_half.py',
-    source = [],
-    command = python_cmd + ' $SCRIPT > $TARGET'
-)
-
 env.Depends('util/u_format_table.c', [
    '#src/gallium/auxiliary/util/u_format_parse.py',
    'util/u_format_pack.py', 
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -42,6 +42,7 @@

 #if HAVE_LLVM
 #include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_limits.h"
 #include "draw_llvm.h"

 static boolean
@@ -69,8 +70,7 @@ draw_get_option_use_llvm(void)
 * Create new draw module context with gallivm state for LLVM JIT.
 */
 static struct draw_context *
-draw_create_context(struct pipe_context *pipe, boolean try_llvm,
-                    struct gallivm_state *gallivm)
+draw_create_context(struct pipe_context *pipe, boolean try_llvm)
 {
   struct draw_context *draw = CALLOC_STRUCT( draw_context );
   if (draw == NULL)
@@ -78,16 +78,7 @@ draw_create_context(struct pipe_context *pipe, boolean try_llvm,

 #if HAVE_LLVM
   if (try_llvm && draw_get_option_use_llvm()) {
-      if (!gallivm) {
-         gallivm = gallivm_create();
-         draw->own_gallivm = gallivm;
-      }
-
-      if (!gallivm)
-         goto err_destroy;
-
-      draw->llvm = draw_llvm_create(draw, gallivm);
-
+      draw->llvm = draw_llvm_create(draw);
      if (!draw->llvm)
         goto err_destroy;
   }
@@ -113,7 +104,7 @@ err_out:
 struct draw_context *
 draw_create(struct pipe_context *pipe)
 {
-   return draw_create_context(pipe, TRUE, NULL);
+   return draw_create_context(pipe, TRUE);
 }


@@ -123,17 +114,7 @@ draw_create(struct pipe_context *pipe)
 struct draw_context *
 draw_create_no_llvm(struct pipe_context *pipe)
 {
-   return draw_create_context(pipe, FALSE, NULL);
-}
-
-
-/**
- * Create new draw module context with gallivm state for LLVM JIT.
- */
-struct draw_context *
-draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm)
-{
-   return draw_create_context(pipe, TRUE, gallivm);
+   return draw_create_context(pipe, FALSE);
 }


@@ -212,9 +193,6 @@ void draw_destroy( struct draw_context *draw )
 #ifdef HAVE_LLVM
   if (draw->llvm)
      draw_llvm_destroy( draw->llvm );
-
-   if (draw->own_gallivm)
-      gallivm_destroy(draw->own_gallivm);
 #endif

   FREE( draw );
@@ -830,3 +808,43 @@ draw_set_mapped_texture(struct draw_context *draw,
                                row_stride, img_stride, data);
 #endif
 }
+
+/**
+ * XXX: Results for PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS because there are two
+ * different ways of setting textures, and drivers typically only support one.
+ */
+int
+draw_get_shader_param_no_llvm(unsigned shader, enum pipe_shader_cap param)
+{
+   switch(shader) {
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_GEOMETRY:
+      return tgsi_exec_get_shader_param(param);
+   default:
+      return 0;
+   }
+}
+
+/**
+ * XXX: Results for PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS because there are two
+ * different ways of setting textures, and drivers typically only support one.
+ */
+int
+draw_get_shader_param(unsigned shader, enum pipe_shader_cap param)
+{
+
+#ifdef HAVE_LLVM
+   if (draw_get_option_use_llvm()) {
+      switch(shader) {
+      case PIPE_SHADER_VERTEX:
+      case PIPE_SHADER_GEOMETRY:
+         return gallivm_get_shader_param(param);
+      default:
+         return 0;
+      }
+   }
+#endif
+
+   return draw_get_shader_param_no_llvm(shader, param);
+}
+
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -48,7 +48,6 @@ struct draw_vertex_shader;
 struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
-struct gallivm_state;

 /*
 * structure to contain driver internal information 
@@ -67,9 +66,6 @@ struct draw_context *draw_create( struct pipe_context *pipe );

 struct draw_context *draw_create_no_llvm(struct pipe_context *pipe);

-struct draw_context *
-draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm);
-
 void draw_destroy( struct draw_context *draw );

 void draw_flush(struct draw_context *draw);
@@ -277,16 +273,10 @@ boolean draw_need_pipeline(const struct draw_context *draw,
                           const struct pipe_rasterizer_state *rasterizer,
                           unsigned prim );

-static INLINE int
-draw_get_shader_param(unsigned shader, enum pipe_shader_cap param)
-{
-   switch(shader) {
-   case PIPE_SHADER_VERTEX:
-   case PIPE_SHADER_GEOMETRY:
-      return tgsi_exec_get_shader_param(param);
-   default:
-      return 0;
-   }
-}
+int
+draw_get_shader_param(unsigned shader, enum pipe_shader_cap param);
+
+int
+draw_get_shader_param_no_llvm(unsigned shader, enum pipe_shader_cap param);

 #endif /* DRAW_CONTEXT_H */
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -36,11 +36,6 @@
 #include "pipe/p_context.h"
 #include "util/u_simple_list.h"

-#include <llvm-c/Core.h>
-#include <llvm-c/Analysis.h>
-#include <llvm-c/Target.h>
-#include <llvm-c/ExecutionEngine.h>
-

 struct draw_llvm;
 struct llvm_vertex_shader;
@@ -220,6 +215,14 @@ struct draw_llvm_variant_list_item

 struct draw_llvm_variant
 {
+   struct gallivm_state *gallivm;
+
+   /* LLVM JIT builder types */
+   LLVMTypeRef context_ptr_type;
+   LLVMTypeRef buffer_ptr_type;
+   LLVMTypeRef vb_ptr_type;
+   LLVMTypeRef vertex_header_ptr_type;
+
   LLVMValueRef function;
   LLVMValueRef function_elts;
   draw_jit_vert_func jit_func;
@@ -249,16 +252,8 @@ struct draw_llvm {

   struct draw_jit_context jit_context;

-   struct gallivm_state *gallivm;
-
   struct draw_llvm_variant_list_item vs_variants_list;
   int nr_variants;
-
-   /* LLVM JIT builder types */
-   LLVMTypeRef context_ptr_type;
-   LLVMTypeRef buffer_ptr_type;
-   LLVMTypeRef vb_ptr_type;
-   LLVMTypeRef vertex_header_ptr_type;
 };


@@ -270,7 +265,7 @@ llvm_vertex_shader(struct draw_vertex_shader *vs)


 struct draw_llvm *
-draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm);
+draw_llvm_create(struct draw_context *draw);

 void
 draw_llvm_destroy(struct draw_llvm *llvm);
@@ -286,11 +281,6 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant);
 struct draw_llvm_variant_key *
 draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store);

-LLVMValueRef
-draw_llvm_translate_from(struct gallivm_state *gallivm,
-                         LLVMValueRef vbuffer,
-                         enum pipe_format from_format);
-
 struct lp_build_sampler_soa *
 draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
                             LLVMValueRef context_ptr);
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -173,8 +173,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                       unsigned unit,
                                       unsigned num_coords,
                                       const LLVMValueRef *coords,
-                                       const LLVMValueRef *ddx,
-                                       const LLVMValueRef *ddy,
+                                       const struct lp_derivatives *derivs,
                                       LLVMValueRef lod_bias, /* optional */
                                       LLVMValueRef explicit_lod, /* optional */
                                       LLVMValueRef *texel)
@@ -189,7 +188,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                       type,
                       unit,
                       num_coords, coords,
-                       ddx, ddy,
+                       derivs,
                       lod_bias, explicit_lod,
                       texel);
 }
@@ -201,6 +200,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
 static void
 draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
                                      struct gallivm_state *gallivm,
+                                      struct lp_type type,
                                      unsigned unit,
                                      LLVMValueRef explicit_lod, /* optional */
                                      LLVMValueRef *sizes_out)
@@ -212,6 +212,7 @@ draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
   lp_build_size_query_soa(gallivm,
                           &sampler->dynamic_state.static_state[unit],
                           &sampler->dynamic_state.base,
+			   type,
                           unit,
                           explicit_lod,
                           sizes_out);
--- a/src/gallium/auxiliary/draw/draw_llvm_translate.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_translate.c
@@ -1,506 +0,0 @@
-#include "draw_private.h"
-#include "draw_context.h"
-
-#include "draw_llvm.h"
-
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_struct.h"
-#include "gallivm/lp_bld_format.h"
-#include "gallivm/lp_bld_debug.h"
-#include "gallivm/lp_bld_type.h"
-
-#include "util/u_memory.h"
-#include "util/u_format.h"
-#include "pipe/p_state.h"
-
-
-#define DRAW_DBG 0
-
-static  LLVMValueRef
-from_64_float(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildFPTrunc(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static LLVMValueRef
-from_32_float(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0) , "");
-   return LLVMBuildLoad(gallivm->builder, bc, "");
-}
-
-static INLINE LLVMValueRef
-from_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-
-static INLINE LLVMValueRef
-from_8_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 255.), "");
-}
-
-static INLINE LLVMValueRef
-from_16_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 65535.), "");
-}
-
-static INLINE LLVMValueRef
-from_32_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 4294967295.), "");
-}
-
-static INLINE LLVMValueRef
-from_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 127.0), "");
-}
-
-static INLINE LLVMValueRef
-from_16_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 32767.0f), "");
-}
-
-static INLINE LLVMValueRef
-from_32_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 2147483647.0), "");
-}
-
-static INLINE LLVMValueRef
-from_32_fixed(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 65536.0), "");
-}
-
-static LLVMValueRef
-to_64_float(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPExt(gallivm->builder, l, LLVMDoubleTypeInContext(gallivm->context), "");
-}
-
-static LLVMValueRef
-to_32_float(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   return LLVMBuildLoad(gallivm->builder, fp, "");
-}
-
-static INLINE LLVMValueRef
-to_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), "");
-}
-
-static INLINE LLVMValueRef
-to_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), "");
-}
-
-static INLINE LLVMValueRef
-to_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), "");
-}
-
-static INLINE LLVMValueRef
-to_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), "");
-}
-
-static INLINE LLVMValueRef
-to_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), "");
-}
-
-static INLINE LLVMValueRef
-to_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), "");
-}
-
-static INLINE LLVMValueRef
-to_8_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 8), "");
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 255.), "");
-}
-
-static INLINE LLVMValueRef
-to_16_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 32), "");
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 65535.), "");
-}
-
-static INLINE LLVMValueRef
-to_32_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 32), "");
-
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 4294967295.), "");
-}
-
-static INLINE LLVMValueRef
-to_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 8), "");
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 127.0), "");
-}
-
-static INLINE LLVMValueRef
-to_16_snorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 16), "");
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 32767.0f), "");
-}
-
-static INLINE LLVMValueRef
-to_32_snorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 32), "");
-
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 2147483647.0), "");
-}
-
-static INLINE LLVMValueRef
-to_32_fixed(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 32), "");
-
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 65536.0), "");
-}
-
-typedef LLVMValueRef (*from_func)(struct gallivm_state *, LLVMValueRef);
-typedef  LLVMValueRef (*to_func)(struct gallivm_state *, LLVMValueRef);
-
-/* so that underneath can avoid function calls which are prohibited
- * for static initialization we need this conversion */
-enum ll_type {
-   LL_Double,
-   LL_Float,
-   LL_Int32,
-   LL_Int16,
-   LL_Int8
-};
-
-static INLINE LLVMTypeRef
-ll_type_to_llvm(struct gallivm_state *gallivm, enum ll_type type)
-{
-   switch (type) {
-   case LL_Double:
-      return LLVMDoubleTypeInContext(gallivm->context);
-   case LL_Float:
-      return LLVMFloatTypeInContext(gallivm->context);
-   case LL_Int32:
-      return LLVMInt32TypeInContext(gallivm->context);
-   case LL_Int16:
-      return LLVMIntTypeInContext(gallivm->context, 16);
-   case LL_Int8:
-      return LLVMIntTypeInContext(gallivm->context, 8);
-   }
-   return LLVMIntTypeInContext(gallivm->context, 8);
-}
-
-static INLINE int
-ll_type_size(enum ll_type type)
-{
-   switch (type) {
-   case LL_Double:
-      return 8;
-   case LL_Float:
-      return 4;
-   case LL_Int32:
-      return 4;
-   case LL_Int16:
-      return 2;
-   case LL_Int8:
-      return 1;
-   }
-   return 1;
-}
-
-struct draw_llvm_translate {
-   int format;
-   from_func from;
-   to_func to;
-   enum ll_type type;
-   int num_components;
-} translates[] =
-{
-   {PIPE_FORMAT_R64_FLOAT,          from_64_float, to_64_float, LL_Double, 1},
-   {PIPE_FORMAT_R64G64_FLOAT,       from_64_float, to_64_float, LL_Double, 2},
-   {PIPE_FORMAT_R64G64B64_FLOAT,    from_64_float, to_64_float, LL_Double, 3},
-   {PIPE_FORMAT_R64G64B64A64_FLOAT, from_64_float, to_64_float, LL_Double, 4},
-   {PIPE_FORMAT_R32_FLOAT,          from_32_float, to_32_float, LL_Float, 1},
-   {PIPE_FORMAT_R32G32_FLOAT,       from_32_float, to_32_float, LL_Float, 2},
-   {PIPE_FORMAT_R32G32B32_FLOAT,    from_32_float, to_32_float, LL_Float, 3},
-   {PIPE_FORMAT_R32G32B32A32_FLOAT, from_32_float, to_32_float, LL_Float, 4},
-
-   {PIPE_FORMAT_R32_UNORM,          from_32_unorm, to_32_unorm, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_UNORM,       from_32_unorm, to_32_unorm, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_UNORM,    from_32_unorm, to_32_unorm, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 4},
-
-   {PIPE_FORMAT_R32_USCALED,          from_32_uscaled, to_32_uscaled, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_USCALED,       from_32_uscaled, to_32_uscaled, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_USCALED,    from_32_uscaled, to_32_uscaled, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 4},
-
-   {PIPE_FORMAT_R32_SNORM,          from_32_snorm, to_32_snorm, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_SNORM,       from_32_snorm, to_32_snorm, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_SNORM,    from_32_snorm, to_32_snorm, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 4},
-
-   {PIPE_FORMAT_R32_SSCALED,          from_32_sscaled, to_32_sscaled, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_SSCALED,       from_32_sscaled, to_32_sscaled, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_SSCALED,    from_32_sscaled, to_32_sscaled, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 4},
-
-   {PIPE_FORMAT_R16_UNORM,          from_16_unorm, to_16_unorm, LL_Int16, 1},
-   {PIPE_FORMAT_R16G16_UNORM,       from_16_unorm, to_16_unorm, LL_Int16, 2},
-   {PIPE_FORMAT_R16G16B16_UNORM,    from_16_unorm, to_16_unorm, LL_Int16, 3},
-   {PIPE_FORMAT_R16G16B16A16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 4},
-
-   {PIPE_FORMAT_R16_USCALED,          from_16_uscaled, to_16_uscaled, LL_Int16, 1},
-   {PIPE_FORMAT_R16G16_USCALED,       from_16_uscaled, to_16_uscaled, LL_Int16, 2},
-   {PIPE_FORMAT_R16G16B16_USCALED,    from_16_uscaled, to_16_uscaled, LL_Int16, 3},
-   {PIPE_FORMAT_R16G16B16A16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 4},
-
-   {PIPE_FORMAT_R16_SNORM,          from_16_snorm, to_16_snorm, LL_Int16, 1},
-   {PIPE_FORMAT_R16G16_SNORM,       from_16_snorm, to_16_snorm, LL_Int16, 2},
-   {PIPE_FORMAT_R16G16B16_SNORM,    from_16_snorm, to_16_snorm, LL_Int16, 3},
-   {PIPE_FORMAT_R16G16B16A16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 4},
-
-   {PIPE_FORMAT_R16_SSCALED,          from_16_sscaled, to_16_sscaled, LL_Int16, 1},
-   {PIPE_FORMAT_R16G16_SSCALED,       from_16_sscaled, to_16_sscaled, LL_Int16, 2},
-   {PIPE_FORMAT_R16G16B16_SSCALED,    from_16_sscaled, to_16_sscaled, LL_Int16, 3},
-   {PIPE_FORMAT_R16G16B16A16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 4},
-
-   {PIPE_FORMAT_R8_UNORM,       from_8_unorm, to_8_unorm, LL_Int8, 1},
-   {PIPE_FORMAT_R8G8_UNORM,     from_8_unorm, to_8_unorm, LL_Int8, 2},
-   {PIPE_FORMAT_R8G8B8_UNORM,   from_8_unorm, to_8_unorm, LL_Int8, 3},
-   {PIPE_FORMAT_R8G8B8A8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 4},
-
-   {PIPE_FORMAT_R8_USCALED,       from_8_uscaled, to_8_uscaled, LL_Int8, 1},
-   {PIPE_FORMAT_R8G8_USCALED,     from_8_uscaled, to_8_uscaled, LL_Int8, 2},
-   {PIPE_FORMAT_R8G8B8_USCALED,   from_8_uscaled, to_8_uscaled, LL_Int8, 3},
-   {PIPE_FORMAT_R8G8B8A8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 4},
-
-   {PIPE_FORMAT_R8_SNORM,       from_8_snorm, to_8_snorm, LL_Int8, 1},
-   {PIPE_FORMAT_R8G8_SNORM,     from_8_snorm, to_8_snorm, LL_Int8, 2},
-   {PIPE_FORMAT_R8G8B8_SNORM,   from_8_snorm, to_8_snorm, LL_Int8, 3},
-   {PIPE_FORMAT_R8G8B8A8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 4},
-
-   {PIPE_FORMAT_R8_SSCALED,       from_8_sscaled, to_8_sscaled, LL_Int8, 1},
-   {PIPE_FORMAT_R8G8_SSCALED,     from_8_sscaled, to_8_sscaled, LL_Int8, 2},
-   {PIPE_FORMAT_R8G8B8_SSCALED,   from_8_sscaled, to_8_sscaled, LL_Int8, 3},
-   {PIPE_FORMAT_R8G8B8A8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 4},
-
-   {PIPE_FORMAT_R32_FIXED,          from_32_fixed, to_32_fixed, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_FIXED,       from_32_fixed, to_32_fixed, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_FIXED,    from_32_fixed, to_32_fixed, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 4},
-};
-
-
-static LLVMValueRef
-fetch(struct gallivm_state *gallivm,
-      LLVMValueRef ptr, int val_size, int nr_components,
-      from_func func)
-{
-   int i;
-   int offset = 0;
-   LLVMValueRef res =
-      LLVMConstNull(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4));
-   LLVMValueRef defaults[4];
-
-   defaults[0] =
-   defaults[1] =
-   defaults[2] = lp_build_const_float(gallivm, 0.0);
-   defaults[3] = lp_build_const_float(gallivm, 1.0);
-
-   for (i = 0; i < nr_components; ++i) {
-      LLVMValueRef src_index = lp_build_const_int32(gallivm, offset);
-      LLVMValueRef dst_index = lp_build_const_int32(gallivm, i);
-      LLVMValueRef src_tmp;
-      LLVMValueRef component;
-
-      src_tmp = LLVMBuildGEP(gallivm->builder, ptr, &src_index, 1, "src_tmp");
-
-      /* convert src_tmp to float */
-      component = func(gallivm, src_tmp);
-
-      /* vec.comp = component */
-      res = LLVMBuildInsertElement(gallivm->builder,
-                                   res,
-                                   component,
-                                   dst_index, "");
-      offset += val_size;
-   }
-   for (; i < 4; ++i) {
-      LLVMValueRef dst_index = lp_build_const_int32(gallivm, i);
-      res = LLVMBuildInsertElement(gallivm->builder,
-                                   res,
-                                   defaults[i],
-                                   dst_index, "");
-   }
-   return res;
-}
-
-
-LLVMValueRef
-draw_llvm_translate_from(struct gallivm_state *gallivm,
-                         LLVMValueRef vbuffer,
-                         enum pipe_format from_format)
-{
-   const struct util_format_description *format_desc;
-   LLVMValueRef zero;
-   int i;
-   struct lp_type type = lp_float32_vec4_type();
-
-   /*
-    * The above can only cope with straight arrays: no bitfields,
-    * swizzles, or half floats.
-    */
-
-   for (i = 0; i < Elements(translates); ++i) {
-      if (translates[i].format == from_format) {
-         /*LLVMTypeRef type = ll_type_to_llvm(translates[i].type);*/
-         return fetch(gallivm,
-                      vbuffer,
-                      ll_type_size(translates[i].type),
-                      translates[i].num_components,
-                      translates[i].from);
-      }
-   }
-
-
-   /*
-    * This doesn't handle anything bigger than 32bits, or half floats
-    * yet.
-    *
-    * TODO: unify all this code into lp_build_fetch_rgba_aos().
-    */
-
-   format_desc = util_format_description(from_format);
-   zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
-   return lp_build_fetch_rgba_aos(gallivm, format_desc, type, vbuffer, zero, zero, zero);
-}
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -47,8 +47,8 @@
 #include "tgsi/tgsi_scan.h"

 #ifdef HAVE_LLVM
-#include <llvm-c/ExecutionEngine.h>
 struct draw_llvm;
+struct gallivm_state;
 #endif


@@ -301,7 +301,6 @@ struct draw_context

 #ifdef HAVE_LLVM
   struct draw_llvm *llvm;
-   struct gallivm_state *own_gallivm;
 #endif

   struct pipe_sampler_view *sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -230,7 +230,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
   llvm_vert_info.stride = fpme->vertex_size;
   llvm_vert_info.verts =
      (struct vertex_header *)MALLOC(fpme->vertex_size *
-                                     align(fetch_info->count,  4));
+                                     align(fetch_info->count,  lp_native_vector_width / 32));
   if (!llvm_vert_info.verts) {
      assert(0);
      return;
@@ -423,7 +423,7 @@ draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw)
 {
   struct llvm_middle_end *fpme = 0;

-   if (!draw->llvm || !draw->llvm->gallivm->engine)
+   if (!draw->llvm)
      return NULL;

   fpme = CALLOC_STRUCT( llvm_middle_end );
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld,
                    LLVMValueRef a,
                    LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
   const struct lp_type type = bld->type;
   const char *intrinsic = NULL;
+   unsigned intr_size = 0;
   LLVMValueRef cond;

   assert(lp_check_value(type, a));
@@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld,

   /* TODO: optimize the constant case */

-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
+   if (type.floating && util_cpu_caps.has_sse) {
+      if (type.width == 32) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse.min.ss";
+            intr_size = 128;
+         }
+         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
            intrinsic = "llvm.x86.sse.min.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.min.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.min.ps.256";
+            intr_size = 256;
+         }
      }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pminu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+      if (type.width == 64 && util_cpu_caps.has_sse2) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse2.min.sd";
+            intr_size = 128;
+         }
+         else if (type.length == 2 || !util_cpu_caps.has_avx) {
+            intrinsic = "llvm.x86.sse2.min.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.min.pd.256";
+            intr_size = 256;
+         }
+      }
+   }
+   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+      intr_size = 128;
+      if ((type.width == 8 || type.width == 16) &&
+          (type.width * type.length <= 64) &&
+          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+                      __FUNCTION__);
+         }
+      if (type.width == 8 && !type.sign) {
+         intrinsic = "llvm.x86.sse2.pminu.b";
+      }
+      else if (type.width == 16 && type.sign) {
+         intrinsic = "llvm.x86.sse2.pmins.w";
+      }
+      if (util_cpu_caps.has_sse4_1) {
+         if (type.width == 8 && type.sign) {
            intrinsic = "llvm.x86.sse41.pminsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 16 && !type.sign) {
            intrinsic = "llvm.x86.sse41.pminuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmins.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 32 && !type.sign) {
            intrinsic = "llvm.x86.sse41.pminud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+        }
+         if (type.width == 32 && type.sign) {
            intrinsic = "llvm.x86.sse41.pminsd";
+         }
      }
   }

-   if(intrinsic)
-      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+   if(intrinsic) {
+      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+                                                 type,
+                                                 intr_size, a, b);
+   }

   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
   return lp_build_select(bld, cond, a, b);
@@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld,
                    LLVMValueRef a,
                    LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
   const struct lp_type type = bld->type;
   const char *intrinsic = NULL;
+   unsigned intr_size = 0;
   LLVMValueRef cond;

   assert(lp_check_value(type, a));
@@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld,

   /* TODO: optimize the constant case */

-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
+   if (type.floating && util_cpu_caps.has_sse) {
+      if (type.width == 32) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse.max.ss";
+            intr_size = 128;
+         }
+         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
            intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.max.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.max.ps.256";
+            intr_size = 256;
+         }
      }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+      if (type.width == 64 && util_cpu_caps.has_sse2) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse2.max.sd";
+            intr_size = 128;
+         }
+         else if (type.length == 2 || !util_cpu_caps.has_avx) {
+            intrinsic = "llvm.x86.sse2.max.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.max.pd.256";
+            intr_size = 256;
+         }
+      }
+   }
+   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+      intr_size = 128;
+      if ((type.width == 8 || type.width == 16) &&
+          (type.width * type.length <= 64) &&
+          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+                      __FUNCTION__);
+         }
+      if (type.width == 8 && !type.sign) {
+         intrinsic = "llvm.x86.sse2.pmaxu.b";
+         intr_size = 128;
+      }
+      else if (type.width == 16 && type.sign) {
+         intrinsic = "llvm.x86.sse2.pmaxs.w";
+      }
+      if (util_cpu_caps.has_sse4_1) {
+         if (type.width == 8 && type.sign) {
            intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 16 && !type.sign) {
            intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 32 && !type.sign) {
            intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+        }
+         if (type.width == 32 && type.sign) {
            intrinsic = "llvm.x86.sse41.pmaxsd";
+         }
      }
   }

-   if(intrinsic)
-      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+   if(intrinsic) {
+      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+                                                 type,
+                                                 intr_size, a, b);
+   }

   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
   return lp_build_select(bld, cond, a, b);
@@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld,
 }


-/** Return the scalar sum of the elements of a */
+/** Return the scalar sum of the elements of a.
+ * Should avoid this operation whenever possible.
+ */
 LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_horizontal_add(struct lp_build_context *bld,
+                        LLVMValueRef a)
 {
   LLVMBuilderRef builder = bld->gallivm->builder;
   const struct lp_type type = bld->type;
   LLVMValueRef index, res;
-   unsigned i;
+   unsigned i, length;
+   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
+   LLVMValueRef vecres, elem2;

   assert(lp_check_value(type, a));

@@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld,

   assert(!bld->type.norm);

-   index = lp_build_const_int32(bld->gallivm, 0);
-   res = LLVMBuildExtractElement(builder, a, index, "");
+   /*
+    * for byte vectors can do much better with psadbw.
+    * Using repeated shuffle/adds here. Note with multiple vectors
+    * this can be done more efficiently as outlined in the intel
+    * optimization manual.
+    * Note: could cause data rearrangement if used with smaller element
+    * sizes.
+    */

-   for (i = 1; i < type.length; i++) {
-      index = lp_build_const_int32(bld->gallivm, i);
-      if (type.floating)
-         res = LLVMBuildFAdd(builder, res,
-                            LLVMBuildExtractElement(builder,
-                                                    a, index, ""),
-                            "");
-      else
-         res = LLVMBuildAdd(builder, res,
-                            LLVMBuildExtractElement(builder,
-                                                    a, index, ""),
-                            "");
+   vecres = a;
+   length = type.length / 2;
+   while (length > 1) {
+      LLVMValueRef vec1, vec2;
+      for (i = 0; i < length; i++) {
+         shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
+         shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
+      }
+      vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
+                                    LLVMConstVector(shuffles1, length), "");
+      vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
+                                    LLVMConstVector(shuffles2, length), "");
+      if (type.floating) {
+         vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
+      }
+      else {
+         vecres = LLVMBuildAdd(builder, vec1, vec2, "");
+      }
+      length = length >> 1;
   }

+   /* always have vector of size 2 here */
+   assert(length == 1);
+
+   index = lp_build_const_int32(bld->gallivm, 0);
+   res = LLVMBuildExtractElement(builder, vecres, index, "");
+   index = lp_build_const_int32(bld->gallivm, 1);
+   elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
+
+   if (type.floating)
+      res = LLVMBuildFAdd(builder, res, elem2, "");
+    else
+      res = LLVMBuildAdd(builder, res, elem2, "");
+
   return res;
 }

+/**
+ * Return the horizontal sums of 4 float vectors as a float4 vector.
+ * This uses the technique as outlined in Intel Optimization Manual.
+ */
+static LLVMValueRef
+lp_build_horizontal_add4x4f(struct lp_build_context *bld,
+                            LLVMValueRef src[4])
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles[4];
+   LLVMValueRef tmp[4];
+   LLVMValueRef sumtmp[2], shuftmp[2];
+
+   /* lower half of regs */
+   shuffles[0] = lp_build_const_int32(gallivm, 0);
+   shuffles[1] = lp_build_const_int32(gallivm, 1);
+   shuffles[2] = lp_build_const_int32(gallivm, 4);
+   shuffles[3] = lp_build_const_int32(gallivm, 5);
+   tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
+                                   LLVMConstVector(shuffles, 4), "");
+   tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
+                                   LLVMConstVector(shuffles, 4), "");
+
+   /* upper half of regs */
+   shuffles[0] = lp_build_const_int32(gallivm, 2);
+   shuffles[1] = lp_build_const_int32(gallivm, 3);
+   shuffles[2] = lp_build_const_int32(gallivm, 6);
+   shuffles[3] = lp_build_const_int32(gallivm, 7);
+   tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
+                                   LLVMConstVector(shuffles, 4), "");
+   tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
+                                   LLVMConstVector(shuffles, 4), "");
+
+   sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
+   sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
+
+   shuffles[0] = lp_build_const_int32(gallivm, 0);
+   shuffles[1] = lp_build_const_int32(gallivm, 2);
+   shuffles[2] = lp_build_const_int32(gallivm, 4);
+   shuffles[3] = lp_build_const_int32(gallivm, 6);
+   shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+                                       LLVMConstVector(shuffles, 4), "");
+
+   shuffles[0] = lp_build_const_int32(gallivm, 1);
+   shuffles[1] = lp_build_const_int32(gallivm, 3);
+   shuffles[2] = lp_build_const_int32(gallivm, 5);
+   shuffles[3] = lp_build_const_int32(gallivm, 7);
+   shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+                                       LLVMConstVector(shuffles, 4), "");
+
+   return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
+}
+
+
+/*
+ * partially horizontally add 2-4 float vectors with length nx4,
+ * i.e. only four adjacent values in each vector will be added,
+ * assuming values are really grouped in 4 which also determines
+ * output order.
+ *
+ * Return a vector of the same length as the initial vectors,
+ * with the excess elements (if any) being undefined.
+ * The element order is independent of number of input vectors.
+ * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
+ * the output order thus will be
+ * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
+ */
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+                       LLVMValueRef vectors[],
+                       unsigned num_vecs)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ret_vec;
+   LLVMValueRef tmp[4];
+   const char *intrinsic = NULL;
+
+   assert(num_vecs >= 2 && num_vecs <= 4);
+   assert(bld->type.floating);
+
+   /* only use this with at least 2 vectors, as it is sort of expensive
+    * (depending on cpu) and we always need two horizontal adds anyway,
+    * so a shuffle/add approach might be better.
+    */
+
+   tmp[0] = vectors[0];
+   tmp[1] = vectors[1];
+
+   tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+   tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+
+   if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
+       bld->type.length == 4) {
+      intrinsic = "llvm.x86.sse3.hadd.ps";
+   }
+   else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
+            bld->type.length == 8) {
+      intrinsic = "llvm.x86.avx.hadd.ps.256";
+   }
+   if (intrinsic) {
+      tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                       lp_build_vec_type(gallivm, bld->type),
+                                       tmp[0], tmp[1]);
+      if (num_vecs > 2) {
+         tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                          lp_build_vec_type(gallivm, bld->type),
+                                          tmp[2], tmp[3]);
+      }
+      else {
+         tmp[1] = tmp[0];
+      }
+      return lp_build_intrinsic_binary(builder, intrinsic,
+                                       lp_build_vec_type(gallivm, bld->type),
+                                       tmp[0], tmp[1]);
+   }
+
+   if (bld->type.length == 4) {
+      ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
+   }
+   else {
+      LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
+      unsigned j;
+      unsigned num_iter = bld->type.length / 4;
+      struct lp_type parttype = bld->type;
+      parttype.length = 4;
+      for (j = 0; j < num_iter; j++) {
+         LLVMValueRef partsrc[4];
+         unsigned i;
+         for (i = 0; i < 4; i++) {
+            partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
+         }
+         partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
+      }
+      ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
+   }
+   return ret_vec;
+}

 /**
 * Generate a - b
@@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
      if(bld->type.floating) {
 #if 0
         /*
-          * Power of two multiplication by directly manipulating the mantissa.
+          * Power of two multiplication by directly manipulating the exponent.
          *
          * XXX: This might not be always faster, it will introduce a small error
          * for multiplication by zero, and it will produce wrong results
@@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld,
         return LLVMConstUDiv(a, b);
   }

-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
+   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
      type.floating)
      return lp_build_mul(bld, a, lp_build_rcp(bld, b));

@@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld,
         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
      }
   }
+   else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
+            (gallivm_debug & GALLIVM_DEBUG_PERF) &&
+            (type.width == 8 || type.width == 16 || type.width == 32)) {
+      debug_printf("%s: inefficient code, should split vectors manually\n",
+                   __FUNCTION__);
+   }

   return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
 }
@@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld,
   else
   {
      /* signed int/norm/fixed point */
+      /* could use psign with sse3 and appropriate vectors here */
      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
      res = lp_build_select(bld, cond, bld->one, minus_one);
@@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld,
   return LLVMBuildSIToFP(builder, a, vec_type, "");
 }

+static boolean
+sse41_rounding_available(const struct lp_type type)
+{
+   if ((util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) ||
+       (util_cpu_caps.has_avx && type.width*type.length == 256))
+      return TRUE;

+   return FALSE;
+}

 enum lp_build_round_sse41_mode
 {
@@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld,
      res = LLVMBuildExtractElement(builder, res, index0, "");
   }
   else {
-      assert(type.width*type.length == 128);
+      if (type.width * type.length == 128) {
+         switch(type.width) {
+         case 32:
+            intrinsic = "llvm.x86.sse41.round.ps";
+            break;
+         case 64:
+            intrinsic = "llvm.x86.sse41.round.pd";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
+      }
+      else {
+         assert(type.width * type.length == 256);
+         assert(util_cpu_caps.has_avx);

-      switch(type.width) {
-      case 32:
-         intrinsic = "llvm.x86.sse41.round.ps";
-         break;
-      case 64:
-         intrinsic = "llvm.x86.sse41.round.pd";
-         break;
-      default:
-         assert(0);
-         return bld->undef;
+         switch(type.width) {
+         case 32:
+            intrinsic = "llvm.x86.avx.round.ps.256";
+            break;
+         case 64:
+            intrinsic = "llvm.x86.avx.round.pd.256";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
      }

      res = lp_build_intrinsic_binary(builder, intrinsic,
@@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
                                     ret_type, arg);
   }
   else {
-      assert(type.width*type.length == 128);
-
-      intrinsic = "llvm.x86.sse2.cvtps2dq";
+      if (type.width* type.length == 128) {
+         intrinsic = "llvm.x86.sse2.cvtps2dq";
+      }
+      else {
+         assert(type.width*type.length == 256);
+         assert(util_cpu_caps.has_avx);

+         intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+      }
      res = lp_build_intrinsic_unary(builder, intrinsic,
                                     ret_type, a);
   }
@@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld,
   assert(type.floating);
   assert(lp_check_value(type, a));

-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
   }
   else {
@@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld,
   assert(type.floating);
   assert(lp_check_value(type, a));

-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
   }
   else {
@@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld,
   assert(type.floating);
   assert(lp_check_value(type, a));

-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
   }
   else {
@@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld,
   assert(type.floating);
   assert(lp_check_value(type, a));

-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
   }
   else {
@@ -1268,6 +1553,34 @@ lp_build_fract(struct lp_build_context *bld,
 }


+/**
+ * Prevent returning a fractional part of 1.0 for very small negative values of
+ * 'a' by clamping against 0.99999(9).
+ */
+static inline LLVMValueRef
+clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
+{
+   LLVMValueRef max;
+
+   /* this is the largest number smaller than 1.0 representable as float */
+   max = lp_build_const_vec(bld->gallivm, bld->type,
+                            1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
+   return lp_build_min(bld, fract, max);
+}
+
+
+/**
+ * Same as lp_build_fract, but guarantees that the result is always smaller
+ * than one.
+ */
+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   return clamp_fract(bld, lp_build_fract(bld, a));
+}
+
+
 /**
 * Return the integer part of a float (vector) value (== round toward zero).
 * The returned value is an integer (vector).
@@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld,

   assert(lp_check_value(type, a));

-   if (util_cpu_caps.has_sse2 &&
-       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+   if ((util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
      return lp_build_iround_nearest_sse2(bld, a);
   }
-   else if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
   }
   else {
@@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld,
   assert(type.floating);
   assert(lp_check_value(type, a));

-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
-      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-   }
-   else {
-      res = a;
-
-      if (type.sign) {
+   res = a;
+   if (type.sign) {
+      if (sse41_rounding_available(type)) {
+         res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+      }
+      else {
         /* Take the sign bit and add it to 1 constant */
         LLVMTypeRef vec_type = bld->vec_type;
         unsigned mantissa = lp_mantissa(type);
@@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld,
   assert(type.floating);
   assert(lp_check_value(type, a));

-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
   }
   else {
@@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld,
 * Combined ifloor() & fract().
 *
 * Preferred to calling the functions separately, as it will ensure that the
- * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ * strategy (floor() vs ifloor()) that results in less redundant work is used.
 */
 void
 lp_build_ifloor_fract(struct lp_build_context *bld,
@@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
   assert(type.floating);
   assert(lp_check_value(type, a));

-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
      /*
       * floor() is easier.
       */
@@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
 }


+/**
+ * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
+ * always smaller than one.
+ */
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+                           LLVMValueRef a,
+                           LLVMValueRef *out_ipart,
+                           LLVMValueRef *out_fpart)
+{
+   lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
+   *out_fpart = clamp_fract(bld, *out_fpart);
+}
+
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
              LLVMValueRef a)
@@ -1518,11 +1842,15 @@ lp_build_sqrt(struct lp_build_context *bld,

   assert(lp_check_value(type, a));

-   /* TODO: optimize the constant case */
   /* TODO: optimize the constant case */

   assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+   if (type.length == 1) {
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
+   }
+   else {
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+   }

   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
 }
@@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld,
    * - it doesn't even get the reciprocate of 1.0 exactly
    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
    * - for recent processors the benefit over DIVPS is marginal, a case
-    *   depedent
+    *   dependent
    *
    * We could still use it on certain processors if benchmarks show that the
    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
    * particular uses that require less workarounds.
    */

-   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
      const unsigned num_iterations = 0;
      LLVMValueRef res;
      unsigned i;
+      const char *intrinsic = NULL;

-      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rcp.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rcp.ps.256";
+      }
+
+      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);

      for (i = 0; i < num_iterations; ++i) {
         res = lp_build_rcp_refine(bld, a, res);
@@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld,

   assert(type.floating);

-   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
      const unsigned num_iterations = 1;
      LLVMValueRef res;
      unsigned i;
+      const char *intrinsic = NULL;
+
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rsqrt.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+      }
+
+      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);

-      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);

      for (i = 0; i < num_iterations; ++i) {
         res = lp_build_rsqrt_refine(bld, a, res);
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -57,8 +57,13 @@ lp_build_add(struct lp_build_context *bld,
             LLVMValueRef b);

 LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
-                    LLVMValueRef a);
+lp_build_horizontal_add(struct lp_build_context *bld,
+                        LLVMValueRef a);
+
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+                       LLVMValueRef vectors[],
+                       unsigned num_vecs);

 LLVMValueRef
 lp_build_sub(struct lp_build_context *bld,
@@ -156,6 +161,10 @@ LLVMValueRef
 lp_build_fract(struct lp_build_context *bld,
               LLVMValueRef a);

+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
 LLVMValueRef
 lp_build_ifloor(struct lp_build_context *bld,
                LLVMValueRef a);
@@ -177,6 +186,12 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
                      LLVMValueRef *out_ipart,
                      LLVMValueRef *out_fpart);

+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+                           LLVMValueRef a,
+                           LLVMValueRef *out_ipart,
+                           LLVMValueRef *out_fpart);
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
              LLVMValueRef a);
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -37,6 +37,7 @@

 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_half.h"

 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -50,10 +51,12 @@ lp_mantissa(struct lp_type type)

   if(type.floating) {
      switch(type.width) {
+      case 16:
+         return 10;
      case 32:
         return 23;
      case 64:
-         return 53;
+         return 52;
      default:
         assert(0);
         return 0;
@@ -136,6 +139,8 @@ lp_const_min(struct lp_type type)

   if (type.floating) {
      switch(type.width) {
+      case 16:
+         return -65504;
      case 32:
         return -FLT_MAX;
      case 64:
@@ -169,6 +174,8 @@ lp_const_max(struct lp_type type)

   if (type.floating) {
      switch(type.width) {
+      case 16:
+         return 65504;
      case 32:
         return FLT_MAX;
      case 64:
@@ -196,6 +203,8 @@ lp_const_eps(struct lp_type type)
 {
   if (type.floating) {
      switch(type.width) {
+      case 16:
+         return 2E-10;
      case 32:
         return FLT_EPSILON;
      case 64:
@@ -247,7 +256,9 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type)

   elem_type = lp_build_elem_type(gallivm, type);

-   if(type.floating)
+   if(type.floating && type.width == 16)
+      elems[0] = LLVMConstInt(elem_type, util_float_to_half(1.0f), 0);
+   else if(type.floating)
      elems[0] = LLVMConstReal(elem_type, 1.0);
   else if(type.fixed)
      elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
@@ -292,7 +303,9 @@ lp_build_const_elem(struct gallivm_state *gallivm,
   LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type);
   LLVMValueRef elem;

-   if(type.floating) {
+   if(type.floating && type.width == 16) {
+      elem = LLVMConstInt(elem_type, util_float_to_half((float)val), 0);
+   } else if(type.floating) {
      elem = LLVMConstReal(elem_type, val);
   }
   else {
@@ -364,20 +377,10 @@ lp_build_const_aos(struct gallivm_state *gallivm,
   if(swizzle == NULL)
      swizzle = default_swizzle;

-   if(type.floating) {
-      elems[swizzle[0]] = LLVMConstReal(elem_type, r);
-      elems[swizzle[1]] = LLVMConstReal(elem_type, g);
-      elems[swizzle[2]] = LLVMConstReal(elem_type, b);
-      elems[swizzle[3]] = LLVMConstReal(elem_type, a);
-   }
-   else {
-      double dscale = lp_const_scale(type);
-
-      elems[swizzle[0]] = LLVMConstInt(elem_type, round(r*dscale), 0);
-      elems[swizzle[1]] = LLVMConstInt(elem_type, round(g*dscale), 0);
-      elems[swizzle[2]] = LLVMConstInt(elem_type, round(b*dscale), 0);
-      elems[swizzle[3]] = LLVMConstInt(elem_type, round(a*dscale), 0);
-   }
+   elems[swizzle[0]] = lp_build_const_elem(gallivm, type, r);
+   elems[swizzle[1]] = lp_build_const_elem(gallivm, type, g);
+   elems[swizzle[2]] = lp_build_const_elem(gallivm, type, b);
+   elems[swizzle[3]] = lp_build_const_elem(gallivm, type, a);

   for(i = 4; i < type.length; ++i)
      elems[i] = elems[i % 4];
@@ -452,7 +455,7 @@ lp_build_const_string(struct gallivm_state *gallivm,
 /**
 * Build a callable function pointer.
 *
- * We this casts instead of LLVMAddGlobalMapping()
+ * We use function pointer constants instead of LLVMAddGlobalMapping()
 * to work around a bug in LLVM 2.6, and for efficiency/simplicity.
 */
 LLVMValueRef
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -70,6 +70,66 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_pack.h"
 #include "lp_bld_conv.h"
+#include "lp_bld_logic.h"
+
+
+/**
+ * Converts int16 half-float to float32
+ * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
+ * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
+ *
+ * @param src_type      <vector> type of int16
+ * @param src           value to convert
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ */
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      LLVMValueRef src)
+{
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);
+
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+   LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
+
+   /* Constants */
+   LLVMValueRef i32_13          = lp_build_const_int_vec(gallivm, i32_type, 13);
+   LLVMValueRef i32_16          = lp_build_const_int_vec(gallivm, i32_type, 16);
+   LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
+   LLVMValueRef i32_was_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
+   LLVMValueRef i32_exp_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+   LLVMValueRef f32_magic       = LLVMBuildBitCast(builder,
+                                                   lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
+                                                   float_vec_type, "");
+
+   /* Convert int16 vector to int32 vector by zero ext */
+   LLVMValueRef h             = LLVMBuildZExt(builder, src, int_vec_type, "");
+
+   /* Exponent / mantissa bits */
+   LLVMValueRef expmant       = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
+   LLVMValueRef shifted       = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
+
+   /* Exponent adjust */
+   LLVMValueRef scaled        = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
+
+   /* Make sure Inf/NaN survive */
+   LLVMValueRef b_wasinfnan   = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
+   LLVMValueRef infnanexp     = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
+
+   /* Sign bit */
+   LLVMValueRef justsign      = LLVMBuildXor(builder, h, expmant, "");
+   LLVMValueRef sign          = LLVMBuildShl(builder, justsign, i32_16, "");
+
+   /* Combine result */
+   LLVMValueRef sign_inf      = LLVMBuildOr(builder, sign, infnanexp, "");
+   LLVMValueRef final         = LLVMBuildOr(builder, scaled, sign_inf, "");
+
+   /* Cast from int32 vector to float32 vector */
+   return LLVMBuildBitCast(builder, final, float_vec_type, "");
+}


 /**
@@ -334,64 +394,113 @@ lp_build_conv(struct gallivm_state *gallivm,
       dst_type.width    == 8 &&
       dst_type.length   == 16 &&

+       4 * num_dsts      == num_srcs &&
+
       util_cpu_caps.has_sse2)
   {
-      int i;
+      struct lp_build_context bld;
+      struct lp_type int16_type = dst_type;
+      struct lp_type int32_type = dst_type;
+      LLVMValueRef const_255f;
+      unsigned i, j;

-      for (i = 0; i < num_dsts; i++, src += 4) {
-         struct lp_type int16_type = dst_type;
-         struct lp_type int32_type = dst_type;
+      lp_build_context_init(&bld, gallivm, src_type);
+
+      int16_type.width *= 2;
+      int16_type.length /= 2;
+      int16_type.sign = 1;
+
+      int32_type.width *= 4;
+      int32_type.length /= 4;
+      int32_type.sign = 1;
+
+      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+
+      for (i = 0; i < num_dsts; ++i, src += 4) {
         LLVMValueRef lo, hi;
-         LLVMValueRef src_int0;
-         LLVMValueRef src_int1;
-         LLVMValueRef src_int2;
-         LLVMValueRef src_int3;
-         LLVMTypeRef int32_vec_type;
-         LLVMTypeRef src_vec_type;
-         LLVMValueRef const_255f;
-         LLVMValueRef a, b, c, d;

-         int16_type.width *= 2;
-         int16_type.length /= 2;
-         int16_type.sign = 1;
+         for (j = 0; j < 4; ++j) {
+            tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
+            tmp[j] = lp_build_iround(&bld, tmp[j]);
+         }

-         int32_type.width *= 4;
-         int32_type.length /= 4;
-         int32_type.sign = 1;
+         /* relying on clamping behavior of sse2 intrinsics here */
+         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
+         hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+      }

-         src_vec_type   = lp_build_vec_type(gallivm, src_type);
-         int32_vec_type = lp_build_vec_type(gallivm, int32_type);
+      return; 
+   }

-         const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+   /* Special case 2x8f --> 1x16ub
+    */
+   else if (src_type.floating == 1 &&
+      src_type.fixed    == 0 &&
+      src_type.sign     == 1 &&
+      src_type.norm     == 0 &&
+      src_type.width    == 32 &&
+      src_type.length   == 8 &&
+
+      dst_type.floating == 0 &&
+      dst_type.fixed    == 0 &&
+      dst_type.sign     == 0 &&
+      dst_type.norm     == 1 &&
+      dst_type.width    == 8 &&
+      dst_type.length   == 16 &&
+
+      2 * num_dsts      == num_srcs &&
+
+      util_cpu_caps.has_avx) {
+
+      struct lp_build_context bld;
+      struct lp_type int16_type = dst_type;
+      struct lp_type int32_type = dst_type;
+      LLVMValueRef const_255f;
+      unsigned i;
+
+      lp_build_context_init(&bld, gallivm, src_type);
+
+      int16_type.width *= 2;
+      int16_type.length /= 2;
+      int16_type.sign = 1;
+
+      int32_type.width *= 4;
+      int32_type.length /= 4;
+      int32_type.sign = 1;
+
+      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+
+      for (i = 0; i < num_dsts; ++i, src += 2) {
+         LLVMValueRef lo, hi, a, b;

         a = LLVMBuildFMul(builder, src[0], const_255f, "");
         b = LLVMBuildFMul(builder, src[1], const_255f, "");
-         c = LLVMBuildFMul(builder, src[2], const_255f, "");
-         d = LLVMBuildFMul(builder, src[3], const_255f, "");

-         {
-            struct lp_build_context bld;
+         a = lp_build_iround(&bld, a);
+         b = lp_build_iround(&bld, b);

-            bld.gallivm = gallivm;
-            bld.type = src_type;
-            bld.vec_type = src_vec_type;
-            bld.int_elem_type = lp_build_elem_type(gallivm, int32_type);
-            bld.int_vec_type = int32_vec_type;
-            bld.undef = lp_build_undef(gallivm, src_type);
-            bld.zero = lp_build_zero(gallivm, src_type);
-            bld.one = lp_build_one(gallivm, src_type);
+         tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
+         tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
+         tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
+         tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);

-            src_int0 = lp_build_iround(&bld, a);
-            src_int1 = lp_build_iround(&bld, b);
-            src_int2 = lp_build_iround(&bld, c);
-            src_int3 = lp_build_iround(&bld, d);
-         }
         /* relying on clamping behavior of sse2 intrinsics here */
-         lo = lp_build_pack2(gallivm, int32_type, int16_type, src_int0, src_int1);
-         hi = lp_build_pack2(gallivm, int32_type, int16_type, src_int2, src_int3);
+         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
+         hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
      }
-      return; 
+      return;
+   }
+
+   /* Pre convert half-floats to floats
+    */
+   else if (src_type.floating && src_type.width == 16)
+   {
+      for(i = 0; i < num_tmps; ++i)
+         tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]);
+
+      tmp_type.width = 32;
   }

   /*
@@ -603,7 +712,7 @@ lp_build_conv(struct gallivm_state *gallivm,
 * This will convert the integer masks that match the given types.
 *
 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
- * Any other value will likely cause in unpredictable results.
+ * Any other value will likely cause unpredictable results.
 *
 * This is basically a very trimmed down version of lp_build_conv.
 */
@@ -614,8 +723,6 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
                   const LLVMValueRef *src, unsigned num_srcs,
                   LLVMValueRef *dst, unsigned num_dsts)
 {
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);

   /* We must not loose or gain channels. Only precision */
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
@@ -640,16 +747,5 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
    * Truncate or expand bit width
    */

-   if(src_type.width > dst_type.width) {
-      assert(num_dsts == 1);
-      dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
-   }
-   else if(src_type.width < dst_type.width) {
-      assert(num_srcs == 1);
-      lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts);
-   }
-   else {
-      assert(num_srcs == num_dsts);
-      memcpy(dst, src, num_dsts * sizeof *dst);
-   }
+   lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
 }
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -42,6 +42,10 @@

 struct lp_type;

+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+                       struct lp_type src_type,
+                       LLVMValueRef src);

 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -35,10 +35,8 @@

 #if HAVE_LLVM >= 0x0300
 #include <llvm/Support/TargetRegistry.h>
-#include <llvm/Support/TargetSelect.h>
 #else /* HAVE_LLVM < 0x0300 */
 #include <llvm/Target/TargetRegistry.h>
-#include <llvm/Target/TargetSelect.h>
 #endif /* HAVE_LLVM < 0x0300 */

 #if HAVE_LLVM >= 0x0209
@@ -183,7 +181,7 @@ lp_disassemble(const void* func)
   /*
    * Limit disassembly to this extent
    */
-   const uint64_t extent = 0x10000;
+   const uint64_t extent = 96 * 1024;

   uint64_t max_pc = 0;

@@ -200,24 +198,6 @@ lp_disassemble(const void* func)
   std::string Error;
   const Target *T = TargetRegistry::lookupTarget(Triple, Error);

-#if HAVE_LLVM >= 0x0208
-   InitializeNativeTargetAsmPrinter();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   LLVMInitializeX86AsmPrinter();
-#elif defined(PIPE_ARCH_ARM)
-   LLVMInitializeARMAsmPrinter();
-#elif defined(PIPE_ARCH_PPC)
-   LLVMInitializePowerPCAsmPrinter();
-#endif
-
-#if HAVE_LLVM >= 0x0301
-   InitializeNativeTargetDisassembler();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   LLVMInitializeX86Disassembler();
-#elif defined(PIPE_ARCH_ARM)
-   LLVMInitializeARMDisassembler();
-#endif
-
 #if HAVE_LLVM >= 0x0300
   OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple));
 #else
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -131,6 +131,15 @@ lp_build_mask_check(struct lp_build_mask_context *mask)

   value = lp_build_mask_value(mask);

+   /*
+    * XXX this doesn't quite generate the most efficient code possible, if
+    * the masks are vectors which have all bits set to the same value
+    * in each element.
+    * movmskps/pmovmskb would be more efficient to get the required value
+    * into ordinary reg (certainly with 8 floats).
+    * Not sure if llvm could figure that out on its own.
+    */
+
   /* cond = (mask == 0) */
   cond = LLVMBuildICmp(builder,
                        LLVMIntEQ,
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -67,6 +67,13 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                        LLVMValueRef i,
                        LLVMValueRef j);

+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+                        const struct util_format_description *format_desc,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset);
+

 /*
 * SoA
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -470,6 +470,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
      return lp_build_format_swizzle_aos(format_desc, &bld, res);
   }

+   /* If all channels are of same type and we are not using half-floats */
+   if (util_format_is_array(format_desc)) {
+      return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
+   }
+
   /*
    * YUV / subsampled formats
    */
@@ -601,7 +606,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
      return res;
   }

-
   /*
    * Fallback to util_format_description::fetch_rgba_float().
    */
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_const.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_pack.h"
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "pipe/p_state.h"
+
+/**
+ * @brief lp_build_fetch_rgba_aos_array
+ *
+ * \param format_desc   describes format of the image we're fetching from
+ * \param dst_type      output type
+ * \param base_ptr      address of the pixel block (or the texel if uncompressed)
+ * \param offset        ptr offset
+ */
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+                              const struct util_format_description *format_desc,
+                              struct lp_type dst_type,
+                              LLVMValueRef base_ptr,
+                              LLVMValueRef offset)
+{
+   struct lp_build_context bld;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef src_elem_type, src_vec_type;
+   LLVMValueRef ptr, res = NULL;
+   struct lp_type src_type;
+
+   memset(&src_type, 0, sizeof src_type);
+   src_type.floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+   src_type.fixed    = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+   src_type.sign     = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+   src_type.norm     = format_desc->channel[0].normalized;
+   src_type.width    = format_desc->channel[0].size;
+   src_type.length   = format_desc->nr_channels;
+
+   assert(src_type.length <= dst_type.length);
+
+   src_elem_type = lp_build_elem_type(gallivm, src_type);
+   src_vec_type  = lp_build_vec_type(gallivm,  src_type);
+
+   /* Read whole vector from memory, unaligned */
+   if (!res) {
+      ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+      ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
+      res = LLVMBuildLoad(builder, ptr, "");
+      lp_set_load_alignment(res, src_type.width / 8);
+   }
+
+   /* Truncate doubles to float */
+   if (src_type.floating && src_type.width == 64) {
+      src_type.width = 32;
+      src_vec_type  = lp_build_vec_type(gallivm,  src_type);
+
+      res = LLVMBuildFPTrunc(builder, res, src_vec_type, "");
+   }
+
+   /* Expand to correct length */
+   if (src_type.length < dst_type.length) {
+      res = lp_build_pad_vector(gallivm, res, src_type, dst_type.length);
+      src_type.length = dst_type.length;
+   }
+
+   /* Convert to correct format */
+   lp_build_conv(gallivm, src_type, dst_type, &res, 1, &res, 1);
+
+   /* Swizzle it */
+   lp_build_context_init(&bld, gallivm, dst_type);
+   return lp_build_format_swizzle_aos(format_desc, &bld, res);
+}
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -359,7 +359,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
    */

   if (util_format_fits_8unorm(format_desc) &&
-       type.floating && type.width == 32 && type.length == 4) {
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0))) {
      struct lp_type tmp_type;
      LLVMValueRef tmp;

--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -84,7 +84,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
    * per element. Didn't measure performance but cuts shader size
    * by quite a bit (less difference if cpu has no sse4.1 support).
    */
-   if (util_cpu_caps.has_sse2 && n == 4) {
+   if (util_cpu_caps.has_sse2 && n > 1) {
      LLVMValueRef sel, tmp, tmp2;
      struct lp_build_context bld32;

@@ -152,7 +152,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
    * per element. Didn't measure performance but cuts shader size
    * by quite a bit (less difference if cpu has no sse4.1 support).
    */
-   if (util_cpu_caps.has_sse2 && n == 4) {
+   if (util_cpu_caps.has_sse2 && n > 1) {
      LLVMValueRef sel, tmp;
      struct lp_build_context bld32;

--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -26,15 +26,44 @@
 **************************************************************************/


+#include "pipe/p_config.h"
 #include "pipe/p_compiler.h"
 #include "util/u_cpu_detect.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_simple_list.h"
+#include "lp_bld.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_misc.h"
 #include "lp_bld_init.h"

+#include <llvm-c/Analysis.h>
 #include <llvm-c/Transforms/Scalar.h>
+#include <llvm-c/BitWriter.h>
+
+
+/**
+ * AVX is supported in:
+ * - standard JIT from LLVM 3.2 onwards
+ * - MC-JIT from LLVM 3.1
+ *   - MC-JIT supports limited OSes (MacOSX and Linux)
+ * - standard JIT in LLVM 3.1, with backports
+ */
+#if HAVE_LLVM >= 0x0301 && (defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE))
+#  define USE_MCJIT 1
+#  define HAVE_AVX 1
+#elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT))
+#  define USE_MCJIT 0
+#  define HAVE_AVX 1
+#else
+#  define USE_MCJIT 0
+#  define HAVE_AVX 0
+#endif
+
+
+#if USE_MCJIT
+void LLVMLinkInMCJIT();
+#endif


 #ifdef DEBUG
@@ -57,6 +86,8 @@ DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags,

 static boolean gallivm_initialized = FALSE;

+unsigned lp_native_vector_width;
+

 /*
 * Optimization values are:
@@ -81,25 +112,13 @@ enum LLVM_CodeGenOpt_Level {
 };


+#if HAVE_LLVM <= 0x0206
 /**
- * LLVM 2.6 permits only one ExecutionEngine to be created.  This is it.
- */
-static LLVMExecutionEngineRef GlobalEngine = NULL;
-
-/**
- * Same gallivm state shared by all contexts.
+ * LLVM 2.6 permits only one ExecutionEngine to be created.  So use the
+ * same gallivm state everywhere.
 */
 static struct gallivm_state *GlobalGallivm = NULL;
-
-
-
-
-extern void
-lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
-
-extern void
-lp_set_target_options(void);
-
+#endif


 /**
@@ -111,6 +130,7 @@ static boolean
 create_pass_manager(struct gallivm_state *gallivm)
 {
   assert(!gallivm->passmgr);
+   assert(gallivm->target);

   gallivm->passmgr = LLVMCreateFunctionPassManager(gallivm->provider);
   if (!gallivm->passmgr)
@@ -174,33 +194,37 @@ free_gallivm_state(struct gallivm_state *gallivm)
                               &mod, &error);
 #endif

+   if (gallivm->passmgr) {
+      LLVMDisposePassManager(gallivm->passmgr);
+   }
+
 #if 0
   /* XXX this seems to crash with all versions of LLVM */
   if (gallivm->provider)
      LLVMDisposeModuleProvider(gallivm->provider);
 #endif

-   if (gallivm->passmgr)
-      LLVMDisposePassManager(gallivm->passmgr);
-
-#if HAVE_LLVM >= 0x207
-   if (gallivm->module)
-      LLVMDisposeModule(gallivm->module);
-#endif
-
-#if 0
-   /* Don't free the exec engine, it's a global/singleton */
-   if (gallivm->engine)
+   if (HAVE_LLVM >= 0x207 && gallivm->engine) {
+      /* This will already destroy any associated module */
      LLVMDisposeExecutionEngine(gallivm->engine);
-#endif
+   } else {
+      LLVMDisposeModule(gallivm->module);
+   }

-#if 0
+#if !USE_MCJIT
   /* Don't free the TargetData, it's owned by the exec engine */
-   LLVMDisposeTargetData(gallivm->target);
+#else
+   if (gallivm->target) {
+      LLVMDisposeTargetData(gallivm->target);
+   }
 #endif

+   /* Never free the LLVM context.
+    */
+#if 0
   if (gallivm->context)
      LLVMContextDispose(gallivm->context);
+#endif

   if (gallivm->builder)
      LLVMDisposeBuilder(gallivm->builder);
@@ -215,6 +239,91 @@ free_gallivm_state(struct gallivm_state *gallivm)
 }


+static boolean
+init_gallivm_engine(struct gallivm_state *gallivm)
+{
+   if (1) {
+      /* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */
+      enum LLVM_CodeGenOpt_Level optlevel;
+      char *error = NULL;
+      int ret;
+
+      if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) {
+         optlevel = None;
+      }
+      else {
+         optlevel = Default;
+      }
+
+#if USE_MCJIT
+      ret = lp_build_create_mcjit_compiler_for_module(&gallivm->engine,
+                                                      gallivm->module,
+                                                      (unsigned) optlevel,
+                                                      &error);
+#else
+      ret = LLVMCreateJITCompiler(&gallivm->engine, gallivm->provider,
+                                  (unsigned) optlevel, &error);
+#endif
+      if (ret) {
+         _debug_printf("%s\n", error);
+         LLVMDisposeMessage(error);
+         goto fail;
+      }
+
+#if defined(DEBUG) || defined(PROFILE)
+      lp_register_oprofile_jit_event_listener(gallivm->engine);
+#endif
+   }
+
+   LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new
+
+#if !USE_MCJIT
+   gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
+   if (!gallivm->target)
+      goto fail;
+#else
+   if (0) {
+       /*
+        * Dump the data layout strings.
+        */
+
+       LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine);
+       char *data_layout;
+       char *engine_data_layout;
+
+       data_layout = LLVMCopyStringRepOfTargetData(gallivm->target);
+       engine_data_layout = LLVMCopyStringRepOfTargetData(target);
+
+       if (1) {
+          debug_printf("module target data = %s\n", data_layout);
+          debug_printf("engine target data = %s\n", engine_data_layout);
+       }
+
+       free(data_layout);
+       free(engine_data_layout);
+   }
+#endif
+
+   return TRUE;
+
+fail:
+   return FALSE;
+}
+
+
+/**
+ * Singleton
+ *
+ * We must never free LLVM contexts, because LLVM has several global caches
+ * which pointing/derived from objects owned by the context, causing false
+ * memory leaks and false cache hits when these objects are destroyed.
+ *
+ * TODO: For thread safety on multi-threaded OpenGL we should use one LLVM
+ * context per thread, and put them in a pool when threads are destroyed.
+ */
+static LLVMContextRef gallivm_context = NULL;
+
+
 /**
 * Allocate gallivm LLVM objects.
 * \return  TRUE for success, FALSE for failure
@@ -228,7 +337,10 @@ init_gallivm_state(struct gallivm_state *gallivm)

   lp_build_init();

-   gallivm->context = LLVMContextCreate();
+   if (!gallivm_context) {
+      gallivm_context = LLVMContextCreate();
+   }
+   gallivm->context = gallivm_context;
   if (!gallivm->context)
      goto fail;

@@ -242,45 +354,58 @@ init_gallivm_state(struct gallivm_state *gallivm)
   if (!gallivm->provider)
      goto fail;

-   if (!GlobalEngine) {
-      /* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */
-      enum LLVM_CodeGenOpt_Level optlevel;
-      char *error = NULL;
-
-      if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) {
-         optlevel = None;
-      }
-      else {
-         optlevel = Default;
-      }
-
-      if (LLVMCreateJITCompiler(&GlobalEngine, gallivm->provider,
-                                (unsigned) optlevel, &error)) {
-         _debug_printf("%s\n", error);
-         LLVMDisposeMessage(error);
-         goto fail;
-      }
-
-#if defined(DEBUG) || defined(PROFILE)
-      lp_register_oprofile_jit_event_listener(GlobalEngine);
-#endif
-   }
-
-   gallivm->engine = GlobalEngine;
-
-   LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new
-
-   gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
-   if (!gallivm->target)
-      goto fail;
-
-   if (!create_pass_manager(gallivm))
-      goto fail;
-
   gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
   if (!gallivm->builder)
      goto fail;

+   /* FIXME: MC-JIT only allows compiling one module at a time, and it must be
+    * complete when MC-JIT is created. So defer the MC-JIT engine creation for
+    * now.
+    */
+#if !USE_MCJIT
+   if (!init_gallivm_engine(gallivm)) {
+      goto fail;
+   }
+#else
+   /*
+    * MC-JIT engine compiles the module immediately on creation, so we can't
+    * obtain the target data from it.  Instead we create a target data layout
+    * from a string.
+    *
+    * The produced layout strings are not precisely the same, but should make
+    * no difference for the kind of optimization passes we run.
+    *
+    * For reference this is the layout string on x64:
+    *
+    *   e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64
+    *
+    * See also:
+    * - http://llvm.org/docs/LangRef.html#datalayout
+    */
+
+   {
+      const unsigned pointer_size = 8 * sizeof(void *);
+      char layout[512];
+      util_snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u",
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+                    'e', // little endian
+#else
+                    'E', // big endian
+#endif
+                    pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment
+                    pointer_size, // aggregate preferred alignment
+                    pointer_size, pointer_size); // stack objects abi alignment, preferred alignment
+
+      gallivm->target = LLVMCreateTargetData(layout);
+      if (!gallivm->target) {
+         return FALSE;
+      }
+   }
+#endif
+
+   if (!create_pass_manager(gallivm))
+      goto fail;
+
   return TRUE;

 fail:
@@ -289,103 +414,6 @@ fail:
 }


-struct callback
-{
-   garbage_collect_callback_func func;
-   void *cb_data;
-   struct callback *prev, *next;
-};
-
-
-/** list of all garbage collector callbacks */
-static struct callback callback_list = {NULL, NULL, NULL, NULL};
-
-
-/**
- * Register a function with gallivm which will be called when we
- * do garbage collection.
- */
-void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
-                                            void *cb_data)
-{
-   struct callback *cb;
-
-   if (!callback_list.prev) {
-      make_empty_list(&callback_list);
-   }
-
-   /* see if already in list */
-   foreach(cb, &callback_list) {
-      if (cb->func == func && cb->cb_data == cb_data)
-         return;
-   }
-
-   /* add to list */
-   cb = CALLOC_STRUCT(callback);
-   if (cb) {
-      cb->func = func;
-      cb->cb_data = cb_data;
-      insert_at_head(&callback_list, cb);
-   }
-}
-
-
-/**
- * Remove a callback.
- */
-void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
-                                          void *cb_data)
-{
-   struct callback *cb;
-
-   /* search list */
-   foreach(cb, &callback_list) {
-      if (cb->func == func && cb->cb_data == cb_data) {
-         /* found, remove it */
-         remove_from_list(cb);
-         FREE(cb);
-         return;
-      }
-   }
-}
-
-
-/**
- * Call the callback functions (which are typically in the
- * draw module and llvmpipe driver.
- */
-static void
-call_garbage_collector_callbacks(void)
-{
-   struct callback *cb;
-   foreach(cb, &callback_list) {
-      cb->func(cb->cb_data);
-   }
-}
-
-
-
-/**
- * Other gallium components using gallivm should call this periodically
- * to let us do garbage collection (or at least try to free memory
- * accumulated by the LLVM libraries).
- */
-void
-gallivm_garbage_collect(struct gallivm_state *gallivm)
-{
-   if (gallivm->context) {
-      if (gallivm_debug & GALLIVM_DEBUG_GC)
-         debug_printf("***** Doing LLVM garbage collection\n");
-
-      call_garbage_collector_callbacks();
-      free_gallivm_state(gallivm);
-      init_gallivm_state(gallivm);
-   }
-}
-
-
 void
 lp_build_init(void)
 {
@@ -398,12 +426,27 @@ lp_build_init(void)

   lp_set_target_options();

-   LLVMInitializeNativeTarget();
-
+#if USE_MCJIT
+   LLVMLinkInMCJIT();
+#else
   LLVMLinkInJIT();
+#endif

   util_cpu_detect();
+
+   if (HAVE_AVX &&
+       util_cpu_caps.has_avx) {
+      lp_native_vector_width = 256;
+   } else {
+      /* Leave it at 128, even when no SIMD extensions are available.
+       * Really needs to be a multiple of 128 so can fit 4 floats.
+       */
+      lp_native_vector_width = 128;
+   }
 
+   lp_native_vector_width = debug_get_num_option("LP_NATIVE_VECTOR_WIDTH",
+                                                 lp_native_vector_width);
+
   gallivm_initialized = TRUE;

 #if 0
@@ -423,16 +466,27 @@ lp_build_init(void)
 struct gallivm_state *
 gallivm_create(void)
 {
-   if (!GlobalGallivm) {
-      GlobalGallivm = CALLOC_STRUCT(gallivm_state);
-      if (GlobalGallivm) {
-         if (!init_gallivm_state(GlobalGallivm)) {
-            FREE(GlobalGallivm);
-            GlobalGallivm = NULL;
-         }
+   struct gallivm_state *gallivm;
+
+#if HAVE_LLVM <= 0x206
+   if (GlobalGallivm) {
+      return GlobalGallivm;
+   }
+#endif
+
+   gallivm = CALLOC_STRUCT(gallivm_state);
+   if (gallivm) {
+      if (!init_gallivm_state(gallivm)) {
+         FREE(gallivm);
+         gallivm = NULL;
      }
   }
-   return GlobalGallivm;
+
+#if HAVE_LLVM <= 0x206
+   GlobalGallivm = gallivm;
+#endif
+
+   return gallivm;
 }


@@ -442,6 +496,132 @@ gallivm_create(void)
 void
 gallivm_destroy(struct gallivm_state *gallivm)
 {
+#if HAVE_LLVM <= 0x0206
   /* No-op: don't destroy the singleton */
   (void) gallivm;
+#else
+   free_gallivm_state(gallivm);
+   FREE(gallivm);
+#endif
+}
+
+
+/**
+ * Validate and optimze a function.
+ */
+static void
+gallivm_optimize_function(struct gallivm_state *gallivm,
+                          LLVMValueRef func)
+{
+   if (0) {
+      debug_printf("optimizing %s...\n", LLVMGetValueName(func));
+   }
+
+   assert(gallivm->passmgr);
+
+   /* Apply optimizations to LLVM IR */
+   LLVMRunFunctionPassManager(gallivm->passmgr, func);
+
+   if (0) {
+      if (gallivm_debug & GALLIVM_DEBUG_IR) {
+         /* Print the LLVM IR to stderr */
+         lp_debug_dump_value(func);
+         debug_printf("\n");
+      }
+   }
+}
+
+
+/**
+ * Validate a function.
+ */
+void
+gallivm_verify_function(struct gallivm_state *gallivm,
+                        LLVMValueRef func)
+{
+   /* Verify the LLVM IR.  If invalid, dump and abort */
+#ifdef DEBUG
+   if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) {
+      lp_debug_dump_value(func);
+      assert(0);
+      return;
+   }
+#endif
+
+   gallivm_optimize_function(gallivm, func);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      /* Print the LLVM IR to stderr */
+      lp_debug_dump_value(func);
+      debug_printf("\n");
+   }
+}
+
+
+void
+gallivm_compile_module(struct gallivm_state *gallivm)
+{
+#if HAVE_LLVM > 0x206
+   assert(!gallivm->compiled);
+#endif
+
+   /* Dump byte code to a file */
+   if (0) {
+      LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc");
+      debug_printf("llvmpipe.bc written\n");
+      debug_printf("Invoke as \"llc -o - llvmpipe.bc\"\n");
+   }
+
+#if USE_MCJIT
+   assert(!gallivm->engine);
+   if (!init_gallivm_engine(gallivm)) {
+      assert(0);
+   }
+#endif
+   assert(gallivm->engine);
+
+   ++gallivm->compiled;
+}
+
+
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+                     LLVMValueRef func)
+{
+   void *code;
+   func_pointer jit_func;
+
+   assert(gallivm->compiled);
+   assert(gallivm->engine);
+
+   code = LLVMGetPointerToGlobal(gallivm->engine, func);
+   assert(code);
+   jit_func = pointer_to_func(code);
+
+   if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      lp_disassemble(code);
+   }
+
+   /* Free the function body to save memory */
+   lp_func_delete_body(func);
+
+   return jit_func;
+}
+
+
+/**
+ * Free the function (and its machine code).
+ */
+void
+gallivm_free_function(struct gallivm_state *gallivm,
+                      LLVMValueRef func,
+                      const void *code)
+{
+#if !USE_MCJIT
+   if (code) {
+      LLVMFreeMachineCodeForFunction(gallivm->engine, func);
+   }
+
+   LLVMDeleteFunction(func);
+#endif
 }
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -31,6 +31,7 @@


 #include "pipe/p_compiler.h"
+#include "util/u_pointer.h" // for func_pointer
 #include "lp_bld.h"
 #include <llvm-c/ExecutionEngine.h>

@@ -44,6 +45,7 @@ struct gallivm_state
   LLVMPassManagerRef passmgr;
   LLVMContextRef context;
   LLVMBuilderRef builder;
+   unsigned compiled;
 };


@@ -51,25 +53,6 @@ void
 lp_build_init(void);


-extern void
-lp_func_delete_body(LLVMValueRef func);
-
-
-void
-gallivm_garbage_collect(struct gallivm_state *gallivm);
-
-
-typedef void (*garbage_collect_callback_func)(void *cb_data);
-
-void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
-                                            void *cb_data);
-
-void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
-                                          void *cb_data);
-
-
 struct gallivm_state *
 gallivm_create(void);

@@ -77,9 +60,21 @@ void
 gallivm_destroy(struct gallivm_state *gallivm);


-extern LLVMValueRef
-lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
-                       const char *Name);
+void
+gallivm_verify_function(struct gallivm_state *gallivm,
+                        LLVMValueRef func);
+
+void
+gallivm_compile_module(struct gallivm_state *gallivm);
+
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+                     LLVMValueRef func);
+
+void
+gallivm_free_function(struct gallivm_state *gallivm,
+                      LLVMValueRef func,
+                      const void * code);

 void
 lp_set_load_alignment(LLVMValueRef Inst,
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -48,6 +48,8 @@

 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_type.h"
+#include "lp_bld_pack.h"


 LLVMValueRef
@@ -129,6 +131,95 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
 }


+/**
+ * Call intrinsic with arguments adapted to intrinsic vector length.
+ *
+ * Split vectors which are too large for the hw, or expand them if they
+ * are too small, so a caller calling a function which might use intrinsics
+ * doesn't need to do splitting/expansion on its own.
+ * This only supports intrinsics where src and dst types match.
+ */
+LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+                                    const char *name,
+                                    struct lp_type src_type,
+                                    unsigned intr_size,
+                                    LLVMValueRef a,
+                                    LLVMValueRef b)
+{
+   unsigned i;
+   struct lp_type intrin_type = src_type;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+   LLVMValueRef anative, bnative;
+   unsigned intrin_length = intr_size / src_type.width;
+
+   intrin_type.length = intrin_length;
+
+   if (intrin_length > src_type.length) {
+      LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef constvec, tmp;
+
+      for (i = 0; i < src_type.length; i++) {
+         elems[i] = lp_build_const_int32(gallivm, i);
+      }
+      for (; i < intrin_length; i++) {
+         elems[i] = i32undef;
+      }
+      if (src_type.length == 1) {
+         LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type);
+         a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), "");
+         b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), "");
+      }
+      constvec = LLVMConstVector(elems, intrin_length);
+      anative = LLVMBuildShuffleVector(builder, a, a, constvec, "");
+      bnative = LLVMBuildShuffleVector(builder, b, b, constvec, "");
+      tmp = lp_build_intrinsic_binary(builder, name,
+                                      lp_build_vec_type(gallivm, intrin_type),
+                                      anative, bnative);
+      if (src_type.length > 1) {
+         constvec = LLVMConstVector(elems, src_type.length);
+         return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, "");
+      }
+      else {
+         return LLVMBuildExtractElement(builder, tmp, elems[0], "");
+      }
+   }
+   else if (intrin_length < src_type.length) {
+      unsigned num_vec = src_type.length / intrin_length;
+      LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+
+      /* don't support arbitrary size here as this is so yuck */
+      if (src_type.length % intrin_length) {
+         /* FIXME: This is something which should be supported
+          * but there doesn't seem to be any need for it currently
+          * so crash and burn.
+          */
+         debug_printf("%s: should handle arbitrary vector size\n",
+                      __FUNCTION__);
+         assert(0);
+         return NULL;
+      }
+
+      for (i = 0; i < num_vec; i++) {
+         anative = lp_build_extract_range(gallivm, a, i*intrin_length,
+                                        intrin_length);
+         bnative = lp_build_extract_range(gallivm, b, i*intrin_length,
+                                        intrin_length);
+         tmp[i] = lp_build_intrinsic_binary(builder, name,
+                                            lp_build_vec_type(gallivm, intrin_type),
+                                            anative, bnative);
+      }
+      return lp_build_concat(gallivm, tmp, intrin_type, num_vec);
+   }
+   else {
+      return lp_build_intrinsic_binary(builder, name,
+                                       lp_build_vec_type(gallivm, src_type),
+                                       a, b);
+   }
+}
+
+
 LLVMValueRef
 lp_build_intrinsic_map(struct gallivm_state *gallivm,
                       const char *name,
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -77,6 +77,15 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
                          LLVMValueRef b);


+LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+                                    const char *name,
+                                    struct lp_type src_type,
+                                    unsigned intr_size,
+                                    LLVMValueRef a,
+                                    LLVMValueRef b);
+
+
 LLVMValueRef
 lp_build_intrinsic_map(struct gallivm_state *gallivm,
                       const char *name,
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -1,6 +1,6 @@
 /**************************************************************************
 *
- * Copyright 2010 VMware, Inc.
+ * Copyright 2010-2012 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -29,6 +29,13 @@
 #ifndef LP_BLD_LIMITS_H_
 #define LP_BLD_LIMITS_H_

+
+#include <limits.h>
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+
+
 /*
 * TGSI translation limits.
 *
@@ -57,4 +64,53 @@
 */
 #define LP_MAX_TGSI_LOOP_ITERATIONS 65535

+
+/**
+ * Some of these limits are actually infinite (i.e., only limited by available
+ * memory), however advertising INT_MAX would cause some test problems to
+ * actually try to allocate the maximum and run out of memory and crash.  So
+ * stick with something reasonable here.
+ */
+static INLINE int
+gallivm_get_shader_param(enum pipe_shader_cap param)
+{
+   switch(param) {
+   case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+      return 1 * 1024 * 1024;
+   case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+      return LP_MAX_TGSI_NESTING;
+   case PIPE_SHADER_CAP_MAX_INPUTS:
+      return PIPE_MAX_SHADER_INPUTS;
+   case PIPE_SHADER_CAP_MAX_CONSTS:
+      return 16 * 2024;
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+      return PIPE_MAX_CONSTANT_BUFFERS;
+   case PIPE_SHADER_CAP_MAX_TEMPS:
+      return LP_MAX_TGSI_TEMPS;
+   case PIPE_SHADER_CAP_MAX_ADDRS:
+      return LP_MAX_TGSI_ADDRS;
+   case PIPE_SHADER_CAP_MAX_PREDS:
+      return LP_MAX_TGSI_PREDS;
+   case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+      return 1;
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+      return 1;
+   case PIPE_SHADER_CAP_SUBROUTINES:
+      return 1;
+   case PIPE_SHADER_CAP_INTEGERS:
+      return 1;
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+      return PIPE_MAX_SAMPLERS;
+   default:
+      return 0;
+   }
+}
+
+
 #endif /* LP_BLD_LIMITS_H_ */
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -52,8 +52,8 @@
 *
 *    select <4 x i1> %C, %A, %B
 *
- * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
- * supported on any backend.
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
+ * supported on some backends (x86) starting with llvm 3.1.
 *
 * Expanding the boolean vector to full SIMD register width, as in
 *
@@ -485,8 +485,10 @@ lp_build_select(struct lp_build_context *bld,
      }
      res = LLVMBuildSelect(builder, mask, a, b, "");
   }
-   else if (util_cpu_caps.has_sse4_1 &&
-            type.width * type.length == 128 &&
+   else if (((util_cpu_caps.has_sse4_1 &&
+              type.width * type.length == 128) ||
+             (util_cpu_caps.has_avx &&
+              type.width * type.length == 256 && type.width >= 32)) &&
            !LLVMIsConstant(a) &&
            !LLVMIsConstant(b) &&
            !LLVMIsConstant(mask)) {
@@ -494,8 +496,22 @@ lp_build_select(struct lp_build_context *bld,
      LLVMTypeRef arg_type;
      LLVMValueRef args[3];

-      if (type.floating &&
-          type.width == 64) {
+      /*
+       *  There's only float blend in AVX but can just cast i32/i64
+       *  to float.
+       */
+      if (type.width * type.length == 256) {
+         if (type.width == 64) {
+           intrinsic = "llvm.x86.avx.blendv.pd.256";
+           arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
+         }
+         else {
+            intrinsic = "llvm.x86.avx.blendv.ps.256";
+            arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+         }
+      }
+      else if (type.floating &&
+               type.width == 64) {
         intrinsic = "llvm.x86.sse41.blendvpd";
         arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
      } else if (type.floating &&
@@ -591,3 +607,35 @@ lp_build_select_aos(struct lp_build_context *bld,
      return lp_build_select(bld, mask_vec, a, b);
   }
 }
+
+
+/**
+ * Return (scalar-cast)val ? true : false;
+ */
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+                        unsigned real_length,
+                        LLVMValueRef val)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMTypeRef scalar_type;
+   LLVMTypeRef true_type;
+
+   assert(real_length <= bld->type.length);
+
+   true_type = LLVMIntTypeInContext(bld->gallivm->context,
+                                    bld->type.width * real_length);
+   scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
+                                      bld->type.width * bld->type.length);
+   val = LLVMBuildBitCast(builder, val, scalar_type, "");
+   /*
+    * We're using always native types so we can use intrinsics.
+    * However, if we don't do per-element calculations, we must ensure
+    * the excess elements aren't used since they may contain garbage.
+    */
+   if (real_length < bld->type.length) {
+      val = LLVMBuildTrunc(builder, val, true_type, "");
+   }
+   return LLVMBuildICmp(builder, LLVMIntNE,
+                        val, LLVMConstNull(true_type), "");
+}
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -82,4 +82,9 @@ lp_build_select_aos(struct lp_build_context *bld,
                    LLVMValueRef b);


+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+                        unsigned real_length,
+                        LLVMValueRef val);
+
 #endif /* !LP_BLD_LOGIC_H */
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -26,6 +26,12 @@
 **************************************************************************/


+/**
+ * The purpose of this module is to expose LLVM functionality not available
+ * through the C++ bindings.
+ */
+
+
 #ifndef __STDC_LIMIT_MACROS
 #define __STDC_LIMIT_MACROS
 #endif
@@ -41,11 +47,24 @@
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/ExecutionEngine/JITEventListener.h>
+#if HAVE_LLVM >= 0x0301
+#include <llvm/ADT/Triple.h>
+#include <llvm/ExecutionEngine/JITMemoryManager.h>
+#endif
 #include <llvm/Support/CommandLine.h>
 #include <llvm/Support/PrettyStackTrace.h>

+#if HAVE_LLVM >= 0x0300
+#include <llvm/Support/TargetSelect.h>
+#else /* HAVE_LLVM < 0x0300 */
+#include <llvm/Target/TargetSelect.h>
+#endif /* HAVE_LLVM < 0x0300 */
+
 #include "pipe/p_config.h"
 #include "util/u_debug.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_misc.h"


 /**
@@ -99,6 +118,9 @@ lp_set_target_options(void)

 #if defined(DEBUG) || defined(PROFILE)
   llvm::NoFramePointerElim = true;
+#if HAVE_LLVM >= 0x0208
+   llvm::NoFramePointerElimNonLeaf = true;
+#endif
 #endif

   llvm::NoExcessFPPrecision = false;
@@ -146,6 +168,30 @@ lp_set_target_options(void)
    * shared object where the gallium driver resides.
    */
   llvm::DisablePrettyStackTrace = true;
+
+   // If we have a native target, initialize it to ensure it is linked in and
+   // usable by the JIT.
+   llvm::InitializeNativeTarget();
+
+#if HAVE_LLVM >= 0x0208
+   llvm::InitializeNativeTargetAsmPrinter();
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   LLVMInitializeX86AsmPrinter();
+#elif defined(PIPE_ARCH_ARM)
+   LLVMInitializeARMAsmPrinter();
+#elif defined(PIPE_ARCH_PPC)
+   LLVMInitializePowerPCAsmPrinter();
+#endif
+
+#if HAVE_LLVM >= 0x0207
+#  if HAVE_LLVM >= 0x0301
+   llvm::InitializeNativeTargetDisassembler();
+#  elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   LLVMInitializeX86Disassembler();
+#  elif defined(PIPE_ARCH_ARM)
+   LLVMInitializeARMDisassembler();
+#  endif
+#endif
 }


@@ -165,6 +211,7 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
   return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
 }

+
 extern "C"
 void
 lp_set_load_alignment(LLVMValueRef Inst,
@@ -180,3 +227,67 @@ lp_set_store_alignment(LLVMValueRef Inst,
 {
   llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align);
 }
+
+
+#if HAVE_LLVM >= 0x301
+
+/**
+ * Same as LLVMCreateJITCompilerForModule, but using MCJIT and enabling AVX
+ * feature where available.
+ *
+ * See also:
+ * - llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+ * - llvm/tools/lli/lli.cpp
+ * - http://markmail.org/message/ttkuhvgj4cxxy2on#query:+page:1+mid:aju2dggerju3ivd3+state:results
+ */
+extern "C"
+LLVMBool
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+                                          LLVMModuleRef M,
+                                          unsigned OptLevel,
+                                          char **OutError)
+{
+   using namespace llvm;
+
+   std::string Error;
+   EngineBuilder builder(unwrap(M));
+   builder.setEngineKind(EngineKind::JIT)
+          .setErrorStr(&Error)
+          .setOptLevel((CodeGenOpt::Level)OptLevel);
+
+   builder.setUseMCJIT(true);
+
+   llvm::SmallVector<std::string, 1> MAttrs;
+   if (util_cpu_caps.has_avx) {
+      /*
+       * AVX feature is not automatically detected from CPUID by the X86 target
+       * yet, because the old (yet default) JIT engine is not capable of
+       * emitting the opcodes.  But as we're using MCJIT here, it is safe to
+       * add set this attribute.
+       */
+      MAttrs.push_back("+avx");
+      builder.setMAttrs(MAttrs);
+   }
+   builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager());
+
+   ExecutionEngine *JIT;
+#if 0
+   JIT = builder.create();
+#else
+   /*
+    * Workaround http://llvm.org/bugs/show_bug.cgi?id=12833
+    */
+   StringRef MArch = "";
+   StringRef MCPU = "";
+   Triple TT(unwrap(M)->getTargetTriple());
+   JIT = builder.create(builder.selectTarget(TT, MArch, MCPU, MAttrs));
+#endif
+   if (JIT) {
+      *OutJIT = wrap(JIT);
+      return 0;
+   }
+   *OutError = strdup(Error.c_str());
+   return 1;
+}
+
+#endif /* HAVE_LLVM >= 0x301 */
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_MISC_H
+#define LP_BLD_MISC_H
+
+
+#include "lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+extern void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
+
+extern void
+lp_set_target_options(void);
+
+
+extern void
+lp_func_delete_body(LLVMValueRef func);
+
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name);
+
+extern int
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+                                          LLVMModuleRef M,
+                                          unsigned OptLevel,
+                                          char **OutError);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* !LP_BLD_MISC_H */
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -69,6 +69,7 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_cpu_detect.h"
+#include "util/u_memory.h"

 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -76,6 +77,7 @@
 #include "lp_bld_intr.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_pack.h"
+#include "lp_bld_swizzle.h"


 /**
@@ -101,6 +103,30 @@ lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
   return LLVMConstVector(elems, n);
 }

+/**
+ * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
+ * See comment above lp_build_interleave2_half for more details.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
+                                   unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
+      if (i == (n / 2))
+         j += n / 4;
+
+      elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
+      elems[i + 1] = lp_build_const_int32(gallivm, n + j);
+   }
+
+   return LLVMConstVector(elems, n);
+}

 /**
 * Build shuffle vectors that match PACKxx instructions.
@@ -119,6 +145,71 @@ lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
   return LLVMConstVector(elems, n);
 }

+/**
+ * Return a vector with elements src[start:start+size]
+ * Most useful for getting half the values out of a 256bit sized vector,
+ * otherwise may cause data rearrangement to happen.
+ */
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       unsigned start,
+                       unsigned size)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(size <= Elements(elems));
+
+   for (i = 0; i < size; ++i)
+      elems[i] = lp_build_const_int32(gallivm, i + start);
+
+   if (size == 1) {
+      return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(elems, size), "");
+   }
+}
+
+/**
+ * Concatenates several (must be a power of 2) vectors (of same type)
+ * into a larger one.
+ * Most useful for building up a 256bit sized vector out of two 128bit ones.
+ */
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+                LLVMValueRef src[],
+                struct lp_type src_type,
+                unsigned num_vectors)
+{
+   unsigned new_length, i;
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+   assert(src_type.length * num_vectors <= Elements(shuffles));
+   assert(util_is_power_of_two(num_vectors));
+
+   new_length = src_type.length;
+
+   for (i = 0; i < num_vectors; i++)
+      tmp[i] = src[i];
+
+   while (num_vectors > 1) {
+      num_vectors >>= 1;
+      new_length <<= 1;
+      for (i = 0; i < new_length; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, i);
+      }
+      for (i = 0; i < num_vectors; i++) {
+         tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
+                                         LLVMConstVector(shuffles, new_length), "");
+      }
+   }
+
+   return tmp[0];
+}

 /**
 * Interleave vector elements.
@@ -139,6 +230,40 @@ lp_build_interleave2(struct gallivm_state *gallivm,
   return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 }

+/**
+ * Interleave vector elements but with 256 bit,
+ * treats it as interleave with 2 concatenated 128 bit vectors.
+ *
+ * This differs to lp_build_interleave2 as that function would do the following (for lo):
+ * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
+ *
+ *
+ * An example interleave 8x float with 8x float on AVX 256bit unpack:
+ *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
+ *
+ * Equivalent to interleaving 2x 128 bit vectors
+ *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
+ *
+ * So interleave-lo would result in:
+ *   a0 b0 a1 b1 a4 b4 a5 b5
+ *
+ * And interleave-hi would result in:
+ *   a2 b2 a3 b3 a6 b6 a7 b7
+ */
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi)
+{
+   if (type.length * type.width == 256) {
+      LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
+      return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
+   } else {
+      return lp_build_interleave2(gallivm, type, a, b, lo_hi);
+   }
+}

 /**
 * Double the bit width.
@@ -237,9 +362,9 @@ lp_build_unpack(struct gallivm_state *gallivm,
 * Non-interleaved pack.
 *
 * This will move values as
- *
- *   lo =   __ l0 __ l1 __ l2 __..  __ ln
- *   hi =   __ h0 __ h1 __ h2 __..  __ hn
+ *         (LSB)                     (MSB)
+ *   lo =   l0 __ l1 __ l2 __..  __ ln __
+ *   hi =   h0 __ h1 __ h2 __..  __ hn __
 *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
 *
 * This will only change the number of bits the values are represented, not the
@@ -257,12 +382,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
               LLVMValueRef hi)
 {
   LLVMBuilderRef builder = gallivm->builder;
-#if HAVE_LLVM < 0x0207
-   LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type);
-#endif
   LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
   LLVMValueRef shuffle;
   LLVMValueRef res = NULL;
+   struct lp_type intr_type = dst_type;
+
+#if HAVE_LLVM < 0x0207
+   intr_type = src_type;
+#endif

   assert(!src_type.floating);
   assert(!dst_type.floating);
@@ -270,50 +397,81 @@ lp_build_pack2(struct gallivm_state *gallivm,
   assert(src_type.length * 2 == dst_type.length);

   /* Check for special cases first */
-   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) {
+      const char *intrinsic = NULL;
+
      switch(src_type.width) {
      case 32:
         if(dst_type.sign) {
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-#endif
+            intrinsic = "llvm.x86.sse2.packssdw.128";
         }
         else {
            if (util_cpu_caps.has_sse4_1) {
-               return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
-            }
-            else {
-               /* use generic shuffle below */
-               res = NULL;
+               intrinsic = "llvm.x86.sse41.packusdw";
+#if HAVE_LLVM < 0x0207
+               /* llvm < 2.7 has inconsistent signatures except for packusdw */
+               intr_type = dst_type;
+#endif
            }
         }
         break;
-
      case 16:
-         if(dst_type.sign)
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-#endif
-         else
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-#endif
-         break;
-
-      default:
-         assert(0);
-         return LLVMGetUndef(dst_vec_type);
+         if (dst_type.sign) {
+            intrinsic = "llvm.x86.sse2.packsswb.128";
+         }
+         else {
+            intrinsic = "llvm.x86.sse2.packuswb.128";
+         }
         break;
+      /* default uses generic shuffle below */
      }
+      if (intrinsic) {
+         if (src_type.width * src_type.length == 128) {
+            LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
+            res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
+            if (dst_vec_type != intr_vec_type) {
+               res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+            }
+         }
+         else {
+            int num_split = src_type.width * src_type.length / 128;
+            int i;
+            int nlen = 128 / src_type.width;
+            struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
+            struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
+            LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
+            LLVMValueRef tmplo, tmphi;
+            LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
+            LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);

-      if (res) {
-         res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+            assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
+
+            for (i = 0; i < num_split / 2; i++) {
+               tmplo = lp_build_extract_range(gallivm,
+                                              lo, i*nlen*2, nlen);
+               tmphi = lp_build_extract_range(gallivm,
+                                              lo, i*nlen*2 + nlen, nlen);
+               tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                     nintr_vec_type, tmplo, tmphi);
+               if (ndst_vec_type != nintr_vec_type) {
+                  tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
+               }
+            }
+            for (i = 0; i < num_split / 2; i++) {
+               tmplo = lp_build_extract_range(gallivm,
+                                              hi, i*nlen*2, nlen);
+               tmphi = lp_build_extract_range(gallivm,
+                                              hi, i*nlen*2 + nlen, nlen);
+               tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                                 nintr_vec_type,
+                                                                 tmplo, tmphi);
+               if (ndst_vec_type != nintr_vec_type) {
+                  tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
+                                                           ndst_vec_type, "");
+               }
+            }
+            res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
+         }
         return res;
      }
   }
@@ -357,8 +515,9 @@ lp_build_packs2(struct gallivm_state *gallivm,
   /* All X86 SSE non-interleaved pack instructions take signed inputs and
    * saturate them, so no need to clamp for those cases. */
   if(util_cpu_caps.has_sse2 &&
-      src_type.width * src_type.length == 128 &&
-      src_type.sign)
+      src_type.width * src_type.length >= 128 &&
+      src_type.sign &&
+      (src_type.width == 32 || src_type.width == 16))
      clamp = FALSE;

   if(clamp) {
@@ -395,7 +554,6 @@ lp_build_pack(struct gallivm_state *gallivm,
   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
   unsigned i;

-
   /* Register width must remain constant */
   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);

@@ -487,21 +645,44 @@ lp_build_resize(struct gallivm_state *gallivm,
        /*
         * Register width remains constant -- use vector packing intrinsics
         */
-
         tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
      }
      else {
-         /*
-          * Do it element-wise.
-          */
+         if (src_type.width / dst_type.width > num_srcs) {
+            /*
+            * First change src vectors size (with shuffle) so they have the
+            * same size as the destination vector, then pack normally.
+            * Note: cannot use cast/extract because llvm generates atrocious code.
+            */
+            unsigned size_ratio = (src_type.width * src_type.length) /
+                                  (dst_type.length * dst_type.width);
+            unsigned new_length = src_type.length / size_ratio;

-         assert(src_type.length == dst_type.length);
-         tmp[0] = lp_build_undef(gallivm, dst_type);
-         for (i = 0; i < dst_type.length; ++i) {
-            LLVMValueRef index = lp_build_const_int32(gallivm, i);
-            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
-            val = LLVMBuildTrunc(builder, val, lp_build_elem_type(gallivm, dst_type), "");
-            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+            for (i = 0; i < size_ratio * num_srcs; i++) {
+               unsigned start_index = (i % size_ratio) * new_length;
+               tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
+                                               start_index, new_length);
+            }
+            num_srcs *= size_ratio;
+            src_type.length = new_length;
+            tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
+         }
+         else {
+            /*
+             * Truncate bit width but expand vector size - first pack
+             * then expand simply because this should be more AVX-friendly
+             * for the cases we probably hit.
+             */
+            unsigned size_ratio = (dst_type.width * dst_type.length) /
+                                  (src_type.length * src_type.width);
+            unsigned num_pack_srcs = num_srcs / size_ratio;
+            dst_type.length = dst_type.length / size_ratio;
+
+            for (i = 0; i < size_ratio; i++) {
+               tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
+                                      &src[i*num_pack_srcs], num_pack_srcs);
+            }
+            tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
         }
      }
   }
@@ -522,19 +703,24 @@ lp_build_resize(struct gallivm_state *gallivm,
         /*
          * Do it element-wise.
          */
+         assert(src_type.length * num_srcs == dst_type.length * num_dsts);

-         assert(src_type.length == dst_type.length);
-         tmp[0] = lp_build_undef(gallivm, dst_type);
-         for (i = 0; i < dst_type.length; ++i) {
-            LLVMValueRef index = lp_build_const_int32(gallivm, i);
-            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+         for (i = 0; i < num_dsts; i++) {
+            tmp[i] = lp_build_undef(gallivm, dst_type);
+         }
+
+         for (i = 0; i < src_type.length; ++i) {
+            unsigned j = i / dst_type.length;
+            LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
+            LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");

            if (src_type.sign && dst_type.sign) {
               val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
            } else {
               val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
            }
-            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+            tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
         }
      }
   }
@@ -554,3 +740,38 @@ lp_build_resize(struct gallivm_state *gallivm,
 }


+/**
+ * Expands src vector from src.length to dst_length
+ */
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       struct lp_type src_type,
+                       unsigned dst_length)
+{
+   LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type));
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(dst_length <= Elements(elems));
+   assert(dst_length > src_type.length);
+
+   if (src_type.length == dst_length)
+      return src;
+
+   /* If its a single scalar type, no need to reinvent the wheel */
+   if (src_type.length == 1) {
+      return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src);
+   }
+
+   /* All elements from src vector */
+   for (i = 0; i < src_type.length; ++i)
+      elems[i] = lp_build_const_int32(gallivm, i);
+
+   /* Undef fill remaining space */
+   for (i = src_type.length; i < dst_length; ++i)
+      elems[i] = lp_build_const_int32(gallivm, src_type.length);
+
+   /* Combine the two vectors */
+   return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
+}
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -44,6 +44,12 @@

 struct lp_type;

+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi);

 LLVMValueRef
 lp_build_interleave2(struct gallivm_state *gallivm,
@@ -69,6 +75,17 @@ lp_build_unpack(struct gallivm_state *gallivm,
                LLVMValueRef src,
                LLVMValueRef *dst, unsigned num_dsts);

+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       unsigned start,
+                       unsigned size);
+
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+                LLVMValueRef src[],
+                struct lp_type src_type,
+                unsigned num_vectors);

 LLVMValueRef
 lp_build_packs2(struct gallivm_state *gallivm,
@@ -102,4 +119,10 @@ lp_build_resize(struct gallivm_state *gallivm,
                LLVMValueRef *dst, unsigned num_dsts);


+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+                    LLVMValueRef src,
+                    struct lp_type src_type,
+                    unsigned dst_length);
+
 #endif /* !LP_BLD_PACK_H */
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
@@ -34,6 +34,107 @@
 #include "lp_bld_init.h"
 #include "lp_bld_const.h"
 #include "lp_bld_printf.h"
+#include "lp_bld_type.h"
+
+
+/**
+ * Generates LLVM IR to call debug_printf.
+ */
+static LLVMValueRef
+lp_build_print_args(struct gallivm_state* gallivm,
+                    int argcount,
+                    LLVMValueRef* args)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMContextRef context = gallivm->context;
+   LLVMValueRef func_printf;
+   LLVMTypeRef printf_type;
+   int i;
+
+   assert(args);
+   assert(argcount > 0);
+   assert(LLVMTypeOf(args[0]) == LLVMPointerType(LLVMInt8TypeInContext(context), 0));
+
+   /* Cast any float arguments to doubles as printf expects */
+   for (i = 1; i < argcount; i++) {
+      LLVMTypeRef type = LLVMTypeOf(args[i]);
+
+      if (LLVMGetTypeKind(type) == LLVMFloatTypeKind)
+         args[i] = LLVMBuildFPExt(builder, args[i], LLVMDoubleTypeInContext(context), "");
+   }
+
+   printf_type = LLVMFunctionType(LLVMInt32TypeInContext(context), NULL, 0, 1);
+   func_printf = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)debug_printf));
+   func_printf = LLVMBuildBitCast(builder, func_printf, LLVMPointerType(printf_type, 0), "debug_printf");
+
+   return LLVMBuildCall(builder, func_printf, args, argcount, "");
+}
+
+
+/**
+ * Print a LLVM value of any type
+ */
+LLVMValueRef
+lp_build_print_value(struct gallivm_state *gallivm,
+                     const char *msg,
+                     LLVMValueRef value)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeKind type_kind;
+   LLVMTypeRef type_ref;
+   LLVMValueRef params[2 + LP_MAX_VECTOR_LENGTH];
+   char type_fmt[6] = " %x";
+   char format[2 + 5 * LP_MAX_VECTOR_LENGTH + 2] = "%s";
+   unsigned length;
+   unsigned i;
+
+   type_ref = LLVMTypeOf(value);
+   type_kind = LLVMGetTypeKind(type_ref);
+
+   if (type_kind == LLVMVectorTypeKind) {
+      length = LLVMGetVectorSize(type_ref);
+
+      type_ref = LLVMGetElementType(type_ref);
+      type_kind = LLVMGetTypeKind(type_ref);
+   } else {
+      length = 1;
+   }
+
+   if (type_kind == LLVMFloatTypeKind || type_kind == LLVMDoubleTypeKind) {
+      type_fmt[2] = '.';
+      type_fmt[3] = '9';
+      type_fmt[4] = 'g';
+      type_fmt[5] = '\0';
+   } else if (type_kind == LLVMIntegerTypeKind) {
+      if (LLVMGetIntTypeWidth(type_ref) == 8) {
+         type_fmt[2] = 'u';
+      } else {
+         type_fmt[2] = 'i';
+      }
+   } else {
+      /* Unsupported type */
+      assert(0);
+   }
+
+   /* Create format string and arguments */
+   assert(strlen(format) + strlen(type_fmt) * length + 2 <= sizeof format);
+
+   params[1] = lp_build_const_string(gallivm, msg);
+   if (length == 1) {
+      util_strncat(format, type_fmt, sizeof(format) - strlen(format) - 1);
+      params[2] = value;
+   } else {
+      for (i = 0; i < length; ++i) {
+         util_strncat(format, type_fmt, sizeof(format) - strlen(format) - 1);
+         params[2 + i] = LLVMBuildExtractElement(builder, value, lp_build_const_int32(gallivm, i), "");
+      }
+   }
+
+   util_strncat(format, "\n", sizeof(format) - strlen(format) - 1);
+
+   params[0] = lp_build_const_string(gallivm, format);
+   return lp_build_print_args(gallivm, 2 + length, params);
+}


 static int
@@ -48,137 +149,46 @@ lp_get_printf_arg_count(const char *fmt)
         continue;
      switch (*p) {
         case '\0':
-	    continue;
+       continue;
         case '%':
-	    p++;
-	    continue;
-	 case '.':
-	    if (p[1] == '*' && p[2] == 's') {
-	       count += 2;
-	       p += 3;
+       p++;
+       continue;
+    case '.':
+       if (p[1] == '*' && p[2] == 's') {
+          count += 2;
+          p += 3;
               continue;
-	    }
-	    /* fallthrough */
-	 default:
-	    count ++;
+       }
+       /* fallthrough */
+    default:
+       count ++;
      }
   }
   return count;
 }

+
 /**
- * lp_build_printf.
- *
- * Build printf call in LLVM IR. The output goes to stdout.
- * The additional variable arguments need to have type
- * LLVMValueRef.
+ * Generate LLVM IR for a c style printf
 */
 LLVMValueRef
-lp_build_printf(struct gallivm_state *gallivm, const char *fmt, ...)
+lp_build_printf(struct gallivm_state *gallivm,
+                const char *fmt, ...)
 {
-   va_list arglist;
-   int i = 0;
-   int argcount = lp_get_printf_arg_count(fmt);
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMContextRef context = gallivm->context;
   LLVMValueRef params[50];
-   LLVMValueRef fmtarg = lp_build_const_string(gallivm, fmt);
-   LLVMTypeRef printf_type;
-   LLVMValueRef func_printf;
+   va_list arglist;
+   int argcount;
+   int i;

+   argcount = lp_get_printf_arg_count(fmt);
   assert(Elements(params) >= argcount + 1);

-   printf_type = LLVMFunctionType(LLVMIntTypeInContext(context, 32), NULL, 0, 1);
-
-   func_printf = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)debug_printf));
-
-   func_printf = LLVMBuildBitCast(gallivm->builder, func_printf,
-                                  LLVMPointerType(printf_type, 0),
-                                  "debug_printf");
-
-   params[0] = fmtarg;
-
   va_start(arglist, fmt);
   for (i = 1; i <= argcount; i++) {
-      LLVMValueRef val = va_arg(arglist, LLVMValueRef);
-      LLVMTypeRef type = LLVMTypeOf(val);
-      /* printf wants doubles, so lets convert so that
-       * we can actually print them */
-      if (LLVMGetTypeKind(type) == LLVMFloatTypeKind)
-         val = LLVMBuildFPExt(builder, val, LLVMDoubleTypeInContext(context), "");
-      params[i] = val;
+      params[i] = va_arg(arglist, LLVMValueRef);
   }
   va_end(arglist);

-   return LLVMBuildCall(builder, func_printf, params, argcount + 1, "");
-}
-
-
-
-/**
- * Print a float[4] vector.
- */
-LLVMValueRef
-lp_build_print_vec4(struct gallivm_state *gallivm,
-                    const char *msg, LLVMValueRef vec)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   char format[1000];
-   LLVMValueRef x, y, z, w;
-
-   x = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, 0), "");
-   y = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, 1), "");
-   z = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, 2), "");
-   w = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, 3), "");
-
-   util_snprintf(format, sizeof(format), "%s %%f %%f %%f %%f\n", msg);
-   return lp_build_printf(gallivm, format, x, y, z, w);
-}
-
-
-/**
- * Print a intt[4] vector.
- */
-LLVMValueRef
-lp_build_print_ivec4(struct gallivm_state *gallivm,
-                    const char *msg, LLVMValueRef vec)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   char format[1000];
-   LLVMValueRef x, y, z, w;
-
-   x = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, 0), "");
-   y = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, 1), "");
-   z = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, 2), "");
-   w = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, 3), "");
-
-   util_snprintf(format, sizeof(format), "%s %%i %%i %%i %%i\n", msg);
-   return lp_build_printf(gallivm, format, x, y, z, w);
-}
-
-
-/**
- * Print a uint8[16] vector.
- */
-LLVMValueRef
-lp_build_print_uvec16(struct gallivm_state *gallivm,
-                    const char *msg, LLVMValueRef vec)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   char format[1000];
-   LLVMValueRef args[16];
-   int i;
-
-   for (i = 0; i < 16; ++i) {
-      args[i] = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(gallivm, i), "");
-   }
-
-   util_snprintf(format, sizeof(format), "%s %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u %%u\n", msg);
-
-   return lp_build_printf(
-            gallivm, format,
-            args[ 0], args[ 1], args[ 2], args[ 3],
-            args[ 4], args[ 5], args[ 6], args[ 7],
-            args[ 8], args[ 9], args[10], args[11],
-            args[12], args[13], args[14], args[15]);
+   params[0] = lp_build_const_string(gallivm, fmt);
+   return lp_build_print_args(gallivm, argcount + 1, params);
 }
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
@@ -39,16 +39,9 @@ lp_build_printf(struct gallivm_state *gallivm,
                const char *fmt, ...);

 LLVMValueRef
-lp_build_print_vec4(struct gallivm_state *gallivm,
-                    const char *msg, LLVMValueRef vec);
-
-LLVMValueRef
-lp_build_print_ivec4(struct gallivm_state *gallivm,
-                     const char *msg, LLVMValueRef vec);
-
-LLVMValueRef
-lp_build_print_uvec16(struct gallivm_state *gallivm,
-                     const char *msg, LLVMValueRef vec);
+lp_build_print_value(struct gallivm_state *gallivm,
+                     const char *msg,
+                     LLVMValueRef value);

 #endif

--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -77,34 +77,82 @@ lp_build_ddy(struct lp_build_context *bld,
   return lp_build_sub(bld, a_bottom, a_top);
 }

-
+/*
+ * To be able to handle multiple quads at once in texture sampling and
+ * do lod calculations per quad, it is necessary to get the per-quad
+ * derivatives into the lp_build_rho function.
+ * For 8-wide vectors the packed derivative values for 3 coords would
+ * look like this, this scales to a arbitrary (multiple of 4) vector size:
+ * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ * dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____
+ * The second vector will be unused for 1d and 2d textures.
+ */
 LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+                                 LLVMValueRef a)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef idx_left  = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
-   LLVMValueRef idx_right = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_RIGHT);
-   LLVMValueRef a_left  = LLVMBuildExtractElement(builder, a, idx_left, "left");
-   LLVMValueRef a_right = LLVMBuildExtractElement(builder, a, idx_right, "right");
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef vec1, vec2;
+
+   /* same packing as _twocoord, but can use aos swizzle helper */
+
+   /*
+    * XXX could make swizzle1 a noop swizzle by using right top/bottom
+    * pair for ddy
+    */
+   static const unsigned char swizzle1[] = {
+      LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT,
+      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+   static const unsigned char swizzle2[] = {
+      LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT,
+      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+
+   vec1 = lp_build_swizzle_aos(bld, a, swizzle1);
+   vec2 = lp_build_swizzle_aos(bld, a, swizzle2);
+
   if (bld->type.floating)
-      return LLVMBuildFSub(builder, a_right, a_left, "ddx");
+      return LLVMBuildFSub(builder, vec2, vec1, "ddxddy");
   else
-      return LLVMBuildSub(builder, a_right, a_left, "ddx");
+      return LLVMBuildSub(builder, vec2, vec1, "ddxddy");
 }


 LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+                                 LLVMValueRef a, LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef idx_top    = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
-   LLVMValueRef idx_bottom = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_BOTTOM_LEFT);
-   LLVMValueRef a_top    = LLVMBuildExtractElement(builder, a, idx_top, "top");
-   LLVMValueRef a_bottom = LLVMBuildExtractElement(builder, a, idx_bottom, "bottom");
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH/4];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH/4];
+   LLVMValueRef vec1, vec2;
+   unsigned length, num_quads, i;
+
+   /* XXX: do hsub version */
+   length = bld->type.length;
+   num_quads = length / 4;
+   for (i = 0; i < num_quads; i++) {
+      unsigned s1 = 4 * i;
+      unsigned s2 = 4 * i + length;
+      shuffles1[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+      shuffles1[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+      shuffles1[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+      shuffles1[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+      shuffles2[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s1);
+      shuffles2[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s1);
+      shuffles2[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s2);
+      shuffles2[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s2);
+   }
+   vec1 = LLVMBuildShuffleVector(builder, a, b,
+                                 LLVMConstVector(shuffles1, length), "");
+   vec2 = LLVMBuildShuffleVector(builder, a, b,
+                                 LLVMConstVector(shuffles2, length), "");
   if (bld->type.floating)
-      return LLVMBuildFSub(builder, a_bottom, a_top, "ddy");
+      return LLVMBuildFSub(builder, vec2, vec1, "ddxddyddxddy");
   else
-      return LLVMBuildSub(builder, a_bottom, a_top, "ddy");
+      return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy");
 }
+
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -78,19 +78,15 @@ lp_build_ddy(struct lp_build_context *bld,


 /*
- * Scalar derivatives.
- *
- * Same as getting the first value of above.
+ * Packed derivatives (one derivative for each direction per quad)
 */
+LLVMValueRef
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+                                 LLVMValueRef a, LLVMValueRef b);

 LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
-                    LLVMValueRef a);
-
-
-LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
-                    LLVMValueRef a);
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+                                 LLVMValueRef a);


 #endif /* LP_BLD_QUAD_H_ */
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -44,6 +44,8 @@
 #include "lp_bld_sample.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_type.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"


 /*
@@ -175,67 +177,92 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,

 /**
 * Generate code to compute coordinate gradient (rho).
- * \param ddx  partial derivatives of (s, t, r, q) with respect to X
- * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 *
- * XXX: The resulting rho is scalar, so we ignore all but the first element of
- * derivatives that are passed by the shader.
+ * The resulting rho is scalar per quad.
 */
 static LLVMValueRef
 lp_build_rho(struct lp_build_sample_context *bld,
             unsigned unit,
-             const LLVMValueRef ddx[4],
-             const LLVMValueRef ddy[4])
+             const struct lp_derivatives *derivs)
 {
+   struct gallivm_state *gallivm = bld->gallivm;
   struct lp_build_context *int_size_bld = &bld->int_size_bld;
   struct lp_build_context *float_size_bld = &bld->float_size_bld;
   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   const LLVMValueRef *ddx_ddy = derivs->ddx_ddy;
   const unsigned dims = bld->dims;
   LLVMBuilderRef builder = bld->gallivm->builder;
   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
   LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
   LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
   LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
-   LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
-   LLVMValueRef rho_x, rho_y;
   LLVMValueRef rho_vec;
   LLVMValueRef int_size, float_size;
   LLVMValueRef rho;
   LLVMValueRef first_level, first_level_vec;
+   LLVMValueRef abs_ddx_ddy[2];
+   unsigned length = coord_bld->type.length;
+   unsigned num_quads = length / 4;
+   unsigned i;
+   LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+   LLVMValueRef rho_xvec, rho_yvec;

-   dsdx = ddx[0];
-   dsdy = ddy[0];
-
-   if (dims <= 1) {
-      rho_x = dsdx;
-      rho_y = dsdy;
+   abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+   if (dims > 2) {
+      abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
   }
   else {
-      rho_x = float_size_bld->undef;
-      rho_y = float_size_bld->undef;
-
-      rho_x = LLVMBuildInsertElement(builder, rho_x, dsdx, index0, "");
-      rho_y = LLVMBuildInsertElement(builder, rho_y, dsdy, index0, "");
-
-      dtdx = ddx[1];
-      dtdy = ddy[1];
-
-      rho_x = LLVMBuildInsertElement(builder, rho_x, dtdx, index1, "");
-      rho_y = LLVMBuildInsertElement(builder, rho_y, dtdy, index1, "");
-
-      if (dims >= 3) {
-         drdx = ddx[2];
-         drdy = ddy[2];
-
-         rho_x = LLVMBuildInsertElement(builder, rho_x, drdx, index2, "");
-         rho_y = LLVMBuildInsertElement(builder, rho_y, drdy, index2, "");
-      }
+      abs_ddx_ddy[1] = NULL;
   }

-   rho_x = lp_build_abs(float_size_bld, rho_x);
-   rho_y = lp_build_abs(float_size_bld, rho_y);
+   if (dims == 1) {
+      static const unsigned char swizzle1[] = {
+         0, LP_BLD_SWIZZLE_DONTCARE,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle2[] = {
+         1, LP_BLD_SWIZZLE_DONTCARE,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+      rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
+   }
+   else if (dims == 2) {
+      static const unsigned char swizzle1[] = {
+         0, 2,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle2[] = {
+         1, 3,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+      rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
+   }
+   else {
+      LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
+      assert(dims == 3);
+      for (i = 0; i < num_quads; i++) {
+         shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
+         shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
+         shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
+         shuffles1[4*i + 3] = i32undef;
+         shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
+         shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
+         shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1);
+         shuffles2[4*i + 3] = i32undef;
+      }
+      rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+                                        LLVMConstVector(shuffles1, length), "");
+      rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+                                        LLVMConstVector(shuffles2, length), "");
+   }

-   rho_vec = lp_build_max(float_size_bld, rho_x, rho_y);
+   rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);

   first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                 bld->gallivm, unit);
@@ -243,22 +270,77 @@ lp_build_rho(struct lp_build_sample_context *bld,
   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
   float_size = lp_build_int_to_float(float_size_bld, int_size);

-   rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
+   if (bld->coord_type.length > 4) {
+      /* expand size to each quad */
+      if (dims > 1) {
+         /* could use some broadcast_vector helper for this? */
+         int num_quads = bld->coord_type.length / 4;
+         LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
+         for (i = 0; i < num_quads; i++) {
+            src[i] = float_size;
+         }
+         float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
+      }
+      else {
+         float_size = lp_build_broadcast_scalar(coord_bld, float_size);
+      }
+      rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);

-   if (dims <= 1) {
-      rho = rho_vec;
+      if (dims <= 1) {
+         rho = rho_vec;
+      }
+      else {
+         if (dims >= 2) {
+            static const unsigned char swizzle1[] = {
+               0, LP_BLD_SWIZZLE_DONTCARE,
+               LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+            };
+            static const unsigned char swizzle2[] = {
+               1, LP_BLD_SWIZZLE_DONTCARE,
+               LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+            };
+            LLVMValueRef rho_s, rho_t, rho_r;
+
+            rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+            rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
+
+            rho = lp_build_max(coord_bld, rho_s, rho_t);
+
+            if (dims >= 3) {
+               static const unsigned char swizzle3[] = {
+                  2, LP_BLD_SWIZZLE_DONTCARE,
+                  LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+               };
+               rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3);
+               rho = lp_build_max(coord_bld, rho, rho_r);
+            }
+         }
+      }
+      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                      perquadf_bld->type, rho);
   }
   else {
-      if (dims >= 2) {
-         LLVMValueRef rho_s, rho_t, rho_r;
+      if (dims <= 1) {
+         rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+      }
+      rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);

-         rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
-         rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
+      if (dims <= 1) {
+         rho = rho_vec;
+      }
+      else {
+         if (dims >= 2) {
+            LLVMValueRef rho_s, rho_t, rho_r;

-         rho = lp_build_max(float_bld, rho_s, rho_t);
-         if (dims >= 3) {
-            rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
-            rho = lp_build_max(float_bld, rho, rho_r);
+            rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+            rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
+
+            rho = lp_build_max(float_bld, rho_s, rho_t);
+
+            if (dims >= 3) {
+               rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
+               rho = lp_build_max(float_bld, rho, rho_r);
+            }
         }
      }
   }
@@ -396,22 +478,20 @@ lp_build_brilinear_rho(struct lp_build_context *bld,

 /**
 * Generate code to compute texture level of detail (lambda).
- * \param ddx  partial derivatives of (s, t, r, q) with respect to X
- * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 * \param lod_bias  optional float vector with the shader lod bias
 * \param explicit_lod  optional float vector with the explicit lod
 * \param width  scalar int texture width
 * \param height  scalar int texture height
 * \param depth  scalar int texture depth
 *
- * XXX: The resulting lod is scalar, so ignore all but the first element of
- * derivatives, lod_bias, etc that are passed by the shader.
+ * The resulting lod is scalar per quad, so only the first value per quad
+ * passed in from lod_bias, explicit_lod is used.
 */
 void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
                      unsigned unit,
-                      const LLVMValueRef ddx[4],
-                      const LLVMValueRef ddy[4],
+                      const struct lp_derivatives *derivs,
                      LLVMValueRef lod_bias, /* optional */
                      LLVMValueRef explicit_lod, /* optional */
                      unsigned mip_filter,
@@ -420,11 +500,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,

 {
   LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
   LLVMValueRef lod;

-   *out_lod_ipart = bld->int_bld.zero;
-   *out_lod_fpart = bld->float_bld.zero;
+   *out_lod_ipart = bld->perquadi_bld.zero;
+   *out_lod_fpart = perquadf_bld->zero;

   if (bld->static_state->min_max_lod_equal) {
      /* User is forcing sampling from a particular mipmap level.
@@ -433,21 +513,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
      LLVMValueRef min_lod =
         bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);

-      lod = min_lod;
+      lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
   }
   else {
-      LLVMValueRef sampler_lod_bias =
-         bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
-      LLVMValueRef index0 = lp_build_const_int32(bld->gallivm, 0);
-
      if (explicit_lod) {
-         lod = LLVMBuildExtractElement(builder, explicit_lod,
-                                       index0, "");
+         lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                                         perquadf_bld->type, explicit_lod);
      }
      else {
         LLVMValueRef rho;

-         rho = lp_build_rho(bld, unit, ddx, ddy);
+         rho = lp_build_rho(bld, unit, derivs);

         /*
          * Compute lod = log2(rho)
@@ -465,66 +541,72 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,

            if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
                mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-               *out_lod_ipart = lp_build_ilog2(float_bld, rho);
-               *out_lod_fpart = bld->float_bld.zero;
+               *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
+               *out_lod_fpart = perquadf_bld->zero;
               return;
            }
            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-               lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+               lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
                                      out_lod_ipart, out_lod_fpart);
               return;
            }
         }

         if (0) {
-            lod = lp_build_log2(float_bld, rho);
+            lod = lp_build_log2(perquadf_bld, rho);
         }
         else {
-            lod = lp_build_fast_log2(float_bld, rho);
+            lod = lp_build_fast_log2(perquadf_bld, rho);
         }

         /* add shader lod bias */
         if (lod_bias) {
-            lod_bias = LLVMBuildExtractElement(builder, lod_bias,
-                                               index0, "");
+            lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                  perquadf_bld->type, lod_bias);
            lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
         }
      }

      /* add sampler lod bias */
-      if (bld->static_state->lod_bias_non_zero)
+      if (bld->static_state->lod_bias_non_zero) {
+         LLVMValueRef sampler_lod_bias =
+            bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
+         sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
+                                                      sampler_lod_bias);
         lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
-
+      }

      /* clamp lod */
      if (bld->static_state->apply_max_lod) {
         LLVMValueRef max_lod =
            bld->dynamic_state->max_lod(bld->dynamic_state, bld->gallivm, unit);
+         max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);

-         lod = lp_build_min(float_bld, lod, max_lod);
+         lod = lp_build_min(perquadf_bld, lod, max_lod);
      }
      if (bld->static_state->apply_min_lod) {
         LLVMValueRef min_lod =
            bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
+         min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);

-         lod = lp_build_max(float_bld, lod, min_lod);
+         lod = lp_build_max(perquadf_bld, lod, min_lod);
      }
   }

   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
      if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-         lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
+         lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
                                out_lod_ipart, out_lod_fpart);
      }
      else {
-         lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
+         lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
      }

      lp_build_name(*out_lod_fpart, "lod_fpart");
   }
   else {
-      *out_lod_ipart = lp_build_iround(float_bld, lod);
+      *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
   }

   lp_build_name(*out_lod_ipart, "lod_ipart");
@@ -536,8 +618,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 /**
 * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
 * mipmap level index.
- * Note: this is all scalar code.
- * \param lod  scalar float texture level of detail
+ * Note: this is all scalar per quad code.
+ * \param lod_ipart  int texture level of detail
 * \param level_out  returns integer 
 */
 void
@@ -546,26 +628,27 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                           LLVMValueRef lod_ipart,
                           LLVMValueRef *level_out)
 {
-   struct lp_build_context *int_bld = &bld->int_bld;
+   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
   LLVMValueRef first_level, last_level, level;

   first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                 bld->gallivm, unit);
   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                               bld->gallivm, unit);
+   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);

-   /* convert float lod to integer */
-   level = lp_build_add(int_bld, lod_ipart, first_level);
+   level = lp_build_add(perquadi_bld, lod_ipart, first_level);

   /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(int_bld, level, first_level, last_level);
+   *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
 }


 /**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes.  Later, we'll sample from those
- * two mipmap levels and interpolate between them.
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
+ * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
+ * Later, we'll sample from those two mipmap levels and interpolate between them.
 */
 void
 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
@@ -576,20 +659,21 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                           LLVMValueRef *level1_out)
 {
   LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *int_bld = &bld->int_bld;
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
   LLVMValueRef first_level, last_level;
   LLVMValueRef clamp_min;
   LLVMValueRef clamp_max;

   first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                 bld->gallivm, unit);
-
-   *level0_out = lp_build_add(int_bld, lod_ipart, first_level);
-   *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
-
   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                               bld->gallivm, unit);
+   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+
+   *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
+   *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);

   /*
    * Clamp both *level0_out and *level1_out to [first_level, last_level], with
@@ -597,6 +681,15 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
    * ends in the process.
    */

+   /*
+    * This code (vector select in particular) only works with llvm 3.1
+    * (if there's more than one quad, with x86 backend). Might consider
+    * converting to our lp_bld_logic helpers.
+    */
+#if HAVE_LLVM < 0x0301
+   assert(perquadi_bld->type.length == 1);
+#endif
+
   /* *level0_out < first_level */
   clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
                             *level0_out, first_level,
@@ -609,7 +702,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                 first_level, *level1_out, "");

   *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
-                                      float_bld->zero, *lod_fpart_inout, "");
+                                      perquadf_bld->zero, *lod_fpart_inout, "");

   /* *level0_out >= last_level */
   clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
@@ -623,7 +716,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                 last_level, *level1_out, "");

   *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
-                                      float_bld->zero, *lod_fpart_inout, "");
+                                      perquadf_bld->zero, *lod_fpart_inout, "");

   lp_build_name(*level0_out, "sampler%u_miplevel0", unit);
   lp_build_name(*level1_out, "sampler%u_miplevel1", unit);
@@ -651,15 +744,6 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 }


-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                int level)
-{
-   LLVMValueRef lvl = lp_build_const_int32(bld->gallivm, level);
-   return lp_build_get_mipmap_level(bld, lvl);
-}
-
-
 /**
 * Codegen equivalent for u_minify().
 * Return max(1, base_size >> level);
@@ -748,8 +832,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
 *                    bld->int_size_type or bld->float_size_type)
 * @param coord_type  type of the texture size vector (either
 *                    bld->int_coord_type or bld->coord_type)
- * @param int_size    vector with the integer texture size (width, height,
- *                    depth)
+ * @param size        vector with the texture size (width, height, depth)
 */
 void
 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
@@ -788,7 +871,7 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
 /**
 * Unnormalize coords.
 *
- * @param int_size  vector with the integer texture size (width, height, depth)
+ * @param flt_size  vector with the integer texture size (width, height, depth)
 */
 void
 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
@@ -823,7 +906,18 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,

 /** Helper used by lp_build_cube_lookup() */
 static LLVMValueRef
-lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+   /* ima = +0.5 / abs(coord); */
+   LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
+   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+   LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
+   return ima;
+}
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
 {
   /* ima = -0.5 / abs(coord); */
   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
@@ -832,9 +926,12 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
   return ima;
 }

-
 /**
 * Helper used by lp_build_cube_lookup()
+ * FIXME: the sign here can also be 0.
+ * Arithmetically this could definitely make a difference. Either
+ * fix the comment or use other (simpler) sign function, not sure
+ * which one it should be.
 * \param sign  scalar +1 or -1
 * \param coord  float vector
 * \param ima  float vector
@@ -898,58 +995,186 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                     LLVMValueRef *face_s,
                     LLVMValueRef *face_t)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
   struct lp_build_context *coord_bld = &bld->coord_bld;
   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct gallivm_state *gallivm = bld->gallivm;
   LLVMValueRef rx, ry, rz;
-   LLVMValueRef arx, ary, arz;
-   LLVMValueRef c25 = lp_build_const_float(bld->gallivm, 0.25);
-   LLVMValueRef arx_ge_ary, arx_ge_arz;
-   LLVMValueRef ary_ge_arx, ary_ge_arz;
-   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-
-   assert(bld->coord_bld.type.length == 4);
+   LLVMValueRef tmp[4], rxyz, arxyz;

   /*
    * Use the average of the four pixel's texcoords to choose the face.
+    * Slight simplification just calculate the sum, skip scaling.
    */
-   rx = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, s));
-   ry = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, t));
-   rz = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, r));
+   tmp[0] = s;
+   tmp[1] = t;
+   tmp[2] = r;
+   rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
+   arxyz = lp_build_abs(&bld->coord_bld, rxyz);

-   arx = lp_build_abs(float_bld, rx);
-   ary = lp_build_abs(float_bld, ry);
-   arz = lp_build_abs(float_bld, rz);
+   if (coord_bld->type.length > 4) {
+      struct lp_build_context *cint_bld = &bld->int_coord_bld;
+      struct lp_type intctype = cint_bld->type;
+      LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
+      LLVMValueRef arxs, arys, arzs;
+      LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
+      LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
+      LLVMValueRef ryneg, rzneg;
+      LLVMValueRef ma, ima;
+      LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
+      LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
+                                                     1 << (intctype.width - 1));
+      LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
+                                                      intctype.width -1);
+      LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
+      LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
+      LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);

-   /*
-    * Compare sign/magnitude of rx,ry,rz to determine face
-    */
-   arx_ge_ary = LLVMBuildFCmp(builder, LLVMRealUGE, arx, ary, "");
-   arx_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, arx, arz, "");
-   ary_ge_arx = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arx, "");
-   ary_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arz, "");
+      assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
+      assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
+      assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);

-   arx_ge_ary_arz = LLVMBuildAnd(builder, arx_ge_ary, arx_ge_arz, "");
-   ary_ge_arx_arz = LLVMBuildAnd(builder, ary_ge_arx, ary_ge_arz, "");
+      rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
+      ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
+      rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
+      ryneg = LLVMBuildXor(builder, ry, signmask, "");
+      rzneg = LLVMBuildXor(builder, rz, signmask, "");

-   {
+      /* the sign bit comes from the averaged vector (per quad),
+       * as does the decision which face to use */
+      signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
+      signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
+
+      arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0);
+      arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1);
+      arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2);
+
+      /*
+       * select x if x >= y else select y
+       * select previous result if y >= max(x,y) else select z
+       */
+      arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
+      maxarxsarys = lp_build_max(coord_bld, arxs, arys);
+      arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
+
+      /*
+       * compute all possible new s/t coords
+       * snewx = signrx * -rz;
+       * tnewx = -ry;
+       * snewy = rx;
+       * tnewy = signry * rz;
+       * snewz = signrz * rx;
+       * tnewz = -ry;
+       */
+      signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0);
+      snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
+      tnewx = ryneg;
+
+      signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1);
+      snewy = rx;
+      tnewy = LLVMBuildXor(builder, signrys, rz, "");
+
+      signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2);
+      snewz = LLVMBuildXor(builder, signrzs, rx, "");
+      tnewz = ryneg;
+
+      /* XXX on x86 unclear if we should cast the values back to float
+       * or not - on some cpus (nehalem) pblendvb has twice the throughput
+       * of blendvps though on others there just might be domain
+       * transition penalties when using it (this depends on what llvm
+       * will chose for the bit ops above so there appears no "right way",
+       * but given the boatload of selects let's just use the int type).
+       *
+       * Unfortunately we also need the sign bit of the summed coords.
+       */
+      *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
+      *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
+      ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
+      *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
+      sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
+
+      *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
+      *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
+      ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
+      *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
+      sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
+
+      *face_s = LLVMBuildBitCast(builder, *face_s,
+                               lp_build_vec_type(gallivm, coord_bld->type), "");
+      *face_t = LLVMBuildBitCast(builder, *face_t,
+                               lp_build_vec_type(gallivm, coord_bld->type), "");
+
+      /* add +1 for neg face */
+      /* XXX with AVX probably want to use another select here -
+       * as long as we ensure vblendvps gets used we can actually
+       * skip the comparison and just use sign as a "mask" directly.
+       */
+      sign = LLVMBuildLShr(builder, sign, signshift, "");
+      *face = LLVMBuildOr(builder, *face, sign, "face");
+
+      ima = lp_build_cube_imapos(coord_bld, ma);
+
+      *face_s = lp_build_mul(coord_bld, *face_s, ima);
+      *face_s = lp_build_add(coord_bld, *face_s, posHalf);
+      *face_t = lp_build_mul(coord_bld, *face_t, ima);
+      *face_t = lp_build_add(coord_bld, *face_t, posHalf);
+   }
+
+   else {
      struct lp_build_if_state if_ctx;
      LLVMValueRef face_s_var;
      LLVMValueRef face_t_var;
      LLVMValueRef face_var;
+      LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+      LLVMValueRef shuffles[4];
+      LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
+      LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
+      struct lp_build_context *float_bld = &bld->float_bld;

-      face_s_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_s_var");
-      face_t_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_t_var");
-      face_var = lp_build_alloca(bld->gallivm, bld->int_bld.vec_type, "face_var");
+      assert(bld->coord_bld.type.length == 4);

-      lp_build_if(&if_ctx, bld->gallivm, arx_ge_ary_arz);
+      shuffles[0] = lp_build_const_int32(gallivm, 0);
+      shuffles[1] = lp_build_const_int32(gallivm, 1);
+      shuffles[2] = lp_build_const_int32(gallivm, 0);
+      shuffles[3] = lp_build_const_int32(gallivm, 1);
+      arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+      shuffles[0] = lp_build_const_int32(gallivm, 1);
+      shuffles[1] = lp_build_const_int32(gallivm, 0);
+      shuffles[2] = lp_build_const_int32(gallivm, 2);
+      shuffles[3] = lp_build_const_int32(gallivm, 2);
+      aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+      arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
+
+      shuffles[0] = lp_build_const_int32(gallivm, 0);
+      shuffles[1] = lp_build_const_int32(gallivm, 1);
+      arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+                                            LLVMConstVector(shuffles, 2), "");
+      shuffles[0] = lp_build_const_int32(gallivm, 2);
+      shuffles[1] = lp_build_const_int32(gallivm, 3);
+      arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+                                            LLVMConstVector(shuffles, 2), "");
+      arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
+
+      arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+                                               lp_build_const_int32(gallivm, 1), "");
+      ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
+      face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
+      face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
+
+      lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
      {
         /* +/- X face */
-         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
-         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+         LLVMValueRef sign, ima;
+         rx = LLVMBuildExtractElement(builder, rxyz,
+                                      lp_build_const_int32(gallivm, 0), "");
+         /* +/- X face */
+         sign = lp_build_sgn(float_bld, rx);
+         ima = lp_build_cube_imaneg(coord_bld, s);
         *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
         *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
         *face = lp_build_cube_face(bld, rx,
@@ -963,11 +1188,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
      {
         struct lp_build_if_state if_ctx2;

-         lp_build_if(&if_ctx2, bld->gallivm, ary_ge_arx_arz);
+         lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
         {
+            LLVMValueRef sign, ima;
            /* +/- Y face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+            ry = LLVMBuildExtractElement(builder, rxyz,
+                                         lp_build_const_int32(gallivm, 1), "");
+            sign = lp_build_sgn(float_bld, ry);
+            ima = lp_build_cube_imaneg(coord_bld, t);
            *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
            *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
            *face = lp_build_cube_face(bld, ry,
@@ -980,8 +1208,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
         lp_build_else(&if_ctx2);
         {
            /* +/- Z face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+            LLVMValueRef sign, ima;
+            rz = LLVMBuildExtractElement(builder, rxyz,
+                                         lp_build_const_int32(gallivm, 2), "");
+            sign = lp_build_sgn(float_bld, rz);
+            ima = lp_build_cube_imaneg(coord_bld, r);
            *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
            *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
            *face = lp_build_cube_face(bld, rz,
@@ -999,6 +1230,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
      *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
      *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
      *face   = LLVMBuildLoad(builder, face_var, "face");
+      *face   = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
   }
 }

--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -51,6 +51,15 @@ struct lp_type;
 struct lp_build_context;


+/**
+ * Helper struct holding all derivatives needed for sampling
+ */
+struct lp_derivatives
+{
+   LLVMValueRef ddx_ddy[2];
+};
+
+
 /**
 * Sampler static state.
 *
@@ -86,6 +95,10 @@ struct lp_sampler_static_state
   unsigned lod_bias_non_zero:1;
   unsigned apply_min_lod:1;  /**< min_lod > 0 ? */
   unsigned apply_max_lod:1;  /**< max_lod < last_level ? */
+
+   /* Hacks */
+   unsigned force_nearest_s:1;
+   unsigned force_nearest_t:1;
 };


@@ -188,6 +201,9 @@ struct lp_build_sample_context
   /* See texture_dims() */
   unsigned dims;

+   /** SIMD vector width */
+   unsigned vector_width;
+
   /** regular scalar float type */
   struct lp_type float_type;
   struct lp_build_context float_bld;
@@ -195,7 +211,7 @@ struct lp_build_sample_context
   /** float vector type */
   struct lp_build_context float_vec_bld;

-   /** regular scalar float type */
+   /** regular scalar int type */
   struct lp_type int_type;
   struct lp_build_context int_bld;

@@ -219,10 +235,15 @@ struct lp_build_sample_context
   struct lp_type texel_type;
   struct lp_build_context texel_bld;

+   /** Float per-quad type */
+   struct lp_type perquadf_type;
+   struct lp_build_context perquadf_bld;
+
+   /** Int per-quad type */
+   struct lp_type perquadi_type;
+   struct lp_build_context perquadi_bld;
+
   /* Common dynamic state values */
-   LLVMValueRef width;
-   LLVMValueRef height;
-   LLVMValueRef depth;
   LLVMValueRef row_stride_array;
   LLVMValueRef img_stride_array;
   LLVMValueRef data_array;
@@ -301,8 +322,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
                      unsigned unit,
-                      const LLVMValueRef ddx[4],
-                      const LLVMValueRef ddy[4],
+                      const struct lp_derivatives *derivs,
                      LLVMValueRef lod_bias, /* optional */
                      LLVMValueRef explicit_lod, /* optional */
                      unsigned mip_filter,
@@ -327,10 +347,6 @@ LLVMValueRef
 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
                          LLVMValueRef level);

-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                int level);
-

 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
@@ -398,22 +414,35 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                    unsigned unit,
                    unsigned num_coords,
                    const LLVMValueRef *coords,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
+                    const struct lp_derivatives *derivs,
                    LLVMValueRef lod_bias,
                    LLVMValueRef explicit_lod,
                    LLVMValueRef texel_out[4]);

+
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+                                  LLVMValueRef coord_f,
+                                  LLVMValueRef length_i,
+                                  LLVMValueRef length_f,
+                                  LLVMValueRef *coord0_i,
+                                  LLVMValueRef *weight_f);
+
+
 void
 lp_build_size_query_soa(struct gallivm_state *gallivm,
                        const struct lp_sampler_static_state *static_state,
                        struct lp_sampler_dynamic_state *dynamic_state,
+                        struct lp_type int_type,
                        unsigned unit,
                        LLVMValueRef explicit_lod,
                        LLVMValueRef *sizes_out);

 void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm, 
+                    struct lp_type type,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
                    LLVMValueRef texel_out[4]);


--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -46,10 +46,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                    LLVMValueRef s,
                    LLVMValueRef t,
                    LLVMValueRef r,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
-                    LLVMValueRef lod_bias, /* optional */
-                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef lod_ipart,
+                    LLVMValueRef lod_fpart,
+                    LLVMValueRef ilevel0,
+                    LLVMValueRef ilevel1,
                    LLVMValueRef texel_out[4]);


--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -41,6 +41,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -57,6 +58,7 @@
 #include "lp_bld_sample_aos.h"
 #include "lp_bld_struct.h"
 #include "lp_bld_quad.h"
+#include "lp_bld_pack.h"


 /**
@@ -220,6 +222,41 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
 }


+/**
+ * Helper to compute the first coord and the weight for
+ * linear wrap repeat npot textures
+ */
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+                                  LLVMValueRef coord_f,
+                                  LLVMValueRef length_i,
+                                  LLVMValueRef length_f,
+                                  LLVMValueRef *coord0_i,
+                                  LLVMValueRef *weight_f)
+{
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
+                                                int_coord_bld->one);
+   LLVMValueRef mask;
+   /* wrap with normalized floats is just fract */
+   coord_f = lp_build_fract(coord_bld, coord_f);
+   /* mul by size and subtract 0.5 */
+   coord_f = lp_build_mul(coord_bld, coord_f, length_f);
+   coord_f = lp_build_sub(coord_bld, coord_f, half);
+   /*
+    * we avoided the 0.5/length division before the repeat wrap,
+    * now need to fix up edge cases with selects
+    */
+   /* convert to int, compute lerp weight */
+   lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
+   mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
+                           PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
+   *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
+}
+
+
 /**
 * Build LLVM code for texture wrap mode for linear filtering.
 * \param x0_out  returns first integer texcoord
@@ -246,28 +283,27 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,

   switch(wrap_mode) {
   case PIPE_TEX_WRAP_REPEAT:
-      /* mul by size and subtract 0.5 */
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      coord = lp_build_sub(coord_bld, coord, half);
-      /* convert to int, compute lerp weight */
-      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
-      /* repeat wrap */
      if (is_pot) {
+         /* mul by size and subtract 0.5 */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_sub(coord_bld, coord, half);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* repeat wrap */
         coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
         coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
      }
      else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
         LLVMValueRef mask;
-         coord0 = LLVMBuildAdd(builder, coord0, bias, "");
-         coord0 = LLVMBuildURem(builder, coord0, length, "");
-         mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+         lp_build_coord_repeat_npot_linear(bld, coord,
+                                           length, length_f,
+                                           &coord0, &weight);
+         mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
                                 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
         coord1 = LLVMBuildAnd(builder,
-                              lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
-                              mask, "");
+                               lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+                               mask, "");
      }
      break;

@@ -444,15 +480,16 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
   
   switch(wrap_mode) {
   case PIPE_TEX_WRAP_REPEAT:
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      icoord = lp_build_ifloor(coord_bld, coord);
-      if (is_pot)
+      if (is_pot) {
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         icoord = lp_build_ifloor(coord_bld, coord);
         icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
+      }
      else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
-         icoord = LLVMBuildAdd(builder, icoord, bias, "");
-         icoord = LLVMBuildURem(builder, icoord, length, "");
+          /* take fraction, unnormalize */
+          coord = lp_build_fract_safe(coord_bld, coord);
+          coord = lp_build_mul(coord_bld, coord, length_f);
+          icoord = lp_build_itrunc(coord_bld, coord);
      }
      break;

@@ -473,7 +510,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
      break;

   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
+      /* Note: this is the same as CLAMP_TO_EDGE, except min = -1 */
      {
         LLVMValueRef min, max;

@@ -873,12 +910,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
      struct lp_build_if_state if_ctx;
      LLVMValueRef need_lerp;
+      unsigned num_quads = bld->coord_bld.type.length / 4;

      /* need_lerp = lod_fpart > 0 */
-      need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
-                                lod_fpart,
-                                bld->float_bld.zero,
-                                "need_lerp");
+      if (num_quads == 1) {
+         need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
+                                   lod_fpart, bld->perquadf_bld.zero,
+                                   "need_lerp");
+      }
+      else {
+         /*
+          * We'll do mip filtering if any of the quads need it.
+          * It might be better to split the vectors here and only fetch/filter
+          * quads which need it.
+          */
+         /*
+          * We unfortunately need to clamp lod_fpart here since we can get
+          * negative values which would screw up filtering if not all
+          * lod_fpart values have same sign.
+          */
+         lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart,
+                                  bld->perquadf_bld.zero);
+         need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type,
+                                      PIPE_FUNC_GREATER,
+                                      lod_fpart, bld->perquadf_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp);
+     }

      lp_build_if(&if_ctx, bld->gallivm, need_lerp);
      {
@@ -904,7 +961,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,

         /* interpolate samples from the two mipmap levels */

-         lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart);
+         lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+                                                           bld->perquadf_bld.type,
+                                                           bld->texel_bld.type,
+                                                           lod_fpart);

         for (chan = 0; chan < 4; chan++) {
            colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
@@ -916,37 +976,28 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
   }
 }

-
-
 /**
- * General texture sampling codegen.
- * This function handles texture sampling for all texture targets (1D,
- * 2D, 3D, cube) and all filtering modes.
+ * Calculate cube face, lod, mip levels.
 */
 static void
-lp_build_sample_general(struct lp_build_sample_context *bld,
-                        unsigned unit,
-                        LLVMValueRef s,
-                        LLVMValueRef t,
-                        LLVMValueRef r,
-                        const LLVMValueRef *ddx,
-                        const LLVMValueRef *ddy,
-                        LLVMValueRef lod_bias, /* optional */
-                        LLVMValueRef explicit_lod, /* optional */
-                        LLVMValueRef *colors_out)
+lp_build_sample_common(struct lp_build_sample_context *bld,
+                       unsigned unit,
+                       LLVMValueRef *s,
+                       LLVMValueRef *t,
+                       LLVMValueRef *r,
+                       const struct lp_derivatives *derivs,
+                       LLVMValueRef lod_bias, /* optional */
+                       LLVMValueRef explicit_lod, /* optional */
+                       LLVMValueRef *lod_ipart,
+                       LLVMValueRef *lod_fpart,
+                       LLVMValueRef *ilevel0,
+                       LLVMValueRef *ilevel1)
 {
-   struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMBuilderRef builder = bld->gallivm->builder;
   const unsigned mip_filter = bld->static_state->min_mip_filter;
   const unsigned min_filter = bld->static_state->min_img_filter;
   const unsigned mag_filter = bld->static_state->mag_img_filter;
-   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
-   LLVMValueRef ilevel0, ilevel1 = NULL;
-   LLVMValueRef face_ddx[4], face_ddy[4];
-   LLVMValueRef texels[4];
   LLVMValueRef first_level;
-   LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
-   unsigned chan;
+   struct lp_derivatives face_derivs;

   /*
   printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -958,23 +1009,16 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    */
   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
      LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
+      lp_build_cube_lookup(bld, *s, *t, *r, &face, &face_s, &face_t);
+      *s = face_s; /* vec */
+      *t = face_t; /* vec */
      /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+      *r = face; /* vec */

      /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
-      face_ddx[2] = NULL;
-      face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
-      face_ddy[2] = NULL;
-      face_ddy[3] = NULL;
-      ddx = face_ddx;
-      ddy = face_ddy;
+      face_derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, *s, *t);
+      face_derivs.ddx_ddy[1] = NULL;
+      derivs = &face_derivs;
   }

   /*
@@ -985,12 +1029,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
      /* Need to compute lod either to choose mipmap levels or to
       * distinguish between minification/magnification with one mipmap level.
       */
-      lp_build_lod_selector(bld, unit, ddx, ddy,
+      lp_build_lod_selector(bld, unit, derivs,
                            lod_bias, explicit_lod,
                            mip_filter,
-                            &lod_ipart, &lod_fpart);
+                            lod_ipart, lod_fpart);
   } else {
-      lod_ipart = i32t_zero;
+      *lod_ipart = bld->perquadi_bld.zero;
   }

   /*
@@ -1006,28 +1050,56 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
          * We should be able to set ilevel0 = const(0) but that causes
          * bad x86 code to be emitted.
+          * XXX should probably disable that on other llvm versions.
          */
-         assert(lod_ipart);
-         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+         assert(*lod_ipart);
+         lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
      }
      else {
         first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                       bld->gallivm, unit);
-         ilevel0 = first_level;
+         first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level);
+         *ilevel0 = first_level;
      }
      break;
   case PIPE_TEX_MIPFILTER_NEAREST:
-      assert(lod_ipart);
-      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+      assert(*lod_ipart);
+      lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
      break;
   case PIPE_TEX_MIPFILTER_LINEAR:
-      assert(lod_ipart);
-      assert(lod_fpart);
+      assert(*lod_ipart);
+      assert(*lod_fpart);
      lp_build_linear_mip_levels(bld, unit,
-                                 lod_ipart, &lod_fpart,
-                                 &ilevel0, &ilevel1);
+                                 *lod_ipart, lod_fpart,
+                                 ilevel0, ilevel1);
      break;
   }
+}
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+                        unsigned unit,
+                        LLVMValueRef s,
+                        LLVMValueRef t,
+                        LLVMValueRef r,
+                        LLVMValueRef lod_ipart,
+                        LLVMValueRef lod_fpart,
+                        LLVMValueRef ilevel0,
+                        LLVMValueRef ilevel1,
+                        LLVMValueRef *colors_out)
+{
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   const unsigned min_filter = bld->static_state->min_img_filter;
+   const unsigned mag_filter = bld->static_state->mag_img_filter;
+   LLVMValueRef texels[4];
+   unsigned chan;

   /*
    * Get/interpolate texture colors.
@@ -1039,7 +1111,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
   }

   if (min_filter == mag_filter) {
-      /* no need to distinquish between minification and magnification */
+      /* no need to distinguish between minification and magnification */
      lp_build_sample_mipmap(bld, unit,
                             min_filter, mip_filter,
                             s, t, r,
@@ -1135,7 +1207,10 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
 * For debugging.
 */
 void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+                    struct lp_type type,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
                    LLVMValueRef texel_out[4])
 {
   LLVMValueRef one = lp_build_one(gallivm, type);
@@ -1152,8 +1227,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
 * 'texel' will return a vector of four LLVMValueRefs corresponding to
 * R, G, B, A.
 * \param type  vector float type to use for coords, etc.
- * \param ddx  partial derivatives of (s,t,r,q) with respect to x
- * \param ddy  partial derivatives of (s,t,r,q) with respect to y
+ * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
 */
 void
 lp_build_sample_soa(struct gallivm_state *gallivm,
@@ -1163,8 +1237,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                    unsigned unit,
                    unsigned num_coords,
                    const LLVMValueRef *coords,
-                    const LLVMValueRef ddx[4],
-                    const LLVMValueRef ddy[4],
+                    const struct lp_derivatives *derivs,
                    LLVMValueRef lod_bias, /* optional */
                    LLVMValueRef explicit_lod, /* optional */
                    LLVMValueRef texel_out[4])
@@ -1173,10 +1246,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
   struct lp_build_sample_context bld;
   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tex_width, tex_height, tex_depth;
   LLVMValueRef s;
   LLVMValueRef t;
   LLVMValueRef r;
-   struct lp_type float_vec_type;

   if (0) {
      enum pipe_format fmt = static_state->format;
@@ -1193,6 +1266,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
   bld.format_desc = util_format_description(static_state->format);
   bld.dims = dims;

+   bld.vector_width = lp_type_width(type);
+
   bld.float_type = lp_type_float(32);
   bld.int_type = lp_type_int(32);
   bld.coord_type = type;
@@ -1201,22 +1276,26 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
   bld.float_size_type.length = dims > 1 ? 4 : 1;
   bld.int_size_type = lp_int_type(bld.float_size_type);
   bld.texel_type = type;
-
-   float_vec_type = lp_type_float_vec(32);
+   bld.perquadf_type = type;
+   /* we want native vector size to be able to use our intrinsics */
+   bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
+   bld.perquadi_type = lp_int_type(bld.perquadf_type);

   lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
-   lp_build_context_init(&bld.float_vec_bld, gallivm, float_vec_type);
+   lp_build_context_init(&bld.float_vec_bld, gallivm, type);
   lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
   lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
   lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
   lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
   lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
   lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
+   lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type);
+   lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type);

   /* Get the dynamic state */
-   bld.width = dynamic_state->width(dynamic_state, gallivm, unit);
-   bld.height = dynamic_state->height(dynamic_state, gallivm, unit);
-   bld.depth = dynamic_state->depth(dynamic_state, gallivm, unit);
+   tex_width = dynamic_state->width(dynamic_state, gallivm, unit);
+   tex_height = dynamic_state->height(dynamic_state, gallivm, unit);
+   tex_depth = dynamic_state->depth(dynamic_state, gallivm, unit);
   bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, unit);
   bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, unit);
   bld.data_array = dynamic_state->data_ptr(dynamic_state, gallivm, unit);
@@ -1228,37 +1307,40 @@ lp_build_sample_soa(struct gallivm_state *gallivm,

   /* width, height, depth as single int vector */
   if (dims <= 1) {
-      bld.int_size = bld.width;
+      bld.int_size = tex_width;
   }
   else {
      bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
-                                            bld.width, LLVMConstInt(i32t, 0, 0), "");
+                                            tex_width, LLVMConstInt(i32t, 0, 0), "");
      if (dims >= 2) {
         bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
-                                               bld.height, LLVMConstInt(i32t, 1, 0), "");
+                                               tex_height, LLVMConstInt(i32t, 1, 0), "");
         if (dims >= 3) {
            bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
-                                                  bld.depth, LLVMConstInt(i32t, 2, 0), "");
+                                                  tex_depth, LLVMConstInt(i32t, 2, 0), "");
         }
      }
   }

   if (0) {
      /* For debug: no-op texture sampling */
-      lp_build_sample_nop(gallivm, bld.texel_type, texel_out);
-   }
-   else if (util_format_fits_8unorm(bld.format_desc) &&
-            lp_is_simple_wrap_mode(static_state->wrap_s) &&
-            lp_is_simple_wrap_mode(static_state->wrap_t)) {
-      /* do sampling/filtering with fixed pt arithmetic */
-      lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
-                          lod_bias, explicit_lod,
+      lp_build_sample_nop(gallivm,
+                          bld.texel_type,
+                          num_coords,
+                          coords,
                          texel_out);
   }
-
   else {
+      LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
+      LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
+      unsigned num_quads = type.length / 4;
+      const unsigned mip_filter = bld.static_state->min_mip_filter;
+      boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
+                        lp_is_simple_wrap_mode(static_state->wrap_s) &&
+                        lp_is_simple_wrap_mode(static_state->wrap_t);
+
      if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
-          util_format_fits_8unorm(bld.format_desc)) {
+          !use_aos && util_format_fits_8unorm(bld.format_desc)) {
         debug_printf("%s: using floating point linear filtering for %s\n",
                      __FUNCTION__, bld.format_desc->short_name);
         debug_printf("  min_img %d  mag_img %d  mip %d  wraps %d  wrapt %d\n",
@@ -1269,9 +1351,203 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                      static_state->wrap_t);
      }

-      lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
-                              lod_bias, explicit_lod,
-                              texel_out);
+      lp_build_sample_common(&bld, unit,
+                             &s, &t, &r,
+                             derivs, lod_bias, explicit_lod,
+                             &lod_ipart, &lod_fpart,
+                             &ilevel0, &ilevel1);
+
+      /*
+       * we only try 8-wide sampling with soa as it appears to
+       * be a loss with aos with AVX.
+       */
+      if (num_quads == 1 || (mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+                             !use_aos)) {
+
+         if (num_quads > 1) {
+            LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+            /* These parameters are the same for all quads */
+            lod_ipart = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+            ilevel0 = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+         }
+         if (use_aos) {
+            /* do sampling/filtering with fixed pt arithmetic */
+            lp_build_sample_aos(&bld, unit,
+                                s, t, r,
+                                lod_ipart, lod_fpart,
+                                ilevel0, ilevel1,
+                                texel_out);
+         }
+
+         else {
+            lp_build_sample_general(&bld, unit,
+                                    s, t, r,
+                                    lod_ipart, lod_fpart,
+                                    ilevel0, ilevel1,
+                                    texel_out);
+         }
+      }
+      else {
+         struct lp_build_if_state if_ctx;
+         LLVMValueRef notsame_levels, notsame;
+         LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+         LLVMValueRef texels[4];
+         LLVMValueRef texelout[4];
+         unsigned j;
+
+         texels[0] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texr");
+         texels[1] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texg");
+         texels[2] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texb");
+         texels[3] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texa");
+
+         /* only build the if if we MAY split, otherwise always split */
+         if (!use_aos) {
+            notsame = lp_build_extract_broadcast(gallivm,
+                                                 bld.perquadi_bld.type,
+                                                 bld.perquadi_bld.type,
+                                                 ilevel0, index0);
+            notsame = lp_build_sub(&bld.perquadi_bld, ilevel0, notsame);
+            notsame_levels = lp_build_any_true_range(&bld.perquadi_bld, num_quads,
+                                                     notsame);
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+               notsame = lp_build_extract_broadcast(gallivm,
+                                                    bld.perquadi_bld.type,
+                                                    bld.perquadi_bld.type,
+                                                    ilevel1, index0);
+               notsame = lp_build_sub(&bld.perquadi_bld, ilevel1, notsame);
+               notsame = lp_build_any_true_range(&bld.perquadi_bld, num_quads, notsame);
+               notsame_levels = LLVMBuildOr(builder, notsame_levels, notsame, "");
+            }
+            lp_build_if(&if_ctx, gallivm, notsame_levels);
+         }
+
+         {
+            struct lp_build_sample_context bld4;
+            struct lp_type type4 = type;
+            unsigned i;
+            LLVMValueRef texelout4[4];
+            LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
+
+            type4.length = 4;
+
+            /* Setup our build context */
+            memset(&bld4, 0, sizeof bld4);
+            bld4.gallivm = bld.gallivm;
+            bld4.static_state = bld.static_state;
+            bld4.dynamic_state = bld.dynamic_state;
+            bld4.format_desc = bld.format_desc;
+            bld4.dims = bld.dims;
+            bld4.row_stride_array = bld.row_stride_array;
+            bld4.img_stride_array = bld.img_stride_array;
+            bld4.data_array = bld.data_array;
+            bld4.int_size = bld.int_size;
+
+            bld4.vector_width = lp_type_width(type4);
+
+            bld4.float_type = lp_type_float(32);
+            bld4.int_type = lp_type_int(32);
+            bld4.coord_type = type4;
+            bld4.int_coord_type = lp_int_type(type4);
+            bld4.float_size_type = lp_type_float(32);
+            bld4.float_size_type.length = dims > 1 ? 4 : 1;
+            bld4.int_size_type = lp_int_type(bld4.float_size_type);
+            bld4.texel_type = type4;
+            bld4.perquadf_type = type4;
+            /* we want native vector size to be able to use our intrinsics */
+            bld4.perquadf_type.length = 1;
+            bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+
+            lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
+            lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
+            lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
+            lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
+            lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
+            lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
+            lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
+            lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
+            lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type);
+            lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type);
+
+            for (i = 0; i < num_quads; i++) {
+               LLVMValueRef s4, t4, r4;
+               LLVMValueRef lod_iparts, lod_fparts = NULL;
+               LLVMValueRef ilevel0s, ilevel1s = NULL;
+               LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+
+               s4 = lp_build_extract_range(gallivm, s, 4*i, 4);
+               t4 = lp_build_extract_range(gallivm, t, 4*i, 4);
+               r4 = lp_build_extract_range(gallivm, r, 4*i, 4);
+               lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, "");
+               ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, "");
+               if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+                  ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, "");
+                  lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, "");
+               }
+
+               if (use_aos) {
+                  /* do sampling/filtering with fixed pt arithmetic */
+                  lp_build_sample_aos(&bld4, unit,
+                                      s4, t4, r4,
+                                      lod_iparts, lod_fparts,
+                                      ilevel0s, ilevel1s,
+                                      texelout4);
+               }
+
+               else {
+                  lp_build_sample_general(&bld4, unit,
+                                          s4, t4, r4,
+                                          lod_iparts, lod_fparts,
+                                          ilevel0s, ilevel1s,
+                                          texelout4);
+               }
+               for (j = 0; j < 4; j++) {
+                  texelouttmp[j][i] = texelout4[j];
+               }
+            }
+            for (j = 0; j < 4; j++) {
+               texelout[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
+               LLVMBuildStore(builder, texelout[j], texels[j]);
+            }
+         }
+         if (!use_aos) {
+            LLVMValueRef ilevel0s, lod_iparts, ilevel1s = NULL;
+
+            lp_build_else(&if_ctx);
+
+            /* These parameters are the same for all quads */
+            lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+            ilevel0s = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+               ilevel1s = LLVMBuildExtractElement(builder, ilevel1, index0, "");
+            }
+
+            if (use_aos) {
+               /* do sampling/filtering with fixed pt arithmetic */
+               lp_build_sample_aos(&bld, unit,
+                                   s, t, r,
+                                   lod_iparts, lod_fpart,
+                                   ilevel0s, ilevel1s,
+                                   texelout);
+            }
+
+            else {
+               lp_build_sample_general(&bld, unit,
+                                       s, t, r,
+                                       lod_iparts, lod_fpart,
+                                       ilevel0s, ilevel1s,
+                                       texelout);
+            }
+            for (j = 0; j < 4; j++) {
+               LLVMBuildStore(builder, texelout[j], texels[j]);
+            }
+
+            lp_build_endif(&if_ctx);
+         }
+
+         for (j = 0; j < 4; j++) {
+            texel_out[j] = LLVMBuildLoad(builder, texels[j], "");
+         }
+      }
   }

   lp_build_sample_compare(&bld, r, texel_out);
@@ -1283,6 +1559,7 @@ void
 lp_build_size_query_soa(struct gallivm_state *gallivm,
                        const struct lp_sampler_static_state *static_state,
                        struct lp_sampler_dynamic_state *dynamic_state,
+                        struct lp_type int_type,
                        unsigned unit,
                        LLVMValueRef explicit_lod,
                        LLVMValueRef *sizes_out)
@@ -1311,7 +1588,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
      return;
   }

-   lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32));
+   assert(!int_type.floating);
+
+   lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128));

   if (explicit_lod) {
      LLVMValueRef first_level;
@@ -1345,7 +1624,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
   size = lp_build_minify(&bld_int_vec, size, lod);

   for (i=0; i < dims; i++) {
-      sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, bld_int_vec.type,
+      sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, int_type,
                                                size,
                                                lp_build_const_int32(gallivm, i));
   }
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -40,6 +40,7 @@
 #include "lp_bld_init.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"


 LLVMValueRef
@@ -47,14 +48,34 @@ lp_build_broadcast(struct gallivm_state *gallivm,
                   LLVMTypeRef vec_type,
                   LLVMValueRef scalar)
 {
-   const unsigned n = LLVMGetVectorSize(vec_type);
   LLVMValueRef res;
-   unsigned i;

-   res = LLVMGetUndef(vec_type);
-   for(i = 0; i < n; ++i) {
-      LLVMValueRef index = lp_build_const_int32(gallivm, i);
-      res = LLVMBuildInsertElement(gallivm->builder, res, scalar, index, "");
+   if (LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind) {
+      /* scalar */
+      assert(vec_type == LLVMTypeOf(scalar));
+      res = scalar;
+   } else {
+      LLVMBuilderRef builder = gallivm->builder;
+      const unsigned length = LLVMGetVectorSize(vec_type);
+      LLVMValueRef undef = LLVMGetUndef(vec_type);
+      LLVMTypeRef i32_type = LLVMInt32TypeInContext(gallivm->context);
+
+      assert(LLVMGetElementType(vec_type) == LLVMTypeOf(scalar));
+
+      if (HAVE_LLVM >= 0x207) {
+         /* The shuffle vector is always made of int32 elements */
+         LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
+         res = LLVMBuildInsertElement(builder, undef, scalar, LLVMConstNull(i32_type), "");
+         res = LLVMBuildShuffleVector(builder, res, undef, LLVMConstNull(i32_vec_type), "");
+      } else {
+         /* XXX: The above path provokes a bug in LLVM 2.6 */
+         unsigned i;
+         res = undef;
+         for(i = 0; i < length; ++i) {
+            LLVMValueRef index = lp_build_const_int32(gallivm, i);
+            res = LLVMBuildInsertElement(builder, res, scalar, index, "");
+         }
+      }
   }

   return res;
@@ -68,42 +89,14 @@ LLVMValueRef
 lp_build_broadcast_scalar(struct lp_build_context *bld,
                          LLVMValueRef scalar)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   const struct lp_type type = bld->type;
+   assert(lp_check_elem_type(bld->type, LLVMTypeOf(scalar)));

-   assert(lp_check_elem_type(type, LLVMTypeOf(scalar)));
-
-   if (type.length == 1) {
-      return scalar;
-   }
-   else {
-      LLVMValueRef res;
-
-#if HAVE_LLVM >= 0x207
-      /* The shuffle vector is always made of int32 elements */
-      struct lp_type i32_vec_type = lp_type_int_vec(32);
-      i32_vec_type.length = type.length;
-
-      res = LLVMBuildInsertElement(builder, bld->undef, scalar,
-                                   lp_build_const_int32(bld->gallivm, 0), "");
-      res = LLVMBuildShuffleVector(builder, res, bld->undef,
-                                   lp_build_const_int_vec(bld->gallivm, i32_vec_type, 0), "");
-#else
-      /* XXX: The above path provokes a bug in LLVM 2.6 */
-      unsigned i;
-      res = bld->undef;
-      for(i = 0; i < type.length; ++i) {
-         LLVMValueRef index = lp_build_const_int32(bld->gallivm, i);
-         res = LLVMBuildInsertElement(builder, res, scalar, index, "");
-      }
-#endif
-      return res;
-   }
+   return lp_build_broadcast(bld->gallivm, bld->vec_type, scalar);
 }


 /**
- * Combined extract and broadcast (or a mere shuffle when the two types match)
+ * Combined extract and broadcast (mere shuffle in most cases)
 */
 LLVMValueRef
 lp_build_extract_broadcast(struct gallivm_state *gallivm,
@@ -140,9 +133,9 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
      }
   }
   else {
-      if (dst_type.length == src_type.length) {
+      if (dst_type.length > 1) {
         /*
-          * Special shuffle of the same size.
+          * shuffle - result can be of different length.
          */

         LLVMValueRef shuffle;
@@ -150,28 +143,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
                                      LLVMVectorType(i32t, dst_type.length),
                                      index);
         res = LLVMBuildShuffleVector(gallivm->builder, vector,
-                                      LLVMGetUndef(lp_build_vec_type(gallivm, dst_type)),
+                                      LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
                                      shuffle, "");
      }
      else {
-         LLVMValueRef scalar;
-         scalar = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
-         if (dst_type.length == 1) {
-            /*
-             * Trivial extract scalar from vector.
-             */
-
-            res = scalar;
-         }
-         else {
-            /*
-             * General case of different sized vectors.
-             */
-
-            res = lp_build_broadcast(gallivm,
-                                     lp_build_vec_type(gallivm, dst_type),
-                                     vector);
-         }
+         /*
+          * Trivial extract scalar from vector.
+          */
+          res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
      }
   }

@@ -298,6 +277,8 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
         return bld->zero;
      case PIPE_SWIZZLE_ONE:
         return bld->one;
+      case LP_BLD_SWIZZLE_DONTCARE:
+         return bld->undef;
      default:
         assert(0);
         return bld->undef;
@@ -327,21 +308,26 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
            case PIPE_SWIZZLE_BLUE:
            case PIPE_SWIZZLE_ALPHA:
               shuffle = j + swizzles[i];
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
               break;
            case PIPE_SWIZZLE_ZERO:
               shuffle = type.length + 0;
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
               if (!aux[0]) {
                  aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
               }
               break;
            case PIPE_SWIZZLE_ONE:
               shuffle = type.length + 1;
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
               if (!aux[1]) {
                  aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
               }
               break;
+            case LP_BLD_SWIZZLE_DONTCARE:
+               shuffles[j + i] = LLVMGetUndef(i32t);
+               break;
            }
-            shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
         }
      }

@@ -516,3 +502,127 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,

   lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
 }
+
+
+/**
+ * Transpose from AOS <-> SOA
+ *
+ * @param single_type_lp   type of pixels
+ * @param src              the 4 * n pixel input
+ * @param dst              the 4 * n pixel output
+ */
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+                       struct lp_type single_type_lp,
+                       const LLVMValueRef src[4],
+                       LLVMValueRef dst[4])
+{
+   struct lp_type double_type_lp = single_type_lp;
+   LLVMTypeRef single_type;
+   LLVMTypeRef double_type;
+   LLVMValueRef t0, t1, t2, t3;
+
+   double_type_lp.length >>= 1;
+   double_type_lp.width  <<= 1;
+
+   double_type = lp_build_vec_type(gallivm, double_type_lp);
+   single_type = lp_build_vec_type(gallivm, single_type_lp);
+
+   /* Interleave x, y, z, w -> xy and zw */
+   t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0);
+   t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0);
+   t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1);
+   t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1);
+
+   /* Cast to double width type for second interleave */
+   t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
+   t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
+   t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
+   t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
+
+   /* Interleave xy, zw -> xyzw */
+   dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
+   dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
+   dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
+   dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
+
+   /* Cast back to original single width type */
+   dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
+   dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
+   dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
+   dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
+}
+
+
+/**
+ * Pack first element of aos values,
+ * pad out to destination size.
+ * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _
+ */
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+                          struct lp_type src_type,
+                          struct lp_type dst_type,
+                          const LLVMValueRef src)
+{
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef undef = LLVMGetUndef(i32t);
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+   unsigned num_src = src_type.length / 4;
+   unsigned num_dst = dst_type.length;
+   unsigned i;
+
+   assert(num_src <= num_dst);
+
+   for (i = 0; i < num_src; i++) {
+      shuffles[i] = LLVMConstInt(i32t, i * 4, 0);
+   }
+   for (i = num_src; i < num_dst; i++) {
+      shuffles[i] = undef;
+   }
+
+   if (num_dst == 1) {
+      return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(shuffles, num_dst), "");
+   }
+}
+
+
+/**
+ * Unpack and broadcast packed aos values consisting of only the
+ * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
+ */
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      struct lp_type dst_type,
+                                      const LLVMValueRef src)
+{
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+   unsigned num_dst = dst_type.length;
+   unsigned num_src = dst_type.length / 4;
+   unsigned i;
+
+   assert(num_dst / 4 <= src_type.length);
+
+   for (i = 0; i < num_src; i++) {
+      shuffles[i*4] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
+   }
+
+   if (num_src == 1) {
+      return lp_build_extract_broadcast(gallivm, src_type, dst_type,
+                                        src, shuffles[0]);
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(shuffles, num_dst), "");
+   }
+}
+
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -44,6 +44,9 @@ struct lp_type;
 struct lp_build_context;


+#define LP_BLD_SWIZZLE_DONTCARE 0xFF
+
+
 LLVMValueRef
 lp_build_broadcast(struct gallivm_state *gallivm,
                   LLVMTypeRef vec_type,
@@ -103,4 +106,25 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
                             const unsigned char swizzles[4]);


+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+                       struct lp_type type,
+                       const LLVMValueRef src[4],
+                       LLVMValueRef dst[4]);
+
+
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+                          struct lp_type src_type,
+                          struct lp_type dst_type,
+                          const LLVMValueRef src);
+
+
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      struct lp_type dst_type,
+                                      const LLVMValueRef src);
+
+
 #endif /* !LP_BLD_SWIZZLE_H */
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -60,6 +60,7 @@ struct tgsi_token;
 struct tgsi_shader_info;
 struct lp_build_mask_context;
 struct gallivm_state;
+struct lp_derivatives;


 enum lp_build_tex_modifier {
@@ -174,8 +175,7 @@ struct lp_build_sampler_soa
                        unsigned unit,
                        unsigned num_coords,
                        const LLVMValueRef *coords,
-                        const LLVMValueRef *ddx,
-                        const LLVMValueRef *ddy,
+                        const struct lp_derivatives *derivs,
                        LLVMValueRef lod_bias, /* optional */
                        LLVMValueRef explicit_lod, /* optional */
                        LLVMValueRef *texel);
@@ -183,6 +183,7 @@ struct lp_build_sampler_soa
   void
   (*emit_size_query)( const struct lp_build_sampler_soa *sampler,
                       struct gallivm_state *gallivm,
+                       struct lp_type type,
                       unsigned unit,
                       LLVMValueRef explicit_lod, /* optional */
                       LLVMValueRef *sizes_out);
@@ -197,8 +198,7 @@ struct lp_build_sampler_aos
                        unsigned target, /* TGSI_TEXTURE_* */
                        unsigned unit,
                        LLVMValueRef coords,
-                        LLVMValueRef ddx,
-                        LLVMValueRef ddy,
+                        const struct lp_derivatives derivs,
                        enum lp_build_tex_modifier modifier);
 };

--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -693,7 +693,7 @@ u2f_emit(
 {
   emit_data->output[emit_data->chan] = LLVMBuildUIToFP(bld_base->base.gallivm->builder,
 							emit_data->args[0],
-							bld_base->uint_bld.vec_type, "");
+							bld_base->base.vec_type, "");
 }

 static void
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -56,6 +56,7 @@
 #include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_sample.h"


 /**
@@ -363,6 +364,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
   LLVMValueRef coords;
   LLVMValueRef ddx;
   LLVMValueRef ddy;
+   struct lp_derivatives derivs;

   if (!bld->sampler) {
      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -373,7 +375,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,

   coords = lp_build_emit_fetch( &bld->bld_base, inst, 0 , LP_CHAN_ALL);

-   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+   if (0 && modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
      ddx = lp_build_emit_fetch( &bld->bld_base, inst, 1 , LP_CHAN_ALL);
      ddy = lp_build_emit_fetch( &bld->bld_base, inst, 2 , LP_CHAN_ALL);
      unit = inst->Src[3].Register.Index;
@@ -383,8 +385,8 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
      ddy = lp_build_ddy( &bld->bld_base.base, coords );
 #else
      /* TODO */
-      ddx = bld->bld_base.base.one;
-      ddy = bld->bld_base.base.one;
+      derivs.ddx_ddy[0] = bld->bld_base.base.one;
+      derivs.ddx_ddy[1] = bld->bld_base.base.one;
 #endif
      unit = inst->Src[1].Register.Index;
   }
@@ -392,7 +394,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
   return bld->sampler->emit_fetch_texel(bld->sampler,
                                         &bld->bld_base.base,
                                         target, unit,
-                                         coords, ddx, ddy,
+                                         coords, derivs,
                                         modifier);
 }

--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -62,6 +62,7 @@
 #include "lp_bld_limits.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_printf.h"
+#include "lp_bld_sample.h"


 static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
@@ -763,7 +764,7 @@ emit_fetch_temporary(
   else {
      LLVMValueRef temp_ptr;
      if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) {
-         LLVMTypeRef itype = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+         LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0);
         LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
                                                     swizzle);
         temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, "");
@@ -1068,7 +1069,7 @@ emit_store_chan(
         switch (dtype) {
         case TGSI_TYPE_UNSIGNED:
         case TGSI_TYPE_SIGNED: {
-            LLVMTypeRef itype = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+            LLVMTypeRef itype = bld_base->int_bld.vec_type;
            LLVMTypeRef ivtype = LLVMPointerType(itype, 0);
            LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
                                                        chan_index);
@@ -1141,13 +1142,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
          LLVMValueRef *texel)
 {
   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
   unsigned unit;
   LLVMValueRef lod_bias, explicit_lod;
   LLVMValueRef oow = NULL;
   LLVMValueRef coords[3];
-   LLVMValueRef ddx[3];
-   LLVMValueRef ddy[3];
+   struct lp_derivatives derivs;
   unsigned num_coords;
+   unsigned dims;
   unsigned i;

   if (!bld->sampler) {
@@ -1158,26 +1160,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
      return;
   }

+   derivs.ddx_ddy[0] = bld->bld_base.base.undef;
+   derivs.ddx_ddy[1] = bld->bld_base.base.undef;
+
   switch (inst->Texture.Texture) {
   case TGSI_TEXTURE_1D:
      num_coords = 1;
+      dims = 1;
      break;
   case TGSI_TEXTURE_1D_ARRAY:
+      num_coords = 2;
+      dims = 1;
+      break;
   case TGSI_TEXTURE_2D:
   case TGSI_TEXTURE_RECT:
      num_coords = 2;
+      dims = 2;
      break;
   case TGSI_TEXTURE_SHADOW1D:
   case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      num_coords = 3;
+      dims = 1;
+      break;
   case TGSI_TEXTURE_SHADOW2D:
   case TGSI_TEXTURE_SHADOWRECT:
   case TGSI_TEXTURE_2D_ARRAY:
-   case TGSI_TEXTURE_3D:
   case TGSI_TEXTURE_CUBE:
      num_coords = 3;
+      dims = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+      num_coords = 3;
+      dims = 3;
      break;
   case TGSI_TEXTURE_SHADOW2D_ARRAY:
      num_coords = 4;
+      dims = 2;
      break;
   default:
      assert(0);
@@ -1212,31 +1230,66 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
   }

   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
-      LLVMValueRef index0 = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
-      for (i = 0; i < num_coords; i++) {
-         LLVMValueRef src1 = lp_build_emit_fetch( &bld->bld_base, inst, 1, i );
-         LLVMValueRef src2 = lp_build_emit_fetch( &bld->bld_base, inst, 2, i );
-         ddx[i] = LLVMBuildExtractElement(builder, src1, index0, "");
-         ddy[i] = LLVMBuildExtractElement(builder, src2, index0, "");
+      LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef ddxdyonec[3];
+      unsigned length = bld->bld_base.base.type.length;
+      unsigned num_quads = length / 4;
+      unsigned dim;
+      unsigned quad;
+
+      for (dim = 0; dim < dims; ++dim) {
+         LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim );
+         LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim );
+         for (quad = 0; quad < num_quads; ++quad) {
+            unsigned s1 = 4*quad;
+            unsigned s2 = 4*quad + length;
+            shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+            shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2);
+            shuffles[4*quad + 2] = i32undef;
+            shuffles[4*quad + 3] = i32undef;
+         }
+         ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy,
+                                               LLVMConstVector(shuffles, length), "");
+      }
+      if (dims == 1) {
+         derivs.ddx_ddy[0] = ddxdyonec[0];
+      }
+      else if (dims >= 2) {
+         for (quad = 0; quad < num_quads; ++quad) {
+            unsigned s1 = 4*quad;
+            unsigned s2 = 4*quad + length;
+            shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+            shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1);
+            shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2);
+            shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1);
+         }
+         derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1],
+                                                  LLVMConstVector(shuffles, length), "");
+         if (dims == 3) {
+            derivs.ddx_ddy[1] = ddxdyonec[2];
+         }
      }
      unit = inst->Src[3].Register.Index;
   }  else {
-      for (i = 0; i < num_coords; i++) {
-         ddx[i] = lp_build_scalar_ddx( &bld->bld_base.base, coords[i] );
-         ddy[i] = lp_build_scalar_ddy( &bld->bld_base.base, coords[i] );
+      if (dims == 1) {
+         derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]);
+      }
+      else if (dims >= 2) {
+         derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base,
+                                                            coords[0], coords[1]);
+         if (dims == 3) {
+            derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]);
+         }
      }
      unit = inst->Src[1].Register.Index;
   }
-   for (i = num_coords; i < 3; i++) {
-      ddx[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
-      ddy[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
-   }

   bld->sampler->emit_fetch_texel(bld->sampler,
                                  bld->bld_base.base.gallivm,
                                  bld->bld_base.base.type,
                                  unit, num_coords, coords,
-                                  ddx, ddy,
+                                  &derivs,
                                  lod_bias, explicit_lod,
                                  texel);
 }
@@ -1310,6 +1363,7 @@ emit_txq( struct lp_build_tgsi_soa_context *bld,

   bld->sampler->emit_size_query(bld->sampler,
                                 bld->bld_base.base.gallivm,
+                                 bld->bld_base.int_bld.type,
                                 inst->Src[1].Register.Index,
                                 explicit_lod,
                                 sizes_out);
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -38,6 +38,9 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
 {
   if (type.floating) {
      switch(type.width) {
+      case 16:
+         return LLVMIntTypeInContext(gallivm->context, 16);
+         break;
      case 32:
         return LLVMFloatTypeInContext(gallivm->context);
         break;
@@ -85,6 +88,10 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type)

   if (type.floating) {
      switch(type.width) {
+      case 16:
+         if(elem_kind != LLVMIntegerTypeKind)
+            return FALSE;
+         break;
      case 32:
         if(elem_kind != LLVMFloatTypeKind)
            return FALSE;
@@ -167,27 +174,6 @@ lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type)
 }


-/**
- * Build int32[4] vector type
- */
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm)
-{
-   struct lp_type t;
-   LLVMTypeRef type;
-
-   memset(&t, 0, sizeof(t));
-   t.floating = FALSE; /* floating point values */
-   t.sign = TRUE;      /* values are signed */
-   t.norm = FALSE;     /* values are not limited to [0,1] or [-1,1] */
-   t.width = 32;       /* 32-bit int */
-   t.length = 4;       /* 4 elements per vector */
-
-   type = lp_build_int_elem_type(gallivm, t);
-   return LLVMVectorType(type, t.length);
-}
-
-
 /**
 * Create element of vector type
 */
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -40,21 +40,35 @@
 #include "pipe/p_compiler.h"
 #include "gallivm/lp_bld.h"

-
+/**
+ * Native SIMD architecture width available at runtime.
+ *
+ * Using this width should give the best performance,
+ * and it determines the necessary alignment of vector variables.
+ */
+extern unsigned lp_native_vector_width;

 /**
- * Native SIMD register width.
+ * Maximum supported vector width (not necessarily supported at run-time).
 *
- * 128 for all architectures we care about.
+ * Should only be used when lp_native_vector_width isn't available,
+ * i.e. sizing/alignment of non-malloced variables.
 */
-#define LP_NATIVE_VECTOR_WIDTH 128
+#define LP_MAX_VECTOR_WIDTH 256
+
+/**
+ * Minimum vector alignment for static variable alignment
+ *
+ * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8.  An
+ * expression is non-portable.
+ */
+#define LP_MIN_VECTOR_ALIGN 32

 /**
 * Several functions can only cope with vectors of length up to this value.
 * You may need to increase that value if you want to represent bigger vectors.
 */
-#define LP_MAX_VECTOR_LENGTH 16
-
+#define LP_MAX_VECTOR_LENGTH (LP_MAX_VECTOR_WIDTH/8)

 /**
 * The LLVM type system can't conveniently express all the things we care about
@@ -151,6 +165,13 @@ struct lp_build_context
 };


+static INLINE unsigned
+lp_type_width(struct lp_type type)
+{
+   return type.width * type.length;
+}
+
+
 /** Create scalar float type */
 static INLINE struct lp_type
 lp_type_float(unsigned width)
@@ -169,7 +190,7 @@ lp_type_float(unsigned width)

 /** Create vector of float type */
 static INLINE struct lp_type
-lp_type_float_vec(unsigned width)
+lp_type_float_vec(unsigned width, unsigned total_width)
 {
   struct lp_type res_type;

@@ -177,7 +198,7 @@ lp_type_float_vec(unsigned width)
   res_type.floating = TRUE;
   res_type.sign = TRUE;
   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;

   return res_type;
 }
@@ -200,14 +221,14 @@ lp_type_int(unsigned width)

 /** Create vector int type */
 static INLINE struct lp_type
-lp_type_int_vec(unsigned width)
+lp_type_int_vec(unsigned width, unsigned total_width)
 {
   struct lp_type res_type;

   memset(&res_type, 0, sizeof res_type);
   res_type.sign = TRUE;
   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;

   return res_type;
 }
@@ -229,34 +250,34 @@ lp_type_uint(unsigned width)

 /** Create vector uint type */
 static INLINE struct lp_type
-lp_type_uint_vec(unsigned width)
+lp_type_uint_vec(unsigned width, unsigned total_width)
 {
   struct lp_type res_type;

   memset(&res_type, 0, sizeof res_type);
   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;

   return res_type;
 }


 static INLINE struct lp_type
-lp_type_unorm(unsigned width)
+lp_type_unorm(unsigned width, unsigned total_width)
 {
   struct lp_type res_type;

   memset(&res_type, 0, sizeof res_type);
   res_type.norm = TRUE;
   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;

   return res_type;
 }


 static INLINE struct lp_type
-lp_type_fixed(unsigned width)
+lp_type_fixed(unsigned width, unsigned total_width)
 {
   struct lp_type res_type;

@@ -264,21 +285,21 @@ lp_type_fixed(unsigned width)
   res_type.sign = TRUE;
   res_type.fixed = TRUE;
   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;

   return res_type;
 }


 static INLINE struct lp_type
-lp_type_ufixed(unsigned width)
+lp_type_ufixed(unsigned width, unsigned total_width)
 {
   struct lp_type res_type;

   memset(&res_type, 0, sizeof res_type);
   res_type.fixed = TRUE;
   res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;

   return res_type;
 }
@@ -312,10 +333,6 @@ LLVMTypeRef
 lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type);


-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm);
-
-
 static INLINE struct lp_type
 lp_float32_vec4_type(void)
 {
--- a/src/gallium/auxiliary/postprocess/pp_mlaa.c
+++ b/src/gallium/auxiliary/postprocess/pp_mlaa.c
@@ -178,7 +178,8 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
   /* Blit the input to the output */
   util_blit_pixels(p->blitctx, in, 0, 0, 0,
                    w, h, 0, p->framebuffer.cbufs[0],
-                    0, 0, w, h, 0, PIPE_TEX_MIPFILTER_NEAREST);
+                    0, 0, w, h, 0, PIPE_TEX_MIPFILTER_NEAREST,
+                    TGSI_WRITEMASK_XYZW, 0);

   u_sampler_view_default_template(&v_tmp, in, in->format);
   arr[0] = p->pipe->create_sampler_view(p->pipe, in, &v_tmp);
--- a/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/src/gallium/auxiliary/postprocess/pp_run.c
@@ -59,7 +59,8 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,

      util_blit_pixels(ppq->p->blitctx, in, 0, 0, 0,
                       w, h, 0, ppq->tmps[0],
-                       0, 0, w, h, 0, PIPE_TEX_MIPFILTER_NEAREST);
+                       0, 0, w, h, 0, PIPE_TEX_MIPFILTER_NEAREST,
+                       TGSI_WRITEMASK_XYZW, 0);

      in = ppq->tmp[0];
   }
--- a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
@@ -10,6 +10,8 @@
 * one or more debug driver: rbug, trace.
 */

+#ifdef DEBUG
+
 #ifdef GALLIUM_TRACE
 #include "trace/tr_public.h"
 #endif
@@ -26,9 +28,12 @@
 #include "noop/noop_public.h"
 #endif

+#endif /* DEBUG */
+
 static INLINE struct pipe_screen *
 debug_screen_wrap(struct pipe_screen *screen)
 {
+#ifdef DEBUG

 #if defined(GALLIUM_RBUG)
   screen = rbug_screen_create(screen);
@@ -46,6 +51,8 @@ debug_screen_wrap(struct pipe_screen *screen)
   screen = noop_screen_create(screen);
 #endif

+#endif /* DEBUG */
+
   return screen;
 }

--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -1534,9 +1534,18 @@ static void emit_decls( struct ureg_program *ureg )
      }
   }

-   for (i = 0; i < ureg->nr_temps; i++) {
-      emit_decl( ureg, TGSI_FILE_TEMPORARY, i,
-                 util_bitmask_get(ureg->local_temps, i) );
+   if (ureg->nr_temps) {
+      if (util_bitmask_get_first_index(ureg->local_temps) ==  UTIL_BITMASK_INVALID_INDEX) {
+         emit_decl_range( ureg,
+                          TGSI_FILE_TEMPORARY,
+                          0, ureg->nr_temps );
+
+      } else {
+         for (i = 0; i < ureg->nr_temps; i++) {
+            emit_decl( ureg, TGSI_FILE_TEMPORARY, i,
+                       util_bitmask_get(ureg->local_temps, i) );
+         }
+      }
   }

   if (ureg->nr_addrs) {
@@ -1687,7 +1696,7 @@ struct ureg_program *ureg_create( unsigned processor )
 {
   struct ureg_program *ureg = CALLOC_STRUCT( ureg_program );
   if (ureg == NULL)
-      return NULL;
+      goto no_ureg;

   ureg->processor = processor;
   ureg->property_gs_input_prim = ~0;
@@ -1696,17 +1705,19 @@ struct ureg_program *ureg_create( unsigned processor )

   ureg->free_temps = util_bitmask_create();
   if (ureg->free_temps == NULL)
-      goto fail;
+      goto no_free_temps;

   ureg->local_temps = util_bitmask_create();
   if (ureg->local_temps == NULL)
-      goto fail;
+      goto no_local_temps;

   return ureg;

-fail:
-   FREE(ureg->free_temps);
+no_local_temps:
+   util_bitmask_destroy(ureg->free_temps);
+no_free_temps:
   FREE(ureg);
+no_ureg:
   return NULL;
 }

--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -1442,8 +1442,10 @@ static void translate_sse_release( struct translate *translate )
 {
   struct translate_sse *p = (struct translate_sse *)translate;

-   x86_release_func( &p->linear_func );
+   x86_release_func( &p->elt8_func );
+   x86_release_func( &p->elt16_func );
   x86_release_func( &p->elt_func );
+   x86_release_func( &p->linear_func );

   os_free_aligned(p);
 }
--- a/src/gallium/auxiliary/util/.gitignore
+++ b/src/gallium/auxiliary/util/.gitignore
@@ -1,3 +1,2 @@
 u_format_srgb.c
 u_format_table.c
-u_half.c
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -56,9 +56,11 @@ struct blit_state
   struct pipe_context *pipe;
   struct cso_context *cso;

-   struct pipe_blend_state blend;
-   struct pipe_depth_stencil_alpha_state depthstencil_keep;
-   struct pipe_depth_stencil_alpha_state depthstencil_write;
+   struct pipe_blend_state blend_write_color, blend_keep_color;
+   struct pipe_depth_stencil_alpha_state dsa_keep_depthstencil;
+   struct pipe_depth_stencil_alpha_state dsa_write_depthstencil;
+   struct pipe_depth_stencil_alpha_state dsa_write_depth;
+   struct pipe_depth_stencil_alpha_state dsa_write_stencil;
   struct pipe_rasterizer_state rasterizer;
   struct pipe_sampler_state sampler;
   struct pipe_viewport_state viewport;
@@ -66,13 +68,17 @@ struct blit_state
   enum pipe_texture_target internal_target;

   void *vs;
-   void *fs[TGSI_WRITEMASK_XYZW + 1];
-   void *fs_depth;
+   void *fs[PIPE_MAX_TEXTURE_TYPES][TGSI_WRITEMASK_XYZW + 1];
+   void *fs_depthstencil[PIPE_MAX_TEXTURE_TYPES];
+   void *fs_depth[PIPE_MAX_TEXTURE_TYPES];
+   void *fs_stencil[PIPE_MAX_TEXTURE_TYPES];

   struct pipe_resource *vbuf;  /**< quad vertices */
   unsigned vbuf_slot;

   float vertices[4][2][4];   /**< vertex/texcoords for quad */
+
+   boolean has_stencil_export;
 };


@@ -94,24 +100,28 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
   ctx->cso = cso;

   /* disabled blending/masking */
-   memset(&ctx->blend, 0, sizeof(ctx->blend));
-   ctx->blend.rt[0].colormask = PIPE_MASK_RGBA;
+   ctx->blend_write_color.rt[0].colormask = PIPE_MASK_RGBA;

-   /* no-op depth/stencil/alpha */
-   memset(&ctx->depthstencil_keep, 0, sizeof(ctx->depthstencil_keep));
-   memset(&ctx->depthstencil_write, 0, sizeof(ctx->depthstencil_write));
-   ctx->depthstencil_write.depth.enabled = 1;
-   ctx->depthstencil_write.depth.writemask = 1;
-   ctx->depthstencil_write.depth.func = PIPE_FUNC_ALWAYS;
+   /* depth stencil states */
+   ctx->dsa_write_depth.depth.enabled = 1;
+   ctx->dsa_write_depth.depth.writemask = 1;
+   ctx->dsa_write_depth.depth.func = PIPE_FUNC_ALWAYS;
+   ctx->dsa_write_stencil.stencil[0].enabled = 1;
+   ctx->dsa_write_stencil.stencil[0].func = PIPE_FUNC_ALWAYS;
+   ctx->dsa_write_stencil.stencil[0].fail_op = PIPE_STENCIL_OP_REPLACE;
+   ctx->dsa_write_stencil.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE;
+   ctx->dsa_write_stencil.stencil[0].zfail_op = PIPE_STENCIL_OP_REPLACE;
+   ctx->dsa_write_stencil.stencil[0].valuemask = 0xff;
+   ctx->dsa_write_stencil.stencil[0].writemask = 0xff;
+   ctx->dsa_write_depthstencil.depth = ctx->dsa_write_depth.depth;
+   ctx->dsa_write_depthstencil.stencil[0] = ctx->dsa_write_stencil.stencil[0];

   /* rasterizer */
-   memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
   ctx->rasterizer.cull_face = PIPE_FACE_NONE;
   ctx->rasterizer.gl_rasterization_rules = 1;
   ctx->rasterizer.depth_clip = 1;

   /* samplers */
-   memset(&ctx->sampler, 0, sizeof(ctx->sampler));
   ctx->sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
   ctx->sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
   ctx->sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
@@ -120,7 +130,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
   ctx->sampler.mag_img_filter = 0; /* set later */

   /* vertex elements state */
-   memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2);
   for (i = 0; i < 2; i++) {
      ctx->velem[i].src_offset = i * 4 * sizeof(float);
      ctx->velem[i].instance_divisor = 0;
@@ -142,6 +151,9 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
   else
      ctx->internal_target = PIPE_TEXTURE_RECT;

+   ctx->has_stencil_export =
+      pipe->screen->get_param(pipe->screen, PIPE_CAP_SHADER_STENCIL_EXPORT);
+
   return ctx;
 }

@@ -153,17 +165,29 @@ void
 util_destroy_blit(struct blit_state *ctx)
 {
   struct pipe_context *pipe = ctx->pipe;
-   unsigned i;
+   unsigned i, j;

   if (ctx->vs)
      pipe->delete_vs_state(pipe, ctx->vs);

-   for (i = 0; i < Elements(ctx->fs); i++)
-      if (ctx->fs[i])
-         pipe->delete_fs_state(pipe, ctx->fs[i]);
+   for (i = 0; i < Elements(ctx->fs); i++) {
+      for (j = 0; j < Elements(ctx->fs[i]); j++) {
+         if (ctx->fs[i][j])
+            pipe->delete_fs_state(pipe, ctx->fs[i][j]);
+      }
+   }

-   if (ctx->fs_depth)
-      pipe->delete_fs_state(pipe, ctx->fs_depth);
+   for (i = 0; i < PIPE_MAX_TEXTURE_TYPES; i++) {
+      if (ctx->fs_depthstencil[i]) {
+         pipe->delete_fs_state(pipe, ctx->fs_depthstencil[i]);
+      }
+      if (ctx->fs_depth[i]) {
+         pipe->delete_fs_state(pipe, ctx->fs_depth[i]);
+      }
+      if (ctx->fs_stencil[i]) {
+         pipe->delete_fs_state(pipe, ctx->fs_stencil[i]);
+      }
+   }

   pipe_resource_reference(&ctx->vbuf, NULL);

@@ -175,30 +199,76 @@ util_destroy_blit(struct blit_state *ctx)
 * Helper function to set the fragment shaders.
 */
 static INLINE void
-set_fragment_shader(struct blit_state *ctx, uint writemask)
+set_fragment_shader(struct blit_state *ctx, uint writemask,
+                    enum pipe_texture_target pipe_tex)
 {
-   if (!ctx->fs[writemask])
-      ctx->fs[writemask] =
-         util_make_fragment_tex_shader_writemask(ctx->pipe, TGSI_TEXTURE_2D,
+   if (!ctx->fs[pipe_tex][writemask]) {
+      unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex);
+
+      ctx->fs[pipe_tex][writemask] =
+         util_make_fragment_tex_shader_writemask(ctx->pipe, tgsi_tex,
                                                 TGSI_INTERPOLATE_LINEAR,
                                                 writemask);
+   }

-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]);
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[pipe_tex][writemask]);
 }


 /**
- * Helper function to set the depthwrite shader.
+ * Helper function to set the shader which writes depth and stencil.
 */
 static INLINE void
-set_depth_fragment_shader(struct blit_state *ctx)
+set_depthstencil_fragment_shader(struct blit_state *ctx,
+                                 enum pipe_texture_target pipe_tex)
 {
-   if (!ctx->fs_depth)
-      ctx->fs_depth =
-         util_make_fragment_tex_shader_writedepth(ctx->pipe, TGSI_TEXTURE_2D,
-                                                  TGSI_INTERPOLATE_LINEAR);
+   if (!ctx->fs_depthstencil[pipe_tex]) {
+      unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex);

-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth);
+      ctx->fs_depthstencil[pipe_tex] =
+         util_make_fragment_tex_shader_writedepthstencil(ctx->pipe, tgsi_tex,
+                                                  TGSI_INTERPOLATE_LINEAR);
+   }
+
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depthstencil[pipe_tex]);
+}
+
+
+/**
+ * Helper function to set the shader which writes depth.
+ */
+static INLINE void
+set_depth_fragment_shader(struct blit_state *ctx,
+                          enum pipe_texture_target pipe_tex)
+{
+   if (!ctx->fs_depth[pipe_tex]) {
+      unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex);
+
+      ctx->fs_depth[pipe_tex] =
+         util_make_fragment_tex_shader_writedepth(ctx->pipe, tgsi_tex,
+                                                  TGSI_INTERPOLATE_LINEAR);
+   }
+
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth[pipe_tex]);
+}
+
+
+/**
+ * Helper function to set the shader which writes stencil.
+ */
+static INLINE void
+set_stencil_fragment_shader(struct blit_state *ctx,
+                            enum pipe_texture_target pipe_tex)
+{
+   if (!ctx->fs_stencil[pipe_tex]) {
+      unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex);
+
+      ctx->fs_stencil[pipe_tex] =
+         util_make_fragment_tex_shader_writestencil(ctx->pipe, tgsi_tex,
+                                                    TGSI_INTERPOLATE_LINEAR);
+   }
+
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs_stencil[pipe_tex]);
 }


@@ -350,20 +420,19 @@ formats_compatible(enum pipe_format src_format,
 * \param writemask  controls which channels in the dest surface are sourced
 *                   from the src surface.  Disabled channels are sourced
 *                   from (0,0,0,1).
- * XXX need some control over blitting stencil.
 */
 void
-util_blit_pixels_writemask(struct blit_state *ctx,
-                           struct pipe_resource *src_tex,
-                           unsigned src_level,
-                           int srcX0, int srcY0,
-                           int srcX1, int srcY1,
-                           int srcZ0,
-                           struct pipe_surface *dst,
-                           int dstX0, int dstY0,
-                           int dstX1, int dstY1,
-                           float z, uint filter,
-                           uint writemask)
+util_blit_pixels(struct blit_state *ctx,
+                 struct pipe_resource *src_tex,
+                 unsigned src_level,
+                 int srcX0, int srcY0,
+                 int srcX1, int srcY1,
+                 int srcZ0,
+                 struct pipe_surface *dst,
+                 int dstX0, int dstY0,
+                 int dstX1, int dstY1,
+                 float z, uint filter,
+                 uint writemask, uint zs_writemask)
 {
   struct pipe_context *pipe = ctx->pipe;
   struct pipe_screen *screen = pipe->screen;
@@ -375,9 +444,12 @@ util_blit_pixels_writemask(struct blit_state *ctx,
   const int srcW = abs(srcX1 - srcX0);
   const int srcH = abs(srcY1 - srcY0);
   unsigned offset;
-   boolean overlap, dst_is_depth;
+   boolean overlap;
   float s0, t0, s1, t1;
   boolean normalized;
+   boolean is_stencil, is_depth, blit_depth, blit_stencil;
+   const struct util_format_description *src_desc =
+         util_format_description(src_tex->format);

   assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
          filter == PIPE_TEX_MIPFILTER_LINEAR);
@@ -394,12 +466,24 @@ util_blit_pixels_writemask(struct blit_state *ctx,
   src_format = util_format_linear(src_tex->format);
   dst_format = util_format_linear(dst->format);

+   /* See whether we will blit depth or stencil. */
+   is_depth = util_format_has_depth(src_desc);
+   is_stencil = util_format_has_stencil(src_desc);
+
+   blit_depth = is_depth && (zs_writemask & BLIT_WRITEMASK_Z);
+   blit_stencil = is_stencil && (zs_writemask & BLIT_WRITEMASK_STENCIL);
+
+   assert((writemask && !zs_writemask && !is_depth && !is_stencil) ||
+          (!writemask && (blit_depth || blit_stencil)));
+
   /*
    * Check for simple case:  no format conversion, no flipping, no stretching,
    * no overlapping.
    * Filter mode should not matter since there's no stretching.
    */
   if (formats_compatible(src_format, dst_format) &&
+       is_stencil == blit_stencil &&
+       is_depth == blit_depth &&
       srcX0 < srcX1 &&
       dstX0 < dstX1 &&
       srcY0 < srcY1 &&
@@ -422,6 +506,17 @@ util_blit_pixels_writemask(struct blit_state *ctx,
      return;
   }

+   /* It's a mistake to call this function with a stencil format and
+    * without shader stencil export. We don't do software fallbacks here.
+    * Ignore stencil and only copy depth.
+    */
+   if (blit_stencil && !ctx->has_stencil_export) {
+      blit_stencil = FALSE;
+
+      if (!blit_depth)
+         return;
+   }
+
   if (dst_format == dst->format) {
      dst_surface = dst;
   } else {
@@ -430,20 +525,11 @@ util_blit_pixels_writemask(struct blit_state *ctx,
      dst_surface = pipe->create_surface(pipe, dst->texture, &templ);
   }

-   /* Create a temporary texture when src and dest alias or when src
-    * is anything other than a 2d texture.
-    * XXX should just use appropriate shader to access 1d / 3d slice / cube face,
-    * much like the u_blitter code does (should be pretty trivial).
-    * 
-    * This can still be improved upon.
+   /* Create a temporary texture when src and dest alias.
    */
-   if ((src_tex == dst_surface->texture &&
+   if (src_tex == dst_surface->texture &&
       dst_surface->u.tex.level == src_level &&
-       dst_surface->u.tex.first_layer == srcZ0) ||
-       (src_tex->target != PIPE_TEXTURE_2D &&
-       src_tex->target != PIPE_TEXTURE_2D &&
-       src_tex->target != PIPE_TEXTURE_RECT))
-   {
+       dst_surface->u.tex.first_layer == srcZ0) {
      /* Make a temporary texture which contains a copy of the source pixels.
       * Then we'll sample from the temporary texture.
       */
@@ -509,6 +595,11 @@ util_blit_pixels_writemask(struct blit_state *ctx,
      }

      u_sampler_view_default_template(&sv_templ, tex, tex->format);
+      if (!blit_depth && blit_stencil) {
+         /* set a stencil-only format, e.g. Z24S8 --> X24S8 */
+         sv_templ.format = util_format_stencil_only(tex->format);
+         assert(sv_templ.format != PIPE_FORMAT_NONE);
+      }
      sampler_view = pipe->create_sampler_view(pipe, tex, &sv_templ);

      if (!sampler_view) {
@@ -520,6 +611,11 @@ util_blit_pixels_writemask(struct blit_state *ctx,
   else {
      /* Directly sample from the source resource/texture */
      u_sampler_view_default_template(&sv_templ, src_tex, src_format);
+      if (!blit_depth && blit_stencil) {
+         /* set a stencil-only format, e.g. Z24S8 --> X24S8 */
+         sv_templ.format = util_format_stencil_only(src_format);
+         assert(sv_templ.format != PIPE_FORMAT_NONE);
+      }
      sampler_view = pipe->create_sampler_view(pipe, src_tex, &sv_templ);

      if (!sampler_view) {
@@ -540,15 +636,14 @@ util_blit_pixels_writemask(struct blit_state *ctx,
      }
   }

-   dst_is_depth = util_format_is_depth_or_stencil(dst_format);
-
-   assert(screen->is_format_supported(screen, sampler_view->format, ctx->internal_target,
-                                      sampler_view->texture->nr_samples,
-                                      PIPE_BIND_SAMPLER_VIEW));
+   assert(screen->is_format_supported(screen, sampler_view->format,
+                     ctx->internal_target, sampler_view->texture->nr_samples,
+                     PIPE_BIND_SAMPLER_VIEW));
   assert(screen->is_format_supported(screen, dst_format, ctx->internal_target,
-                                      dst_surface->texture->nr_samples,
-                                      dst_is_depth ? PIPE_BIND_DEPTH_STENCIL :
-                                                     PIPE_BIND_RENDER_TARGET));
+                     dst_surface->texture->nr_samples,
+                     is_depth || is_stencil ? PIPE_BIND_DEPTH_STENCIL :
+                                              PIPE_BIND_RENDER_TARGET));
+
   /* save state (restored below) */
   cso_save_blend(ctx->cso);
   cso_save_depth_stencil_alpha(ctx->cso);
@@ -565,23 +660,76 @@ util_blit_pixels_writemask(struct blit_state *ctx,
   cso_save_vertex_buffers(ctx->cso);

   /* set misc state we care about */
-   cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso,
-                               dst_is_depth ? &ctx->depthstencil_write :
-                                              &ctx->depthstencil_keep);
+   if (writemask)
+      cso_set_blend(ctx->cso, &ctx->blend_write_color);
+   else
+      cso_set_blend(ctx->cso, &ctx->blend_keep_color);
+
   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
   cso_set_stream_outputs(ctx->cso, 0, NULL, 0);

-   /* sampler */
+   /* default sampler state */
   ctx->sampler.normalized_coords = normalized;
   ctx->sampler.min_img_filter = filter;
   ctx->sampler.mag_img_filter = filter;
   ctx->sampler.min_lod = src_level;
   ctx->sampler.max_lod = src_level;
-   cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+
+   /* Depth stencil state, fragment shader and sampler setup depending on what
+    * we blit.
+    */
+   if (blit_depth && blit_stencil) {
+      cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+      /* don't filter stencil */
+      ctx->sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+      ctx->sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+      cso_single_sampler(ctx->cso, 1, &ctx->sampler);
+
+      cso_set_depth_stencil_alpha(ctx->cso, &ctx->dsa_write_depthstencil);
+      set_depthstencil_fragment_shader(ctx, sampler_view->texture->target);
+   }
+   else if (blit_depth) {
+      cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+      cso_set_depth_stencil_alpha(ctx->cso, &ctx->dsa_write_depth);
+      set_depth_fragment_shader(ctx, sampler_view->texture->target);
+   }
+   else if (blit_stencil) {
+      /* don't filter stencil */
+      ctx->sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+      ctx->sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+      cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+
+      cso_set_depth_stencil_alpha(ctx->cso, &ctx->dsa_write_stencil);
+      set_stencil_fragment_shader(ctx, sampler_view->texture->target);
+   }
+   else { /* color */
+      cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+      cso_set_depth_stencil_alpha(ctx->cso, &ctx->dsa_keep_depthstencil);
+      set_fragment_shader(ctx, writemask, sampler_view->texture->target);
+   }
   cso_single_sampler_done(ctx->cso);

+   /* textures */
+   if (blit_depth && blit_stencil) {
+      /* Setup two samplers, one for depth and the other one for stencil. */
+      struct pipe_sampler_view templ;
+      struct pipe_sampler_view *views[2];
+
+      templ = *sampler_view;
+      templ.format = util_format_stencil_only(templ.format);
+      assert(templ.format != PIPE_FORMAT_NONE);
+
+      views[0] = sampler_view;
+      views[1] = pipe->create_sampler_view(pipe, views[0]->texture, &templ);
+      cso_set_fragment_sampler_views(ctx->cso, 2, views);
+
+      pipe_sampler_view_reference(&views[1], NULL);
+   }
+   else {
+      cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view);
+   }
+
   /* viewport */
   ctx->viewport.scale[0] = 0.5f * dst_surface->width;
   ctx->viewport.scale[1] = 0.5f * dst_surface->height;
@@ -593,15 +741,6 @@ util_blit_pixels_writemask(struct blit_state *ctx,
   ctx->viewport.translate[3] = 0.0f;
   cso_set_viewport(ctx->cso, &ctx->viewport);

-   /* texture */
-   cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view);
-
-   /* shaders */
-   if (dst_is_depth) {
-      set_depth_fragment_shader(ctx);
-   } else {
-      set_fragment_shader(ctx, writemask);
-   }
   set_vertex_shader(ctx);
   cso_set_geometry_shader_handle(ctx->cso, NULL);

@@ -609,7 +748,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
   memset(&fb, 0, sizeof(fb));
   fb.width = dst_surface->width;
   fb.height = dst_surface->height;
-   if (dst_is_depth) {
+   if (blit_depth || blit_stencil) {
      fb.zsbuf = dst_surface;
   } else {
      fb.nr_cbufs = 1;
@@ -655,31 +794,6 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 }


-void
-util_blit_pixels(struct blit_state *ctx,
-                 struct pipe_resource *src_tex,
-                 unsigned src_level,
-                 int srcX0, int srcY0,
-                 int srcX1, int srcY1,
-                 int srcZ,
-                 struct pipe_surface *dst,
-                 int dstX0, int dstY0,
-                 int dstX1, int dstY1,
-                 float z, uint filter )
-{
-   util_blit_pixels_writemask( ctx, src_tex,
-                               src_level,
-                               srcX0, srcY0,
-                               srcX1, srcY1,
-                               srcZ,
-                               dst,
-                               dstX0, dstY0,
-                               dstX1, dstY1,
-                               z, filter,
-                               TGSI_WRITEMASK_XYZW );
-}
-
-
 /**
 * Copy pixel block from src texture to dst surface.
 * The sampler view's first_level field indicates the source
@@ -747,8 +861,8 @@ util_blit_pixels_tex(struct blit_state *ctx,
   cso_save_vertex_buffers(ctx->cso);

   /* set misc state we care about */
-   cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil_keep);
+   cso_set_blend(ctx->cso, &ctx->blend_write_color);
+   cso_set_depth_stencil_alpha(ctx->cso, &ctx->dsa_keep_depthstencil);
   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
   cso_set_stream_outputs(ctx->cso, 0, NULL, 0);
@@ -775,7 +889,8 @@ util_blit_pixels_tex(struct blit_state *ctx,
   cso_set_fragment_sampler_views(ctx->cso, 1, &src_sampler_view);

   /* shaders */
-   set_fragment_shader(ctx, TGSI_WRITEMASK_XYZW);
+   set_fragment_shader(ctx, TGSI_WRITEMASK_XYZW,
+                       src_sampler_view->texture->target);
   set_vertex_shader(ctx);
   cso_set_geometry_shader_handle(ctx->cso, NULL);

--- a/src/gallium/auxiliary/util/u_blit.h
+++ b/src/gallium/auxiliary/util/u_blit.h
@@ -31,6 +31,8 @@


 #include "pipe/p_compiler.h"
+/* for TGSI_WRITEMASK_* specification in util_blit_pixels */
+#include "pipe/p_shader_tokens.h"


 #ifdef __cplusplus
@@ -44,6 +46,8 @@ struct pipe_resource;
 struct pipe_sampler_view;
 struct pipe_surface;

+#define BLIT_WRITEMASK_Z         1
+#define BLIT_WRITEMASK_STENCIL   2

 extern struct blit_state *
 util_create_blit(struct pipe_context *pipe, struct cso_context *cso);
@@ -61,20 +65,8 @@ util_blit_pixels(struct blit_state *ctx,
                 struct pipe_surface *dst,
                 int dstX0, int dstY0,
                 int dstX1, int dstY1,
-                 float z, uint filter);
-
-void
-util_blit_pixels_writemask(struct blit_state *ctx,
-                           struct pipe_resource *src_tex,
-                           unsigned src_level,
-                           int srcX0, int srcY0,
-                           int srcX1, int srcY1,
-                           int srcZ0,
-                           struct pipe_surface *dst,
-                           int dstX0, int dstY0,
-                           int dstX1, int dstY1,
-                           float z, uint filter,
-                           uint writemask);
+                 float z, uint filter,
+                 uint writemask, uint zs_writemask);

 extern void
 util_blit_pixels_tex(struct blit_state *ctx,
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -78,6 +78,8 @@ struct blitter_context_priv
   /* FS which outputs a depth from a texture,
      where the index is PIPE_TEXTURE_* to be sampled. */
   void *fs_texfetch_depth[PIPE_MAX_TEXTURE_TYPES];
+   void *fs_texfetch_depthstencil[PIPE_MAX_TEXTURE_TYPES];
+   void *fs_texfetch_stencil[PIPE_MAX_TEXTURE_TYPES];

   /* Blend state. */
   void *blend_write_color;   /**< blend state with writemask of RGBA */
@@ -112,6 +114,7 @@ struct blitter_context_priv
   boolean has_geometry_shader;
   boolean vertex_has_integers;
   boolean has_stream_out;
+   boolean has_stencil_export;
 };

 static void blitter_draw_rectangle(struct blitter_context *blitter,
@@ -163,6 +166,10 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
      pipe->screen->get_param(pipe->screen,
                              PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS) != 0;

+   ctx->has_stencil_export =
+         pipe->screen->get_param(pipe->screen,
+                                 PIPE_CAP_SHADER_STENCIL_EXPORT);
+
   /* blend state objects */
   memset(&blend, 0, sizeof(blend));
   ctx->blend_keep_color = pipe->create_blend_state(pipe, &blend);
@@ -314,6 +321,10 @@ void util_blitter_destroy(struct blitter_context *blitter)
         pipe->delete_fs_state(pipe, ctx->fs_texfetch_col[i]);
      if (ctx->fs_texfetch_depth[i])
         pipe->delete_fs_state(pipe, ctx->fs_texfetch_depth[i]);
+      if (ctx->fs_texfetch_depthstencil[i])
+         pipe->delete_fs_state(pipe, ctx->fs_texfetch_depthstencil[i]);
+      if (ctx->fs_texfetch_stencil[i])
+         pipe->delete_fs_state(pipe, ctx->fs_texfetch_stencil[i]);
   }

   for (i = 0; i <= PIPE_MAX_COLOR_BUFS; i++) {
@@ -653,32 +664,6 @@ void *blitter_get_fs_col(struct blitter_context_priv *ctx, unsigned num_cbufs,
   }
 }

-/** Convert PIPE_TEXTURE_x to TGSI_TEXTURE_x */
-static unsigned
-pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target)
-{
-   switch (pipe_tex_target) {
-   case PIPE_TEXTURE_1D:
-      return TGSI_TEXTURE_1D;
-   case PIPE_TEXTURE_2D:
-      return TGSI_TEXTURE_2D;
-   case PIPE_TEXTURE_RECT:
-      return TGSI_TEXTURE_RECT;
-   case PIPE_TEXTURE_3D:
-      return TGSI_TEXTURE_3D;
-   case PIPE_TEXTURE_CUBE:
-      return TGSI_TEXTURE_CUBE;
-   case PIPE_TEXTURE_1D_ARRAY:
-      return TGSI_TEXTURE_1D_ARRAY;
-   case PIPE_TEXTURE_2D_ARRAY:
-      return TGSI_TEXTURE_2D_ARRAY;
-   default:
-      assert(0 && "unexpected texture target");
-      return TGSI_TEXTURE_UNKNOWN;
-   }
-}
-
-
 static INLINE
 void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
                                  unsigned tex_target)
@@ -689,7 +674,7 @@ void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,

   /* Create the fragment shader on-demand. */
   if (!ctx->fs_texfetch_col[tex_target]) {
-      unsigned tgsi_tex = pipe_tex_to_tgsi_tex(tex_target);
+      unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(tex_target);

      ctx->fs_texfetch_col[tex_target] =
        util_make_fragment_tex_shader(pipe, tgsi_tex, TGSI_INTERPOLATE_LINEAR);
@@ -708,7 +693,7 @@ void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,

   /* Create the fragment shader on-demand. */
   if (!ctx->fs_texfetch_depth[tex_target]) {
-      unsigned tgsi_tex = pipe_tex_to_tgsi_tex(tex_target);
+      unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(tex_target);

      ctx->fs_texfetch_depth[tex_target] =
         util_make_fragment_tex_shader_writedepth(pipe, tgsi_tex,
@@ -718,6 +703,58 @@ void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
   return ctx->fs_texfetch_depth[tex_target];
 }

+static INLINE
+void *blitter_get_fs_texfetch_depthstencil(struct blitter_context_priv *ctx,
+                                           unsigned tex_target)
+{
+   struct pipe_context *pipe = ctx->base.pipe;
+
+   assert(tex_target < PIPE_MAX_TEXTURE_TYPES);
+
+   /* Create the fragment shader on-demand. */
+   if (!ctx->fs_texfetch_depthstencil[tex_target]) {
+      unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(tex_target);
+
+      ctx->fs_texfetch_depthstencil[tex_target] =
+         util_make_fragment_tex_shader_writedepthstencil(pipe, tgsi_tex,
+                                                  TGSI_INTERPOLATE_LINEAR);
+   }
+
+   return ctx->fs_texfetch_depthstencil[tex_target];
+}
+
+static INLINE
+void *blitter_get_fs_texfetch_stencil(struct blitter_context_priv *ctx,
+                                      unsigned tex_target)
+{
+   struct pipe_context *pipe = ctx->base.pipe;
+
+   assert(tex_target < PIPE_MAX_TEXTURE_TYPES);
+
+   /* Create the fragment shader on-demand. */
+   if (!ctx->fs_texfetch_stencil[tex_target]) {
+      unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(tex_target);
+
+      ctx->fs_texfetch_stencil[tex_target] =
+         util_make_fragment_tex_shader_writestencil(pipe, tgsi_tex,
+                                                    TGSI_INTERPOLATE_LINEAR);
+   }
+
+   return ctx->fs_texfetch_stencil[tex_target];
+}
+
+static void blitter_set_common_draw_rect_state(struct blitter_context_priv *ctx)
+{
+   struct pipe_context *pipe = ctx->base.pipe;
+
+   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
+   pipe->bind_vs_state(pipe, ctx->vs);
+   if (ctx->has_geometry_shader)
+      pipe->bind_gs_state(pipe, NULL);
+   if (ctx->has_stream_out)
+      pipe->set_stream_output_targets(pipe, 0, NULL, 0);
+}
+
 static void blitter_draw(struct blitter_context_priv *ctx,
                         unsigned x1, unsigned y1,
                         unsigned x2, unsigned y2,
@@ -803,7 +840,6 @@ static void util_blitter_clear_custom(struct blitter_context *blitter,
   sr.ref_value[0] = stencil & 0xff;
   pipe->set_stencil_ref(pipe, &sr);

-   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
   if (util_format_is_pure_sint(cbuf_format)) {
      pipe->bind_vertex_elements_state(pipe, ctx->velem_sint_state);
   } else if (util_format_is_pure_uint(cbuf_format)) {
@@ -812,10 +848,8 @@ static void util_blitter_clear_custom(struct blitter_context *blitter,
      pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
   }
   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, num_cbufs, int_format));
-   pipe->bind_vs_state(pipe, ctx->vs);
-   if (ctx->has_geometry_shader)
-      pipe->bind_gs_state(pipe, NULL);

+   blitter_set_common_draw_rect_state(ctx);
   blitter_set_dst_dimensions(ctx, width, height);
   blitter->draw_rectangle(blitter, 0, 0, width, height, depth,
                           UTIL_BLITTER_ATTRIB_COLOR, color);
@@ -883,7 +917,7 @@ void util_blitter_default_src_texture(struct pipe_sampler_view *src_templ,
    src_templ->u.tex.last_level = srclevel;
    src_templ->u.tex.first_layer = 0;
    src_templ->u.tex.last_layer =
-        src->target == PIPE_TEXTURE_3D ? src->depth0 - 1
+        src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, srclevel) - 1
                                       : src->array_size - 1;
    src_templ->swizzle_r = PIPE_SWIZZLE_RED;
    src_templ->swizzle_g = PIPE_SWIZZLE_GREEN;
@@ -907,6 +941,8 @@ void util_blitter_copy_texture(struct blitter_context *blitter,
   struct pipe_sampler_view src_templ, *src_view;
   unsigned bind;
   boolean is_stencil, is_depth;
+   const struct util_format_description *src_desc =
+         util_format_description(src->format);

   /* Give up if textures are not set. */
   assert(dst && src);
@@ -916,8 +952,8 @@ void util_blitter_copy_texture(struct blitter_context *blitter,
   assert(src->target < PIPE_MAX_TEXTURE_TYPES);

   /* Is this a ZS format? */
-   is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
-   is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
+   is_depth = util_format_has_depth(src_desc);
+   is_stencil = util_format_has_stencil(src_desc);

   if (is_depth || is_stencil)
      bind = PIPE_BIND_DEPTH_STENCIL;
@@ -926,7 +962,7 @@ void util_blitter_copy_texture(struct blitter_context *blitter,

   /* Check if we can sample from and render to the surfaces. */
   /* (assuming copying a stencil buffer is not possible) */
-   if ((!ignore_stencil && is_stencil) ||
+   if ((!ignore_stencil && is_stencil && !ctx->has_stencil_export) ||
       !screen->is_format_supported(screen, dst->format, dst->target,
                                    dst->nr_samples, bind) ||
       !screen->is_format_supported(screen, src->format, src->target,
@@ -967,6 +1003,21 @@ void util_blitter_copy_texture_view(struct blitter_context *blitter,
   enum pipe_texture_target src_target = src->texture->target;
   unsigned width = srcbox->width;
   unsigned height = srcbox->height;
+   boolean is_stencil, is_depth;
+   const struct util_format_description *src_desc =
+         util_format_description(src->format);
+
+   is_depth = util_format_has_depth(src_desc);
+   is_stencil = util_format_has_stencil(src_desc);
+
+   /* If you want a fallback for stencil copies,
+    * use util_blitter_copy_texture. */
+   if (is_stencil && !ctx->has_stencil_export) {
+      is_stencil = FALSE;
+
+      if (!is_depth)
+         return;
+   }

   /* Sanity checks. */
   if (dst->texture == src->texture &&
@@ -988,12 +1039,25 @@ void util_blitter_copy_texture_view(struct blitter_context *blitter,
   fb_state.width = dst->width;
   fb_state.height = dst->height;

-   if (util_format_is_depth_or_stencil(dst->format)) {
+   if (is_depth || is_stencil) {
      pipe->bind_blend_state(pipe, ctx->blend_keep_color);
-      pipe->bind_depth_stencil_alpha_state(pipe,
-                                           ctx->dsa_write_depth_keep_stencil);
-      pipe->bind_fs_state(pipe,
-            blitter_get_fs_texfetch_depth(ctx, src_target));
+
+      if (is_depth && is_stencil) {
+         pipe->bind_depth_stencil_alpha_state(pipe,
+                                              ctx->dsa_write_depth_stencil);
+         pipe->bind_fs_state(pipe,
+               blitter_get_fs_texfetch_depthstencil(ctx, src_target));
+      } else if (is_depth) {
+         pipe->bind_depth_stencil_alpha_state(pipe,
+                                              ctx->dsa_write_depth_keep_stencil);
+         pipe->bind_fs_state(pipe,
+               blitter_get_fs_texfetch_depth(ctx, src_target));
+      } else { /* is_stencil */
+         pipe->bind_depth_stencil_alpha_state(pipe,
+                                              ctx->dsa_keep_depth_write_stencil);
+         pipe->bind_fs_state(pipe,
+               blitter_get_fs_texfetch_stencil(ctx, src_target));
+      }

      fb_state.nr_cbufs = 0;
      fb_state.zsbuf = dst;
@@ -1008,16 +1072,32 @@ void util_blitter_copy_texture_view(struct blitter_context *blitter,
      fb_state.zsbuf = 0;
   }

-   /* Set rasterizer state, shaders, and textures. */
-   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
-   pipe->bind_vs_state(pipe, ctx->vs);
-   if (ctx->has_geometry_shader)
-      pipe->bind_gs_state(pipe, NULL);
-   pipe->bind_fragment_sampler_states(pipe, 1, &ctx->sampler_state);
+   if (is_depth && is_stencil) {
+      /* Setup two samplers, one for depth and the other one for stencil. */
+      struct pipe_sampler_view templ;
+      struct pipe_sampler_view *views[2];
+      void *samplers[2] = {ctx->sampler_state, ctx->sampler_state};
+
+      templ = *src;
+      templ.format = util_format_stencil_only(templ.format);
+      assert(templ.format != PIPE_FORMAT_NONE);
+
+      views[0] = src;
+      views[1] = pipe->create_sampler_view(pipe, src->texture, &templ);
+
+      pipe->set_fragment_sampler_views(pipe, 2, views);
+      pipe->bind_fragment_sampler_states(pipe, 2, samplers);
+
+      pipe_sampler_view_reference(&views[1], NULL);
+   } else {
+      pipe->set_fragment_sampler_views(pipe, 1, &src);
+      pipe->bind_fragment_sampler_states(pipe, 1, &ctx->sampler_state);
+   }
+
   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
-   pipe->set_fragment_sampler_views(pipe, 1, &src);
   pipe->set_framebuffer_state(pipe, &fb_state);

+   blitter_set_common_draw_rect_state(ctx);
   blitter_set_dst_dimensions(ctx, dst->width, dst->height);

   switch (src_target) {
@@ -1093,11 +1173,7 @@ void util_blitter_clear_render_target(struct blitter_context *blitter,
   /* bind states */
   pipe->bind_blend_state(pipe, ctx->blend_write_color);
   pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
-   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 1, FALSE));
-   pipe->bind_vs_state(pipe, ctx->vs);
-   if (ctx->has_geometry_shader)
-      pipe->bind_gs_state(pipe, NULL);
   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);

   /* set a framebuffer state */
@@ -1108,6 +1184,7 @@ void util_blitter_clear_render_target(struct blitter_context *blitter,
   fb_state.zsbuf = 0;
   pipe->set_framebuffer_state(pipe, &fb_state);

+   blitter_set_common_draw_rect_state(ctx);
   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
   blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0,
                           UTIL_BLITTER_ATTRIB_COLOR, color);
@@ -1161,11 +1238,7 @@ void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
      /* hmm that should be illegal probably, or make it a no-op somewhere */
      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);

-   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 0, FALSE));
-   pipe->bind_vs_state(pipe, ctx->vs);
-   if (ctx->has_geometry_shader)
-      pipe->bind_gs_state(pipe, NULL);
   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);

   /* set a framebuffer state */
@@ -1176,6 +1249,7 @@ void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
   fb_state.zsbuf = dstsurf;
   pipe->set_framebuffer_state(pipe, &fb_state);

+   blitter_set_common_draw_rect_state(ctx);
   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
   blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, depth,
                           UTIL_BLITTER_ATTRIB_NONE, NULL);
@@ -1209,12 +1283,7 @@ void util_blitter_custom_depth_stencil(struct blitter_context *blitter,
   /* bind states */
   pipe->bind_blend_state(pipe, ctx->blend_write_color);
   pipe->bind_depth_stencil_alpha_state(pipe, dsa_stage);
-
-   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 0, FALSE));
-   pipe->bind_vs_state(pipe, ctx->vs);
-   if (ctx->has_geometry_shader)
-      pipe->bind_gs_state(pipe, NULL);
   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);

   /* set a framebuffer state */
@@ -1231,6 +1300,7 @@ void util_blitter_custom_depth_stencil(struct blitter_context *blitter,
   fb_state.zsbuf = zsurf;
   pipe->set_framebuffer_state(pipe, &fb_state);

+   blitter_set_common_draw_rect_state(ctx);
   blitter_set_dst_dimensions(ctx, zsurf->width, zsurf->height);
   blitter->draw_rectangle(blitter, 0, 0, zsurf->width, zsurf->height, depth,
                           UTIL_BLITTER_ATTRIB_NONE, NULL);
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -169,8 +169,8 @@ void util_blitter_clear_depth_custom(struct blitter_context *blitter,
 * The same holds for depth-stencil formats with the exception that stencil
 * cannot be copied unless you set ignore_stencil to FALSE. In that case,
 * a software fallback path is taken and both surfaces must be of the same
- * format.
- * XXX implement hw-accel stencil copy using shader stencil export.
+ * format. If the shader stencil export is supported, stencil copy is always
+ * accelerated.
 *
 * Use pipe_screen->is_format_supported to know your options.
 *
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -35,9 +35,16 @@
 #ifndef _UTIL_CPU_DETECT_H
 #define _UTIL_CPU_DETECT_H

+
 #include "pipe/p_compiler.h"
 #include "pipe/p_config.h"

+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
 struct util_cpu_caps {
   unsigned nr_cpus;

@@ -66,4 +73,9 @@ util_cpu_caps;
 void util_cpu_detect(void);


+#ifdef	__cplusplus
+}
+#endif
+
+
 #endif /* _UTIL_CPU_DETECT_H */
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -204,7 +204,7 @@ static boolean str_has_option(const char *str, const char *name)
       * we compare 'start' up to 'str-1' with 'name'. */

      while (1) {
-         if (!*str || !isalnum(*str)) {
+         if (!*str || !(isalnum(*str) || *str == '_')) {
            if (str-start == name_len &&
                !memcmp(start, name, name_len)) {
               return TRUE;
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -158,6 +158,38 @@ util_format_is_pure_uint(enum pipe_format format)
   return (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED && desc->channel[i].pure_integer) ? TRUE : FALSE;
 }

+boolean
+util_format_is_array(const struct util_format_description *desc)
+{
+   unsigned chan;
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+       desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
+       desc->block.width != 1 ||
+       desc->block.height != 1) {
+      return FALSE;
+   }
+
+   for (chan = 0; chan < desc->nr_channels; ++chan) {
+      if (desc->swizzle[chan] != chan)
+         return FALSE;
+
+      if (desc->channel[chan].type != desc->channel[0].type)
+         return FALSE;
+
+      if (desc->channel[chan].normalized != desc->channel[0].normalized)
+         return FALSE;
+
+      if (desc->channel[chan].pure_integer != desc->channel[0].pure_integer)
+         return FALSE;
+
+      if (desc->channel[chan].size != desc->channel[0].size)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
 boolean
 util_format_is_luminance_alpha(enum pipe_format format)
 {
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -590,6 +590,13 @@ util_format_is_pure_sint(enum pipe_format format);
 boolean
 util_format_is_pure_uint(enum pipe_format format);

+/**
+ * Whether the format is a simple array format where all channels
+ * are of the same type and can be loaded from memory as a vector
+ */
+boolean
+util_format_is_array(const struct util_format_description *desc);
+
 /**
 * Check if the src format can be blitted to the destination format with
 * a simple memcpy.  For example, blitting from RGBA to RGBx is OK, but not
@@ -874,6 +881,35 @@ util_format_linear(enum pipe_format format)
   }
 }

+/**
+ * Given a depth-stencil format, return the corresponding stencil-only format.
+ * For stencil-only formats, return the format unchanged.
+ */
+static INLINE enum pipe_format
+util_format_stencil_only(enum pipe_format format)
+{
+   switch (format) {
+   /* mask out the depth component */
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      return PIPE_FORMAT_X24S8_UINT;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      return PIPE_FORMAT_S8X24_UINT;
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return PIPE_FORMAT_X32_S8X24_UINT;
+
+   /* stencil only formats */
+   case PIPE_FORMAT_X24S8_UINT:
+   case PIPE_FORMAT_S8X24_UINT:
+   case PIPE_FORMAT_X32_S8X24_UINT:
+   case PIPE_FORMAT_S8_UINT:
+      return format;
+
+   default:
+      assert(0);
+      return PIPE_FORMAT_NONE;
+   }
+}
+
 /**
 * Return the number of components stored.
 * Formats with block size != 1x1 will always have 1 component (the block).
--- a/src/gallium/auxiliary/util/u_format_etc.c
+++ b/src/gallium/auxiliary/util/u_format_etc.c
@@ -13,30 +13,7 @@
 void
 util_format_etc1_rgb8_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
 {
-   const unsigned bw = 4, bh = 4, bs = 8, comps = 4;
-   struct etc1_block block;
-   unsigned x, y, i, j;
-
-   for (y = 0; y < height; y += bh) {
-      const uint8_t *src = src_row;
-
-      for (x = 0; x < width; x+= bw) {
-         etc1_parse_block(&block, src);
-
-         for (j = 0; j < bh; j++) {
-            uint8_t *dst = dst_row + (y + j) * dst_stride + x * comps;
-            for (i = 0; i < bw; i++) {
-               etc1_fetch_texel(&block, i, j, dst);
-               dst[3] = 255;
-               dst += comps;
-            }
-         }
-
-         src += bs;
-      }
-
-      src_row += src_stride;
-   }
+   etc1_unpack_rgba8888(dst_row, dst_stride, src_row, src_stride, width, height);
 }

 void
--- a/src/gallium/auxiliary/util/u_format_tests.c
+++ b/src/gallium/auxiliary/util/u_format_tests.c
@@ -26,6 +26,9 @@
 **************************************************************************/


+#include <float.h>
+
+#include "pipe/p_config.h"
 #include "u_memory.h"
 #include "u_format_tests.h"

@@ -63,6 +66,9 @@
       {{ 0,  0,  0,  0}, { 0,  0,  0,  0}, {0, 0, 0, 0}, {0, 0, 0, 0}}}


+#define NAN (0.0 / 0.0)
+#define INF (1.0 / 0.0)
+
 /**
 * Test cases.
 *
@@ -876,7 +882,39 @@ util_format_test_cases[] =
    * Half float formats
    */

-   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)},
+   /* Minimum positive normal */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0400), UNPACKED_1x1( 6.10352E-5, 0.0, 0.0, 1.0)},
+
+   /* Max denormal */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x03FF), UNPACKED_1x1( 6.09756E-5, 0.0, 0.0, 1.0)},
+
+   /* Minimum positive denormal */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0001), UNPACKED_1x1( 5.96046E-8, 0.0, 0.0, 1.0)},
+
+   /* Min representable value */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfbff), UNPACKED_1x1(   -65504.0, 0.0, 0.0, 1.0)},
+
+   /* Max representable value */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7bff), UNPACKED_1x1(    65504.0, 0.0, 0.0, 1.0)},
+
+#if !defined(PIPE_CC_MSVC)
+
+   /* NaNs */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7c01), UNPACKED_1x1(        NAN, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfc01), UNPACKED_1x1(       -NAN, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7fff), UNPACKED_1x1(        NAN, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(       -NAN, 0.0, 0.0, 1.0)},
+
+   /* Inf */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7c00), UNPACKED_1x1(        INF, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfc00), UNPACKED_1x1(       -INF, 0.0, 0.0, 1.0)},
+
+#endif
+
+   /* Zero, ignore sign */
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0x7fff), PACKED_1x16(0x8000), UNPACKED_1x1( -0.0, 0.0, 0.0, 1.0)},
+   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0x7fff), PACKED_1x16(0x0000), UNPACKED_1x1(  0.0, 0.0, 0.0, 1.0)},
+
   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x3c00), UNPACKED_1x1(  1.0, 0.0, 0.0, 1.0)},
   {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xbc00), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)},

--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -59,14 +59,17 @@ struct gen_mipmap_state
   struct pipe_context *pipe;
   struct cso_context *cso;

-   struct pipe_blend_state blend;
-   struct pipe_depth_stencil_alpha_state depthstencil;
+   struct pipe_blend_state blend_keep_color, blend_write_color;
+   struct pipe_depth_stencil_alpha_state dsa_keep_depth, dsa_write_depth;
   struct pipe_rasterizer_state rasterizer;
   struct pipe_sampler_state sampler;
   struct pipe_vertex_element velem[2];

   void *vs;
-   void *fs[TGSI_TEXTURE_COUNT]; /**< Not all are used, but simplifies code */
+
+   /** Not all are used, but simplifies code */
+   void *fs_color[TGSI_TEXTURE_COUNT];
+   void *fs_depth[TGSI_TEXTURE_COUNT];

   struct pipe_resource *vbuf;  /**< quad vertices */
   unsigned vbuf_slot;
@@ -1272,11 +1275,16 @@ util_create_gen_mipmap(struct pipe_context *pipe,
   ctx->cso = cso;

   /* disabled blending/masking */
-   memset(&ctx->blend, 0, sizeof(ctx->blend));
-   ctx->blend.rt[0].colormask = PIPE_MASK_RGBA;
+   memset(&ctx->blend_keep_color, 0, sizeof(ctx->blend_keep_color));
+   memset(&ctx->blend_write_color, 0, sizeof(ctx->blend_write_color));
+   ctx->blend_write_color.rt[0].colormask = PIPE_MASK_RGBA;

   /* no-op depth/stencil/alpha */
-   memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil));
+   memset(&ctx->dsa_keep_depth, 0, sizeof(ctx->dsa_keep_depth));
+   memset(&ctx->dsa_write_depth, 0, sizeof(ctx->dsa_write_depth));
+   ctx->dsa_write_depth.depth.enabled = 1;
+   ctx->dsa_write_depth.depth.func = PIPE_FUNC_ALWAYS;
+   ctx->dsa_write_depth.depth.writemask = 1;

   /* rasterizer */
   memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
@@ -1318,14 +1326,25 @@ util_create_gen_mipmap(struct pipe_context *pipe,
 * Helper function to set the fragment shaders.
 */
 static INLINE void
-set_fragment_shader(struct gen_mipmap_state *ctx, uint type)
+set_fragment_shader(struct gen_mipmap_state *ctx, uint type,
+                    boolean output_depth)
 {
-   if (!ctx->fs[type])
-      ctx->fs[type] =
-         util_make_fragment_tex_shader(ctx->pipe, type,
-                                       TGSI_INTERPOLATE_LINEAR);
+   if (output_depth) {
+      if (!ctx->fs_depth[type])
+         ctx->fs_depth[type] =
+            util_make_fragment_tex_shader_writedepth(ctx->pipe, type,
+                                                     TGSI_INTERPOLATE_LINEAR);

-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[type]);
+      cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth[type]);
+   }
+   else {
+      if (!ctx->fs_color[type])
+         ctx->fs_color[type] =
+            util_make_fragment_tex_shader(ctx->pipe, type,
+                                          TGSI_INTERPOLATE_LINEAR);
+
+      cso_set_fragment_shader_handle(ctx->cso, ctx->fs_color[type]);
+   }
 }


@@ -1464,9 +1483,13 @@ util_destroy_gen_mipmap(struct gen_mipmap_state *ctx)
   struct pipe_context *pipe = ctx->pipe;
   unsigned i;

-   for (i = 0; i < Elements(ctx->fs); i++)
-      if (ctx->fs[i])
-         pipe->delete_fs_state(pipe, ctx->fs[i]);
+   for (i = 0; i < Elements(ctx->fs_color); i++)
+      if (ctx->fs_color[i])
+         pipe->delete_fs_state(pipe, ctx->fs_color[i]);
+
+   for (i = 0; i < Elements(ctx->fs_depth); i++)
+      if (ctx->fs_depth[i])
+         pipe->delete_fs_state(pipe, ctx->fs_depth[i]);

   if (ctx->vs)
      pipe->delete_vs_state(pipe, ctx->vs);
@@ -1500,6 +1523,7 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
   uint dstLevel;
   uint offset;
   uint type;
+   boolean is_depth = util_format_is_depth_or_stencil(psv->format);

   /* The texture object should have room for the levels which we're
    * about to generate.
@@ -1538,7 +1562,9 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,

   /* check if we can render in the texture's format */
   if (!screen->is_format_supported(screen, psv->format, pt->target,
-                                    pt->nr_samples, PIPE_BIND_RENDER_TARGET)) {
+                                    pt->nr_samples,
+                                    is_depth ? PIPE_BIND_DEPTH_STENCIL :
+                                               PIPE_BIND_RENDER_TARGET)) {
      fallback_gen_mipmap(ctx, pt, face, baseLevel, lastLevel);
      return;
   }
@@ -1556,30 +1582,28 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
   cso_save_geometry_shader(ctx->cso);
   cso_save_viewport(ctx->cso);
   cso_save_vertex_elements(ctx->cso);
+   cso_save_vertex_buffers(ctx->cso);

   /* bind our state */
-   cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
+   cso_set_blend(ctx->cso, is_depth ? &ctx->blend_keep_color :
+                                      &ctx->blend_write_color);
+   cso_set_depth_stencil_alpha(ctx->cso, is_depth ? &ctx->dsa_write_depth :
+                                                    &ctx->dsa_keep_depth);
   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
   cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
   cso_set_stream_outputs(ctx->cso, 0, NULL, 0);

-   set_fragment_shader(ctx, type);
+   set_fragment_shader(ctx, type, is_depth);
   set_vertex_shader(ctx);
   cso_set_geometry_shader_handle(ctx->cso, NULL);

   /* init framebuffer state */
   memset(&fb, 0, sizeof(fb));
-   fb.nr_cbufs = 1;

   /* set min/mag to same filter for faster sw speed */
   ctx->sampler.mag_img_filter = filter;
   ctx->sampler.min_img_filter = filter;

-   /*
-    * XXX for small mipmap levels, it may be faster to use the software
-    * fallback path...
-    */
   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
      const uint srcLevel = dstLevel - 1;
      struct pipe_viewport_state vp;
@@ -1608,7 +1632,9 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
            layer = face;

         memset(&surf_templ, 0, sizeof(surf_templ));
-         u_surface_default_template(&surf_templ, pt, PIPE_BIND_RENDER_TARGET);
+         u_surface_default_template(&surf_templ, pt,
+                                    is_depth ? PIPE_BIND_DEPTH_STENCIL :
+                                               PIPE_BIND_RENDER_TARGET);
         surf_templ.u.tex.level = dstLevel;
         surf_templ.u.tex.first_layer = layer;
         surf_templ.u.tex.last_layer = layer;
@@ -1617,7 +1643,14 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
         /*
          * Setup framebuffer / dest surface
          */
-         fb.cbufs[0] = surf;
+         if (is_depth) {
+            fb.nr_cbufs = 0;
+            fb.zsbuf = surf;
+         }
+         else {
+            fb.nr_cbufs = 1;
+            fb.cbufs[0] = surf;
+         }
         fb.width = u_minify(pt->width0, dstLevel);
         fb.height = u_minify(pt->height0, dstLevel);
         cso_set_framebuffer(ctx->cso, &fb);
@@ -1679,4 +1712,5 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
   cso_restore_viewport(ctx->cso);
   cso_restore_vertex_elements(ctx->cso);
   cso_restore_stream_outputs(ctx->cso);
+   cso_restore_vertex_buffers(ctx->cso);
 }
--- a/src/gallium/auxiliary/util/u_half.h
+++ b/src/gallium/auxiliary/util/u_half.h
@@ -35,51 +35,84 @@
 extern "C" {
 #endif

-extern const uint32_t util_half_to_float_mantissa_table[2048];
-extern const uint32_t util_half_to_float_exponent_table[64];
-extern const uint32_t util_half_to_float_offset_table[64];
-extern const uint16_t util_float_to_half_base_table[512];
-extern const uint8_t util_float_to_half_shift_table[512];
-
 /*
- * Note that if the half float is a signaling NaN, the x87 FPU will turn
- * it into a quiet NaN immediately upon loading into a float.
+ * References for float <-> half conversions
 *
- * Additionally, denormals may be flushed to zero.
- *
- * To avoid this, use the floatui functions instead of the float ones
- * when just doing conversion rather than computation on the resulting
- * floats.
+ *  http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ *  https://gist.github.com/2156668
+ *  https://gist.github.com/2144712
 */

-static INLINE uint32_t
-util_half_to_floatui(uint16_t h)
-{
-   unsigned exp = h >> 10;
-   return util_half_to_float_mantissa_table[util_half_to_float_offset_table[exp] + (h & 0x3ff)] + util_half_to_float_exponent_table[exp];
-}
-
-static INLINE float
-util_half_to_float(uint16_t h)
-{
-   union fi r;
-   r.ui = util_half_to_floatui(h);
-   return r.f;
-}
-
-static INLINE uint16_t
-util_floatui_to_half(uint32_t v)
-{
-   unsigned signexp = v >> 23;
-   return util_float_to_half_base_table[signexp] + ((v & 0x007fffff) >> util_float_to_half_shift_table[signexp]);
-}
-
 static INLINE uint16_t
 util_float_to_half(float f)
 {
-   union fi i;
-   i.f = f;
-   return util_floatui_to_half(i.ui);
+   uint32_t sign_mask  = 0x80000000;
+   uint32_t round_mask = ~0xfff;
+   uint32_t f32inf = 0xff << 23;
+   uint32_t f16inf = 0x1f << 23;
+   uint32_t sign;
+   union fi magic;
+   union fi f32;
+   uint16_t f16;
+
+   magic.ui = 0xf << 23;
+
+   f32.f = f;
+
+   /* Sign */
+   sign = f32.ui & sign_mask;
+   f32.ui ^= sign;
+
+   if (f32.ui == f32inf) {
+      /* Inf */
+      f16 = 0x7c00;
+   } else if (f32.ui > f32inf) {
+      /* NaN */
+      f16 = 0x7e00;
+   } else {
+      /* Number */
+      f32.ui &= round_mask;
+      f32.f  *= magic.f;
+      f32.ui -= round_mask;
+
+      /* Clamp to infinity if overflowed */
+      if (f32.ui > f16inf)
+         f32.ui = f16inf;
+
+      f16 = f32.ui >> 13;
+   }
+
+   /* Sign */
+   f16 |= sign >> 16;
+
+   return f16;
+}
+
+static INLINE float
+util_half_to_float(uint16_t f16)
+{
+   union fi infnan;
+   union fi magic;
+   union fi f32;
+
+   infnan.ui = 0x8f << 23;
+   infnan.f = 65536.0f;
+   magic.ui  = 0xef << 23;
+
+   /* Exponent / Mantissa */
+   f32.ui = (f16 & 0x7fff) << 13;
+
+   /* Adjust */
+   f32.f *= magic.f;
+
+   /* Inf / NaN */
+   if (f32.f >= infnan.f)
+      f32.ui |= 0xff << 23;
+
+   /* Sign */
+   f32.ui |= (f16 & 0x8000) << 16;
+
+   return f32.f;
 }

 #ifdef __cplusplus
--- a/Show More
+++ b/Show More