i965/gen10: Use CS Stall instead of WriteImmediate.

Fixes: ca19ee33d7 Signed-off-by: Rafael Antognolli <rafael.antognolli@intel.com> Cc: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
anv/gen10: Emit CS stall and mark push constants dirty.
2018-01-26 12:02:34 -08:00 · 2018-01-26 11:59:17 -08:00 · 2018-01-26 10:07:44 -08:00 · 2018-01-26 10:07:40 -08:00 · 2018-01-26 10:07:35 -08:00 · 2018-01-26 10:44:05 -07:00
176 changed files with 5222 additions and 3795 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -396,39 +396,9 @@ matrix:
            - libexpat1-dev
            - libx11-xcb-dev
            - libelf-dev
-    - env:
-        - LABEL="macOS make"
-        - BUILD=make
-        - MAKEFLAGS="-j4"
-        - MAKE_CHECK_COMMAND="make check"
-        - DRI_LOADERS="--with-platforms=x11 --disable-egl"
-      os: osx
-
-before_install:
-  - |
-    if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-      HOMEBREW_NO_AUTO_UPDATE=1 brew install python3 ninja expat gettext
-      # Set PATH for homebrew pip3 installs
-      PATH="$HOME/Library/Python/3.6/bin:${PATH}"
-      # Set PKG_CONFIG_PATH for keg-only expat
-      PKG_CONFIG_PATH="/usr/local/opt/expat/lib/pkgconfig:${PKG_CONFIG_PATH}"
-      # Set PATH for keg-only gettext
-      PATH="/usr/local/opt/gettext/bin:${PATH}"
-
-      # Install xquartz for prereqs ...
-      XQUARTZ_VERSION="2.7.11"
-      wget -nv https://dl.bintray.com/xquartz/downloads/XQuartz-${XQUARTZ_VERSION}.dmg
-      hdiutil attach XQuartz-${XQUARTZ_VERSION}.dmg
-      sudo installer -pkg /Volumes/XQuartz-${XQUARTZ_VERSION}/XQuartz.pkg -target /
-      hdiutil detach /Volumes/XQuartz-${XQUARTZ_VERSION}
-      # ... and set paths
-      PATH="/opt/X11/bin:${PATH}"
-      PKG_CONFIG_PATH="/opt/X11/share/pkgconfig:/opt/X11/lib/pkgconfig:${PKG_CONFIG_PATH}"
-      ACLOCAL="aclocal -I /opt/X11/share/aclocal -I /usr/local/share/aclocal"
-    fi

 install:
-  - pip2 install --user mako
+  - pip install --user mako

  # Install the latest meson from pip, since the version in the ubuntu repos is
  # often quite old.
@@ -449,64 +419,62 @@ install:
  # Install dependencies where we require specific versions (or where
  # disallowed by Travis CI's package whitelisting).

-  - |
-    if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
-      wget $XORG_RELEASES/util/$XORGMACROS_VERSION.tar.bz2
-      tar -jxvf $XORGMACROS_VERSION.tar.bz2
-      (cd $XORGMACROS_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget $XORG_RELEASES/util/$XORGMACROS_VERSION.tar.bz2
+  - tar -jxvf $XORGMACROS_VERSION.tar.bz2
+  - (cd $XORGMACROS_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      wget $XORG_RELEASES/proto/$GLPROTO_VERSION.tar.bz2
-      tar -jxvf $GLPROTO_VERSION.tar.bz2
-      (cd $GLPROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget $XORG_RELEASES/proto/$GLPROTO_VERSION.tar.bz2
+  - tar -jxvf $GLPROTO_VERSION.tar.bz2
+  - (cd $GLPROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      wget $XORG_RELEASES/proto/$DRI2PROTO_VERSION.tar.bz2
-      tar -jxvf $DRI2PROTO_VERSION.tar.bz2
-      (cd $DRI2PROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget $XORG_RELEASES/proto/$DRI2PROTO_VERSION.tar.bz2
+  - tar -jxvf $DRI2PROTO_VERSION.tar.bz2
+  - (cd $DRI2PROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      wget $XCB_RELEASES/$XCBPROTO_VERSION.tar.bz2
-      tar -jxvf $XCBPROTO_VERSION.tar.bz2
-      (cd $XCBPROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget $XCB_RELEASES/$XCBPROTO_VERSION.tar.bz2
+  - tar -jxvf $XCBPROTO_VERSION.tar.bz2
+  - (cd $XCBPROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      wget $XCB_RELEASES/$LIBXCB_VERSION.tar.bz2
-      tar -jxvf $LIBXCB_VERSION.tar.bz2
-      (cd $LIBXCB_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget $XCB_RELEASES/$LIBXCB_VERSION.tar.bz2
+  - tar -jxvf $LIBXCB_VERSION.tar.bz2
+  - (cd $LIBXCB_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      wget $XORG_RELEASES/lib/$LIBPCIACCESS_VERSION.tar.bz2
-      tar -jxvf $LIBPCIACCESS_VERSION.tar.bz2
-      (cd $LIBPCIACCESS_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget $XORG_RELEASES/lib/$LIBPCIACCESS_VERSION.tar.bz2
+  - tar -jxvf $LIBPCIACCESS_VERSION.tar.bz2
+  - (cd $LIBPCIACCESS_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      wget http://dri.freedesktop.org/libdrm/$LIBDRM_VERSION.tar.bz2
-      tar -jxvf $LIBDRM_VERSION.tar.bz2
-      (cd $LIBDRM_VERSION && ./configure --prefix=$HOME/prefix --enable-vc4 --enable-freedreno --enable-etnaviv-experimental-api && make install)
+  - wget http://dri.freedesktop.org/libdrm/$LIBDRM_VERSION.tar.bz2
+  - tar -jxvf $LIBDRM_VERSION.tar.bz2
+  - (cd $LIBDRM_VERSION && ./configure --prefix=$HOME/prefix --enable-vc4 --enable-freedreno --enable-etnaviv-experimental-api && make install)

-      wget $XORG_RELEASES/lib/$LIBXSHMFENCE_VERSION.tar.bz2
-      tar -jxvf $LIBXSHMFENCE_VERSION.tar.bz2
-      (cd $LIBXSHMFENCE_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget $XORG_RELEASES/lib/$LIBXSHMFENCE_VERSION.tar.bz2
+  - tar -jxvf $LIBXSHMFENCE_VERSION.tar.bz2
+  - (cd $LIBXSHMFENCE_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      wget http://people.freedesktop.org/~aplattner/vdpau/$LIBVDPAU_VERSION.tar.bz2
-      tar -jxvf $LIBVDPAU_VERSION.tar.bz2
-      (cd $LIBVDPAU_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget http://people.freedesktop.org/~aplattner/vdpau/$LIBVDPAU_VERSION.tar.bz2
+  - tar -jxvf $LIBVDPAU_VERSION.tar.bz2
+  - (cd $LIBVDPAU_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      wget http://www.freedesktop.org/software/vaapi/releases/libva/$LIBVA_VERSION.tar.bz2
-      tar -jxvf $LIBVA_VERSION.tar.bz2
-      (cd $LIBVA_VERSION && ./configure --prefix=$HOME/prefix --disable-wayland --disable-dummy-driver && make install)
+  - wget http://www.freedesktop.org/software/vaapi/releases/libva/$LIBVA_VERSION.tar.bz2
+  - tar -jxvf $LIBVA_VERSION.tar.bz2
+  - (cd $LIBVA_VERSION && ./configure --prefix=$HOME/prefix --disable-wayland --disable-dummy-driver && make install)

-      wget $WAYLAND_RELEASES/$LIBWAYLAND_VERSION.tar.xz
-      tar -axvf $LIBWAYLAND_VERSION.tar.xz
-      (cd $LIBWAYLAND_VERSION && ./configure --prefix=$HOME/prefix --enable-libraries --without-host-scanner --disable-documentation --disable-dtd-validation && make install)
+  - wget $WAYLAND_RELEASES/$LIBWAYLAND_VERSION.tar.xz
+  - tar -axvf $LIBWAYLAND_VERSION.tar.xz
+  - (cd $LIBWAYLAND_VERSION && ./configure --prefix=$HOME/prefix --enable-libraries --without-host-scanner --disable-documentation --disable-dtd-validation && make install)

-      wget $WAYLAND_RELEASES/$WAYLAND_PROTOCOLS_VERSION.tar.xz
-      tar -axvf $WAYLAND_PROTOCOLS_VERSION.tar.xz
-      (cd $WAYLAND_PROTOCOLS_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - wget $WAYLAND_RELEASES/$WAYLAND_PROTOCOLS_VERSION.tar.xz
+  - tar -axvf $WAYLAND_PROTOCOLS_VERSION.tar.xz
+  - (cd $WAYLAND_PROTOCOLS_VERSION && ./configure --prefix=$HOME/prefix && make install)

-      # Meson requires ninja >= 1.6, but trusty has 1.3.x
-      wget https://github.com/ninja-build/ninja/releases/download/v1.6.0/ninja-linux.zip
-      unzip ninja-linux.zip
-      mv ninja $HOME/prefix/bin/
+  # Meson requires ninja >= 1.6, but trusty has 1.3.x
+  - wget https://github.com/ninja-build/ninja/releases/download/v1.6.0/ninja-linux.zip;
+  - unzip ninja-linux.zip
+  - mv ninja $HOME/prefix/bin/

-      # Generate this header since one is missing on the Travis instance
-      mkdir -p linux
-      printf "%s\n" \
+  # Generate the header since one is missing on the Travis instance
+  - mkdir -p linux
+  - printf "%s\n" \
           "#ifndef _LINUX_MEMFD_H" \
           "#define _LINUX_MEMFD_H" \
           "" \
@@ -517,7 +485,6 @@ install:
           "#define MFD_ALLOW_SEALING       0x0002U" \
           "" \
           "#endif /* _LINUX_MEMFD_H */" > linux/memfd.h
-    fi

 script:
  - if test "x$BUILD" = xmake; then
--- a/2
+++ b/2
@@ -1 +1 @@
-18.0.0-rc4
+18.1.0-devel
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -1,6 +0,0 @@
-# fixes: The following commits were applied without the "cherry-picked from" tag
-50265cd9ee4caffee853700bdcd75b92eedc0e7b automake: anv: ship anv_extensions_gen.py in the tarball
-ac4437b20b87c7285b89466f05b51518ae616873 automake: small cleanup after the meson.build inclusion
-
-# stable: The KHX extension is disabled all together in the stable branches.
-bee9270853c34aa8e4b3d19a125608ee67c87b86 radv: Don't expose VK_KHX_multiview on android.
--- a/configure.ac
+++ b/configure.ac
@@ -685,19 +685,6 @@ AC_LINK_IFELSE(
 LDFLAGS=$save_LDFLAGS
 AM_CONDITIONAL(HAVE_LD_DYNAMIC_LIST, test "$have_ld_dynamic_list" = "yes")

-dnl
-dnl OSX linker does not support build-id
-dnl
-case "$host_os" in
-darwin*)
-    LD_BUILD_ID=""
-    ;;
-*)
-    LD_BUILD_ID="-Wl,--build-id=sha1"
-    ;;
-esac
-AC_SUBST([LD_BUILD_ID])
-
 dnl
 dnl compatibility symlinks
 dnl
@@ -1283,10 +1270,10 @@ AC_ARG_ENABLE([xa],
    [enable_xa=no])
 AC_ARG_ENABLE([gbm],
   [AS_HELP_STRING([--enable-gbm],
-         [enable gbm library @<:@default=yes except cygwin and macOS@:>@])],
+         [enable gbm library @<:@default=yes except cygwin@:>@])],
   [enable_gbm="$enableval"],
   [case "$host_os" in
-       cygwin* | darwin*)
+       cygwin*)
          enable_gbm=no
          ;;
       *)
--- a/docs/release-calendar.html
+++ b/docs/release-calendar.html
@@ -58,13 +58,7 @@ if you'd like to nominate a patch in the next stable release.
 <td>Final planned release for the 17.3 series</td>
 </tr>
 <tr>
-<td rowspan="7">18.0</td>
-<td>2018-01-19</td>
-<td>18.0.0-rc1</td>
-<td>Emil Velikov</td>
-<td></td>
-</tr>
-<tr>
+<td rowspan="6">18.0</td>
 <td>2018-01-26</td>
 <td>18.0.0-rc2</td>
 <td>Emil Velikov</td>
--- a/docs/relnotes/18.1.0.html
+++ b/docs/relnotes/18.1.0.html
@@ -0,0 +1,64 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.1.0 Release Notes / TBD</h1>
+
+<p>
+Mesa 18.1.0 is a new development release. People who are concerned
+with stability and reliability should stick with a previous release or
+wait for Mesa 18.1.1.
+</p>
+<p>
+Mesa 18.1.0 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD.
+</pre>
+
+
+<h2>New features</h2>
+
+<p>
+Note: some of the new features are only available with certain drivers.
+</p>
+
+<ul>
+TBD
+</ul>
+
+<h2>Bug fixes</h2>
+
+<ul>
+TBD
+</ul>
+
+<h2>Changes</h2>
+
+<ul>
+TBD
+</ul>
+
+</div>
+</body>
+</html>
--- a/docs/submittingpatches.html
+++ b/docs/submittingpatches.html
@@ -246,6 +246,10 @@ release.
 Note: resending patch identical to one on mesa-dev@ or one that differs only
 by the extra mesa-stable@ tag is <strong>not</strong> recommended.
 </p>
+<p>
+If you are not the author of the original patch, please Cc: them in your
+nomination request.
+</p>


 <h3 id="thetag">The stable tag</h3>
--- a/include/vulkan/vulkan.h
+++ b/include/vulkan/vulkan.h
@@ -6,7 +6,7 @@ extern "C" {
 #endif

 /*
-** Copyright (c) 2015-2017 The Khronos Group Inc.
+** Copyright (c) 2015-2018 The Khronos Group Inc.
 **
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ extern "C" {
 #define VK_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3ff)
 #define VK_VERSION_PATCH(version) ((uint32_t)(version) & 0xfff)
 // Version of this file
-#define VK_HEADER_VERSION 66
+#define VK_HEADER_VERSION 68


 #define VK_NULL_HANDLE 0
@@ -304,6 +304,8 @@ typedef enum VkStructureType {
    VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SWIZZLE_STATE_CREATE_INFO_NV = 1000098000,
    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DISCARD_RECTANGLE_PROPERTIES_EXT = 1000099000,
    VK_STRUCTURE_TYPE_PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT = 1000099001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONSERVATIVE_RASTERIZATION_PROPERTIES_EXT = 1000101000,
+    VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT = 1000101001,
    VK_STRUCTURE_TYPE_HDR_METADATA_EXT = 1000105000,
    VK_STRUCTURE_TYPE_SHARED_PRESENT_SURFACE_CAPABILITIES_KHR = 1000111000,
    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO_KHR = 1000112000,
@@ -5240,12 +5242,12 @@ typedef enum VkDebugReportObjectTypeEXT {
    VK_DEBUG_REPORT_OBJECT_TYPE_DISPLAY_MODE_KHR_EXT = 30,
    VK_DEBUG_REPORT_OBJECT_TYPE_OBJECT_TABLE_NVX_EXT = 31,
    VK_DEBUG_REPORT_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NVX_EXT = 32,
-    VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT = 33,
+    VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT_EXT = 33,
    VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR_EXT = 1000085000,
    VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION_KHR_EXT = 1000156000,
    VK_DEBUG_REPORT_OBJECT_TYPE_BEGIN_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT,
-    VK_DEBUG_REPORT_OBJECT_TYPE_END_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT,
-    VK_DEBUG_REPORT_OBJECT_TYPE_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT + 1),
+    VK_DEBUG_REPORT_OBJECT_TYPE_END_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT_EXT,
+    VK_DEBUG_REPORT_OBJECT_TYPE_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT_EXT - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT + 1),
    VK_DEBUG_REPORT_OBJECT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
 } VkDebugReportObjectTypeEXT;

@@ -6532,6 +6534,47 @@ VKAPI_ATTR void VKAPI_CALL vkCmdSetDiscardRectangleEXT(
    const VkRect2D*                             pDiscardRectangles);
 #endif

+#define VK_EXT_conservative_rasterization 1
+#define VK_EXT_CONSERVATIVE_RASTERIZATION_SPEC_VERSION 1
+#define VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME "VK_EXT_conservative_rasterization"
+
+
+typedef enum VkConservativeRasterizationModeEXT {
+    VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT = 0,
+    VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT = 1,
+    VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT = 2,
+    VK_CONSERVATIVE_RASTERIZATION_MODE_BEGIN_RANGE_EXT = VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT,
+    VK_CONSERVATIVE_RASTERIZATION_MODE_END_RANGE_EXT = VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT,
+    VK_CONSERVATIVE_RASTERIZATION_MODE_RANGE_SIZE_EXT = (VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT - VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT + 1),
+    VK_CONSERVATIVE_RASTERIZATION_MODE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkConservativeRasterizationModeEXT;
+
+typedef VkFlags VkPipelineRasterizationConservativeStateCreateFlagsEXT;
+
+typedef struct VkPhysicalDeviceConservativeRasterizationPropertiesEXT {
+    VkStructureType    sType;
+    void*              pNext;
+    float              primitiveOverestimationSize;
+    float              maxExtraPrimitiveOverestimationSize;
+    float              extraPrimitiveOverestimationSizeGranularity;
+    VkBool32           primitiveUnderestimation;
+    VkBool32           conservativePointAndLineRasterization;
+    VkBool32           degenerateTrianglesRasterized;
+    VkBool32           degenerateLinesRasterized;
+    VkBool32           fullyCoveredFragmentShaderInputVariable;
+    VkBool32           conservativeRasterizationPostDepthCoverage;
+} VkPhysicalDeviceConservativeRasterizationPropertiesEXT;
+
+typedef struct VkPipelineRasterizationConservativeStateCreateInfoEXT {
+    VkStructureType                                           sType;
+    const void*                                               pNext;
+    VkPipelineRasterizationConservativeStateCreateFlagsEXT    flags;
+    VkConservativeRasterizationModeEXT                        conservativeRasterizationMode;
+    float                                                     extraPrimitiveOverestimationSize;
+} VkPipelineRasterizationConservativeStateCreateInfoEXT;
+
+
+
 #define VK_EXT_swapchain_colorspace 1
 #define VK_EXT_SWAPCHAIN_COLOR_SPACE_SPEC_VERSION 3
 #define VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME "VK_EXT_swapchain_colorspace"
@@ -6861,6 +6904,7 @@ VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkValidationCacheEXT)

 #define VK_EXT_VALIDATION_CACHE_SPEC_VERSION 1
 #define VK_EXT_VALIDATION_CACHE_EXTENSION_NAME "VK_EXT_validation_cache"
+#define VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT_EXT


 typedef enum VkValidationCacheHeaderVersionEXT {
--- a/meson.build
+++ b/meson.build
@@ -57,10 +57,6 @@ dri_drivers_path = get_option('dri-drivers-path')
 if dri_drivers_path == ''
  dri_drivers_path = join_paths(get_option('libdir'), 'dri')
 endif
-dri_search_path = get_option('dri-search-path')
-if dri_search_path == ''
-  dri_search_path = join_paths(get_option('prefix'), dri_drivers_path)
-endif

 with_gles1 = get_option('gles1')
 with_gles2 = get_option('gles2')
@@ -357,15 +353,9 @@ endif
 with_dri2 = (with_dri or with_any_vk) and with_dri_platform == 'drm'
 with_dri3 = get_option('dri3')
 if with_dri3 == 'auto'
-  if system_has_kms_drm and with_dri2
-    with_dri3 = true
-  else
-    with_dri3 = false
- endif
-elif with_dri3 == 'true'
-  with_dri3 = true
+  with_dri3 = system_has_kms_drm and with_dri2
 else
-  with_dri3 = false
+  with_dri3 = with_dri3 == 'true'
 endif

 if with_any_vk and (with_platform_x11 and not with_dri3)
@@ -1010,23 +1000,15 @@ if with_gallium_opencl
  # TODO: optional modules
 endif

-if with_amd_vk
-  _llvm_version = '>= 4.0.0'
-elif with_gallium_opencl or with_gallium_swr or with_gallium_r600 or with_gallium_radeonsi
-  _llvm_version = '>= 3.9.0'
-else
-  _llvm_version = '>= 3.3.0'
-endif
-
 _llvm = get_option('llvm')
 if _llvm == 'auto'
  dep_llvm = dependency(
-    'llvm', version : _llvm_version, modules : llvm_modules,
+    'llvm', version : '>= 3.9.0', modules : llvm_modules,
    required : with_amd_vk or with_gallium_radeonsi or with_gallium_swr or with_gallium_opencl,
  )
  with_llvm = dep_llvm.found()
 elif _llvm == 'true'
-  dep_llvm = dependency('llvm', version : _llvm_version, modules : llvm_modules)
+  dep_llvm = dependency('llvm', version : '>= 3.9.0', modules : llvm_modules)
  with_llvm = true
 else
  dep_llvm = []
@@ -1045,7 +1027,7 @@ if with_llvm
    _llvm_patch = _llvm_patch.split('g')[0]
  endif
  pre_args += [
-    '-DHAVE_LLVM=0x0@0@@1@@2@'.format(_llvm_version[0], _llvm_version[1], _llvm_patch),
+    '-DHAVE_LLVM=0x0@0@0@1@'.format(_llvm_version[0], _llvm_version[1]),
    '-DMESA_LLVM_VERSION_PATCH=@0@'.format(_llvm_patch),
  ]
 elif with_amd_vk or with_gallium_radeonsi or with_gallium_swr
@@ -1231,10 +1213,8 @@ inc_include = include_directories('include')

 gl_priv_reqs = [
  'x11', 'xext', 'xdamage >= 1.1', 'xfixes', 'x11-xcb', 'xcb',
-  'xcb-glx >= 1.8.1']
-if dep_libdrm.found()
-  gl_priv_reqs += 'libdrm >= 2.4.75'
-endif
+  'xcb-glx >= 1.8.1', 'libdrm >= 2.4.75',
+]
 if dep_xxf86vm != [] and dep_xxf86vm.found()
  gl_priv_reqs += 'xxf86vm'
 endif
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -41,13 +41,7 @@ option(
  'dri-drivers-path',
  type : 'string',
  value : '',
-  description : 'Location to install dri drivers. Default: $libdir/dri.'
-)
-option(
-  'dri-search-path',
-  type : 'string',
-  value : '',
-  description : 'Locations to search for dri drivers, passed as colon separated list. Default: dri-drivers-path.'
+  description : 'Location of dri drivers. Default: $libdir/dri.'
 )
 option(
  'gallium-drivers',
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -923,6 +923,43 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 			   AC_FUNC_ATTR_LEGACY);
 }

+static LLVMValueRef
+ac_build_buffer_load_common(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vindex,
+			    LLVMValueRef voffset,
+			    unsigned num_channels,
+			    bool glc,
+			    bool slc,
+			    bool can_speculate,
+			    bool use_format)
+{
+	LLVMValueRef args[] = {
+		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+		vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
+		voffset,
+		LLVMConstInt(ctx->i1, glc, 0),
+		LLVMConstInt(ctx->i1, slc, 0)
+	};
+	unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+	LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
+	const char *type_names[] = {"f32", "v2f32", "v4f32"};
+	char name[256];
+
+	if (use_format) {
+		snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
+			 type_names[func]);
+	} else {
+		snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
+			 type_names[func]);
+	}
+
+	return ac_build_intrinsic(ctx, name, types[func], args,
+				  ARRAY_SIZE(args),
+				  ac_get_load_intr_attribs(can_speculate));
+}
+
 LLVMValueRef
 ac_build_buffer_load(struct ac_llvm_context *ctx,
 		     LLVMValueRef rsrc,
@@ -967,47 +1004,21 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
 		return ac_build_gather_values(ctx, result, num_channels);
 	}

-	unsigned func = CLAMP(num_channels, 1, 3) - 1;
-
-	LLVMValueRef args[] = {
-		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
-		vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
-		offset,
-		LLVMConstInt(ctx->i1, glc, 0),
-		LLVMConstInt(ctx->i1, slc, 0)
-	};
-
-	LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
-			       ctx->v4f32};
-	const char *type_names[] = {"f32", "v2f32", "v4f32"};
-	char name[256];
-
-	snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
-		 type_names[func]);
-
-	return ac_build_intrinsic(ctx, name, types[func], args,
-				  ARRAY_SIZE(args),
-				  ac_get_load_intr_attribs(can_speculate));
+	return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
+					   num_channels, glc, slc,
+					   can_speculate, false);
 }

 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
 					 LLVMValueRef rsrc,
 					 LLVMValueRef vindex,
 					 LLVMValueRef voffset,
+					 unsigned num_channels,
 					 bool can_speculate)
 {
-	LLVMValueRef args [] = {
-		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
-		vindex,
-		voffset,
-		ctx->i1false, /* glc */
-		ctx->i1false, /* slc */
-	};
-
-	return ac_build_intrinsic(ctx,
-				  "llvm.amdgcn.buffer.load.format.v4f32",
-				  ctx->v4f32, args, ARRAY_SIZE(args),
-				  ac_get_load_intr_attribs(can_speculate));
+	return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
+					   num_channels, false, false,
+					   can_speculate, true);
 }

 /**
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -214,6 +214,7 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
 					 LLVMValueRef rsrc,
 					 LLVMValueRef vindex,
 					 LLVMValueRef voffset,
+					 unsigned num_channels,
 					 bool can_speculate);

 LLVMValueRef
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -326,6 +326,7 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module,
 		if (args->array_params_mask & (1 << i)) {
 			LLVMValueRef P = LLVMGetParam(main_function, i);
 			ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_BYVAL);
+			ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
 			ac_add_attr_dereferenceable(P, UINT64_MAX);
 		}
 		else {
@@ -555,10 +556,12 @@ static bool needs_view_index_sgpr(struct nir_to_llvm_context *ctx,
 	case MESA_SHADER_TESS_EVAL:
 		if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.tes.as_es && ctx->options->key.has_multiview_view_index))
 			return true;
+		break;
 	case MESA_SHADER_GEOMETRY:
 	case MESA_SHADER_TESS_CTRL:
 		if (ctx->shader_info->info.needs_multiview_view_index)
 			return true;
+		break;
 	default:
 		break;
 	}
@@ -1911,18 +1914,24 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 	case nir_op_fmax:
 		result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
 		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
-		if (instr->dest.dest.ssa.bit_size == 32)
+		if (ctx->ac.chip_class < GFX9 &&
+		    instr->dest.dest.ssa.bit_size == 32) {
+			/* Only pre-GFX9 chips do not flush denorms. */
 			result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
 						      ac_to_float_type(&ctx->ac, def_type),
 						      result);
+		}
 		break;
 	case nir_op_fmin:
 		result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
 		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
-		if (instr->dest.dest.ssa.bit_size == 32)
+		if (ctx->ac.chip_class < GFX9 &&
+		    instr->dest.dest.ssa.bit_size == 32) {
+			/* Only pre-GFX9 chips do not flush denorms. */
 			result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
 						      ac_to_float_type(&ctx->ac, def_type),
 						      result);
+		}
 		break;
 	case nir_op_ffma:
 		result = emit_intrin_3f_param(&ctx->ac, "llvm.fmuladd",
@@ -2307,10 +2316,13 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
 					struct ac_image_args *args)
 {
 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
+		unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
+
 		return ac_build_buffer_load_format(&ctx->ac,
 						   args->resource,
 						   args->addr,
 						   ctx->ac.i32_0,
+						   util_last_bit(mask),
 						   true);
 	}

@@ -4549,11 +4561,14 @@ static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi,
 				   LLVMValueRef buffer_ptr, bool write)
 {
 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
+	LLVMValueRef result;

-	if (write && ctx->stage == MESA_SHADER_FRAGMENT)
-		ctx->shader_info->fs.writes_memory = true;
+	LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);

-	return LLVMBuildLoad(ctx->builder, buffer_ptr, "");
+	result = LLVMBuildLoad(ctx->builder, buffer_ptr, "");
+	LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
+
+	return result;
 }

 static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr)
@@ -4589,9 +4604,6 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,

 	assert(base_index < layout->binding_count);

-	if (write && ctx->stage == MESA_SHADER_FRAGMENT)
-		ctx->shader_info->fs.writes_memory = true;
-
 	switch (desc_type) {
 	case AC_DESC_IMAGE:
 		type = ctx->ac.v8i32;
@@ -5365,7 +5377,7 @@ handle_vs_input_decl(struct nir_to_llvm_context *ctx,
 		input = ac_build_buffer_load_format(&ctx->ac, t_list,
 						    buffer_index,
 						    ctx->ac.i32_0,
-						    true);
+						    4, true);

 		for (unsigned chan = 0; chan < 4; chan++) {
 			LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -179,7 +179,6 @@ struct ac_shader_variant_info {
 			bool writes_stencil;
 			bool writes_sample_mask;
 			bool early_fragment_test;
-			bool writes_memory;
 			bool prim_id_input;
 			bool layer_input;
 		} fs;
--- a/src/amd/common/ac_shader_info.c
+++ b/src/amd/common/ac_shader_info.c
@@ -31,7 +31,7 @@ static void mark_sampler_desc(const nir_variable *var,
 }

 static void
-gather_intrinsic_info(const nir_intrinsic_instr *instr,
+gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 		      struct ac_shader_info *info)
 {
 	switch (instr->intrinsic) {
@@ -104,15 +104,43 @@ gather_intrinsic_info(const nir_intrinsic_instr *instr,
 		    dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
 			info->ps.uses_input_attachments = true;
 		mark_sampler_desc(instr->variables[0]->var, info);
+
+		if (nir_intrinsic_image_store ||
+		    nir_intrinsic_image_atomic_add ||
+		    nir_intrinsic_image_atomic_min ||
+		    nir_intrinsic_image_atomic_max ||
+		    nir_intrinsic_image_atomic_and ||
+		    nir_intrinsic_image_atomic_or ||
+		    nir_intrinsic_image_atomic_xor ||
+		    nir_intrinsic_image_atomic_exchange ||
+		    nir_intrinsic_image_atomic_comp_swap) {
+			if (nir->info.stage == MESA_SHADER_FRAGMENT)
+				info->ps.writes_memory = true;
+		}
 		break;
 	}
+	case nir_intrinsic_store_ssbo:
+	case nir_intrinsic_ssbo_atomic_add:
+	case nir_intrinsic_ssbo_atomic_imin:
+	case nir_intrinsic_ssbo_atomic_umin:
+	case nir_intrinsic_ssbo_atomic_imax:
+	case nir_intrinsic_ssbo_atomic_umax:
+	case nir_intrinsic_ssbo_atomic_and:
+	case nir_intrinsic_ssbo_atomic_or:
+	case nir_intrinsic_ssbo_atomic_xor:
+	case nir_intrinsic_ssbo_atomic_exchange:
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		if (nir->info.stage == MESA_SHADER_FRAGMENT)
+			info->ps.writes_memory = true;
+		break;
 	default:
 		break;
 	}
 }

 static void
-gather_tex_info(const nir_tex_instr *instr, struct ac_shader_info *info)
+gather_tex_info(const nir_shader *nir, const nir_tex_instr *instr,
+		struct ac_shader_info *info)
 {
 	if (instr->sampler)
 		mark_sampler_desc(instr->sampler->var, info);
@@ -121,15 +149,16 @@ gather_tex_info(const nir_tex_instr *instr, struct ac_shader_info *info)
 }

 static void
-gather_info_block(const nir_block *block, struct ac_shader_info *info)
+gather_info_block(const nir_shader *nir, const nir_block *block,
+		  struct ac_shader_info *info)
 {
 	nir_foreach_instr(instr, block) {
 		switch (instr->type) {
 		case nir_instr_type_intrinsic:
-			gather_intrinsic_info(nir_instr_as_intrinsic(instr), info);
+			gather_intrinsic_info(nir, nir_instr_as_intrinsic(instr), info);
 			break;
 		case nir_instr_type_tex:
-			gather_tex_info(nir_instr_as_tex(instr), info);
+			gather_tex_info(nir, nir_instr_as_tex(instr), info);
 			break;
 		default:
 			break;
@@ -165,6 +194,6 @@ ac_nir_shader_info_pass(const struct nir_shader *nir,
 		gather_info_input_decl(nir, variable, info);

 	nir_foreach_block(block, func->impl) {
-		gather_info_block(block, info);
+		gather_info_block(nir, block, info);
 	}
 }
--- a/src/amd/common/ac_shader_info.h
+++ b/src/amd/common/ac_shader_info.h
@@ -42,6 +42,7 @@ struct ac_shader_info {
 		bool force_persample;
 		bool needs_sample_positions;
 		bool uses_input_attachments;
+		bool writes_memory;
 	} ps;
 	struct {
 		bool uses_grid_size;
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -429,18 +429,26 @@ void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
 }

 static void
-radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer)
+radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
+			   enum radv_cmd_flush_bits flags)
 {
 	if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
-		enum radv_cmd_flush_bits flags;
+		uint32_t *ptr = NULL;
+		uint64_t va = 0;

-		/* Force wait for graphics/compute engines to be idle. */
-		flags = RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
-			RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
+		assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+				RADV_CMD_FLAG_CS_PARTIAL_FLUSH));

-		si_cs_emit_cache_flush(cmd_buffer->cs,
+		if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
+			va = radv_buffer_get_va(cmd_buffer->gfx9_fence_bo) +
+			     cmd_buffer->gfx9_fence_offset;
+			ptr = &cmd_buffer->gfx9_fence_idx;
+		}
+
+		/* Force wait for graphics or compute engines to be idle. */
+		si_cs_emit_cache_flush(cmd_buffer->cs, false,
 				       cmd_buffer->device->physical_device->rad_info.chip_class,
-				       NULL, 0,
+				       ptr, va,
 				       radv_cmd_buffer_uses_mec(cmd_buffer),
 				       flags);
 	}
@@ -3501,7 +3509,7 @@ radv_draw(struct radv_cmd_buffer *cmd_buffer,
 	}

 	assert(cmd_buffer->cs->cdw <= cdw_max);
-	radv_cmd_buffer_after_draw(cmd_buffer);
+	radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
 }

 void radv_CmdDraw(
@@ -3821,7 +3829,7 @@ radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
 		radv_emit_dispatch_packets(cmd_buffer, info);
 	}

-	radv_cmd_buffer_after_draw(cmd_buffer);
+	radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
 }

 void radv_CmdDispatch(
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1771,6 +1771,7 @@ radv_get_preamble_cs(struct radv_queue *queue,

 		if (i == 0) {
 			si_cs_emit_cache_flush(cs,
+					       false,
 			                       queue->device->physical_device->rad_info.chip_class,
 					       NULL, 0,
 			                       queue->queue_family_index == RING_COMPUTE &&
@@ -1782,6 +1783,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
 			                       RADV_CMD_FLAG_INV_GLOBAL_L2);
 		} else if (i == 1) {
 			si_cs_emit_cache_flush(cs,
+					       false,
 			                       queue->device->physical_device->rad_info.chip_class,
 					       NULL, 0,
 			                       queue->queue_family_index == RING_COMPUTE &&
@@ -1994,32 +1996,6 @@ VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
 	return ret;
 }

-/* Signals fence as soon as all the work currently put on queue is done. */
-static VkResult radv_signal_fence(struct radv_queue *queue,
-                              struct radv_fence *fence)
-{
-	int ret;
-	VkResult result;
-	struct radv_winsys_sem_info sem_info;
-
-	result = radv_alloc_sem_info(&sem_info, 0, NULL, 0, NULL,
-	                             radv_fence_to_handle(fence));
-	if (result != VK_SUCCESS)
-		return result;
-
-	ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
-	                                   &queue->device->empty_cs[queue->queue_family_index],
-	                                   1, NULL, NULL, &sem_info,
-	                                   false, fence->fence);
-	radv_free_sem_info(&sem_info);
-
-	/* TODO: find a better error */
-	if (ret)
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
-
-	return VK_SUCCESS;
-}
-
 VkResult radv_QueueSubmit(
 	VkQueue                                     _queue,
 	uint32_t                                    submitCount,
@@ -2148,7 +2124,18 @@ VkResult radv_QueueSubmit(

 	if (fence) {
 		if (!fence_emitted) {
-			radv_signal_fence(queue, fence);
+			struct radv_winsys_sem_info sem_info;
+
+			result = radv_alloc_sem_info(&sem_info, 0, NULL, 0, NULL,
+			                             _fence);
+			if (result != VK_SUCCESS)
+				return result;
+
+			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
+							   &queue->device->empty_cs[queue->queue_family_index],
+							   1, NULL, NULL, &sem_info,
+							   false, base_fence);
+			radv_free_sem_info(&sem_info);
 		}
 		fence->submitted = true;
 	}
@@ -2669,11 +2656,8 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,

 	}

-	if (fence) {
-		if (!fence_emitted) {
-			radv_signal_fence(queue, fence);
-		}
-		fence->submitted = true;
+	if (fence && !fence_emitted) {
+		fence->signalled = true;
 	}

 	return VK_SUCCESS;
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -81,7 +81,7 @@ EXTENSIONS = [
    Extension('VK_KHR_wayland_surface',                   6, 'VK_USE_PLATFORM_WAYLAND_KHR'),
    Extension('VK_KHR_xcb_surface',                       6, 'VK_USE_PLATFORM_XCB_KHR'),
    Extension('VK_KHR_xlib_surface',                      6, 'VK_USE_PLATFORM_XLIB_KHR'),
-    Extension('VK_KHX_multiview',                         1, False),
+    Extension('VK_KHX_multiview',                         1, True),
    Extension('VK_EXT_debug_report',                      9, True),
    Extension('VK_EXT_discard_rectangles',                1, True),
    Extension('VK_EXT_external_memory_dma_buf',           1, True),
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -1069,55 +1069,10 @@ radv_image_view_init(struct radv_image_view *iview,
 	}

 	if (iview->vk_format != image->vk_format) {
-		unsigned view_bw = vk_format_get_blockwidth(iview->vk_format);
-		unsigned view_bh = vk_format_get_blockheight(iview->vk_format);
-		unsigned img_bw = vk_format_get_blockwidth(image->vk_format);
-		unsigned img_bh = vk_format_get_blockheight(image->vk_format);
-
-		iview->extent.width = round_up_u32(iview->extent.width * view_bw, img_bw);
-		iview->extent.height = round_up_u32(iview->extent.height * view_bh, img_bh);
-
-		/* Comment ported from amdvlk -
-		 * If we have the following image:
-		 *              Uncompressed pixels   Compressed block sizes (4x4)
-		 *      mip0:       22 x 22                   6 x 6
-		 *      mip1:       11 x 11                   3 x 3
-		 *      mip2:        5 x  5                   2 x 2
-		 *      mip3:        2 x  2                   1 x 1
-		 *      mip4:        1 x  1                   1 x 1
-		 *
-		 * On GFX9 the descriptor is always programmed with the WIDTH and HEIGHT of the base level and the HW is
-		 * calculating the degradation of the block sizes down the mip-chain as follows (straight-up
-		 * divide-by-two integer math):
-		 *      mip0:  6x6
-		 *      mip1:  3x3
-		 *      mip2:  1x1
-		 *      mip3:  1x1
-		 *
-		 * This means that mip2 will be missing texels.
-		 *
-		 * Fix this by calculating the base mip's width and height, then convert that, and round it
-		 * back up to get the level 0 size.
-		 * Clamp the converted size between the original values, and next power of two, which
-		 * means we don't oversize the image.
-		 */
-		 if (device->physical_device->rad_info.chip_class >= GFX9 &&
-		     vk_format_is_compressed(image->vk_format) &&
-		     !vk_format_is_compressed(iview->vk_format)) {
-			 unsigned rounded_img_w = util_next_power_of_two(iview->extent.width);
-			 unsigned rounded_img_h = util_next_power_of_two(iview->extent.height);
-			 unsigned lvl_width  = radv_minify(image->info.width , range->baseMipLevel);
-			 unsigned lvl_height = radv_minify(image->info.height, range->baseMipLevel);
-
-			 lvl_width = round_up_u32(lvl_width * view_bw, img_bw);
-			 lvl_height = round_up_u32(lvl_height * view_bh, img_bh);
-
-			 lvl_width <<= range->baseMipLevel;
-			 lvl_height <<= range->baseMipLevel;
-
-			 iview->extent.width = CLAMP(lvl_width, iview->extent.width, rounded_img_w);
-			 iview->extent.height = CLAMP(lvl_height, iview->extent.height, rounded_img_h);
-		 }
+		iview->extent.width = round_up_u32(iview->extent.width * vk_format_get_blockwidth(iview->vk_format),
+						   vk_format_get_blockwidth(image->vk_format));
+		iview->extent.height = round_up_u32(iview->extent.height * vk_format_get_blockheight(iview->vk_format),
+						    vk_format_get_blockheight(image->vk_format));
 	}

 	iview->base_layer = range->baseArrayLayer;
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -714,6 +714,9 @@ radv_pipeline_init_depth_stencil_state(struct radv_pipeline *pipeline,
 		                       S_028800_Z_WRITE_ENABLE(vkds->depthWriteEnable ? 1 : 0) |
 		                       S_028800_ZFUNC(vkds->depthCompareOp) |
 		                       S_028800_DEPTH_BOUNDS_ENABLE(vkds->depthBoundsTestEnable ? 1 : 0);
+
+		/* from amdvlk: For 4xAA and 8xAA need to decompress on flush for better performance */
+		ds->db_render_override2 |= S_028010_DECOMPRESS_Z_ON_FLUSH(attachment->samples > 2);
 	}

 	if (has_stencil_attachment && vkds->stencilTestEnable) {
@@ -2494,7 +2497,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,

 	unsigned z_order;
 	pipeline->graphics.db_shader_control = 0;
-	if (ps->info.fs.early_fragment_test || !ps->info.fs.writes_memory)
+	if (ps->info.fs.early_fragment_test || !ps->info.info.ps.writes_memory)
 		z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
 	else
 		z_order = V_02880C_LATE_Z;
@@ -2506,8 +2509,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 		S_02880C_MASK_EXPORT_ENABLE(ps->info.fs.writes_sample_mask) |
 		S_02880C_Z_ORDER(z_order) |
 		S_02880C_DEPTH_BEFORE_SHADER(ps->info.fs.early_fragment_test) |
-		S_02880C_EXEC_ON_HIER_FAIL(ps->info.fs.writes_memory) |
-		S_02880C_EXEC_ON_NOOP(ps->info.fs.writes_memory);
+		S_02880C_EXEC_ON_HIER_FAIL(ps->info.info.ps.writes_memory) |
+		S_02880C_EXEC_ON_NOOP(ps->info.info.ps.writes_memory);

 	if (pipeline->device->physical_device->has_rbplus)
 		pipeline->graphics.db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1021,6 +1021,7 @@ void si_emit_wait_fence(struct radeon_winsys_cs *cs,
 			uint64_t va, uint32_t ref,
 			uint32_t mask);
 void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
+			    bool predicated,
 			    enum chip_class chip_class,
 			    uint32_t *fence_ptr, uint64_t va,
 			    bool is_mec,
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -917,6 +917,7 @@ si_emit_acquire_mem(struct radeon_winsys_cs *cs,

 void
 si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
+		       bool predicated,
                       enum chip_class chip_class,
 		       uint32_t *flush_cnt,
 		       uint64_t flush_va,
@@ -947,7 +948,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 			/* Necessary for DCC */
 			if (chip_class >= VI) {
 				si_cs_emit_write_event_eop(cs,
-							   false,
+							   predicated,
 							   chip_class,
 							   is_mec,
 							   V_028A90_FLUSH_AND_INV_CB_DATA_TS,
@@ -961,12 +962,12 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 	}

 	if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
 	}

 	if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
 	}

@@ -979,7 +980,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 	}

 	if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 	}

@@ -1036,14 +1037,14 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 		assert(flush_cnt);
 		uint32_t old_fence = (*flush_cnt)++;

-		si_cs_emit_write_event_eop(cs, false, chip_class, false, cb_db_event, tc_flags, 1,
+		si_cs_emit_write_event_eop(cs, predicated, chip_class, false, cb_db_event, tc_flags, 1,
 					   flush_va, old_fence, *flush_cnt);
-		si_emit_wait_fence(cs, false, flush_va, *flush_cnt, 0xffffffff);
+		si_emit_wait_fence(cs, predicated, flush_va, *flush_cnt, 0xffffffff);
 	}

 	/* VGT state sync */
 	if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, predicated));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
 	}

@@ -1056,13 +1057,13 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 			    RADV_CMD_FLAG_INV_GLOBAL_L2 |
 			    RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) &&
 	    !is_mec) {
-		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, predicated));
 		radeon_emit(cs, 0);
 	}

 	if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
 	    (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
-		si_emit_acquire_mem(cs, is_mec, false, chip_class >= GFX9,
+		si_emit_acquire_mem(cs, is_mec, predicated, chip_class >= GFX9,
 				    cp_coher_cntl |
 				    S_0085F0_TC_ACTION_ENA(1) |
 				    S_0085F0_TCL1_ACTION_ENA(1) |
@@ -1076,7 +1077,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 			 *
 			 * WB doesn't work without NC.
 			 */
-			si_emit_acquire_mem(cs, is_mec, false,
+			si_emit_acquire_mem(cs, is_mec, predicated,
 					    chip_class >= GFX9,
 					    cp_coher_cntl |
 					    S_0301F0_TC_WB_ACTION_ENA(1) |
@@ -1085,7 +1086,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 		}
 		if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
 			si_emit_acquire_mem(cs, is_mec,
-					    false, chip_class >= GFX9,
+					    predicated, chip_class >= GFX9,
 					    cp_coher_cntl |
 					    S_0085F0_TCL1_ACTION_ENA(1));
 			cp_coher_cntl = 0;
@@ -1096,7 +1097,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 	 * Therefore, it should be last. Done in PFP.
 	 */
 	if (cp_coher_cntl)
-		si_emit_acquire_mem(cs, is_mec, false, chip_class >= GFX9, cp_coher_cntl);
+		si_emit_acquire_mem(cs, is_mec, predicated, chip_class >= GFX9, cp_coher_cntl);
 }

 void
@@ -1126,6 +1127,7 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 		ptr = &cmd_buffer->gfx9_fence_idx;
 	}
 	si_cs_emit_cache_flush(cmd_buffer->cs,
+			       cmd_buffer->state.predicating,
 	                       cmd_buffer->device->physical_device->rad_info.chip_class,
 			       ptr, va,
 	                       radv_cmd_buffer_uses_mec(cmd_buffer),
--- a/src/compiler/nir/nir_serialize.c
+++ b/src/compiler/nir/nir_serialize.c
@@ -585,7 +585,6 @@ union packed_tex_data {
      unsigned component:2;
      unsigned has_texture_deref:1;
      unsigned has_sampler_deref:1;
-      unsigned unused:10; /* Mark unused for valgrind. */
   } u;
 };

--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -115,6 +115,7 @@ struct _egl_extensions
   EGLBoolean KHR_config_attribs;
   EGLBoolean KHR_context_flush_control;
   EGLBoolean KHR_create_context;
+   EGLBoolean KHR_create_context_no_error;
   EGLBoolean KHR_fence_sync;
   EGLBoolean KHR_get_all_proc_addresses;
   EGLBoolean KHR_gl_colorspace;
@@ -130,7 +131,6 @@ struct _egl_extensions
   EGLBoolean KHR_reusable_sync;
   EGLBoolean KHR_surfaceless_context;
   EGLBoolean KHR_wait_sync;
-   EGLBoolean KHR_create_context_no_error;

   EGLBoolean MESA_drm_image;
   EGLBoolean MESA_image_dma_buf_export;
--- a/src/egl/meson.build
+++ b/src/egl/meson.build
@@ -160,7 +160,7 @@ libegl = shared_library(
  c_args : [
    c_vis_args,
    c_args_for_egl,
-    '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_search_path),
+    '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_driver_dir),
    '-D_EGL_BUILT_IN_DRIVER_DRI2',
    '-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_@0@'.format(egl_native_platform.to_upper()),
  ],
--- a/src/gallium/auxiliary/renderonly/renderonly.c
+++ b/src/gallium/auxiliary/renderonly/renderonly.c
@@ -33,7 +33,6 @@

 #include "state_tracker/drm_driver.h"
 #include "pipe/p_screen.h"
-#include "util/u_format.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"

@@ -74,7 +73,7 @@ renderonly_create_kms_dumb_buffer_for_resource(struct pipe_resource *rsc,
   struct drm_mode_create_dumb create_dumb = {
      .width = rsc->width0,
      .height = rsc->height0,
-      .bpp = util_format_get_blocksizebits(rsc->format),
+      .bpp = 32,
   };
   struct drm_mode_destroy_dumb destroy_dumb = { };

--- a/src/gallium/auxiliary/util/u_tests.c
+++ b/src/gallium/auxiliary/util/u_tests.c
@@ -228,7 +228,7 @@ util_probe_rect_rgba_multi(struct pipe_context *ctx, struct pipe_resource *tex,
                         expected[e*4], expected[e*4+1],
                         expected[e*4+2], expected[e*4+3]);
                  printf("Got: %.3f, %.3f, %.3f, %.3f\n",
-                         probe[0], probe[1], probe[2], probe[2]);
+                         probe[0], probe[1], probe[2], probe[3]);
                  pass = false;
                  goto done;
               }
@@ -592,6 +592,113 @@ test_sync_file_fences(struct pipe_context *ctx)
   util_report_result(pass);
 }

+static void
+test_texture_barrier(struct pipe_context *ctx, bool use_fbfetch)
+{
+   struct cso_context *cso;
+   struct pipe_resource *cb;
+   void *fs, *vs;
+   struct pipe_sampler_view *view = NULL;
+   const char *text;
+
+   if (!ctx->screen->get_param(ctx->screen, PIPE_CAP_TEXTURE_BARRIER)) {
+      util_report_result_helper(SKIP, "%s: %s", __func__,
+                                use_fbfetch ? "FBFETCH" : "sampler");
+      return;
+   }
+   if (use_fbfetch &&
+       !ctx->screen->get_param(ctx->screen, PIPE_CAP_TGSI_FS_FBFETCH)) {
+      util_report_result_helper(SKIP, "%s: %s", __func__,
+                                use_fbfetch ? "FBFETCH" : "sampler");
+      return;
+   }
+
+   cso = cso_create_context(ctx, 0);
+   cb = util_create_texture2d(ctx->screen, 256, 256,
+                              PIPE_FORMAT_R8G8B8A8_UNORM);
+   util_set_common_states_and_clear(cso, ctx, cb);
+
+   if (use_fbfetch) {
+      /* Fragment shader. */
+      text = "FRAG\n"
+             "DCL OUT[0], COLOR[0]\n"
+             "DCL TEMP[0]\n"
+             "IMM[0] FLT32 { 0.1, 0.2, 0.3, 0.4}\n"
+
+             "FBFETCH TEMP[0], OUT[0]\n"
+             "ADD OUT[0], TEMP[0], IMM[0]\n"
+             "END\n";
+   } else {
+      struct pipe_sampler_view templ = {{0}};
+      templ.format = cb->format;
+      templ.target = cb->target;
+      templ.swizzle_r = PIPE_SWIZZLE_X;
+      templ.swizzle_g = PIPE_SWIZZLE_Y;
+      templ.swizzle_b = PIPE_SWIZZLE_Z;
+      templ.swizzle_a = PIPE_SWIZZLE_W;
+      view = ctx->create_sampler_view(ctx, cb, &templ);
+      ctx->set_sampler_views(ctx, PIPE_SHADER_FRAGMENT, 0, 1, &view);
+
+      /* Fragment shader. */
+      text = "FRAG\n"
+             "DCL SV[0], POSITION\n"
+             "DCL SAMP[0]\n"
+             "DCL SVIEW[0], 2D, FLOAT\n"
+             "DCL OUT[0], COLOR[0]\n"
+             "DCL TEMP[0]\n"
+             "IMM[0] FLT32 { 0.1, 0.2, 0.3, 0.4}\n"
+             "IMM[1] INT32 { 0, 0, 0, 0}\n"
+
+             "F2I TEMP[0].xy, SV[0].xyyy\n"
+             "MOV TEMP[0].z, IMM[1].xxxx\n"
+             "TXF TEMP[0], TEMP[0].xyzz, SAMP[0], 2D\n"
+             "ADD OUT[0], TEMP[0], IMM[0]\n"
+             "END\n";
+   }
+
+   struct tgsi_token tokens[1000];
+   struct pipe_shader_state state;
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(0);
+      util_report_result_helper(FAIL, "%s: %s", __func__,
+                                use_fbfetch ? "FBFETCH" : "sampler");
+      return;
+   }
+   pipe_shader_state_from_tgsi(&state, tokens);
+#if 0
+   tgsi_dump(state.tokens, 0);
+#endif
+
+   fs = ctx->create_fs_state(ctx, &state);
+   cso_set_fragment_shader_handle(cso, fs);
+
+   /* Vertex shader. */
+   vs = util_set_passthrough_vertex_shader(cso, ctx, false);
+
+   for (int i = 0; i < 2; i++) {
+      ctx->texture_barrier(ctx,
+                           use_fbfetch ? PIPE_TEXTURE_BARRIER_FRAMEBUFFER :
+                                         PIPE_TEXTURE_BARRIER_SAMPLER);
+      util_draw_fullscreen_quad(cso);
+   }
+
+   /* Probe pixels. */
+   static const float expected[] = {0.3, 0.5, 0.7, 0.9};
+   bool pass = util_probe_rect_rgba(ctx, cb, 0, 0,
+                                    cb->width0, cb->height0, expected);
+
+   /* Cleanup. */
+   cso_destroy_context(cso);
+   ctx->delete_vs_state(ctx, vs);
+   ctx->delete_fs_state(ctx, fs);
+   pipe_sampler_view_reference(&view, NULL);
+   pipe_resource_reference(&cb, NULL);
+
+   util_report_result_helper(pass, "%s: %s", __func__,
+                             use_fbfetch ? "FBFETCH" : "sampler");
+}
+
 /**
 * Run all tests. This should be run with a clean context after
 * context_create.
@@ -607,6 +714,8 @@ util_run_tests(struct pipe_screen *screen)
   null_sampler_view(ctx, TGSI_TEXTURE_BUFFER);
   util_test_constant_buffer(ctx, NULL);
   test_sync_file_fences(ctx);
+   test_texture_barrier(ctx, false);
+   test_texture_barrier(ctx, true);

   ctx->destroy(ctx);

--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -766,7 +766,7 @@ static void compute_emit_cs(struct r600_context *rctx,
 	} else {
 		uint32_t rat_mask;

-		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
+		rat_mask = ((1ULL << (((unsigned)rctx->cb_misc_state.nr_image_rats + rctx->cb_misc_state.nr_buffer_rats) * 4)) - 1);
 		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
 					       rat_mask);
 	}
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1998,31 +1998,13 @@ static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600
 			       pa_su_poly_offset_db_fmt_cntl);
 }

-uint32_t evergreen_construct_rat_mask(struct r600_context *rctx, struct r600_cb_misc_state *a,
-				      unsigned nr_cbufs)
-{
-	unsigned base_mask = 0;
-	unsigned dirty_mask = a->image_rat_enabled_mask;
-	while (dirty_mask) {
-		unsigned idx = u_bit_scan(&dirty_mask);
-		base_mask |= (0xf << (idx * 4));
-	}
-	unsigned offset = util_last_bit(a->image_rat_enabled_mask);
-	dirty_mask = a->buffer_rat_enabled_mask;
-	while (dirty_mask) {
-		unsigned idx = u_bit_scan(&dirty_mask);
-		base_mask |= (0xf << (idx + offset) * 4);
-	}
-	return base_mask << (nr_cbufs * 4);
-}
-
 static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom;
 	unsigned fb_colormask = (1ULL << ((unsigned)a->nr_cbufs * 4)) - 1;
 	unsigned ps_colormask = (1ULL << ((unsigned)a->nr_ps_color_outputs * 4)) - 1;
-	unsigned rat_colormask = evergreen_construct_rat_mask(rctx, a, a->nr_cbufs);
+	unsigned rat_colormask = ((1ULL << ((unsigned)(a->nr_image_rats + a->nr_buffer_rats) * 4)) - 1) << (a->nr_cbufs * 4);
 	radeon_set_context_reg_seq(cs, R_028238_CB_TARGET_MASK, 2);
 	radeon_emit(cs, (a->blend_colormask & fb_colormask) | rat_colormask); /* R_028238_CB_TARGET_MASK */
 	/* This must match the used export instructions exactly.
@@ -4050,9 +4032,8 @@ static void evergreen_set_shader_buffers(struct pipe_context *ctx,
 	if (old_mask != istate->enabled_mask)
 		r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);

-	/* construct the target mask */
-	if (rctx->cb_misc_state.buffer_rat_enabled_mask != istate->enabled_mask) {
-		rctx->cb_misc_state.buffer_rat_enabled_mask = istate->enabled_mask;
+	if (rctx->cb_misc_state.nr_buffer_rats != util_bitcount(istate->enabled_mask)) {
+		rctx->cb_misc_state.nr_buffer_rats = util_bitcount(istate->enabled_mask);
 		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}

@@ -4227,8 +4208,8 @@ static void evergreen_set_shader_images(struct pipe_context *ctx,
 	if (old_mask != istate->enabled_mask)
 		r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);

-	if (rctx->cb_misc_state.image_rat_enabled_mask != istate->enabled_mask) {
-		rctx->cb_misc_state.image_rat_enabled_mask = istate->enabled_mask;
+	if (rctx->cb_misc_state.nr_image_rats != util_bitcount(istate->enabled_mask)) {
+		rctx->cb_misc_state.nr_image_rats = util_bitcount(istate->enabled_mask);
 		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}

--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -152,8 +152,8 @@ struct r600_cb_misc_state {
 	unsigned blend_colormask; /* 8*4 bits for 8 RGBA colorbuffers */
 	unsigned nr_cbufs;
 	unsigned nr_ps_color_outputs;
-	unsigned image_rat_enabled_mask;
-	unsigned buffer_rat_enabled_mask;
+	unsigned nr_image_rats;
+	unsigned nr_buffer_rats;
 	bool multiwrite;
 	bool dual_src_blend;
 };
@@ -700,9 +700,6 @@ void evergreen_init_color_surface_rat(struct r600_context *rctx,
 					struct r600_surface *surf);
 void evergreen_update_db_shader_control(struct r600_context * rctx);
 bool evergreen_adjust_gprs(struct r600_context *rctx);
-
-uint32_t evergreen_construct_rat_mask(struct r600_context *rctx, struct r600_cb_misc_state *a,
-				      unsigned nr_cbufs);
 /* r600_blit.c */
 void r600_init_blit_functions(struct r600_context *rctx);
 void r600_decompress_depth_textures(struct r600_context *rctx,
--- a/src/gallium/drivers/r600/sb/sb_bc.h
+++ b/src/gallium/drivers/r600/sb/sb_bc.h
@@ -665,7 +665,6 @@ public:
 			return false;

 		switch (hw_chip) {
-		case HW_CHIP_HEMLOCK:
 		case HW_CHIP_CYPRESS:
 		case HW_CHIP_JUNIPER:
 			return false;
--- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
@@ -208,25 +208,8 @@ void bc_finalizer::finalize_if(region_node* r) {
 		r->push_front(if_jump);
 		r->push_back(if_pop);

-		/* the depart/repeat 1 is actually part of the "else" code.
-		 * if it's a depart for an outer loop region it will want to
-		 * insert a LOOP_BREAK or LOOP_CONTINUE in here, so we need
-		 * to emit the else clause.
-		 */
 		bool has_else = n_if->next;

-		if (repdep1->is_depart()) {
-			depart_node *dep1 = static_cast<depart_node*>(repdep1);
-			if (dep1->target != r && dep1->target->is_loop())
-				has_else = true;
-		}
-
-		if (repdep1->is_repeat()) {
-			repeat_node *rep1 = static_cast<repeat_node*>(repdep1);
-			if (rep1->target != r && rep1->target->is_loop())
-				has_else = true;
-		}
-
 		if (has_else) {
 			cf_node *nelse = sh.create_cf(CF_OP_ELSE);
 			n_if->insert_after(nelse);
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -610,7 +610,7 @@ void si_llvm_load_input_vs(

 		input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
 						       vertex_index, voffset,
-						       true);
+						       4, true);
 	}

 	/* Break up the vec4 into individual components */
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -1826,7 +1826,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 						    emit_data->args[0],
 						    emit_data->args[2],
 						    emit_data->args[1],
-						    true);
+						    4, true);
 		return;
 	}

--- a/src/gallium/drivers/svga/include/svga3d_devcaps.h
+++ b/src/gallium/drivers/svga/include/svga3d_devcaps.h
@@ -448,10 +448,10 @@ typedef enum {
           SVGADX_DXFMT_MULTISAMPLE_8 )

 typedef union {
-   Bool   b;
+   SVGA3dBool b;
   uint32 u;
-   int32  i;
-   float  f;
+   int32 i;
+   float f;
 } SVGA3dDevCapResult;

 #endif /* _SVGA3D_DEVCAPS_H_ */
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -134,7 +134,7 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)

   svga = CALLOC_STRUCT(svga_context);
   if (!svga)
-      goto cleanup;
+      goto done;

   LIST_INITHEAD(&svga->dirty_buffers);

--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -140,6 +140,8 @@ JITTER_CXX_SOURCES := \
 	rasterizer/jitter/builder.cpp \
 	rasterizer/jitter/builder.h \
 	rasterizer/jitter/builder_math.h \
+	rasterizer/jitter/builder_mem.cpp \
+	rasterizer/jitter/builder_mem.h \
 	rasterizer/jitter/builder_misc.cpp \
 	rasterizer/jitter/builder_misc.h \
 	rasterizer/jitter/fetch_jit.cpp \
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2017-2018 Intel Corporation
+# Copyright © 2017 Intel Corporation

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -68,6 +68,8 @@ files_swr_mesa = files(
  'rasterizer/jitter/builder.cpp',
  'rasterizer/jitter/builder.h',
  'rasterizer/jitter/builder_math.h',
+  'rasterizer/jitter/builder_mem.cpp',
+  'rasterizer/jitter/builder_mem.h',
  'rasterizer/jitter/builder_misc.cpp',
  'rasterizer/jitter/builder_misc.h',
  'rasterizer/jitter/fetch_jit.cpp',
@@ -149,22 +151,7 @@ files_swr_arch = files(
 swr_context_files = files('swr_context.h')
 swr_state_files = files('rasterizer/core/state.h')
 swr_event_proto_files = files('rasterizer/archrast/events.proto')
-swr_gen_backend_files = files('rasterizer/codegen/templates/gen_backend.cpp')
-swr_gen_rasterizer_files = files('rasterizer/codegen/templates/gen_rasterizer.cpp')
-swr_gen_header_init_files = files('rasterizer/codegen/templates/gen_header_init.hpp')
-
-swr_gen_llvm_ir_macros_py = files('rasterizer/codegen/gen_llvm_ir_macros.py')
-swr_gen_backends_py = files('rasterizer/codegen/gen_backends.py')
-
-swr_gen_builder_depends = files(
-    'rasterizer/codegen/templates/gen_builder.hpp',
-    'rasterizer/codegen/gen_common.py'
-    )
-
-
-subdir('rasterizer/jitter')
 subdir('rasterizer/codegen')
-subdir('rasterizer/core/backends')

 swr_incs = include_directories(
  'rasterizer/codegen', 'rasterizer/core', 'rasterizer/jitter',
@@ -193,7 +180,7 @@ if with_swr_arches.contains('avx')
  swr_arch_defines += '-DHAVE_SWR_AVX'
  swr_arch_libs += shared_library(
    'swrAVX',
-    [files_swr_common, files_swr_arch],
+    files_swr_common,
    cpp_args : [swr_cpp_args, swr_avx_args, '-DKNOB_ARCH=KNOB_ARCH_AVX'],
    link_args : [ld_args_gc_sections],
    include_directories : [swr_incs],
@@ -225,7 +212,7 @@ if with_swr_arches.contains('avx2')
  swr_arch_defines += '-DHAVE_SWR_AVX2'
  swr_arch_libs += shared_library(
    'swrAVX2',
-    [files_swr_common, files_swr_arch],
+    files_swr_common,
    cpp_args : [swr_cpp_args, swr_avx2_args, '-DKNOB_ARCH=KNOB_ARCH_AVX2'],
    link_args : [ld_args_gc_sections],
    include_directories : [swr_incs],
@@ -249,7 +236,7 @@ if with_swr_arches.contains('knl')
  swr_arch_defines += '-DHAVE_SWR_KNL'
  swr_arch_libs += shared_library(
    'swrKNL',
-    [files_swr_common, files_swr_arch],
+    files_swr_common,
    cpp_args : [
      swr_cpp_args, swr_knl_args, '-DKNOB_ARCH=KNOB_ARCH_AVX512',
      '-DKNOB_ARCH_KNIGHTS',
@@ -276,7 +263,7 @@ if with_swr_arches.contains('skx')
  swr_arch_defines += '-DHAVE_SWR_SKX'
  swr_arch_libs += shared_library(
    'swrSKX',
-    [files_swr_common, files_swr_arch],
+    files_swr_common,
    cpp_args : [swr_cpp_args, swr_skx_args, '-DKNOB_ARCH=KNOB_ARCH_AVX512'],
    link_args : [ld_args_gc_sections],
    include_directories : [swr_incs],
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
--- a/src/gallium/drivers/swr/rasterizer/codegen/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/codegen/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2017-2018 Intel Corporation
+# Copyright © 2017 Intel Corporation

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -40,6 +40,40 @@ gen_knobs_h = custom_target(
  ),
 )

+gen_builder_hpp = custom_target(
+  'gen_builder.hpp',
+  input : [
+    'gen_llvm_ir_macros.py',
+    join_paths(
+      dep_llvm.get_configtool_variable('includedir'), 'llvm', 'IR',
+      'IRBuilder.h'
+    )
+  ],
+  output : 'gen_builder.hpp',
+  command : [
+    prog_python2, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@',
+    '--gen_h', '--output-dir', meson.current_build_dir()
+  ],
+  depend_files : files(
+    'templates/gen_builder.hpp',
+    'gen_common.py',
+  ),
+  build_by_default : true,
+)
+
+gen_builder_x86_hpp = custom_target(
+  'gen_builder_x86.hpp',
+  input : 'gen_llvm_ir_macros.py',
+  output : 'gen_builder_x86.hpp',
+  command : [
+    prog_python2, '@INPUT0@', '--gen_x86_h', '--output', '@OUTPUT@',
+    '--output-dir', meson.current_build_dir()
+  ],
+  depend_files : files(
+    'templates/gen_builder.hpp',
+    'gen_common.py',
+  ),
+)

 # The generators above this are needed individually, while the below generators
 # are all inputs to the same lib, so they don't need unique names.
@@ -80,3 +114,45 @@ foreach x : [['gen_ar_event.hpp', '--gen_event_hpp'],
  )
 endforeach

+files_swr_common += custom_target(
+  'gen_backend_pixel',
+  input : 'gen_backends.py',
+  output : [
+    'gen_BackendPixelRate0.cpp', 'gen_BackendPixelRate1.cpp',
+    'gen_BackendPixelRate2.cpp', 'gen_BackendPixelRate3.cpp',
+    'gen_BackendPixelRate.hpp',
+  ],
+  command : [
+    prog_python2, '@INPUT@',
+    '--outdir', meson.current_build_dir(),
+    '--dim', '5', '2', '3', '2', '2', '2',
+    '--numfiles', '4',
+    '--cpp', '--hpp',
+  ],
+  depend_files : files(
+    'templates/gen_backend.cpp',
+    'templates/gen_header_init.hpp',
+  ),
+)
+
+files_swr_common += custom_target(
+  'gen_backend_raster',
+  input : 'gen_backends.py',
+  output : [
+    'gen_rasterizer0.cpp', 'gen_rasterizer1.cpp',
+    'gen_rasterizer2.cpp', 'gen_rasterizer3.cpp',
+    'gen_rasterizer.hpp',
+  ],
+  command : [
+    prog_python2, '@INPUT@',
+    '--outdir', meson.current_build_dir(),
+    '--rast',
+    '--dim', '5', '2', '2', '3', '5', '2',
+    '--numfiles', '4',
+    '--cpp', '--hpp',
+  ],
+  depend_files : files(
+    'templates/gen_rasterizer.cpp',
+    'templates/gen_header_init.hpp',
+  ),
+)
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2015-2017 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright (C) 2015-2017 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
--- a/src/gallium/drivers/swr/rasterizer/common/os.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/os.cpp
@@ -23,6 +23,7 @@

 #include "common/os.h"
 #include <vector>
+#include <array>
 #include <sstream>

 #if defined(_WIN32)
@@ -151,3 +152,160 @@ void SWR_API CreateDirectoryPath(const std::string& path)
    }
 #endif // Unix
 }
+
+/// Execute Command (block until finished)
+/// @returns process exit value
+int SWR_API  ExecCmd(
+    const std::string&  cmd,            ///< (In) Command line string
+    const char*         pOptEnvStrings, ///< (Optional In) Environment block for new process
+    std::string*        pOptStdOut,     ///< (Optional Out) Standard Output text
+    std::string*        pOptStdErr,     ///< (Optional Out) Standard Error text
+    const std::string*  pOptStdIn)      ///< (Optional In) Standard Input text
+{
+    int rvalue = -1;
+
+#if defined(_WIN32)
+    struct WinPipe
+    {
+        HANDLE hRead;
+        HANDLE hWrite;
+    };
+    std::array<WinPipe, 3> hPipes = {};
+
+    SECURITY_ATTRIBUTES saAttr = { sizeof(SECURITY_ATTRIBUTES) };
+    saAttr.bInheritHandle = TRUE;   //Pipe handles are inherited by child process.
+    saAttr.lpSecurityDescriptor = NULL;
+
+    {
+        bool bFail = false;
+        for (WinPipe& p : hPipes)
+        {
+            if (!CreatePipe(&p.hRead, &p.hWrite, &saAttr, 0))
+            {
+                bFail = true;
+            }
+        }
+
+        if (bFail)
+        {
+            for (WinPipe& p : hPipes)
+            {
+                CloseHandle(p.hRead);
+                CloseHandle(p.hWrite);
+            }
+            return rvalue;
+        }
+    }
+
+    STARTUPINFOA StartupInfo{};
+    StartupInfo.cb = sizeof(STARTUPINFOA);
+    StartupInfo.dwFlags = STARTF_USESTDHANDLES;
+    StartupInfo.dwFlags |= STARTF_USESHOWWINDOW;
+    StartupInfo.wShowWindow = SW_HIDE;
+    if (pOptStdIn)
+    {
+        StartupInfo.hStdInput = hPipes[0].hRead;
+    }
+    StartupInfo.hStdOutput = hPipes[1].hWrite;
+    StartupInfo.hStdError = hPipes[2].hWrite;
+    PROCESS_INFORMATION procInfo{};
+
+    // CreateProcess can modify the string
+    std::string local_cmd = cmd;
+
+    BOOL ProcessValue = CreateProcessA(
+        NULL,
+        (LPSTR)local_cmd.c_str(),
+        NULL,
+        NULL,
+        TRUE,
+        0,
+        (LPVOID)pOptEnvStrings,
+        NULL,
+        &StartupInfo,
+        &procInfo);
+
+    if (ProcessValue && procInfo.hProcess)
+    {
+        auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr)
+        {
+            char buf[1024];
+            DWORD dwRead = 0;
+            DWORD dwAvail = 0;
+            while (true)
+            {
+                if (!::PeekNamedPipe(hPipe, NULL, 0, NULL, &dwAvail, NULL))
+                {
+                    break;
+                }
+
+                if (!dwAvail) // no data available, return
+                {
+                    break;
+                }
+
+                if (!::ReadFile(hPipe, buf, std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)), &dwRead, NULL) || !dwRead)
+                {
+                    // error, the child process might ended
+                    break;
+                }
+
+                buf[dwRead] = 0;
+                if (pOutStr)
+                {
+                    (*pOutStr) += buf;
+                }
+            }
+        };
+        bool bProcessEnded = false;
+        size_t bytesWritten = 0;
+        do
+        {
+            if (pOptStdIn && (pOptStdIn->size() > bytesWritten))
+            {
+                DWORD bytesToWrite = static_cast<DWORD>(pOptStdIn->size()) - bytesWritten;
+                if (!::WriteFile(
+                    hPipes[0].hWrite,
+                    pOptStdIn->data() + bytesWritten,
+                    bytesToWrite, &bytesToWrite, nullptr))
+                {
+                    // Failed to write to pipe
+                    break;
+                }
+                bytesWritten += bytesToWrite;
+            }
+
+            // Give some timeslice (50ms), so we won't waste 100% cpu.
+            bProcessEnded = (WaitForSingleObject(procInfo.hProcess, 50) == WAIT_OBJECT_0);
+
+            ReadFromPipe(hPipes[1].hRead, pOptStdOut);
+            ReadFromPipe(hPipes[2].hRead, pOptStdErr);
+        }
+        while (!bProcessEnded);
+
+        DWORD exitVal = 0;
+        if (!GetExitCodeProcess(procInfo.hProcess, &exitVal))
+        {
+            exitVal = 1;
+        }
+
+        CloseHandle(procInfo.hProcess);
+        CloseHandle(procInfo.hThread);
+
+        rvalue = exitVal;
+    }
+
+    for (WinPipe& p : hPipes)
+    {
+        CloseHandle(p.hRead);
+        CloseHandle(p.hWrite);
+    }
+
+#else
+
+    // Non-Windows implementation
+
+#endif
+
+    return rvalue;
+}
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -280,4 +280,13 @@ typedef MEGABYTE    GIGABYTE[1024];
 void SWR_API SetCurrentThreadName(const char* pThreadName);
 void SWR_API CreateDirectoryPath(const std::string& path);

+/// Execute Command (block until finished)
+/// @returns process exit value
+int SWR_API  ExecCmd(
+    const std::string&  cmd,                        ///< (In) Command line string
+    const char*         pOptEnvStrings = nullptr,   ///< (Optional In) Environment block for new process
+    std::string*        pOptStdOut = nullptr,       ///< (Optional Out) Standard Output text
+    std::string*        pOptStdErr = nullptr,       ///< (Optional Out) Standard Error text
+    const std::string*  pOptStdIn = nullptr);       ///< (Optional In) Standard Input text
+
 #endif//__SWR_OS_H__
--- a/src/gallium/drivers/swr/rasterizer/core/backends/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/core/backends/meson.build
@@ -1,57 +0,0 @@
-# Copyright © 2017-2018 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-
-files_swr_common += custom_target(
-  'gen_backend_pixel',
-  input : swr_gen_backends_py,
-  output : [
-    'gen_BackendPixelRate0.cpp', 'gen_BackendPixelRate1.cpp',
-    'gen_BackendPixelRate2.cpp', 'gen_BackendPixelRate3.cpp',
-    'gen_BackendPixelRate.hpp',
-  ],
-  command : [
-    prog_python2, '@INPUT@',
-    '--outdir', '@OUTDIR@',
-    '--dim', '5', '2', '3', '2', '2', '2',
-    '--numfiles', '4',
-    '--cpp', '--hpp',
-  ],
-  depend_files : [ swr_gen_backend_files, swr_gen_header_init_files ],
-)
-
-files_swr_common += custom_target(
-  'gen_backend_raster',
-  input : swr_gen_backends_py,
-  output : [
-    'gen_rasterizer0.cpp', 'gen_rasterizer1.cpp',
-    'gen_rasterizer2.cpp', 'gen_rasterizer3.cpp',
-    'gen_rasterizer.hpp',
-  ],
-  command : [
-    prog_python2, '@INPUT@',
-    '--outdir', '@OUTDIR@',
-    '--rast',
-    '--dim', '5', '2', '2', '3', '5', '2',
-    '--numfiles', '4',
-    '--cpp', '--hpp',
-  ],
-  depend_files : [ swr_gen_rasterizer_files, swr_gen_header_init_files ],
-)
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1032,31 +1032,31 @@ static void GeometryShaderStage(
                                simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);

                                // Gather data from the SVG if provided.
-                                simdscalari vViewportIdx = SIMD16::setzero_si();
-                                simdscalari vRtIdx = SIMD16::setzero_si();
-                                SIMD8::Vec4 svgAttrib[4];
+                                simdscalari vViewportIdx = SIMD::setzero_si();
+                                simdscalari vRtIdx = SIMD::setzero_si();
+                                SIMD::Vec4 svgAttrib[4];

                                if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
                                {
-                                    tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+                                    gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
                                }


                                if (state.backendState.readViewportArrayIndex)
                                {
-                                    vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                                    vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);

                                    // OOB VPAI indices => forced to zero.
-                                    vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
-                                    simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
-                                    vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
-                                    tessPa.viewportArrayActive = true;
+                                    vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+                                    simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                    simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                                    vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
+                                    gsPa.viewportArrayActive = true;
                                }
                                if (state.backendState.readRenderTargetArrayIndex)
                                {
-                                    vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
-                                    tessPa.rtArrayActive = true;
+                                    vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                                    gsPa.rtArrayActive = true;
                                }

                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
@@ -1437,9 +1437,9 @@ static void TessellationStages(
                    }
 #else
                    // Gather data from the SVG if provided.
-                    simdscalari vViewportIdx = SIMD16::setzero_si();
-                    simdscalari vRtIdx = SIMD16::setzero_si();
-                    SIMD8::Vec4 svgAttrib[4];
+                    simdscalari vViewportIdx = SIMD::setzero_si();
+                    simdscalari vRtIdx = SIMD::setzero_si();
+                    SIMD::Vec4 svgAttrib[4];

                    if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
                    {
@@ -1448,18 +1448,18 @@ static void TessellationStages(

                    if (state.backendState.readViewportArrayIndex)
                    {
-                        vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                        vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);

                        // OOB VPAI indices => forced to zero.
-                        vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
-                        simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                        simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
-                        vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
+                        vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+                        simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                        simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                        vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
                        tessPa.viewportArrayActive = true;
                    }
                    if (state.backendState.readRenderTargetArrayIndex)
                    {
-                        vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                        vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
                        tessPa.rtArrayActive = true;
                    }
                    pfnClipFunc(pDC, tessPa, workerId, prim,
@@ -2053,30 +2053,30 @@ void ProcessDraw(
                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);

                                    // Gather data from the SVG if provided.
-                                    simdscalari vViewportIdx = SIMD16::setzero_si();
-                                    simdscalari vRtIdx = SIMD16::setzero_si();
-                                    SIMD8::Vec4 svgAttrib[4];
+                                    simdscalari vViewportIdx = SIMD::setzero_si();
+                                    simdscalari vRtIdx = SIMD::setzero_si();
+                                    SIMD::Vec4 svgAttrib[4];

                                    if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
                                    {
-                                        tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+                                        pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
                                    }

                                    if (state.backendState.readViewportArrayIndex)
                                    {
-                                        vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                                        vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);

                                        // OOB VPAI indices => forced to zero.
-                                        vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
-                                        simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                        simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
-                                        vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
-                                        tessPa.viewportArrayActive = true;
+                                        vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+                                        simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                        simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                                        vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
+                                        pa.viewportArrayActive = true;
                                    }
                                    if (state.backendState.readRenderTargetArrayIndex)
                                    {
-                                        vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
-                                        tessPa.rtArrayActive = true;
+                                        vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                                        pa.rtArrayActive = true;
                                    }

                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -421,8 +421,7 @@ void JitManager::DumpToFile(Function *f, const char *fileName)
        sprintf(fName, "%s.%s.ll", funcName, fileName);
 #endif
        raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None);
-        Module* pModule = f->getParent();
-        pModule->print(fd, nullptr);
+        f->print(fd, nullptr);

 #if defined(_WIN32)
        sprintf(fName, "%s\\cfg.%s.%s.dot", outDir.c_str(), funcName, fileName);
@@ -599,44 +598,12 @@ JitCache::JitCache()
    }
 }

-#if defined(_WIN32)
-int ExecUnhookedProcess(const char* pCmdLine)
+int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::string* pStdErr)
 {
    static const char *g_pEnv = "RASTY_DISABLE_HOOK=1\0";

-    STARTUPINFOA StartupInfo{};
-    StartupInfo.cb = sizeof(STARTUPINFOA);
-    PROCESS_INFORMATION procInfo{};
-
-    BOOL ProcessValue = CreateProcessA(
-        NULL,
-        (LPSTR)pCmdLine,
-        NULL,
-        NULL,
-        TRUE,
-        0,
-        (LPVOID)g_pEnv,
-        NULL,
-        &StartupInfo,
-        &procInfo);
-
-    if (ProcessValue && procInfo.hProcess)
-    {
-        WaitForSingleObject(procInfo.hProcess, INFINITE);
-        DWORD exitVal = 0;
-        if (!GetExitCodeProcess(procInfo.hProcess, &exitVal))
-        {
-            exitVal = 1;
-        }
-
-        CloseHandle(procInfo.hProcess);
-
-        return exitVal;
-    }
-
-    return -1;
+    return ExecCmd(CmdLine, g_pEnv, pStdOut, pStdErr);
 }
-#endif

 #if defined(_WIN64) && defined(ENABLE_JIT_DEBUG) && defined(JIT_BASE_DIR)
 EXTERN_C IMAGE_DOS_HEADER __ImageBase;
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -38,6 +38,8 @@ namespace SwrJit
    struct Builder
    {
        Builder(JitManager *pJitMgr);
+        virtual ~Builder() {}
+
        IRBuilder<> *IRB() { return mpIRBuilder; };
        JitManager *JM() { return mpJitMgr; }

@@ -92,5 +94,6 @@ namespace SwrJit
 #include "gen_builder_x86.hpp"
 #include "builder_misc.h"
 #include "builder_math.h"
+#include "builder_mem.h"
    };
 }
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -0,0 +1,816 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_misc.cpp
+*
+* @brief Implementation for miscellaneous builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_pch.hpp"
+#include "builder.h"
+#include "common/rdtsc_buckets.h"
+
+#include <cstdarg>
+
+
+namespace SwrJit
+{
+
+    Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(i);
+        return GEPA(ptr, indices);
+    }
+
+    Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(C(i));
+        return GEPA(ptr, indices);
+    }
+
+    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(i);
+        return IN_BOUNDS_GEP(ptr, indices);
+    }
+
+    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(C(i));
+        return IN_BOUNDS_GEP(ptr, indices);
+    }
+
+    LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(C(i));
+        return LOAD(GEPA(basePtr, valIndices), name);
+    }
+
+    LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(i);
+        return LOAD(GEPA(basePtr, valIndices), name);
+    }
+
+    StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(C(i));
+        return STORE(val, GEPA(basePtr, valIndices));
+    }
+
+    StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(i);
+        return STORE(val, GEPA(basePtr, valIndices));
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate an i32 masked load operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with float masked load
+    /// @param src - base address pointer for the load
+    /// @param vMask - SIMD wide mask that controls whether to access memory load 0
+    Value *Builder::MASKLOADD(Value* src, Value* mask)
+    {
+        Value* vResult;
+        // use avx2 gather instruction is available
+        if (JM()->mArch.AVX2())
+        {
+            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
+            vResult = CALL(func, { src,mask });
+        }
+        else
+        {
+            // maskload intrinsic expects integer mask operand in llvm >= 3.8
+#if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
+            mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth));
+#else
+            mask = BITCAST(mask, VectorType::get(mFP32Ty, mVWidth));
+#endif
+            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
+            vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth));
+        }
+        return vResult;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, Value *pDrawContext)
+    {
+        Value *vGather;
+
+        // use avx2 gather instruction if available
+        if (JM()->mArch.AVX2())
+        {
+            // force mask to <N x float>, required by vgather
+            Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
+
+            vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
+        }
+        else
+        {
+            Value* pStack = STACKSAVE();
+
+            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+            Value* vSrcPtr = ALLOCA(vSrc->getType());
+            STORE(vSrc, vSrcPtr);
+
+            vGather = VUNDEF_F();
+            Value *vScaleVec = VIMMED1((uint32_t)scale);
+            Value *vOffsets = MUL(vIndices, vScaleVec);
+            for (uint32_t i = 0; i < mVWidth; ++i)
+            {
+                // single component byte index
+                Value *offset = VEXTRACT(vOffsets, C(i));
+                // byte pointer to component
+                Value *loadAddress = GEP(pBase, offset);
+                loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 0));
+                // pointer to the value to load if we're masking off a component
+                Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+                Value *selMask = VEXTRACT(vMask, C(i));
+                // switch in a safe address to load if we're trying to access a vertex 
+                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+                Value *val = LOAD(validAddress);
+                vGather = VINSERT(vGather, val, C(i));
+            }
+
+            STACKRESTORE(pStack);
+        }
+
+        return vGather;
+    }
+
+    Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
+    {
+        Value *vGather = VUNDEF_F_16();
+
+        // use AVX512F gather instruction if available
+        if (JM()->mArch.AVX512F())
+        {
+            // force mask to <N-bit Integer>, required by vgather2
+            Value *mask = BITCAST(vMask, mInt16Ty);
+
+            vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
+        }
+        else
+        {
+            Value *src0 = EXTRACT_16(vSrc, 0);
+            Value *src1 = EXTRACT_16(vSrc, 1);
+
+            Value *indices0 = EXTRACT_16(vIndices, 0);
+            Value *indices1 = EXTRACT_16(vIndices, 1);
+
+            Value *mask0 = EXTRACT_16(vMask, 0);
+            Value *mask1 = EXTRACT_16(vMask, 1);
+
+            Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
+            Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
+
+            vGather = JOIN_16(gather0, gather1);
+        }
+
+        return vGather;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
+    {
+        Value* vGather;
+
+        // use avx2 gather instruction if available
+        if (JM()->mArch.AVX2())
+        {
+            vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
+        }
+        else
+        {
+            Value* pStack = STACKSAVE();
+
+            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+            Value* vSrcPtr = ALLOCA(vSrc->getType());
+            STORE(vSrc, vSrcPtr);
+
+            vGather = VUNDEF_I();
+            Value *vScaleVec = VIMMED1((uint32_t)scale);
+            Value *vOffsets = MUL(vIndices, vScaleVec);
+            for (uint32_t i = 0; i < mVWidth; ++i)
+            {
+                // single component byte index
+                Value *offset = VEXTRACT(vOffsets, C(i));
+                // byte pointer to component
+                Value *loadAddress = GEP(pBase, offset);
+                loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
+                // pointer to the value to load if we're masking off a component
+                Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+                Value *selMask = VEXTRACT(vMask, C(i));
+                // switch in a safe address to load if we're trying to access a vertex 
+                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+                Value *val = LOAD(validAddress, C(0));
+                vGather = VINSERT(vGather, val, C(i));
+            }
+
+            STACKRESTORE(pStack);
+        }
+
+        return vGather;
+    }
+
+    Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
+    {
+        Value *vGather = VUNDEF_I_16();
+
+        // use AVX512F gather instruction if available
+        if (JM()->mArch.AVX512F())
+        {
+            // force mask to <N-bit Integer>, required by vgather2
+            Value *mask = BITCAST(vMask, mInt16Ty);
+
+            vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
+        }
+        else
+        {
+            Value *src0 = EXTRACT_16(vSrc, 0);
+            Value *src1 = EXTRACT_16(vSrc, 1);
+
+            Value *indices0 = EXTRACT_16(vIndices, 0);
+            Value *indices1 = EXTRACT_16(vIndices, 1);
+
+            Value *mask0 = EXTRACT_16(vMask, 0);
+            Value *mask1 = EXTRACT_16(vMask, 1);
+
+            Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
+            Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
+
+            vGather = JOIN_16(gather0, gather1);
+        }
+
+        return vGather;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
+    {
+        Value* vGather;
+
+        // use avx2 gather instruction if available
+        if (JM()->mArch.AVX2())
+        {
+            vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
+            vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
+        }
+        else
+        {
+            Value* pStack = STACKSAVE();
+
+            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+            Value* vSrcPtr = ALLOCA(vSrc->getType());
+            STORE(vSrc, vSrcPtr);
+
+            vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
+            Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
+            Value *vOffsets = MUL(vIndices, vScaleVec);
+            for (uint32_t i = 0; i < mVWidth / 2; ++i)
+            {
+                // single component byte index
+                Value *offset = VEXTRACT(vOffsets, C(i));
+                // byte pointer to component
+                Value *loadAddress = GEP(pBase, offset);
+                loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
+                // pointer to the value to load if we're masking off a component
+                Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+                Value *selMask = VEXTRACT(vMask, C(i));
+                // switch in a safe address to load if we're trying to access a vertex
+                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+                Value *val = LOAD(validAddress);
+                vGather = VINSERT(vGather, val, C(i));
+            }
+            STACKRESTORE(pStack);
+        }
+        return vGather;
+    }
+
+    void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+        Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+    {
+        const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+        if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
+        {
+            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+        }
+        else
+        {
+            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+        }
+    }
+
+    void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+        Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
+    {
+        switch (info.bpp / info.numComps)
+        {
+        case 16:
+        {
+            Value* vGatherResult[2];
+
+            // TODO: vGatherMaskedVal
+            Value* vGatherMaskedVal = VIMMED1((float)0);
+
+            // always have at least one component out of x or y to fetch
+
+            vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+            // e.g. result of first 8x32bit integer gather for 16bit components
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+            //
+
+            // if we have at least one component out of x or y to fetch
+            if (info.numComps > 2)
+            {
+                // offset base to the next components(zw) in the vertex to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+
+                vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+                // e.g. result of second 8x32bit integer gather for 16bit components
+                // 256i - 0    1    2    3    4    5    6    7
+                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                //
+            }
+            else
+            {
+                vGatherResult[1] = vGatherMaskedVal;
+            }
+
+            // Shuffle gathered components into place, each row is a component
+            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+        }
+        break;
+        case 32:
+        {
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
+            }
+
+            for (uint32_t i = 0; i < info.numComps; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // Gather a SIMD of components
+                vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
+
+                // offset base to the next component to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+            }
+        }
+        break;
+        default:
+            SWR_INVALID("Invalid float format");
+            break;
+        }
+    }
+
+    void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+        Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
+    {
+        switch (info.bpp / info.numComps)
+        {
+        case 8:
+        {
+            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+            Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+            // e.g. result of an 8x32bit integer gather for 8bit components
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+
+            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+        }
+        break;
+        case 16:
+        {
+            Value* vGatherResult[2];
+
+            // TODO: vGatherMaskedVal
+            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+
+            // always have at least one component out of x or y to fetch
+
+            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+            // e.g. result of first 8x32bit integer gather for 16bit components
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+            //
+
+            // if we have at least one component out of x or y to fetch
+            if (info.numComps > 2)
+            {
+                // offset base to the next components(zw) in the vertex to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+
+                vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+                // e.g. result of second 8x32bit integer gather for 16bit components
+                // 256i - 0    1    2    3    4    5    6    7
+                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                //
+            }
+            else
+            {
+                vGatherResult[1] = vGatherMaskedVal;
+            }
+
+            // Shuffle gathered components into place, each row is a component
+            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+
+        }
+        break;
+        case 32:
+        {
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
+            }
+
+            for (uint32_t i = 0; i < info.numComps; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // Gather a SIMD of components
+                vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
+
+                // offset base to the next component to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+            }
+        }
+        break;
+        default:
+            SWR_INVALID("unsupported format");
+            break;
+        }
+    }
+
+    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
+    {
+        // cast types
+        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+                                                               // input could either be float or int vector; do shuffle work in int
+        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
+        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
+
+        if (bPackedOutput)
+        {
+            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+                                                                                                         // shuffle mask
+            Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
+            // after pshufb: group components together in each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            // after PERMD: move and pack xy components into each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+
+            // do the same for zw components
+            Value* vi128ZW = nullptr;
+            if (info.numComps > 2)
+            {
+                Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
+                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            }
+
+            for (uint32_t i = 0; i < 4; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+                // todo: fixed for packed
+                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+                if (i >= info.numComps)
+                {
+                    // set the default component val
+                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+                    continue;
+                }
+
+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+                // extract packed component 128 bit lanes 
+                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+            }
+
+        }
+        else
+        {
+            // pshufb masks for each component
+            Value* vConstMask[2];
+            // x/z shuffle mask
+            vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+
+            // y/w shuffle mask
+            vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+
+
+            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+            }
+
+            for (uint32_t i = 0; i < info.numComps; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // select correct constMask for x/z or y/w pshufb
+                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+                // after pshufb mask for x channel; z uses the same shuffle from the second gather
+                // 256i - 0    1    2    3    4    5    6    7
+                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
+            }
+        }
+    }
+
+    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
+    {
+        // cast types
+        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+        if (bPackedOutput)
+        {
+            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+                                                                                                      // shuffle mask
+            Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+            // after pshufb: group components together in each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+
+            // do the same for zw components
+            Value* vi128ZW = nullptr;
+            if (info.numComps > 2)
+            {
+                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+            }
+
+            // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+            for (uint32_t i = 0; i < 4; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+                // todo: fix for packed
+                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+                if (i >= info.numComps)
+                {
+                    // set the default component val
+                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+                    continue;
+                }
+
+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+                // sign extend
+                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+            }
+        }
+        // else zero extend
+        else {
+            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+            }
+
+            for (uint32_t i = 0; i < info.numComps; i++) {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // pshufb masks for each component
+                Value* vConstMask;
+                switch (i)
+                {
+                case 0:
+                    // x shuffle mask
+                    vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+                        0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
+                    break;
+                case 1:
+                    // y shuffle mask
+                    vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+                        1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
+                    break;
+                case 2:
+                    // z shuffle mask
+                    vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+                        2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
+                    break;
+                case 3:
+                    // w shuffle mask
+                    vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+                        3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
+                    break;
+                default:
+                    vConstMask = nullptr;
+                    break;
+                }
+
+                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+                // after pshufb for x channel
+                // 256i - 0    1    2    3    4    5    6    7
+                //        x000 x000 x000 x000 x000 x000 x000 x000 
+            }
+        }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief emulates a scatter operation.
+    /// @param pDst - pointer to destination 
+    /// @param vSrc - vector of src data to scatter
+    /// @param vOffsets - vector of byte offsets from pDst
+    /// @param vMask - mask of valid lanes
+    void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
+    {
+        /* Scatter algorithm
+
+        while(Index = BitScanForward(mask))
+        srcElem = srcVector[Index]
+        offsetElem = offsetVector[Index]
+        *(pDst + offsetElem) = srcElem
+        Update mask (&= ~(1<<Index)
+
+        */
+
+        BasicBlock* pCurBB = IRB()->GetInsertBlock();
+        Function* pFunc = pCurBB->getParent();
+        Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
+        // Store vectors on stack
+        if (pScatterStackSrc == nullptr)
+        {
+            // Save off stack allocations and reuse per scatter. Significantly reduces stack
+            // requirements for shaders with a lot of scatters.
+            pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
+            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
+        }
+
+        Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
+        Value* pOffsetsArrayPtr = pScatterStackOffsets;
+        STORE(vSrc, pSrcArrayPtr);
+        STORE(vOffsets, pOffsetsArrayPtr);
+
+        // Cast to pointers for random access
+        pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
+        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
+
+        Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+
+        // Get cttz function
+        Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
+
+        // Setup loop basic block
+        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
+
+        // compute first set bit
+        Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
+
+        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
+
+        // Split current block
+        BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
+
+        // Remove unconditional jump created by splitBasicBlock
+        pCurBB->getTerminator()->eraseFromParent();
+
+        // Add terminator to end of original block
+        IRB()->SetInsertPoint(pCurBB);
+
+        // Add conditional branch
+        COND_BR(pIsUndef, pPostLoop, pLoop);
+
+        // Add loop basic block contents
+        IRB()->SetInsertPoint(pLoop);
+        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
+        PHINode* pMaskPhi = PHI(mInt32Ty, 2);
+
+        pIndexPhi->addIncoming(pIndex, pCurBB);
+        pMaskPhi->addIncoming(pMask, pCurBB);
+
+        // Extract elements for this index
+        Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
+        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
+
+        // GEP to this offset in dst
+        Value* pCurDst = GEP(pDst, pOffsetElem);
+        pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
+        STORE(pSrcElem, pCurDst);
+
+        // Update the mask
+        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
+
+        // Terminator
+        Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
+
+        pIsUndef = ICMP_EQ(pNewIndex, C(32));
+        COND_BR(pIsUndef, pPostLoop, pLoop);
+
+        // Update phi edges
+        pIndexPhi->addIncoming(pNewIndex, pLoop);
+        pMaskPhi->addIncoming(pNewMask, pLoop);
+
+        // Move builder to beginning of post loop
+        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief save/restore stack, providing ability to push/pop the stack and 
+    ///        reduce overall stack requirements for temporary stack use
+    Value* Builder::STACKSAVE()
+    {
+        Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
+        return CALLA(pfnStackSave);
+    }
+
+    void Builder::STACKRESTORE(Value* pSaved)
+    {
+        Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
+        CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
+    }
+
+}
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
@@ -0,0 +1,73 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_misc.h
+*
+* @brief miscellaneous builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+
+LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
+LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
+StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
+StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
+
+Value *MASKLOADD(Value* src, Value* mask);
+
+void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+    Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, Value *pDrawContext = nullptr);
+Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
+
+void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+    Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
+Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
+
+void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+    Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
+
+void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
+
+void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
+void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
+
+Value* STACKSAVE();
+void STACKRESTORE(Value* pSaved);
+
+// Static stack allocations for scatter operations
+Value* pScatterStackSrc{ nullptr };
+Value* pScatterStackOffsets{ nullptr };
+
+
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -303,70 +303,6 @@ namespace SwrJit
        return pValConst->getSExtValue();
    }

-    Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(i);
-        return GEPA(ptr, indices);
-    }
-
-    Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(C(i));
-        return GEPA(ptr, indices);
-    }
-
-    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(i);
-        return IN_BOUNDS_GEP(ptr, indices);
-    }
-
-    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(C(i));
-        return IN_BOUNDS_GEP(ptr, indices);
-    }
-
-    LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(C(i));
-        return LOAD(GEPA(basePtr, valIndices), name);
-    }
-
-    LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(i);
-        return LOAD(GEPA(basePtr, valIndices), name);
-    }
-
-    StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(C(i));
-        return STORE(val, GEPA(basePtr, valIndices));
-    }
-
-    StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(i);
-        return STORE(val, GEPA(basePtr, valIndices));
-    }
-
    CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
    {
        std::vector<Value*> args;
@@ -418,34 +354,6 @@ namespace SwrJit
        return vOut;
    }

-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate an i32 masked load operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with float masked load
-    /// @param src - base address pointer for the load
-    /// @param vMask - SIMD wide mask that controls whether to access memory load 0
-    Value *Builder::MASKLOADD(Value* src,Value* mask)
-    {
-        Value* vResult;
-        // use avx2 gather instruction is available
-        if(JM()->mArch.AVX2())
-        {
-            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
-            vResult = CALL(func,{src,mask});
-        }
-        else
-        {
-            // maskload intrinsic expects integer mask operand in llvm >= 3.8
-    #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
-            mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
-    #else
-            mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
-    #endif
-            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-            vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
-        }
-        return vResult;
-    }
-
    //////////////////////////////////////////////////////////////////////////
    /// @brief insert a JIT call to CallPrint
    /// - outputs formatted string to both stdout and VS output window
@@ -581,222 +489,6 @@ namespace SwrJit
        return PRINT(printStr, {});
    }

-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
-    {
-        Value *vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            // force mask to <N x float>, required by vgather
-            Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
-
-            vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = VUNDEF_F();
-            Value *vScaleVec = VIMMED1((uint32_t)scale);
-            Value *vOffsets = MUL(vIndices,vScaleVec);
-            for(uint32_t i = 0; i < mVWidth; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets,C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase,offset);
-                loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-                Value *selMask = VEXTRACT(vMask,C(i));
-                // switch in a safe address to load if we're trying to access a vertex 
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress);
-                vGather = VINSERT(vGather,val,C(i));
-            }
-
-            STACKRESTORE(pStack);
-        }
-
-        return vGather;
-    }
-
-    Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
-    {
-        Value *vGather = VUNDEF_F_16();
-
-        // use AVX512F gather instruction if available
-        if (JM()->mArch.AVX512F())
-        {
-            // force mask to <N-bit Integer>, required by vgather2
-            Value *mask = BITCAST(vMask, mInt16Ty);
-
-            vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
-        }
-        else
-        {
-            Value *src0 = EXTRACT_16(vSrc, 0);
-            Value *src1 = EXTRACT_16(vSrc, 1);
-
-            Value *indices0 = EXTRACT_16(vIndices, 0);
-            Value *indices1 = EXTRACT_16(vIndices, 1);
-
-            Value *mask0 = EXTRACT_16(vMask, 0);
-            Value *mask1 = EXTRACT_16(vMask, 1);
-
-            Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
-            Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
-
-            vGather = JOIN_16(gather0, gather1);
-        }
-
-        return vGather;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
-    {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = VUNDEF_I();
-            Value *vScaleVec = VIMMED1((uint32_t)scale);
-            Value *vOffsets = MUL(vIndices, vScaleVec);
-            for(uint32_t i = 0; i < mVWidth; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets, C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase, offset);
-                loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
-                Value *selMask = VEXTRACT(vMask, C(i));
-                // switch in a safe address to load if we're trying to access a vertex 
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress, C(0));
-                vGather = VINSERT(vGather, val, C(i));
-            }
-
-            STACKRESTORE(pStack);
-        }
-
-        return vGather;
-    }
-
-    Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
-    {
-        Value *vGather = VUNDEF_I_16();
-
-        // use AVX512F gather instruction if available
-        if (JM()->mArch.AVX512F())
-        {
-            // force mask to <N-bit Integer>, required by vgather2
-            Value *mask = BITCAST(vMask, mInt16Ty);
-
-            vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
-        }
-        else
-        {
-            Value *src0 = EXTRACT_16(vSrc, 0);
-            Value *src1 = EXTRACT_16(vSrc, 1);
-
-            Value *indices0 = EXTRACT_16(vIndices, 0);
-            Value *indices1 = EXTRACT_16(vIndices, 1);
-
-            Value *mask0 = EXTRACT_16(vMask, 0);
-            Value *mask1 = EXTRACT_16(vMask, 1);
-
-            Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
-            Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
-
-            vGather = JOIN_16(gather0, gather1);
-        }
-
-        return vGather;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
-    {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
-            vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-            Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
-            Value *vOffsets = MUL(vIndices,vScaleVec);
-            for(uint32_t i = 0; i < mVWidth/2; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets,C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase,offset);
-                loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-                Value *selMask = VEXTRACT(vMask,C(i));
-                // switch in a safe address to load if we're trying to access a vertex
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress);
-                vGather = VINSERT(vGather,val,C(i));
-            }
-            STACKRESTORE(pStack);
-        }
-        return vGather;
-    }
-
    Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
    {
        if (imm == 0)
@@ -1064,360 +756,6 @@ namespace SwrJit
        return SELECT(cmp, a, b);
    }

-    void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 
-                          Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-    {
-        const SWR_FORMAT_INFO &info = GetFormatInfo(format);
-        if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
-        {
-            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
-        }
-        else
-        {
-            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
-        }
-    }
-
-    void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 
-                            Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
-    {
-        switch(info.bpp / info.numComps)
-        {
-            case 16: 
-            {
-                    Value* vGatherResult[2];
-
-                    // TODO: vGatherMaskedVal
-                    Value* vGatherMaskedVal = VIMMED1((float)0);
-
-                    // always have at least one component out of x or y to fetch
-
-                    vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                    // e.g. result of first 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                    //
-
-                    // if we have at least one component out of x or y to fetch
-                    if(info.numComps > 2)
-                    {
-                        // offset base to the next components(zw) in the vertex to gather
-                        pSrcBase = GEP(pSrcBase, C((char)4));
-
-                        vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                        // e.g. result of second 8x32bit integer gather for 16bit components
-                        // 256i - 0    1    2    3    4    5    6    7
-                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                        //
-                    }
-                    else
-                    {
-                        vGatherResult[1] =  vGatherMaskedVal;
-                    }
-
-                    // Shuffle gathered components into place, each row is a component
-                    Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-            }
-                break;
-            case 32: 
-            { 
-                // apply defaults
-                for (uint32_t i = 0; i < 4; ++i)
-                {
-                    vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
-                }
-
-                for(uint32_t i = 0; i < info.numComps; i++)
-                {
-                    uint32_t swizzleIndex = info.swizzle[i];
-
-                    // Gather a SIMD of components
-                    vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
-
-                    // offset base to the next component to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-                }
-            }
-                break;
-            default:
-                SWR_INVALID("Invalid float format");
-                break;
-        }
-    }
-
-    void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-                            Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
-    {
-        switch (info.bpp / info.numComps)
-        {
-            case 8:
-            {
-                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-                Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                // e.g. result of an 8x32bit integer gather for 8bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
-
-                Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-            }
-                break;
-            case 16:
-            {
-                Value* vGatherResult[2];
-
-                // TODO: vGatherMaskedVal
-                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-
-                // always have at least one component out of x or y to fetch
-
-                vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                // e.g. result of first 8x32bit integer gather for 16bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                //
-
-                // if we have at least one component out of x or y to fetch
-                if(info.numComps > 2)
-                {
-                    // offset base to the next components(zw) in the vertex to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-
-                    vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                    // e.g. result of second 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                    //
-                }
-                else
-                {
-                    vGatherResult[1] = vGatherMaskedVal;
-                }
-
-                // Shuffle gathered components into place, each row is a component
-                Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-
-            }
-                break;
-            case 32:
-            {
-                // apply defaults
-                for (uint32_t i = 0; i < 4; ++i)
-                {
-                    vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
-                }
-
-                for(uint32_t i = 0; i < info.numComps; i++)
-                {
-                    uint32_t swizzleIndex = info.swizzle[i];
-
-                    // Gather a SIMD of components
-                    vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
-
-                    // offset base to the next component to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-                }
-            }
-                break;
-            default:
-                SWR_INVALID("unsupported format");
-            break;
-        }
-    }
-
-    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
-    {
-        // cast types
-        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
-        // input could either be float or int vector; do shuffle work in int
-        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
-        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
-
-        if(bPackedOutput) 
-        {
-            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-
-            // shuffle mask
-            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            // after PERMD: move and pack xy components into each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-
-            // do the same for zw components
-            Value* vi128ZW = nullptr;
-            if(info.numComps > 2) 
-            {
-                Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            }
-
-            for(uint32_t i = 0; i < 4; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-                // todo: fixed for packed
-                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-                if(i >= info.numComps)
-                {
-                    // set the default component val
-                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                    continue;
-                }
-
-                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-                // extract packed component 128 bit lanes 
-                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-            }
-
-        }
-        else 
-        {
-            // pshufb masks for each component
-            Value* vConstMask[2];
-            // x/z shuffle mask
-            vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                                     0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
-
-            // y/w shuffle mask
-            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-
-
-            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-            }
-
-            for(uint32_t i = 0; i < info.numComps; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // select correct constMask for x/z or y/w pshufb
-                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                uint32_t selectedGather = (i < 2) ? 0 : 1;
-
-                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-                // after pshufb mask for x channel; z uses the same shuffle from the second gather
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
-            }
-        }
-    }
-
-    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
-    {
-        // cast types
-        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
-
-        if(bPackedOutput)
-        {
-            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-            // shuffle mask
-            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
-            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-
-            // do the same for zw components
-            Value* vi128ZW = nullptr;
-            if(info.numComps > 2) 
-            {
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
-            }
-
-            // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
-            for(uint32_t i = 0; i < 4; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-                // todo: fix for packed
-                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-                if(i >= info.numComps)
-                {
-                    // set the default component val
-                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                    continue;
-                }
-
-                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-            
-                // sign extend
-                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-            }
-        }
-        // else zero extend
-        else{
-            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-            }
-
-            for(uint32_t i = 0; i < info.numComps; i++){
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // pshufb masks for each component
-                Value* vConstMask;
-                switch(i)
-                {
-                    case 0:
-                        // x shuffle mask
-                        vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-                                              0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
-                        break;
-                    case 1:
-                        // y shuffle mask
-                        vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-                                              1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
-                        break;
-                    case 2:
-                        // z shuffle mask
-                        vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-                                              2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
-                        break;
-                    case 3:
-                        // w shuffle mask
-                        vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-                                              3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
-                        break;
-                    default:
-                        vConstMask = nullptr;
-                        break;
-                }
-
-                    vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-                    // after pshufb for x channel
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        x000 x000 x000 x000 x000 x000 x000 x000 
-            }
-        }
-    }
-
    // Helper function to create alloca in entry block of function
    Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
    {
@@ -1439,105 +777,6 @@ namespace SwrJit
        return pAlloca;
    }

-    //////////////////////////////////////////////////////////////////////////
-    /// @brief emulates a scatter operation.
-    /// @param pDst - pointer to destination 
-    /// @param vSrc - vector of src data to scatter
-    /// @param vOffsets - vector of byte offsets from pDst
-    /// @param vMask - mask of valid lanes
-    void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
-    {
-        /* Scatter algorithm
-    
-           while(Index = BitScanForward(mask))
-                srcElem = srcVector[Index]
-                offsetElem = offsetVector[Index]
-                *(pDst + offsetElem) = srcElem
-                Update mask (&= ~(1<<Index)
-
-        */
-
-        BasicBlock* pCurBB = IRB()->GetInsertBlock();
-        Function* pFunc = pCurBB->getParent();
-        Type* pSrcTy = vSrc->getType()->getVectorElementType();
-
-        // Store vectors on stack
-        if (pScatterStackSrc == nullptr)
-        {
-            // Save off stack allocations and reuse per scatter. Significantly reduces stack
-            // requirements for shaders with a lot of scatters.
-            pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
-            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
-        }
-    
-        Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
-        Value* pOffsetsArrayPtr = pScatterStackOffsets;
-        STORE(vSrc, pSrcArrayPtr);
-        STORE(vOffsets, pOffsetsArrayPtr);
-
-        // Cast to pointers for random access
-        pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
-        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
-
-        Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
-
-        // Get cttz function
-        Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
-    
-        // Setup loop basic block
-        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
-
-        // compute first set bit
-        Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
-
-        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
-
-        // Split current block
-        BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
-
-        // Remove unconditional jump created by splitBasicBlock
-        pCurBB->getTerminator()->eraseFromParent();
-
-        // Add terminator to end of original block
-        IRB()->SetInsertPoint(pCurBB);
-
-        // Add conditional branch
-        COND_BR(pIsUndef, pPostLoop, pLoop);
-
-        // Add loop basic block contents
-        IRB()->SetInsertPoint(pLoop);
-        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
-        PHINode* pMaskPhi = PHI(mInt32Ty, 2);
-
-        pIndexPhi->addIncoming(pIndex, pCurBB);
-        pMaskPhi->addIncoming(pMask, pCurBB);
-
-        // Extract elements for this index
-        Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
-        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
-
-        // GEP to this offset in dst
-        Value* pCurDst = GEP(pDst, pOffsetElem);
-        pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
-        STORE(pSrcElem, pCurDst);
-
-        // Update the mask
-        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
-
-        // Terminator
-        Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
-
-        pIsUndef = ICMP_EQ(pNewIndex, C(32));
-        COND_BR(pIsUndef, pPostLoop, pLoop);
-
-        // Update phi edges
-        pIndexPhi->addIncoming(pNewIndex, pLoop);
-        pMaskPhi->addIncoming(pNewMask, pLoop);
-
-        // Move builder to beginning of post loop
-        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
-    }
-
    Value* Builder::VABSPS(Value* a)
    {
        Value* asInt = BITCAST(a, mSimdInt32Ty);
@@ -1575,21 +814,6 @@ namespace SwrJit
        return result;
    }

-    //////////////////////////////////////////////////////////////////////////
-    /// @brief save/restore stack, providing ability to push/pop the stack and 
-    ///        reduce overall stack requirements for temporary stack use
-    Value* Builder::STACKSAVE()
-    {
-        Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-        return CALLA(pfnStackSave);
-    }
-
-    void Builder::STACKRESTORE(Value* pSaved)
-    {
-        Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
-        CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
-    }
-
    Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
    {
        Value* vOut;
@@ -1707,7 +931,6 @@ namespace SwrJit
        }
    }

-
    uint32_t Builder::GetTypeSize(Type* pType)
    {
        if (pType->isStructTy())
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -90,22 +90,12 @@ Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
 uint32_t IMMED(Value* i);
 int32_t S_IMMED(Value* i);

-Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
-Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
-Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
-Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
-
 CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args, const llvm::Twine& name = "");
 CallInst *CALL(Value *Callee) { return CALLA(Callee); }
 CallInst *CALL(Value *Callee, Value* arg);
 CallInst *CALL2(Value *Callee, Value* arg1, Value* arg2);
 CallInst *CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3);

-LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
-LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
-StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
-StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
-
 Value *VCMPPS_EQ(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); }
 Value *VCMPPS_LT(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); }
 Value *VCMPPS_LE(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); }
@@ -129,30 +119,6 @@ Value *VMASK_16(Value *mask);
 Value *EXTRACT_16(Value *x, uint32_t imm);
 Value *JOIN_16(Value *a, Value *b);

-Value *MASKLOADD(Value* src, Value* mask);
-
-void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
-                      Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
-Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
-
-void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-               Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
-Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
-
-void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-               Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
-
-void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
-
-void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
-void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
-
 Value *PSHUFB(Value* a, Value* b);
 Value *PMOVSXBD(Value* a);
 Value *PMOVSXWD(Value* a);
@@ -180,8 +146,6 @@ Value *FCLAMP(Value* src, float low, float high);

 CallInst *PRINT(const std::string &printStr);
 CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
-Value* STACKSAVE();
-void STACKRESTORE(Value* pSaved);

 Value* POPCNT(Value* a);

@@ -199,9 +163,4 @@ void RDTSC_STOP(Value* pBucketMgr, Value* pId);
 Value* CreateEntryAlloca(Function* pFunc, Type* pType);
 Value* CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize);

-// Static stack allocations for scatter operations
-Value* pScatterStackSrc{ nullptr };
-Value* pScatterStackOffsets{ nullptr };
-
-
 uint32_t GetTypeSize(Type* pType);
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -55,9 +55,12 @@ enum ConversionType
 //////////////////////////////////////////////////////////////////////////
 /// Interface to Jitting a fetch shader
 //////////////////////////////////////////////////////////////////////////
-struct FetchJit : public Builder
+struct FetchJit : 
+    public Builder
 {
-    FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+    FetchJit(JitManager* pJitMgr) :
+        Builder(pJitMgr)
+    {}

    Function* Create(const FETCH_COMPILE_STATE& fetchState);

@@ -1361,7 +1364,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                // But, we know that elements must be aligned for FETCH. :)
                                // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
                                Value *vShiftedOffsets = LSHR(vOffsets, 1);
-                                vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
+                                vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2, mpPrivateContext);
                            }
                            else
                            {
--- a/src/gallium/drivers/swr/rasterizer/jitter/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/jitter/meson.build
@@ -1,50 +0,0 @@
-# Copyright © 2017-2018 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-
-gen_builder_hpp = custom_target(
-  'gen_builder.hpp',
-  input : [
-    swr_gen_llvm_ir_macros_py,
-    join_paths(
-      dep_llvm.get_configtool_variable('includedir'), 'llvm', 'IR',
-      'IRBuilder.h'
-    )
-  ],
-  output : 'gen_builder.hpp',
-  command : [
-    prog_python2, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@',
-    '--gen_h', '--output-dir', '@OUTDIR@'
-  ],
-  depend_files : swr_gen_builder_depends,
-  build_by_default : true,
-)
-
-gen_builder_x86_hpp = custom_target(
-  'gen_builder_x86.hpp',
-  input : '../codegen/gen_llvm_ir_macros.py',
-  output : 'gen_builder_x86.hpp',
-  command : [
-    prog_python2, '@INPUT0@', '--gen_x86_h', '--output', '@OUTPUT@',
-    '--output-dir', '@OUTDIR@'
-  ],
-  depend_files : swr_gen_builder_depends,
-)
-
--- a/src/gallium/drivers/virgl/virgl_tgsi.c
+++ b/src/gallium/drivers/virgl/virgl_tgsi.c
@@ -76,6 +76,7 @@ virgl_tgsi_transform_instruction(struct tgsi_transform_context *ctx,
   for (unsigned i = 0; i < inst->Instruction.NumSrcRegs; i++) {
      if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT &&
          inst->Src[i].Register.Dimension &&
+          !inst->Src[i].Register.Indirect &&
          inst->Src[i].Dimension.Index == 0)
         inst->Src[i].Register.Dimension = 0;
   }
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -670,10 +670,11 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib,
                               ws->info.gart_page_size,
                               RADEON_DOMAIN_GTT,
                               RADEON_FLAG_NO_INTERPROCESS_SHARING |
+                               RADEON_FLAG_READ_ONLY |
                               (ring_type == RING_GFX ||
                                ring_type == RING_COMPUTE ||
                                ring_type == RING_DMA ?
-                                   RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC : 0));
+                                   RADEON_FLAG_GTT_WC : 0));
   if (!pb)
      return false;

--- a/src/gallium/winsys/radeon/drm/radeon_drm_surface.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_surface.c
@@ -215,9 +215,6 @@ static void surf_drm_to_winsys(struct radeon_drm_winsys *ws,
    }

    set_micro_tile_mode(surf_ws, &ws->info);
-    surf_ws->is_displayable = surf_ws->is_linear ||
-			      surf_ws->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY ||
-			      surf_ws->micro_tile_mode == RADEON_MICRO_MODE_ROTATED;
 }

 static int radeon_winsys_surface_init(struct radeon_winsys *rws,
--- a/src/gallium/winsys/svga/drm/vmw_screen_pools.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_pools.c
@@ -150,7 +150,7 @@ vmw_mob_pools_init(struct vmw_winsys_screen *vws)
   vws->pools.mob_shader_slab_fenced =
      simple_fenced_bufmgr_create(vws->pools.mob_shader_slab,
 				  vws->fence_ops);
-   if(!vws->pools.mob_fenced)
+   if(!vws->pools.mob_shader_slab_fenced)
      goto out_no_mob_shader_slab_fenced;

   return TRUE;
--- a/src/gbm/meson.build
+++ b/src/gbm/meson.build
@@ -38,7 +38,7 @@ incs_gbm = [
 if with_dri2
  files_gbm += files('backends/dri/gbm_dri.c', 'backends/dri/gbm_driint.h')
  deps_gbm += dep_libdrm # TODO: pthread-stubs
-  args_gbm += '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_search_path)
+  args_gbm += '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_driver_dir)
 endif
 if with_platform_wayland
  deps_gbm += dep_wayland_server
--- a/src/glx/apple/apple_glapi.c
+++ b/src/glx/apple/apple_glapi.c
@@ -41,6 +41,7 @@
 #include "main/glheader.h"
 #include "glapi.h"
 #include "glapitable.h"
+#include "main/dispatch.h"

 #include "apple_glx.h"
 #include "apple_xgl_api.h"
@@ -60,11 +61,12 @@ static void _apple_glapi_create_table(void) {
    assert(__applegl_api);
    memcpy(__applegl_api, __ogl_framework_api, sizeof(struct _glapi_table));

-    _glapi_table_patch(__applegl_api, "ReadPixels", __applegl_glReadPixels);
-    _glapi_table_patch(__applegl_api, "CopyPixels", __applegl_glCopyPixels);
-    _glapi_table_patch(__applegl_api, "CopyColorTable", __applegl_glCopyColorTable);
-    _glapi_table_patch(__applegl_api, "DrawBuffers", __applegl_glDrawBuffer);
-    _glapi_table_patch(__applegl_api, "Viewport", __applegl_glViewport);
+    SET_ReadPixels(__applegl_api, __applegl_glReadPixels);
+    SET_CopyPixels(__applegl_api, __applegl_glCopyPixels);
+    SET_CopyColorTable(__applegl_api, __applegl_glCopyColorTable);
+    SET_DrawBuffer(__applegl_api, __applegl_glDrawBuffer);
+    SET_DrawBuffers(__applegl_api, __applegl_glDrawBuffers);
+    SET_Viewport(__applegl_api, __applegl_glViewport);
 }

 void apple_glapi_set_dispatch(void) {
--- a/src/glx/apple/apple_visual.c
+++ b/src/glx/apple/apple_visual.c
@@ -32,7 +32,6 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <GL/gl.h>
-#include <util/debug.h>

 /* <rdar://problem/6953344> */
 #define glTexImage1D glTexImage1D_OSX
--- a/src/glx/glxcmds.c
+++ b/src/glx/glxcmds.c
@@ -43,7 +43,6 @@
 #ifdef GLX_USE_APPLEGL
 #include "apple/apple_glx_context.h"
 #include "apple/apple_glx.h"
-#include "util/debug.h"
 #else
 #include <sys/time.h>
 #ifdef XF86VIDMODE
--- a/src/glx/meson.build
+++ b/src/glx/meson.build
@@ -18,9 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

-if with_dri_platform == 'windows'
-  subdir('windows')
-endif
+subdir('windows')

 files_libglx = files(
  'clientattrib.c',
@@ -113,6 +111,7 @@ elif with_dri_platform == 'windows'
  extra_ld_args_libgl = '-Wl,--disable-stdcall-fixup'
 endif

+dri_driver_dir = join_paths(get_option('prefix'), dri_drivers_path)
 if not with_glvnd
  gl_lib_name = 'GL'
  gl_lib_version = '1.2.0'
@@ -129,8 +128,7 @@ else
 endif

 gl_lib_cargs = [
-  '-D_REENTRANT',
-  '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_search_path),
+  '-D_REENTRANT', '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_driver_dir),
 ]

 if dep_xxf86vm != [] and dep_xxf86vm.found()
--- a/src/glx/tests/fake_glx_screen.cpp
+++ b/src/glx/tests/fake_glx_screen.cpp
@@ -75,18 +75,6 @@ indirect_create_context_attribs(struct glx_screen *base,
   return indirect_create_context(base, config_base, shareList, 0);
 }

-#ifdef GLX_USE_APPLEGL
-#warning Indirect GLX tests are not built
-extern "C" struct glx_context *
-applegl_create_context(struct glx_screen *base,
-		       struct glx_config *config_base,
-		       struct glx_context *shareList,
-		       int renderType)
-{
-   return indirect_create_context(base, config_base, shareList, renderType);
-}
-#endif
-
 /* This is necessary so that we don't have to link with glxcurrent.c
 * which would require us to link with X libraries and what not.
 */
--- a/src/glx/tests/indirect_api.cpp
+++ b/src/glx/tests/indirect_api.cpp
@@ -705,8 +705,6 @@ void __indirect_glFramebufferTextureLayer(void) { }
 }
 /*@}*/

-#ifndef GLX_USE_APPLEGL
-
 class IndirectAPI : public ::testing::Test {
 public:
   virtual void SetUp();
@@ -1520,5 +1518,3 @@ TEST_F(IndirectAPI, EXT_texture_array)
 {
   EXPECT_EQ((_glapi_proc) __indirect_glFramebufferTextureLayer, table[_glapi_get_proc_offset("glFramebufferTextureLayerEXT")]);
 }
-
-#endif
--- a/src/intel/tools/aubinator_error_decode.c
+++ b/src/intel/tools/aubinator_error_decode.c
@@ -390,7 +390,7 @@ get_gen_batch_bo(void *user_data, uint64_t address)
         return (struct gen_batch_decode_bo) {
            .addr = sections[s].gtt_offset,
            .map = sections[s].data,
-            .size = sections[s].count,
+            .size = sections[s].count * 4,
         };
      }
   }
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -547,6 +547,14 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
      cmd_buffer->state.descriptors_dirty |=
         set_layout->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
   }
+
+   /* Pipeline layout objects are required to live at least while any command
+    * buffers that use them are in recording state. We need to grab a reference
+    * to the pipeline layout being bound here so we can compute correct dynamic
+    * offsets for VK_DESCRIPTOR_TYPE_*_DYNAMIC in dynamic_offset_for_binding()
+    * when we record draw commands that come after this.
+    */
+   pipe_state->layout = layout;
 }

 void anv_CmdBindDescriptorSets(
@@ -913,8 +921,7 @@ void anv_CmdPushDescriptorSetKHR(

   assert(_set < MAX_SETS);

-   const struct anv_descriptor_set_layout *set_layout =
-      layout->set[_set].layout;
+   struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;

   struct anv_push_descriptor_set *push_set =
      anv_cmd_buffer_get_push_descriptor_set(cmd_buffer,
@@ -1006,8 +1013,7 @@ void anv_CmdPushDescriptorSetWithTemplateKHR(

   assert(_set < MAX_PUSH_DESCRIPTORS);

-   const struct anv_descriptor_set_layout *set_layout =
-      layout->set[_set].layout;
+   struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;

   struct anv_push_descriptor_set *push_set =
      anv_cmd_buffer_get_push_descriptor_set(cmd_buffer,
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -57,16 +57,21 @@ VkResult anv_CreateDescriptorSetLayout(
   struct anv_descriptor_set_binding_layout *bindings;
   struct anv_sampler **samplers;

+   /* We need to allocate decriptor set layouts off the device allocator
+    * with DEVICE scope because they are reference counted and may not be
+    * destroyed when vkDestroyDescriptorSetLayout is called.
+    */
   ANV_MULTIALLOC(ma);
   anv_multialloc_add(&ma, &set_layout, 1);
   anv_multialloc_add(&ma, &bindings, max_binding + 1);
   anv_multialloc_add(&ma, &samplers, immutable_sampler_count);

-   if (!anv_multialloc_alloc2(&ma, &device->alloc, pAllocator,
-                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
+   if (!anv_multialloc_alloc(&ma, &device->alloc,
+                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

   memset(set_layout, 0, sizeof(*set_layout));
+   set_layout->ref_cnt = 1;
   set_layout->binding_count = max_binding + 1;

   for (uint32_t b = 0; b <= max_binding; b++) {
@@ -204,7 +209,7 @@ void anv_DestroyDescriptorSetLayout(
   if (!set_layout)
      return;

-   vk_free2(&device->alloc, pAllocator, set_layout);
+   anv_descriptor_set_layout_unref(device, set_layout);
 }

 static void
@@ -246,6 +251,7 @@ VkResult anv_CreatePipelineLayout(
      ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout,
                      pCreateInfo->pSetLayouts[set]);
      layout->set[set].layout = set_layout;
+      anv_descriptor_set_layout_ref(set_layout);

      layout->set[set].dynamic_offset_start = dynamic_offset_count;
      for (uint32_t b = 0; b < set_layout->binding_count; b++) {
@@ -290,6 +296,9 @@ void anv_DestroyPipelineLayout(
   if (!pipeline_layout)
      return;

+   for (uint32_t i = 0; i < pipeline_layout->num_sets; i++)
+      anv_descriptor_set_layout_unref(device, pipeline_layout->set[i].layout);
+
   vk_free2(&device->alloc, pAllocator, pipeline_layout);
 }

@@ -423,7 +432,7 @@ struct surface_state_free_list_entry {
 VkResult
 anv_descriptor_set_create(struct anv_device *device,
                          struct anv_descriptor_pool *pool,
-                          const struct anv_descriptor_set_layout *layout,
+                          struct anv_descriptor_set_layout *layout,
                          struct anv_descriptor_set **out_set)
 {
   struct anv_descriptor_set *set;
@@ -455,8 +464,10 @@ anv_descriptor_set_create(struct anv_device *device,
      }
   }

-   set->size = size;
   set->layout = layout;
+   anv_descriptor_set_layout_ref(layout);
+
+   set->size = size;
   set->buffer_views =
      (struct anv_buffer_view *) &set->descriptors[layout->size];
   set->buffer_count = layout->buffer_count;
@@ -512,6 +523,8 @@ anv_descriptor_set_destroy(struct anv_device *device,
                           struct anv_descriptor_pool *pool,
                           struct anv_descriptor_set *set)
 {
+   anv_descriptor_set_layout_unref(device, set->layout);
+
   /* Put the buffer view surface state back on the free list. */
   for (uint32_t b = 0; b < set->buffer_count; b++) {
      struct surface_state_free_list_entry *entry =
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -83,7 +83,7 @@ EXTENSIONS = [
    Extension('VK_KHR_wayland_surface',                   6, 'VK_USE_PLATFORM_WAYLAND_KHR'),
    Extension('VK_KHR_xcb_surface',                       6, 'VK_USE_PLATFORM_XCB_KHR'),
    Extension('VK_KHR_xlib_surface',                      6, 'VK_USE_PLATFORM_XLIB_KHR'),
-    Extension('VK_KHX_multiview',                         1, False),
+    Extension('VK_KHX_multiview',                         1, True),
    Extension('VK_EXT_debug_report',                      8, True),
    Extension('VK_EXT_external_memory_dma_buf',           1, True),
 ]
--- a/src/intel/vulkan/anv_nir.h
+++ b/src/intel/vulkan/anv_nir.h
@@ -38,9 +38,10 @@ void anv_nir_lower_push_constants(nir_shader *shader);
 bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask);

 bool anv_nir_lower_ycbcr_textures(nir_shader *shader,
-                                  struct anv_pipeline *pipeline);
+                                  struct anv_pipeline_layout *layout);

 void anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline,
+                                   struct anv_pipeline_layout *layout,
                                   nir_shader *shader,
                                   struct brw_stage_prog_data *prog_data,
                                   struct anv_pipeline_bind_map *map);
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -147,7 +147,7 @@ lower_res_reindex_intrinsic(nir_intrinsic_instr *intrin,
    * array elements are sequential.  A resource_reindex just turns into an
    * add of the two indices.
    */
-   assert(intrin->src[0].is_ssa && intrin->src[0].is_ssa);
+   assert(intrin->src[0].is_ssa && intrin->src[1].is_ssa);
   nir_ssa_def *new_index = nir_iadd(b, intrin->src[0].ssa,
                                        intrin->src[1].ssa);

@@ -326,11 +326,11 @@ setup_vec4_uniform_value(uint32_t *params, uint32_t offset, unsigned n)

 void
 anv_nir_apply_pipeline_layout(struct anv_pipeline *pipeline,
+                              struct anv_pipeline_layout *layout,
                              nir_shader *shader,
                              struct brw_stage_prog_data *prog_data,
                              struct anv_pipeline_bind_map *map)
 {
-   struct anv_pipeline_layout *layout = pipeline->layout;
   gl_shader_stage stage = shader->info.stage;

   struct apply_pipeline_layout_state state = {
--- a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
+++ b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
@@ -316,13 +316,13 @@ swizzle_channel(struct isl_swizzle swizzle, unsigned channel)
 }

 static bool
-try_lower_tex_ycbcr(struct anv_pipeline *pipeline,
+try_lower_tex_ycbcr(struct anv_pipeline_layout *layout,
                    nir_builder *builder,
                    nir_tex_instr *tex)
 {
   nir_variable *var = tex->texture->var;
   const struct anv_descriptor_set_layout *set_layout =
-      pipeline->layout->set[var->data.descriptor_set].layout;
+      layout->set[var->data.descriptor_set].layout;
   const struct anv_descriptor_set_binding_layout *binding =
      &set_layout->binding[var->data.binding];

@@ -440,7 +440,8 @@ try_lower_tex_ycbcr(struct anv_pipeline *pipeline,
 }

 bool
-anv_nir_lower_ycbcr_textures(nir_shader *shader, struct anv_pipeline *pipeline)
+anv_nir_lower_ycbcr_textures(nir_shader *shader,
+                             struct anv_pipeline_layout *layout)
 {
   bool progress = false;

@@ -458,7 +459,7 @@ anv_nir_lower_ycbcr_textures(nir_shader *shader, struct anv_pipeline *pipeline)
               continue;

            nir_tex_instr *tex = nir_instr_as_tex(instr);
-            function_progress |= try_lower_tex_ycbcr(pipeline, &builder, tex);
+            function_progress |= try_lower_tex_ycbcr(layout, &builder, tex);
         }
      }

--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -349,6 +349,7 @@ populate_cs_prog_key(const struct gen_device_info *devinfo,

 static void
 anv_pipeline_hash_shader(struct anv_pipeline *pipeline,
+                         struct anv_pipeline_layout *layout,
                         struct anv_shader_module *module,
                         const char *entrypoint,
                         gl_shader_stage stage,
@@ -363,10 +364,8 @@ anv_pipeline_hash_shader(struct anv_pipeline *pipeline,
      _mesa_sha1_update(&ctx, &pipeline->subpass->view_mask,
                        sizeof(pipeline->subpass->view_mask));
   }
-   if (pipeline->layout) {
-      _mesa_sha1_update(&ctx, pipeline->layout->sha1,
-                        sizeof(pipeline->layout->sha1));
-   }
+   if (layout)
+      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
   _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
   _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint));
   _mesa_sha1_update(&ctx, &stage, sizeof(stage));
@@ -382,6 +381,7 @@ anv_pipeline_hash_shader(struct anv_pipeline *pipeline,
 static nir_shader *
 anv_pipeline_compile(struct anv_pipeline *pipeline,
                     void *mem_ctx,
+                     struct anv_pipeline_layout *layout,
                     struct anv_shader_module *module,
                     const char *entrypoint,
                     gl_shader_stage stage,
@@ -398,7 +398,7 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
   if (nir == NULL)
      return NULL;

-   NIR_PASS_V(nir, anv_nir_lower_ycbcr_textures, pipeline);
+   NIR_PASS_V(nir, anv_nir_lower_ycbcr_textures, layout);

   NIR_PASS_V(nir, anv_nir_lower_push_constants);

@@ -438,8 +438,8 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
      pipeline->needs_data_cache = true;

   /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
-   if (pipeline->layout)
-      anv_nir_apply_pipeline_layout(pipeline, nir, prog_data, map);
+   if (layout)
+      anv_nir_apply_pipeline_layout(pipeline, layout, nir, prog_data, map);

   if (stage != MESA_SHADER_COMPUTE)
      brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges);
@@ -508,8 +508,10 @@ anv_pipeline_compile_vs(struct anv_pipeline *pipeline,

   populate_vs_prog_key(&pipeline->device->info, &key);

+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
   if (cache) {
-      anv_pipeline_hash_shader(pipeline, module, entrypoint,
+      anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
                               MESA_SHADER_VERTEX, spec_info,
                               &key, sizeof(key), sha1);
      bin = anv_pipeline_cache_search(cache, sha1, 20);
@@ -527,7 +529,7 @@ anv_pipeline_compile_vs(struct anv_pipeline *pipeline,

      void *mem_ctx = ralloc_context(NULL);

-      nir_shader *nir = anv_pipeline_compile(pipeline, mem_ctx,
+      nir_shader *nir = anv_pipeline_compile(pipeline, mem_ctx, layout,
                                             module, entrypoint,
                                             MESA_SHADER_VERTEX, spec_info,
                                             &prog_data.base.base, &map);
@@ -633,11 +635,13 @@ anv_pipeline_compile_tcs_tes(struct anv_pipeline *pipeline,
   populate_sampler_prog_key(&pipeline->device->info, &tes_key.tex);
   tcs_key.input_vertices = info->pTessellationState->patchControlPoints;

+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
   if (cache) {
-      anv_pipeline_hash_shader(pipeline, tcs_module, tcs_entrypoint,
+      anv_pipeline_hash_shader(pipeline, layout, tcs_module, tcs_entrypoint,
                               MESA_SHADER_TESS_CTRL, tcs_spec_info,
                               &tcs_key, sizeof(tcs_key), tcs_sha1);
-      anv_pipeline_hash_shader(pipeline, tes_module, tes_entrypoint,
+      anv_pipeline_hash_shader(pipeline, layout, tes_module, tes_entrypoint,
                               MESA_SHADER_TESS_EVAL, tes_spec_info,
                               &tes_key, sizeof(tes_key), tes_sha1);
      memcpy(&tcs_sha1[20], tes_sha1, 20);
@@ -666,11 +670,13 @@ anv_pipeline_compile_tcs_tes(struct anv_pipeline *pipeline,
      void *mem_ctx = ralloc_context(NULL);

      nir_shader *tcs_nir =
-         anv_pipeline_compile(pipeline, mem_ctx, tcs_module, tcs_entrypoint,
+         anv_pipeline_compile(pipeline, mem_ctx, layout,
+                              tcs_module, tcs_entrypoint,
                              MESA_SHADER_TESS_CTRL, tcs_spec_info,
                              &tcs_prog_data.base.base, &tcs_map);
      nir_shader *tes_nir =
-         anv_pipeline_compile(pipeline, mem_ctx, tes_module, tes_entrypoint,
+         anv_pipeline_compile(pipeline, mem_ctx, layout,
+                              tes_module, tes_entrypoint,
                              MESA_SHADER_TESS_EVAL, tes_spec_info,
                              &tes_prog_data.base.base, &tes_map);
      if (tcs_nir == NULL || tes_nir == NULL) {
@@ -771,8 +777,10 @@ anv_pipeline_compile_gs(struct anv_pipeline *pipeline,

   populate_gs_prog_key(&pipeline->device->info, &key);

+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
   if (cache) {
-      anv_pipeline_hash_shader(pipeline, module, entrypoint,
+      anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
                               MESA_SHADER_GEOMETRY, spec_info,
                               &key, sizeof(key), sha1);
      bin = anv_pipeline_cache_search(cache, sha1, 20);
@@ -790,7 +798,7 @@ anv_pipeline_compile_gs(struct anv_pipeline *pipeline,

      void *mem_ctx = ralloc_context(NULL);

-      nir_shader *nir = anv_pipeline_compile(pipeline, mem_ctx,
+      nir_shader *nir = anv_pipeline_compile(pipeline, mem_ctx, layout,
                                             module, entrypoint,
                                             MESA_SHADER_GEOMETRY, spec_info,
                                             &prog_data.base.base, &map);
@@ -849,8 +857,10 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,

   populate_wm_prog_key(pipeline, info, &key);

+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
   if (cache) {
-      anv_pipeline_hash_shader(pipeline, module, entrypoint,
+      anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
                               MESA_SHADER_FRAGMENT, spec_info,
                               &key, sizeof(key), sha1);
      bin = anv_pipeline_cache_search(cache, sha1, 20);
@@ -868,7 +878,7 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,

      void *mem_ctx = ralloc_context(NULL);

-      nir_shader *nir = anv_pipeline_compile(pipeline, mem_ctx,
+      nir_shader *nir = anv_pipeline_compile(pipeline, mem_ctx, layout,
                                             module, entrypoint,
                                             MESA_SHADER_FRAGMENT, spec_info,
                                             &prog_data.base, &map);
@@ -997,8 +1007,10 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,

   populate_cs_prog_key(&pipeline->device->info, &key);

+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
   if (cache) {
-      anv_pipeline_hash_shader(pipeline, module, entrypoint,
+      anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
                               MESA_SHADER_COMPUTE, spec_info,
                               &key, sizeof(key), sha1);
      bin = anv_pipeline_cache_search(cache, sha1, 20);
@@ -1016,7 +1028,7 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,

      void *mem_ctx = ralloc_context(NULL);

-      nir_shader *nir = anv_pipeline_compile(pipeline, mem_ctx,
+      nir_shader *nir = anv_pipeline_compile(pipeline, mem_ctx, layout,
                                             module, entrypoint,
                                             MESA_SHADER_COMPUTE, spec_info,
                                             &prog_data.base, &map);
@@ -1279,8 +1291,6 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
   assert(pCreateInfo->subpass < render_pass->subpass_count);
   pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];

-   pipeline->layout = anv_pipeline_layout_from_handle(pCreateInfo->layout);
-
   result = anv_reloc_list_init(&pipeline->batch_relocs, alloc);
   if (result != VK_SUCCESS)
      return result;
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1199,6 +1199,9 @@ struct anv_descriptor_set_binding_layout {
 };

 struct anv_descriptor_set_layout {
+   /* Descriptor set layouts can be destroyed at almost any time */
+   uint32_t ref_cnt;
+
   /* Number of bindings in this descriptor set */
   uint16_t binding_count;

@@ -1218,6 +1221,22 @@ struct anv_descriptor_set_layout {
   struct anv_descriptor_set_binding_layout binding[0];
 };

+static inline void
+anv_descriptor_set_layout_ref(struct anv_descriptor_set_layout *layout)
+{
+   assert(layout && layout->ref_cnt >= 1);
+   p_atomic_inc(&layout->ref_cnt);
+}
+
+static inline void
+anv_descriptor_set_layout_unref(struct anv_device *device,
+                                struct anv_descriptor_set_layout *layout)
+{
+   assert(layout && layout->ref_cnt >= 1);
+   if (p_atomic_dec_zero(&layout->ref_cnt))
+      vk_free(&device->alloc, layout);
+}
+
 struct anv_descriptor {
   VkDescriptorType type;

@@ -1239,7 +1258,7 @@ struct anv_descriptor {
 };

 struct anv_descriptor_set {
-   const struct anv_descriptor_set_layout *layout;
+   struct anv_descriptor_set_layout *layout;
   uint32_t size;
   uint32_t buffer_count;
   struct anv_buffer_view *buffer_views;
@@ -1363,7 +1382,7 @@ anv_descriptor_set_write_template(struct anv_descriptor_set *set,
 VkResult
 anv_descriptor_set_create(struct anv_device *device,
                          struct anv_descriptor_pool *pool,
-                          const struct anv_descriptor_set_layout *layout,
+                          struct anv_descriptor_set_layout *layout,
                          struct anv_descriptor_set **out_set);

 void
@@ -1675,6 +1694,7 @@ struct anv_attachment_state {
 */
 struct anv_cmd_pipeline_state {
   struct anv_pipeline *pipeline;
+   struct anv_pipeline_layout *layout;

   struct anv_descriptor_set *descriptors[MAX_SETS];
   uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
@@ -2124,7 +2144,6 @@ struct anv_pipeline {
   struct anv_dynamic_state                     dynamic_state;

   struct anv_subpass *                         subpass;
-   struct anv_pipeline_layout *                 layout;

   bool                                         needs_data_cache;

--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1529,7 +1529,6 @@ anv_descriptor_for_binding(const struct anv_cmd_pipeline_state *pipe_state,

 static uint32_t
 dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state,
-                           const struct anv_pipeline *pipeline,
                           const struct anv_pipeline_binding *binding)
 {
   assert(binding->set < MAX_SETS);
@@ -1537,7 +1536,7 @@ dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state,
      pipe_state->descriptors[binding->set];

   uint32_t dynamic_offset_idx =
-      pipeline->layout->set[binding->set].dynamic_offset_start +
+      pipe_state->layout->set[binding->set].dynamic_offset_start +
      set->layout->binding[binding->binding].dynamic_offset_index +
      binding->index;

@@ -1725,7 +1724,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
         /* Compute the offset within the buffer */
         uint32_t dynamic_offset =
-            dynamic_offset_for_binding(pipe_state, pipeline, binding);
+            dynamic_offset_for_binding(pipe_state, binding);
         uint64_t offset = desc->offset + dynamic_offset;
         /* Clamp to the buffer size */
         offset = MIN2(offset, desc->buffer->size);
@@ -2000,8 +1999,7 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
                  assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);

                  uint32_t dynamic_offset =
-                     dynamic_offset_for_binding(&gfx_state->base,
-                                                pipeline, binding);
+                     dynamic_offset_for_binding(&gfx_state->base, binding);
                  uint32_t buf_offset =
                     MIN2(desc->offset + dynamic_offset, desc->buffer->size);
                  uint32_t buf_range =
@@ -3208,17 +3206,6 @@ genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
   if (GEN_GEN == 7)
      cmd_buffer->state.gfx.vb_dirty |= ~0;

-   /* It is possible to start a render pass with an old pipeline.  Because the
-    * render pass and subpass index are both baked into the pipeline, this is
-    * highly unlikely.  In order to do so, it requires that you have a render
-    * pass with a single subpass and that you use that render pass twice
-    * back-to-back and use the same pipeline at the start of the second render
-    * pass as at the end of the first.  In order to avoid unpredictable issues
-    * with this edge case, we just dirty the pipeline at the start of every
-    * subpass.
-    */
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
-
   /* Perform transitions to the subpass layout before any writes have
    * occurred.
    */
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -1756,7 +1756,6 @@ compute_pipeline_create(
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

   pipeline->device = device;
-   pipeline->layout = anv_pipeline_layout_from_handle(pCreateInfo->layout);

   pipeline->blend_state.map = NULL;

--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -110,8 +110,7 @@ static char *loader_get_dri_config_device_id(void)

 static char *drm_construct_id_path_tag(drmDevicePtr device)
 {
-/* Length of "pci-xxxx_xx_xx_x\0" */
-#define PCI_ID_PATH_TAG_LENGTH 17
+#define PCI_ID_PATH_TAG_LENGTH sizeof("pci-xxxx_xx_xx_x")
   char *tag = NULL;

   if (device->bustype == DRM_BUS_PCI) {
--- a/src/mapi/Makefile.sources
+++ b/src/mapi/Makefile.sources
@@ -2,9 +2,6 @@
 #
 # mapi may be used in several ways
 #
-#  - In default mode, mapi implements the interface defined by mapi.h.  To use
-#    this mode, compile MAPI_FILES.
-#
 #  - In util mode, mapi provides utility functions for use with glapi.  To use
 #    this mode, compile MAPI_UTIL_FILES with MAPI_MODE_UTIL defined.
 #
@@ -30,14 +27,6 @@ MAPI_BRIDGE_FILES = \
 	entry_ppc64le_tsd.h \
 	mapi_tmp.h

-MAPI_FILES = \
-	entry.c \
-	stub.c \
-	stub.h \
-	table.c \
-	table.h \
-	$(MAPI_UTIL_FILES)
-
 MAPI_GLAPI_FILES = \
 	entry.c \
 	mapi_glapi.c \
--- a/src/mapi/glapi/gen/gl_gentable.py
+++ b/src/mapi/glapi/gen/gl_gentable.py
@@ -56,7 +56,6 @@ header = """/* GLXEXT is the define used in the xserver when the GLX extension i
 #endif
 #include <stdlib.h>
 #include <stdio.h>
-#include <string.h>

 #include "main/glheader.h"

@@ -145,19 +144,6 @@ _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) {

    return disp;
 }
-
-void
- _glapi_table_patch(struct _glapi_table *table, const char *name, void *wrapper)
-{
-   for (int func_index = 0; func_index < GLAPI_TABLE_COUNT; ++func_index) {
-      if (!strcmp(_glapi_table_func_names[func_index], name)) {
-            ((void **)table)[func_index] = wrapper;
-            return;
-         }
-   }
-   fprintf(stderr, "could not patch %s in dispatch table\\n", name);
-}
-
 """


--- a/src/mapi/glapi/glapi.h
+++ b/src/mapi/glapi/glapi.h
@@ -161,9 +161,6 @@ _glapi_get_proc_name(unsigned int offset);
 #if defined(GLX_USE_APPLEGL) || defined(GLX_USE_WINDOWSGL)
 _GLAPI_EXPORT struct _glapi_table *
 _glapi_create_table_from_handle(void *handle, const char *symbol_prefix);
-
-_GLAPI_EXPORT void
-_glapi_table_patch(struct _glapi_table *, const char *name, void *wrapper);
 #endif


--- a/src/mapi/glapi/glapi_priv.h
+++ b/src/mapi/glapi/glapi_priv.h
@@ -27,25 +27,12 @@
 #define _GLAPI_PRIV_H


-#ifdef HAVE_DIX_CONFIG_H
-#include <dix-config.h>
-#include "glapi/mesa.h"
-#else /* HAVE_DIX_CONFIG_H */
 #define GL_GLEXT_PROTOTYPES
 #include "GL/gl.h"
 #include "GL/glext.h"

-#ifndef GL_OES_fixed_point
-typedef int GLfixed;
-#endif
 typedef int GLclampx;

-#ifndef GL_OES_EGL_image
-typedef void *GLeglImageOES;
-#endif
-
-#endif /* HAVE_DIX_CONFIG_H */
-
 #include "glapi/glapi.h"


--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -393,7 +393,6 @@ VBO_FILES = \
 	vbo/vbo_attrib.h \
 	vbo/vbo_attrib_tmp.h \
 	vbo/vbo_context.c \
-	vbo/vbo_context.h \
 	vbo/vbo_exec_api.c \
 	vbo/vbo_exec_array.c \
 	vbo/vbo_exec.c \
@@ -405,6 +404,7 @@ VBO_FILES = \
 	vbo/vbo_noop.c \
 	vbo/vbo_noop.h \
 	vbo/vbo_primitive_restart.c \
+	vbo/vbo_private.h \
 	vbo/vbo_rebase.c \
 	vbo/vbo_save_api.c \
 	vbo/vbo_save.c \
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -277,7 +277,7 @@ _mesa_init_driver_state(struct gl_context *ctx)
   }

   ctx->Driver.LineWidth(ctx, ctx->Line.Width);
-   ctx->Driver.LogicOpcode(ctx, ctx->Color.LogicOp);
+   ctx->Driver.LogicOpcode(ctx, ctx->Color._LogicOp);
   ctx->Driver.PointSize(ctx, ctx->Point.Size);
   ctx->Driver.PolygonStipple(ctx, (const GLubyte *) ctx->PolygonStipple);
   ctx->Driver.Scissor(ctx);
--- a/src/mesa/drivers/dri/Makefile.am
+++ b/src/mesa/drivers/dri/Makefile.am
@@ -57,7 +57,7 @@ mesa_dri_drivers_la_LDFLAGS = \
 	-module \
 	-no-undefined \
 	-avoid-version \
-	$(LD_BUILD_ID) \
+	-Wl,--build-id=sha1 \
 	$(BSYMBOLIC) \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -573,16 +573,16 @@ i830Scissor(struct gl_context * ctx)
 }

 static void
-i830LogicOp(struct gl_context * ctx, GLenum opcode)
+i830LogicOp(struct gl_context * ctx, enum gl_logicop_mode opcode)
 {
   struct i830_context *i830 = i830_context(ctx);
-   int tmp = intel_translate_logic_op(opcode);

   DBG("%s\n", __func__);
-   
+
+   assert((unsigned)opcode <= 15);
   I830_STATECHANGE(i830, I830_UPLOAD_CTX);
   i830->state.Ctx[I830_CTXREG_STATE4] &= ~LOGICOP_MASK;
-   i830->state.Ctx[I830_CTXREG_STATE4] |= LOGIC_OP_FUNC(tmp);
+   i830->state.Ctx[I830_CTXREG_STATE4] |= opcode;
 }


--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -539,16 +539,16 @@ i915Scissor(struct gl_context * ctx)
 }

 static void
-i915LogicOp(struct gl_context * ctx, GLenum opcode)
+i915LogicOp(struct gl_context * ctx, enum gl_logicop_mode opcode)
 {
   struct i915_context *i915 = I915_CONTEXT(ctx);
-   int tmp = intel_translate_logic_op(opcode);

   DBG("%s\n", __func__);
-   
+
+   assert((unsigned)opcode <= 15);
   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
   i915->state.Ctx[I915_CTXREG_STATE4] &= ~LOGICOP_MASK;
-   i915->state.Ctx[I915_CTXREG_STATE4] |= LOGIC_OP_FUNC(tmp);
+   i915->state.Ctx[I915_CTXREG_STATE4] |= LOGIC_OP_FUNC(opcode);
 }


--- a/src/mesa/drivers/dri/i915/intel_blit.c
+++ b/src/mesa/drivers/dri/i915/intel_blit.c
@@ -48,27 +48,9 @@ intel_miptree_set_alpha_to_one(struct intel_context *intel,
                               struct intel_mipmap_tree *mt,
                               int x, int y, int width, int height);

-static GLuint translate_raster_op(GLenum logicop)
+static GLuint translate_raster_op(enum gl_logicop_mode logicop)
 {
-   switch(logicop) {
-   case GL_CLEAR: return 0x00;
-   case GL_AND: return 0x88;
-   case GL_AND_REVERSE: return 0x44;
-   case GL_COPY: return 0xCC;
-   case GL_AND_INVERTED: return 0x22;
-   case GL_NOOP: return 0xAA;
-   case GL_XOR: return 0x66;
-   case GL_OR: return 0xEE;
-   case GL_NOR: return 0x11;
-   case GL_EQUIV: return 0x99;
-   case GL_INVERT: return 0x55;
-   case GL_OR_REVERSE: return 0xDD;
-   case GL_COPY_INVERTED: return 0x33;
-   case GL_OR_INVERTED: return 0xBB;
-   case GL_NAND: return 0x77;
-   case GL_SET: return 0xFF;
-   default: return 0;
-   }
+   return logicop | (logicop << 4);
 }

 static uint32_t
@@ -90,144 +72,23 @@ br13_for_cpp(int cpp)
   }
 }

-/**
- * Implements a rectangular block transfer (blit) of pixels between two
- * miptrees.
- *
- * Our blitter can operate on 1, 2, or 4-byte-per-pixel data, with generous,
- * but limited, pitches and sizes allowed.
- *
- * The src/dst coordinates are relative to the given level/slice of the
- * miptree.
- *
- * If @src_flip or @dst_flip is set, then the rectangle within that miptree
- * will be inverted (including scanline order) when copying.  This is common
- * in GL when copying between window system and user-created
- * renderbuffers/textures.
- */
-bool
-intel_miptree_blit(struct intel_context *intel,
-                   struct intel_mipmap_tree *src_mt,
-                   int src_level, int src_slice,
-                   uint32_t src_x, uint32_t src_y, bool src_flip,
-                   struct intel_mipmap_tree *dst_mt,
-                   int dst_level, int dst_slice,
-                   uint32_t dst_x, uint32_t dst_y, bool dst_flip,
-                   uint32_t width, uint32_t height,
-                   GLenum logicop)
-{
-   /* No sRGB decode or encode is done by the hardware blitter, which is
-    * consistent with what we want in the callers (glCopyTexSubImage(),
-    * glBlitFramebuffer(), texture validation, etc.).
-    */
-   mesa_format src_format = _mesa_get_srgb_format_linear(src_mt->format);
-   mesa_format dst_format = _mesa_get_srgb_format_linear(dst_mt->format);
-
-   /* The blitter doesn't support doing any format conversions.  We do also
-    * support blitting ARGB8888 to XRGB8888 (trivial, the values dropped into
-    * the X channel don't matter), and XRGB8888 to ARGB8888 by setting the A
-    * channel to 1.0 at the end.
-    */
-   if (src_format != dst_format &&
-      ((src_format != MESA_FORMAT_B8G8R8A8_UNORM &&
-        src_format != MESA_FORMAT_B8G8R8X8_UNORM) ||
-       (dst_format != MESA_FORMAT_B8G8R8A8_UNORM &&
-        dst_format != MESA_FORMAT_B8G8R8X8_UNORM))) {
-      perf_debug("%s: Can't use hardware blitter from %s to %s, "
-                 "falling back.\n", __func__,
-                 _mesa_get_format_name(src_format),
-                 _mesa_get_format_name(dst_format));
-      return false;
-   }
-
-   /* According to the Ivy Bridge PRM, Vol1 Part4, section 1.2.1.2 (Graphics
-    * Data Size Limitations):
-    *
-    *    The BLT engine is capable of transferring very large quantities of
-    *    graphics data. Any graphics data read from and written to the
-    *    destination is permitted to represent a number of pixels that
-    *    occupies up to 65,536 scan lines and up to 32,768 bytes per scan line
-    *    at the destination. The maximum number of pixels that may be
-    *    represented per scan line’s worth of graphics data depends on the
-    *    color depth.
-    *
-    * Furthermore, intelEmitCopyBlit (which is called below) uses a signed
-    * 16-bit integer to represent buffer pitch, so it can only handle buffer
-    * pitches < 32k.
-    *
-    * As a result of these two limitations, we can only use the blitter to do
-    * this copy when the region's pitch is less than 32k.
-    */
-   if (src_mt->region->pitch > 32768 ||
-       dst_mt->region->pitch > 32768) {
-      perf_debug("Falling back due to >32k pitch\n");
-      return false;
-   }
-
-   if (src_flip)
-      src_y = src_mt->level[src_level].height - src_y - height;
-
-   if (dst_flip)
-      dst_y = dst_mt->level[dst_level].height - dst_y - height;
-
-   int src_pitch = src_mt->region->pitch;
-   if (src_flip != dst_flip)
-      src_pitch = -src_pitch;
-
-   uint32_t src_image_x, src_image_y;
-   intel_miptree_get_image_offset(src_mt, src_level, src_slice,
-                                  &src_image_x, &src_image_y);
-   src_x += src_image_x;
-   src_y += src_image_y;
-
-   uint32_t dst_image_x, dst_image_y;
-   intel_miptree_get_image_offset(dst_mt, dst_level, dst_slice,
-                                  &dst_image_x, &dst_image_y);
-   dst_x += dst_image_x;
-   dst_y += dst_image_y;
-
-   if (!intelEmitCopyBlit(intel,
-                          src_mt->cpp,
-                          src_pitch,
-                          src_mt->region->bo, src_mt->offset,
-                          src_mt->region->tiling,
-                          dst_mt->region->pitch,
-                          dst_mt->region->bo, dst_mt->offset,
-                          dst_mt->region->tiling,
-                          src_x, src_y,
-                          dst_x, dst_y,
-                          width, height,
-                          logicop)) {
-      return false;
-   }
-
-   if (src_mt->format == MESA_FORMAT_B8G8R8X8_UNORM &&
-       dst_mt->format == MESA_FORMAT_B8G8R8A8_UNORM) {
-      intel_miptree_set_alpha_to_one(intel, dst_mt,
-                                     dst_x, dst_y,
-                                     width, height);
-   }
-
-   return true;
-}
-
 /* Copy BitBlt
 */
-bool
-intelEmitCopyBlit(struct intel_context *intel,
-		  GLuint cpp,
-		  GLshort src_pitch,
-		  drm_intel_bo *src_buffer,
-		  GLuint src_offset,
-		  uint32_t src_tiling,
-		  GLshort dst_pitch,
-		  drm_intel_bo *dst_buffer,
-		  GLuint dst_offset,
-		  uint32_t dst_tiling,
-		  GLshort src_x, GLshort src_y,
-		  GLshort dst_x, GLshort dst_y,
-		  GLshort w, GLshort h,
-		  GLenum logic_op)
+static bool
+emit_copy_blit(struct intel_context *intel,
+               GLuint cpp,
+               GLshort src_pitch,
+               drm_intel_bo *src_buffer,
+               GLuint src_offset,
+               uint32_t src_tiling,
+               GLshort dst_pitch,
+               drm_intel_bo *dst_buffer,
+               GLuint dst_offset,
+               uint32_t dst_tiling,
+               GLshort src_x, GLshort src_y,
+               GLshort dst_x, GLshort dst_y,
+               GLshort w, GLshort h,
+               enum gl_logicop_mode logic_op)
 {
   GLuint CMD, BR13, pass = 0;
   int dst_y2 = dst_y + h;
@@ -338,6 +199,126 @@ intelEmitCopyBlit(struct intel_context *intel,
   return true;
 }

+/**
+ * Implements a rectangular block transfer (blit) of pixels between two
+ * miptrees.
+ *
+ * Our blitter can operate on 1, 2, or 4-byte-per-pixel data, with generous,
+ * but limited, pitches and sizes allowed.
+ *
+ * The src/dst coordinates are relative to the given level/slice of the
+ * miptree.
+ *
+ * If @src_flip or @dst_flip is set, then the rectangle within that miptree
+ * will be inverted (including scanline order) when copying.  This is common
+ * in GL when copying between window system and user-created
+ * renderbuffers/textures.
+ */
+bool
+intel_miptree_blit(struct intel_context *intel,
+                   struct intel_mipmap_tree *src_mt,
+                   int src_level, int src_slice,
+                   uint32_t src_x, uint32_t src_y, bool src_flip,
+                   struct intel_mipmap_tree *dst_mt,
+                   int dst_level, int dst_slice,
+                   uint32_t dst_x, uint32_t dst_y, bool dst_flip,
+                   uint32_t width, uint32_t height,
+                   enum gl_logicop_mode logicop)
+{
+   /* No sRGB decode or encode is done by the hardware blitter, which is
+    * consistent with what we want in the callers (glCopyTexSubImage(),
+    * glBlitFramebuffer(), texture validation, etc.).
+    */
+   mesa_format src_format = _mesa_get_srgb_format_linear(src_mt->format);
+   mesa_format dst_format = _mesa_get_srgb_format_linear(dst_mt->format);
+
+   /* The blitter doesn't support doing any format conversions.  We do also
+    * support blitting ARGB8888 to XRGB8888 (trivial, the values dropped into
+    * the X channel don't matter), and XRGB8888 to ARGB8888 by setting the A
+    * channel to 1.0 at the end.
+    */
+   if (src_format != dst_format &&
+      ((src_format != MESA_FORMAT_B8G8R8A8_UNORM &&
+        src_format != MESA_FORMAT_B8G8R8X8_UNORM) ||
+       (dst_format != MESA_FORMAT_B8G8R8A8_UNORM &&
+        dst_format != MESA_FORMAT_B8G8R8X8_UNORM))) {
+      perf_debug("%s: Can't use hardware blitter from %s to %s, "
+                 "falling back.\n", __func__,
+                 _mesa_get_format_name(src_format),
+                 _mesa_get_format_name(dst_format));
+      return false;
+   }
+
+   /* According to the Ivy Bridge PRM, Vol1 Part4, section 1.2.1.2 (Graphics
+    * Data Size Limitations):
+    *
+    *    The BLT engine is capable of transferring very large quantities of
+    *    graphics data. Any graphics data read from and written to the
+    *    destination is permitted to represent a number of pixels that
+    *    occupies up to 65,536 scan lines and up to 32,768 bytes per scan line
+    *    at the destination. The maximum number of pixels that may be
+    *    represented per scan line’s worth of graphics data depends on the
+    *    color depth.
+    *
+    * Furthermore, emit_copy_blit (which is called below) uses a signed
+    * 16-bit integer to represent buffer pitch, so it can only handle buffer
+    * pitches < 32k.
+    *
+    * As a result of these two limitations, we can only use the blitter to do
+    * this copy when the region's pitch is less than 32k.
+    */
+   if (src_mt->region->pitch > 32768 ||
+       dst_mt->region->pitch > 32768) {
+      perf_debug("Falling back due to >32k pitch\n");
+      return false;
+   }
+
+   if (src_flip)
+      src_y = src_mt->level[src_level].height - src_y - height;
+
+   if (dst_flip)
+      dst_y = dst_mt->level[dst_level].height - dst_y - height;
+
+   int src_pitch = src_mt->region->pitch;
+   if (src_flip != dst_flip)
+      src_pitch = -src_pitch;
+
+   uint32_t src_image_x, src_image_y;
+   intel_miptree_get_image_offset(src_mt, src_level, src_slice,
+                                  &src_image_x, &src_image_y);
+   src_x += src_image_x;
+   src_y += src_image_y;
+
+   uint32_t dst_image_x, dst_image_y;
+   intel_miptree_get_image_offset(dst_mt, dst_level, dst_slice,
+                                  &dst_image_x, &dst_image_y);
+   dst_x += dst_image_x;
+   dst_y += dst_image_y;
+
+   if (!emit_copy_blit(intel,
+                       src_mt->cpp,
+                       src_pitch,
+                       src_mt->region->bo, src_mt->offset,
+                       src_mt->region->tiling,
+                       dst_mt->region->pitch,
+                       dst_mt->region->bo, dst_mt->offset,
+                       dst_mt->region->tiling,
+                       src_x, src_y,
+                       dst_x, dst_y,
+                       width, height,
+                       logicop)) {
+      return false;
+   }
+
+   if (src_mt->format == MESA_FORMAT_B8G8R8X8_UNORM &&
+       dst_mt->format == MESA_FORMAT_B8G8R8A8_UNORM) {
+      intel_miptree_set_alpha_to_one(intel, dst_mt,
+                                     dst_x, dst_y,
+                                     width, height);
+   }
+
+   return true;
+}

 /**
 * Use blitting to clear the renderbuffers named by 'flags'.
@@ -523,7 +504,7 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  uint32_t dst_tiling,
 				  GLshort x, GLshort y,
 				  GLshort w, GLshort h,
-				  GLenum logic_op)
+				  enum gl_logicop_mode logic_op)
 {
   int dwords = ALIGN(src_size, 8) / 4;
   uint32_t opcode, br13, blit_cmd;
@@ -535,7 +516,7 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 	 return false;
   }

-   assert((logic_op >= GL_CLEAR) && (logic_op <= (GL_CLEAR + 0x0f)));
+   assert((unsigned)logic_op <= 0x0f);
   assert(dst_pitch > 0);

   if (w < 0 || h < 0)
@@ -607,13 +588,13 @@ intel_emit_linear_blit(struct intel_context *intel,
    */
   pitch = ROUND_DOWN_TO(MIN2(size, (1 << 15) - 1), 4);
   height = (pitch == 0) ? 1 : size / pitch;
-   ok = intelEmitCopyBlit(intel, 1,
-			  pitch, src_bo, src_offset, I915_TILING_NONE,
-			  pitch, dst_bo, dst_offset, I915_TILING_NONE,
-			  0, 0, /* src x/y */
-			  0, 0, /* dst x/y */
-			  pitch, height, /* w, h */
-			  GL_COPY);
+   ok = emit_copy_blit(intel, 1,
+                       pitch, src_bo, src_offset, I915_TILING_NONE,
+                       pitch, dst_bo, dst_offset, I915_TILING_NONE,
+                       0, 0, /* src x/y */
+                       0, 0, /* dst x/y */
+                       pitch, height, /* w, h */
+                       COLOR_LOGICOP_COPY);
   if (!ok)
      _mesa_problem(ctx, "Failed to linear blit %dx%d\n", pitch, height);

@@ -623,13 +604,13 @@ intel_emit_linear_blit(struct intel_context *intel,
   assert (size < (1 << 15));
   pitch = ALIGN(size, 4);
   if (size != 0) {
-      ok = intelEmitCopyBlit(intel, 1,
-			     pitch, src_bo, src_offset, I915_TILING_NONE,
-			     pitch, dst_bo, dst_offset, I915_TILING_NONE,
-			     0, 0, /* src x/y */
-			     0, 0, /* dst x/y */
-			     size, 1, /* w, h */
-			     GL_COPY);
+      ok = emit_copy_blit(intel, 1,
+                          pitch, src_bo, src_offset, I915_TILING_NONE,
+                          pitch, dst_bo, dst_offset, I915_TILING_NONE,
+                          0, 0, /* src x/y */
+                          0, 0, /* dst x/y */
+                          size, 1, /* w, h */
+                          COLOR_LOGICOP_COPY);
      if (!ok)
         _mesa_problem(ctx, "Failed to linear blit %dx%d\n", size, 1);
   }
--- a/src/mesa/drivers/dri/i915/intel_blit.h
+++ b/src/mesa/drivers/dri/i915/intel_blit.h
@@ -35,22 +35,6 @@ extern void intelCopyBuffer(const __DRIdrawable * dpriv,

 extern GLbitfield intelClearWithBlit(struct gl_context * ctx, GLbitfield mask);

-bool
-intelEmitCopyBlit(struct intel_context *intel,
-                              GLuint cpp,
-                              GLshort src_pitch,
-                              drm_intel_bo *src_buffer,
-                              GLuint src_offset,
-			      uint32_t src_tiling,
-                              GLshort dst_pitch,
-                              drm_intel_bo *dst_buffer,
-                              GLuint dst_offset,
-			      uint32_t dst_tiling,
-                              GLshort srcx, GLshort srcy,
-                              GLshort dstx, GLshort dsty,
-                              GLshort w, GLshort h,
-			      GLenum logicop );
-
 bool intel_miptree_blit(struct intel_context *intel,
                        struct intel_mipmap_tree *src_mt,
                        int src_level, int src_slice,
@@ -59,7 +43,7 @@ bool intel_miptree_blit(struct intel_context *intel,
                        int dst_level, int dst_slice,
                        uint32_t dst_x, uint32_t dst_y, bool dst_flip,
                        uint32_t width, uint32_t height,
-                        GLenum logicop);
+                        enum gl_logicop_mode logicop);

 bool
 intelEmitImmediateColorExpandBlit(struct intel_context *intel,
@@ -72,7 +56,7 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  uint32_t dst_tiling,
 				  GLshort x, GLshort y,
 				  GLshort w, GLshort h,
-				  GLenum logic_op);
+                                  enum gl_logicop_mode logic_op);
 void intel_emit_linear_blit(struct intel_context *intel,
 			    drm_intel_bo *dst_bo,
 			    unsigned int dst_offset,
--- a/src/mesa/drivers/dri/i915/intel_context.h
+++ b/src/mesa/drivers/dri/i915/intel_context.h
@@ -421,7 +421,6 @@ extern int intel_translate_shadow_compare_func(GLenum func);
 extern int intel_translate_compare_func(GLenum func);
 extern int intel_translate_stencil_op(GLenum op);
 extern int intel_translate_blend_factor(GLenum factor);
-extern int intel_translate_logic_op(GLenum opcode);

 void intel_update_renderbuffers(__DRIcontext *context,
 				__DRIdrawable *drawable);
--- a/src/mesa/drivers/dri/i915/intel_fbo.c
+++ b/src/mesa/drivers/dri/i915/intel_fbo.c
@@ -287,7 +287,7 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
 * intel_process_dri2_buffer().
 */
 static GLboolean
-intel_alloc_window_storage(struct gl_context * ctx, struct gl_renderbuffer *rb,
+intel_alloc_window_storage(UNUSED struct gl_context *ctx, struct gl_renderbuffer *rb,
                           GLenum internalFormat, GLuint width, GLuint height)
 {
   assert(rb->Name == 0);
@@ -300,8 +300,10 @@ intel_alloc_window_storage(struct gl_context * ctx, struct gl_renderbuffer *rb,

 /** Dummy function for gl_renderbuffer::AllocStorage() */
 static GLboolean
-intel_nop_alloc_storage(struct gl_context * ctx, struct gl_renderbuffer *rb,
-                        GLenum internalFormat, GLuint width, GLuint height)
+intel_nop_alloc_storage(UNUSED struct gl_context *ctx,
+                        UNUSED struct gl_renderbuffer *rb,
+                        UNUSED GLenum internalFormat,
+                        UNUSED GLuint width, UNUSED GLuint height)
 {
   _mesa_problem(ctx, "intel_op_alloc_storage should never be called.");
   return false;
@@ -393,7 +395,8 @@ intel_new_renderbuffer(struct gl_context * ctx, GLuint name)
 */
 static void
 intel_bind_framebuffer(struct gl_context * ctx, GLenum target,
-                       struct gl_framebuffer *fb, struct gl_framebuffer *fbread)
+                       UNUSED struct gl_framebuffer *fb,
+                       UNUSED struct gl_framebuffer *fbread)
 {
   if (target == GL_FRAMEBUFFER_EXT || target == GL_DRAW_FRAMEBUFFER_EXT) {
      intel_draw_buffer(ctx);
@@ -419,8 +422,7 @@ intel_framebuffer_renderbuffer(struct gl_context * ctx,
 }

 static bool
-intel_renderbuffer_update_wrapper(struct intel_context *intel,
-                                  struct intel_renderbuffer *irb,
+intel_renderbuffer_update_wrapper(struct intel_renderbuffer *irb,
 				  struct gl_texture_image *image,
                                  uint32_t layer)
 {
@@ -468,7 +470,6 @@ intel_render_texture(struct gl_context * ctx,
                     struct gl_framebuffer *fb,
                     struct gl_renderbuffer_attachment *att)
 {
-   struct intel_context *intel = intel_context(ctx);
   struct gl_renderbuffer *rb = att->Renderbuffer;
   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
   struct gl_texture_image *image = rb->TexImage;
@@ -495,7 +496,7 @@ intel_render_texture(struct gl_context * ctx,

   intel_miptree_check_level_layer(mt, att->TextureLevel, layer);

-   if (!intel_renderbuffer_update_wrapper(intel, irb, image, layer)) {
+   if (!intel_renderbuffer_update_wrapper(irb, image, layer)) {
       _swrast_render_texture(ctx, fb, att);
       return;
   }
@@ -641,7 +642,7 @@ intel_blit_framebuffer_with_blitter(struct gl_context *ctx,
                                    GLint srcX1, GLint srcY1,
                                    GLint dstX0, GLint dstY0,
                                    GLint dstX1, GLint dstY1,
-                                    GLbitfield mask, GLenum filter)
+                                    GLbitfield mask)
 {
   struct intel_context *intel = intel_context(ctx);

@@ -714,7 +715,7 @@ intel_blit_framebuffer_with_blitter(struct gl_context *ctx,
                                 dst_irb->mt,
                                 dst_irb->mt_level, dst_irb->mt_layer,
                                 dstX0, dstY0, dst_rb->Name == 0,
-                                 dstX1 - dstX0, dstY1 - dstY0, GL_COPY)) {
+                                 dstX1 - dstX0, dstY1 - dstY0, COLOR_LOGICOP_COPY)) {
            perf_debug("glBlitFramebuffer(): unknown blit failure.  "
                       "Falling back to software rendering.\n");
            return mask;
@@ -739,7 +740,7 @@ intel_blit_framebuffer(struct gl_context *ctx,
   mask = intel_blit_framebuffer_with_blitter(ctx, readFb, drawFb,
                                              srcX0, srcY0, srcX1, srcY1,
                                              dstX0, dstY0, dstX1, dstY1,
-                                              mask, filter);
+                                              mask);
   if (mask == 0x0)
      return;

--- a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
@@ -620,7 +620,7 @@ intel_miptree_copy_slice(struct intel_context *intel,
   if (!intel_miptree_blit(intel,
                           src_mt, level, slice, 0, 0, false,
                           dst_mt, level, slice, 0, 0, false,
-                           width, height, GL_COPY)) {
+                           width, height, COLOR_LOGICOP_COPY)) {
      perf_debug("miptree validate blit for %s failed\n",
                 _mesa_get_format_name(format));

@@ -757,7 +757,7 @@ intel_miptree_map_blit(struct intel_context *intel,
                           map->x, map->y, false,
                           map->mt, 0, 0,
                           0, 0, false,
-                           map->w, map->h, GL_COPY)) {
+                           map->w, map->h, COLOR_LOGICOP_COPY)) {
      fprintf(stderr, "Failed to blit\n");
      goto fail;
   }
@@ -795,7 +795,7 @@ intel_miptree_unmap_blit(struct intel_context *intel,
                                   0, 0, false,
                                   mt, level, slice,
                                   map->x, map->y, false,
-                                   map->w, map->h, GL_COPY);
+                                   map->w, map->h, COLOR_LOGICOP_COPY);
      WARN_ONCE(!ok, "Failed to blit from linear temporary mapping");
   }

--- a/src/mesa/drivers/dri/i915/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_bitmap.c
@@ -262,8 +262,8 @@ do_blit_bitmap( struct gl_context *ctx,
 	 int h = MIN2(DY, height - py);
 	 int w = MIN2(DX, width - px);
 	 GLuint sz = ALIGN(ALIGN(w,8) * h, 64)/8;
-	 GLenum logic_op = ctx->Color.ColorLogicOpEnabled ?
-	    ctx->Color.LogicOp : GL_COPY;
+         const enum gl_logicop_mode logic_op = ctx->Color.ColorLogicOpEnabled ?
+            ctx->Color._LogicOp : COLOR_LOGICOP_COPY;

 	 assert(sz <= sizeof(stipple));
 	 memset(stipple, 0, sz);
--- a/src/mesa/drivers/dri/i915/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_copy.c
@@ -176,7 +176,7 @@ do_blit_copypixels(struct gl_context * ctx,
                           dstx, dsty, _mesa_is_winsys_fbo(fb),
                           width, height,
                           (ctx->Color.ColorLogicOpEnabled ?
-                            ctx->Color.LogicOp : GL_COPY))) {
+                            ctx->Color._LogicOp : COLOR_LOGICOP_COPY))) {
      DBG("%s: blit failure\n", __func__);
      return false;
   }
--- a/src/mesa/drivers/dri/i915/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_read.c
@@ -141,7 +141,7 @@ do_blit_readpixels(struct gl_context * ctx,
                           x, y, _mesa_is_winsys_fbo(ctx->ReadBuffer),
                           pbo_mt, 0, 0,
                           0, 0, dst_flip,
-                           width, height, GL_COPY)) {
+                           width, height, COLOR_LOGICOP_COPY)) {
      intel_miptree_release(&pbo_mt);
      return false;
   }
--- a/src/mesa/drivers/dri/i915/intel_screen.h
+++ b/src/mesa/drivers/dri/i915/intel_screen.h
@@ -117,7 +117,6 @@ struct intel_screen
 #define intel_check_front_buffer_rendering  old_intel_check_front_buffer_rendering
 #define intelInitBufferFuncs                old_intelInitBufferFuncs
 #define intelClearWithBlit                  old_intelClearWithBlit
-#define intelEmitCopyBlit                   old_intelEmitCopyBlit
 #define intelEmitImmediateColorExpandBlit   old_intelEmitImmediateColorExpandBlit
 #define intel_emit_linear_blit              old_intel_emit_linear_blit
 #define intel_miptree_blit                  old_intel_miptree_blit
@@ -139,7 +138,6 @@ struct intel_screen
 #define get_time                            old_get_time
 #define intel_translate_blend_factor        old_intel_translate_blend_factor
 #define intel_translate_compare_func        old_intel_translate_compare_func
-#define intel_translate_logic_op            old_intel_translate_logic_op
 #define intel_translate_shadow_compare_func old_intel_translate_shadow_compare_func
 #define intel_translate_stencil_op          old_intel_translate_stencil_op
 #define intel_init_syncobj_functions        old_intel_init_syncobj_functions
--- a/src/mesa/drivers/dri/i915/intel_state.c
+++ b/src/mesa/drivers/dri/i915/intel_state.c
@@ -151,44 +151,3 @@ intel_translate_blend_factor(GLenum factor)
   fprintf(stderr, "Unknown value in %s: %x\n", __func__, factor);
   return BLENDFACT_ZERO;
 }
-
-int
-intel_translate_logic_op(GLenum opcode)
-{
-   switch (opcode) {
-   case GL_CLEAR:
-      return LOGICOP_CLEAR;
-   case GL_AND:
-      return LOGICOP_AND;
-   case GL_AND_REVERSE:
-      return LOGICOP_AND_RVRSE;
-   case GL_COPY:
-      return LOGICOP_COPY;
-   case GL_COPY_INVERTED:
-      return LOGICOP_COPY_INV;
-   case GL_AND_INVERTED:
-      return LOGICOP_AND_INV;
-   case GL_NOOP:
-      return LOGICOP_NOOP;
-   case GL_XOR:
-      return LOGICOP_XOR;
-   case GL_OR:
-      return LOGICOP_OR;
-   case GL_OR_INVERTED:
-      return LOGICOP_OR_INV;
-   case GL_NOR:
-      return LOGICOP_NOR;
-   case GL_EQUIV:
-      return LOGICOP_EQUIV;
-   case GL_INVERT:
-      return LOGICOP_INV;
-   case GL_OR_REVERSE:
-      return LOGICOP_OR_RVRSE;
-   case GL_NAND:
-      return LOGICOP_NAND;
-   case GL_SET:
-      return LOGICOP_SET;
-   default:
-      return LOGICOP_SET;
-   }
-}
--- a/src/mesa/drivers/dri/i915/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_copy.c
@@ -70,7 +70,7 @@ intel_copy_texsubimage(struct intel_context *intel,
                           intelImage->mt, intelImage->base.Base.Level,
                           intelImage->base.Base.Face + slice,
                           dstx, dsty, false,
-                           width, height, GL_COPY)) {
+                           width, height, COLOR_LOGICOP_COPY)) {
      return false;
   }

--- a/src/mesa/drivers/dri/i915/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_image.c
@@ -163,7 +163,7 @@ try_pbo_upload(struct gl_context *ctx,
                           0, 0, false,
                           intelImage->mt, image->Level, image->Face,
                           0, 0, false,
-                           image->Width, image->Height, GL_COPY)) {
+                           image->Width, image->Height, COLOR_LOGICOP_COPY)) {
      DBG("%s: blit failed\n", __func__);
      intel_miptree_release(&pbo_mt);
      return false;
--- a/src/mesa/drivers/dri/i915/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_subimage.c
@@ -111,7 +111,7 @@ intel_blit_texsubimage(struct gl_context * ctx,
                            0, 0, false,
                            intelImage->mt, texImage->Level, texImage->Face,
                            xoffset, yoffset, false,
-                            width, height, GL_COPY);
+                            width, height, COLOR_LOGICOP_COPY);
   assert(ret);

   intel_miptree_release(&temp_mt);
--- a/Show More
+++ b/Show More