30 files changed, 2333 insertions, 181 deletions
diff --git a/var/spack/repos/builtin/packages/cairo/package.py b/var/spack/repos/builtin/packages/cairo/package.py
index b2911e126a..12c7838f63 100644
--- a/var/spack/repos/builtin/packages/cairo/package.py
+++ b/var/spack/repos/builtin/packages/cairo/package.py
@@ -25,7 +25,7 @@
 from spack import *
 
 
-class Cairo(Package):
+class Cairo(AutotoolsPackage):
     """Cairo is a 2D graphics library with support for multiple output
     devices."""
     homepage = "http://cairographics.org"
@@ -40,9 +40,7 @@ class Cairo(Package):
     depends_on("pkg-config", type="build")
     depends_on("fontconfig@2.10.91:")  # Require newer version of fontconfig.
 
-    def install(self, spec, prefix):
-        configure("--prefix=%s" % prefix,
-                  "--disable-trace",  # can cause problems with libiberty
-                  "--enable-tee")
-        make()
-        make("install")
+    def configure_args(self):
+        args = ["--disable-trace",  # can cause problems with libiberty
+                "--enable-tee"]
+        return args
diff --git a/var/spack/repos/builtin/packages/dealii/package.py b/var/spack/repos/builtin/packages/dealii/package.py
index dbccd01b99..563f751e0b 100644
--- a/var/spack/repos/builtin/packages/dealii/package.py
+++ b/var/spack/repos/builtin/packages/dealii/package.py
@@ -23,10 +23,9 @@
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ##############################################################################
 from spack import *
-import sys
 
 
-class Dealii(Package):
+class Dealii(CMakePackage):
     """C++ software library providing well-documented tools to build finite
     element codes for a broad variety of PDEs."""
     homepage = "https://www.dealii.org"
@@ -118,19 +117,16 @@ class Dealii(Package):
     depends_on("numdiff",     when='@develop')
     depends_on("astyle@2.04", when='@develop')
 
-    def install(self, spec, prefix):
-        options = []
-        options.extend(std_cmake_args)
-
+    def build_type(self):
         # CMAKE_BUILD_TYPE should be DebugRelease | Debug | Release
-        for word in options[:]:
-            if word.startswith('-DCMAKE_BUILD_TYPE'):
-                options.remove(word)
+        return 'DebugRelease'
+
+    def cmake_args(self):
+        spec = self.spec
+        options = []
 
-        dsuf = 'dylib' if sys.platform == 'darwin' else 'so'
         lapack_blas = spec['lapack'].lapack_libs + spec['blas'].blas_libs
         options.extend([
-            '-DCMAKE_BUILD_TYPE=DebugRelease',
             '-DDEAL_II_COMPONENT_EXAMPLES=ON',
             '-DDEAL_II_WITH_THREADS:BOOL=ON',
             '-DBOOST_DIR=%s' % spec['boost'].prefix,
@@ -215,9 +211,9 @@ class Dealii(Package):
                 '-DNETCDF_FOUND=true',
                 '-DNETCDF_LIBRARIES=%s;%s' % (
                     join_path(spec['netcdf-cxx'].prefix.lib,
-                              'libnetcdf_c++.%s' % dsuf),
+                              'libnetcdf_c++.%s' % dso_suffix),
                     join_path(spec['netcdf'].prefix.lib,
-                              'libnetcdf.%s' % dsuf)),
+                              'libnetcdf.%s' % dso_suffix)),
                 '-DNETCDF_INCLUDE_DIRS=%s;%s' % (
                     spec['netcdf-cxx'].prefix.include,
                     spec['netcdf'].prefix.include),
@@ -238,11 +234,7 @@ class Dealii(Package):
                 '-DDEAL_II_WITH_OPENCASCADE=OFF'
             ])
 
-        cmake('.', *options)
-        make()
-        if self.run_tests:
-            make("test")
-        make("install")
+        return options
 
     def setup_environment(self, spack_env, env):
         env.set('DEAL_II_DIR', self.prefix)
diff --git a/var/spack/repos/builtin/packages/dyninst/package.py b/var/spack/repos/builtin/packages/dyninst/package.py
index 3df7ca551d..420ab0fc68 100644
--- a/var/spack/repos/builtin/packages/dyninst/package.py
+++ b/var/spack/repos/builtin/packages/dyninst/package.py
@@ -33,6 +33,13 @@ class Dyninst(Package):
     url = "https://github.com/dyninst/dyninst/archive/v9.2.0.tar.gz"
     list_url = "http://www.dyninst.org/downloads/dyninst-8.x"
 
+    # version 9.2.1b was the latest git commit when trying to port to a
+    # ppc64le system to get fixes in computeAddrWidth independent of 
+    # endianness. This version can be removed if the next release includes
+    # this change. The actual commit was
+    # b8596ad4023ec40ac07e669ff8ea3ec06e262703
+    version('9.2.1b', git='https://github.com/dyninst/dyninst.git',
+            commit='859cb778e20b619443c943c96dd1851da763142b')
     version('9.2.0', 'ad023f85e8e57837ed9de073b59d6bab',
             url="https://github.com/dyninst/dyninst/archive/v9.2.0.tar.gz")
     version('9.1.0', '5c64b77521457199db44bec82e4988ac',
@@ -67,19 +74,21 @@ class Dyninst(Package):
         libdwarf = spec['libdwarf'].prefix
 
         with working_dir('spack-build', create=True):
-            cmake('..',
-                  '-DBoost_INCLUDE_DIR=%s'    % spec['boost'].prefix.include,
-                  '-DBoost_LIBRARY_DIR=%s'    % spec['boost'].prefix.lib,
-                  '-DBoost_NO_SYSTEM_PATHS=TRUE',
-                  '-DLIBELF_INCLUDE_DIR=%s'   % join_path(
-                      libelf.include, 'libelf'),
-                  '-DLIBELF_LIBRARIES=%s'     % join_path(
-                      libelf.lib, 'libelf.so'),
-                  '-DLIBDWARF_INCLUDE_DIR=%s' % libdwarf.include,
-                  '-DLIBDWARF_LIBRARIES=%s'   % join_path(
-                      libdwarf.lib, 'libdwarf.so'),
-                  *std_cmake_args)
-
+            args = ['..',
+                    '-DBoost_INCLUDE_DIR=%s'    % spec['boost'].prefix.include,
+                    '-DBoost_LIBRARY_DIR=%s'    % spec['boost'].prefix.lib,
+                    '-DBoost_NO_SYSTEM_PATHS=TRUE',
+                    '-DLIBELF_INCLUDE_DIR=%s'   % join_path(
+                        libelf.include, 'libelf'),
+                    '-DLIBELF_LIBRARIES=%s'     % join_path(
+                        libelf.lib, 'libelf.so'),
+                    '-DLIBDWARF_INCLUDE_DIR=%s' % libdwarf.include,
+                    '-DLIBDWARF_LIBRARIES=%s'   % join_path(
+                        libdwarf.lib, 'libdwarf.so')]
+            if spec.satisfies('arch=linux-redhat7-ppc64le'):
+                args.append('-Darch_ppc64_little_endian=1')
+            args += std_cmake_args
+            cmake(*args)
             make()
             make("install")
 
diff --git a/var/spack/repos/builtin/packages/everytrace-example/package.py b/var/spack/repos/builtin/packages/everytrace-example/package.py
index 8a85423192..8c49e04634 100644
--- a/var/spack/repos/builtin/packages/everytrace-example/package.py
+++ b/var/spack/repos/builtin/packages/everytrace-example/package.py
@@ -39,8 +39,5 @@ class EverytraceExample(CMakePackage):
     # Currently the only MPI this everytrace works with.
     depends_on('openmpi')
 
-    def configure_args(self):
-        return []
-
     def setup_environment(self, spack_env, env):
         env.prepend_path('PATH', join_path(self.prefix, 'bin'))
diff --git a/var/spack/repos/builtin/packages/everytrace/package.py b/var/spack/repos/builtin/packages/everytrace/package.py
index ee1a058009..a3f3e2cfce 100644
--- a/var/spack/repos/builtin/packages/everytrace/package.py
+++ b/var/spack/repos/builtin/packages/everytrace/package.py
@@ -42,7 +42,7 @@ class Everytrace(CMakePackage):
     depends_on('cmake', type='build')
     depends_on('mpi', when='+mpi')
 
-    def configure_args(self):
+    def cmake_args(self):
         spec = self.spec
         return [
             '-DUSE_MPI=%s' % ('YES' if '+mpi' in spec else 'NO'),
diff --git a/var/spack/repos/builtin/packages/fftw/package.py b/var/spack/repos/builtin/packages/fftw/package.py
index 3069e39226..53b635ba7c 100644
--- a/var/spack/repos/builtin/packages/fftw/package.py
+++ b/var/spack/repos/builtin/packages/fftw/package.py
@@ -39,6 +39,9 @@ class Fftw(Package):
     version('3.3.5', '6cc08a3b9c7ee06fdd5b9eb02e06f569')
     version('3.3.4', '2edab8c06b24feeb3b82bbb3ebf3e7b3')
 
+    patch('pfft-3.3.5.patch', when="@3.3.5+pfft_patches", level=0)
+    patch('pfft-3.3.4.patch', when="@3.3.4+pfft_patches", level=0)
+
     variant(
         'float', default=True,
         description='Produces a single precision version of the library')
@@ -51,8 +54,13 @@ class Fftw(Package):
                     '(works only with GCC and libquadmath)')
     variant('openmp', default=False, description="Enable OpenMP support.")
     variant('mpi', default=False, description='Activate MPI support')
+    variant(
+        'pfft_patches', default=False,
+        description='Add extra transpose functions for PFFT compatibility')
 
     depends_on('mpi', when='+mpi')
+    depends_on('automake', type='build', when='+pfft_patches')
+    depends_on('autoconf', type='build', when='+pfft_patches')
 
     # TODO : add support for architecture specific optimizations as soon as
     # targets are supported
@@ -77,6 +85,10 @@ class Fftw(Package):
         if '+mpi' in spec:
             options.append('--enable-mpi')
 
+        if '+pfft_patches' in spec:
+            autoreconf = which('autoreconf')
+            autoreconf('-ifv')
+
         configure(*options)
         make()
         if self.run_tests:
diff --git a/var/spack/repos/builtin/packages/fftw/pfft-3.3.4.patch b/var/spack/repos/builtin/packages/fftw/pfft-3.3.4.patch
new file mode 100644
index 0000000000..4740a60ae4
--- /dev/null
+++ b/var/spack/repos/builtin/packages/fftw/pfft-3.3.4.patch
@@ -0,0 +1,865 @@
+--- mpi/conf.c	2014-03-04 19:41:03.000000000 +0100
++++ mpi/conf.c	2015-09-05 05:53:19.085516467 +0200
+@@ -29,6 +29,8 @@ static const solvtab s =
+      SOLVTAB(XM(transpose_pairwise_register)),
+      SOLVTAB(XM(transpose_alltoall_register)),
+      SOLVTAB(XM(transpose_recurse_register)),
++     SOLVTAB(XM(transpose_pairwise_transposed_register)),
++     SOLVTAB(XM(transpose_alltoall_transposed_register)),
+      SOLVTAB(XM(dft_rank_geq2_register)),
+      SOLVTAB(XM(dft_rank_geq2_transposed_register)),
+      SOLVTAB(XM(dft_serial_register)),
+
+--- mpi/Makefile.am  2013-03-18 13:10:45.000000000 +0100
++++ mpi/Makefile.am  2015-09-05 05:53:19.084516437 +0200
+@@ -16,6 +16,7 @@ BUILT_SOURCES = fftw3-mpi.f03.in fftw3-m
+ CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
+ 
+ TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-problem.c transpose-solve.c mpi-transpose.h
++TRANSPOSE_SRC += transpose-alltoall-transposed.c transpose-pairwise-transposed.c
+ DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
+ RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
+ RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h                       
+
+--- mpi/mpi-transpose.h	2014-03-04 19:41:03.000000000 +0100
++++ mpi/mpi-transpose.h	2015-09-05 05:53:19.085516467 +0200
+@@ -59,3 +59,5 @@ int XM(mkplans_posttranspose)(const prob
+ void XM(transpose_pairwise_register)(planner *p);
+ void XM(transpose_alltoall_register)(planner *p);
+ void XM(transpose_recurse_register)(planner *p);
++void XM(transpose_pairwise_transposed_register)(planner *p);
++void XM(transpose_alltoall_transposed_register)(planner *p);
+
+--- mpi/transpose-alltoall-transposed.c	1970-01-01 01:00:00.000000000 +0100
++++ mpi/transpose-alltoall-transposed.c	2015-09-05 05:53:19.085516467 +0200
+@@ -0,0 +1,280 @@
++/*
++ * Copyright (c) 2003, 2007-11 Matteo Frigo
++ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
++ * Copyright (c) 2012 Michael Pippig
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
++ *
++ */
++
++/* plans for distributed out-of-place transpose using MPI_Alltoall,
++   and which destroy the input array (also if TRANSPOSED_IN is used) */
++
++#include "mpi-transpose.h"
++#include <string.h>
++
++typedef struct {
++     solver super;
++     int copy_transposed_out; /* whether to copy the output for TRANSPOSED_OUT,
++				which makes the first transpose out-of-place
++				but costs an extra copy and requires us
++				to destroy the input */
++} S;
++
++typedef struct {
++     plan_mpi_transpose super;
++
++     plan *cld1, *cld2, *cld2rest, *cld3;
++
++     MPI_Comm comm;
++     int *send_block_sizes, *send_block_offsets;
++     int *recv_block_sizes, *recv_block_offsets;
++
++     INT rest_Ioff, rest_Ooff;
++
++     int equal_blocks;
++} P;
++
++/* transpose locally to get contiguous chunks
++   this may take two transposes if the block sizes are unequal
++   (3 subplans, two of which operate on disjoint data) */
++static void apply_pretranspose(
++    const P *ego, R *I, R *O
++    )
++{
++  plan_rdft *cld2, *cld2rest, *cld3;
++
++  cld3 = (plan_rdft *) ego->cld3;
++  if (cld3)
++       cld3->apply(ego->cld3, O, O);
++  /* else TRANSPOSED_IN is true and user wants I transposed */
++
++  cld2 = (plan_rdft *) ego->cld2;
++  cld2->apply(ego->cld2, I, O);
++  cld2rest = (plan_rdft *) ego->cld2rest;
++  if (cld2rest) {
++       cld2rest->apply(ego->cld2rest,
++      		       I + ego->rest_Ioff, O + ego->rest_Ooff);
++  }
++}
++
++static void apply(const plan *ego_, R *I, R *O)
++{
++     const P *ego = (const P *) ego_;
++     plan_rdft *cld1 = (plan_rdft *) ego->cld1;
++
++     if (cld1) {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, O);
++
++	  /* transpose chunks globally */
++	  if (ego->equal_blocks)
++	       MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE,
++			    I, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
++			    ego->comm);
++	  else
++	       MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets,
++			     FFTW_MPI_TYPE,
++			     I, ego->recv_block_sizes, ego->recv_block_offsets,
++			     FFTW_MPI_TYPE,
++			     ego->comm);
++
++          /* transpose locally to get non-transposed output */
++          cld1->apply(ego->cld1, I, O);
++     } /* else TRANSPOSED_OUT is true and user wants O transposed */
++     else {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, I);
++
++          /* transpose chunks globally */
++	  if (ego->equal_blocks)
++	       MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE,
++			    O, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
++			    ego->comm);
++	  else
++	       MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets,
++			     FFTW_MPI_TYPE,
++			     O, ego->recv_block_sizes, ego->recv_block_offsets,
++			     FFTW_MPI_TYPE,
++			     ego->comm);
++     }
++}
++
++static int applicable(const S *ego, const problem *p_,
++		      const planner *plnr)
++{
++     /* in contrast to transpose-alltoall this algorithm can not preserve the input,
++      * since we need at least one transpose before the (out-of-place) Alltoall */
++     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
++     return (1
++	     && p->I != p->O
++	     && (!NO_DESTROY_INPUTP(plnr))  
++	     && ((p->flags & TRANSPOSED_OUT) || !ego->copy_transposed_out)
++	     && ONLY_TRANSPOSEDP(p->flags)
++	  );
++}
++
++static void awake(plan *ego_, enum wakefulness wakefulness)
++{
++     P *ego = (P *) ego_;
++     X(plan_awake)(ego->cld1, wakefulness);
++     X(plan_awake)(ego->cld2, wakefulness);
++     X(plan_awake)(ego->cld2rest, wakefulness);
++     X(plan_awake)(ego->cld3, wakefulness);
++}
++
++static void destroy(plan *ego_)
++{
++     P *ego = (P *) ego_;
++     X(ifree0)(ego->send_block_sizes);
++     MPI_Comm_free(&ego->comm);
++     X(plan_destroy_internal)(ego->cld3);
++     X(plan_destroy_internal)(ego->cld2rest);
++     X(plan_destroy_internal)(ego->cld2);
++     X(plan_destroy_internal)(ego->cld1);
++}
++
++static void print(const plan *ego_, printer *p)
++{
++     const P *ego = (const P *) ego_;
++     p->print(p, "(mpi-transpose-alltoall-transposed%s%(%p%)%(%p%)%(%p%)%(%p%))",
++	      ego->equal_blocks ? "/e" : "",
++	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
++}
++
++static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
++{
++     const S *ego = (const S *) ego_;
++     const problem_mpi_transpose *p;
++     P *pln;
++     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
++     INT b, bt, vn, rest_Ioff, rest_Ooff;
++     R *O;
++     int *sbs, *sbo, *rbs, *rbo;
++     int pe, my_pe, n_pes;
++     int equal_blocks = 1;
++     static const plan_adt padt = {
++          XM(transpose_solve), awake, print, destroy
++     };
++
++     if (!applicable(ego, p_, plnr))
++          return (plan *) 0;
++
++     p = (const problem_mpi_transpose *) p_;
++     vn = p->vn;
++
++     MPI_Comm_rank(p->comm, &my_pe);
++     MPI_Comm_size(p->comm, &n_pes);
++
++     bt = XM(block)(p->ny, p->tblock, my_pe);
++
++     if (p->flags & TRANSPOSED_OUT) { /* O stays transposed */
++	  if (ego->copy_transposed_out) {
++	       cld1 = X(mkplan_f_d)(plnr,
++				  X(mkproblem_rdft_0_d)(X(mktensor_1d)
++							(bt * p->nx * vn, 1, 1),
++							p->I, O = p->O),
++				    0, 0, NO_SLOW);
++	       if (XM(any_true)(!cld1, p->comm)) goto nada;
++	  }
++	  else /* first transpose is in-place */
++              O = p->I;
++     }
++     else { /* transpose nx x bt x vn -> bt x nx x vn */
++	  cld1 = X(mkplan_f_d)(plnr, 
++			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
++						     (bt, vn, p->nx * vn,
++						      p->nx, bt * vn, vn,
++						      vn, 1, 1),
++						     p->I, O = p->O),
++			       0, 0, NO_SLOW);
++	  if (XM(any_true)(!cld1, p->comm)) goto nada;
++     }
++
++     if (XM(any_true)(!XM(mkplans_pretranspose)(p, plnr, p->I, O, my_pe,
++	 					&cld2, &cld2rest, &cld3,
++						&rest_Ioff, &rest_Ooff),
++		      p->comm)) goto nada;
++
++
++     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
++
++     pln->cld1 = cld1;
++     pln->cld2 = cld2;
++     pln->cld2rest = cld2rest;
++     pln->rest_Ioff = rest_Ioff;
++     pln->rest_Ooff = rest_Ooff;
++     pln->cld3 = cld3;
++
++     MPI_Comm_dup(p->comm, &pln->comm);
++
++     /* Compute sizes/offsets of blocks to send for all-to-all command. */
++     sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
++     sbo = sbs + n_pes;
++     rbs = sbo + n_pes;
++     rbo = rbs + n_pes;
++     b = XM(block)(p->nx, p->block, my_pe);
++     bt = XM(block)(p->ny, p->tblock, my_pe);
++     for (pe = 0; pe < n_pes; ++pe) {
++	  INT db, dbt; /* destination block sizes */
++	  db = XM(block)(p->nx, p->block, pe);
++	  dbt = XM(block)(p->ny, p->tblock, pe);
++	  if (db != p->block || dbt != p->tblock)
++	       equal_blocks = 0;
++
++	  /* MPI requires type "int" here; apparently it
++	     has no 64-bit API?  Grrr. */
++	  sbs[pe] = (int) (b * dbt * vn);
++	  sbo[pe] = (int) (pe * (b * p->tblock) * vn);
++	  rbs[pe] = (int) (db * bt * vn);
++	  rbo[pe] = (int) (pe * (p->block * bt) * vn);
++     }
++     pln->send_block_sizes = sbs;
++     pln->send_block_offsets = sbo;
++     pln->recv_block_sizes = rbs;
++     pln->recv_block_offsets = rbo;
++     pln->equal_blocks = equal_blocks;
++
++     X(ops_zero)(&pln->super.super.ops);
++     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
++     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
++     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
++     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
++     /* FIXME: should MPI operations be counted in "other" somehow? */
++
++     return &(pln->super.super);
++
++ nada:
++     X(plan_destroy_internal)(cld3);
++     X(plan_destroy_internal)(cld2rest);
++     X(plan_destroy_internal)(cld2);
++     X(plan_destroy_internal)(cld1);
++     return (plan *) 0;
++}
++
++static solver *mksolver(int copy_transposed_out)
++{
++     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
++     S *slv = MKSOLVER(S, &sadt);
++     slv->copy_transposed_out = copy_transposed_out;
++     return &(slv->super);
++}
++
++void XM(transpose_alltoall_transposed_register)(planner *p)
++{
++     int cto;
++     for (cto = 0; cto <= 1; ++cto)
++	  REGISTER_SOLVER(p, mksolver(cto));
++}
+
+--- mpi/transpose-pairwise.c	2014-03-04 19:41:03.000000000 +0100
++++ mpi/transpose-pairwise.c	2015-09-05 06:00:05.715433709 +0200
+@@ -53,7 +53,6 @@ static void transpose_chunks(int *sched,
+ {
+      if (sched) {
+ 	  int i;
+-	  MPI_Status status;
+ 
+ 	  /* TODO: explore non-synchronous send/recv? */
+ 
+@@ -74,7 +73,7 @@ static void transpose_chunks(int *sched,
+ 				      O + rbo[pe], (int) (rbs[pe]),
+ 				      FFTW_MPI_TYPE,
+ 				      pe, (pe * n_pes + my_pe) & 0xffff,
+-				      comm, &status);
++				      comm, MPI_STATUS_IGNORE);
+ 		    }
+ 	       }
+ 
+@@ -92,7 +91,7 @@ static void transpose_chunks(int *sched,
+ 				      O + rbo[pe], (int) (rbs[pe]),
+ 				      FFTW_MPI_TYPE,
+ 				      pe, (pe * n_pes + my_pe) & 0xffff,
+-				      comm, &status);
++				      comm, MPI_STATUS_IGNORE);
+ 	       }
+ 	  }
+      }
+@@ -350,6 +349,7 @@ nada:
+      X(plan_destroy_internal)(*cld3);
+      X(plan_destroy_internal)(*cld2rest);
+      X(plan_destroy_internal)(*cld2);
++     *cld2 = *cld2rest = *cld3 = NULL;
+      return 0;
+ }
+ 
+--- mpi/transpose-pairwise-transposed.c	1970-01-01 01:00:00.000000000 +0100
++++ mpi/transpose-pairwise-transposed.c	2015-09-05 06:00:07.280481042 +0200
+@@ -0,0 +1,510 @@
++/*
++ * Copyright (c) 2003, 2007-11 Matteo Frigo
++ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
++ * Copyright (c) 2012 Michael Pippig
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
++ *
++ */
++
++/* Distributed transposes using a sequence of carefully scheduled
++   pairwise exchanges.  This has the advantage that it can be done
++   in-place, or out-of-place while preserving the input, using buffer
++   space proportional to the local size divided by the number of
++   processes (i.e. to the total array size divided by the number of
++   processes squared). */
++
++#include "mpi-transpose.h"
++#include <string.h>
++
++typedef struct {
++     solver super;
++     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
++} S;
++
++typedef struct {
++     plan_mpi_transpose super;
++
++     plan *cld1, *cld2, *cld2rest, *cld3;
++     INT rest_Ioff, rest_Ooff;
++     
++     int n_pes, my_pe, *sched;
++     INT *send_block_sizes, *send_block_offsets;
++     INT *recv_block_sizes, *recv_block_offsets;
++     MPI_Comm comm;
++     int preserve_input;
++} P;
++
++static void transpose_chunks(int *sched, int n_pes, int my_pe,
++			     INT *sbs, INT *sbo, INT *rbs, INT *rbo,
++			     MPI_Comm comm,
++			     R *I, R *O)
++{
++     if (sched) {
++	  int i;
++
++	  /* TODO: explore non-synchronous send/recv? */
++
++	  if (I == O) {
++	       R *buf = (R*) MALLOC(sizeof(R) * sbs[0], BUFFERS);
++	       
++	       for (i = 0; i < n_pes; ++i) {
++		    int pe = sched[i];
++		    if (my_pe == pe) {
++			 if (rbo[pe] != sbo[pe])
++			      memmove(O + rbo[pe], O + sbo[pe],
++				      sbs[pe] * sizeof(R));
++		    }
++		    else {
++			 memcpy(buf, O + sbo[pe], sbs[pe] * sizeof(R));
++			 MPI_Sendrecv(buf, (int) (sbs[pe]), FFTW_MPI_TYPE,
++				      pe, (my_pe * n_pes + pe) & 0xffff,
++				      O + rbo[pe], (int) (rbs[pe]),
++				      FFTW_MPI_TYPE,
++				      pe, (pe * n_pes + my_pe) & 0xffff,
++				      comm, MPI_STATUS_IGNORE);
++		    }
++	       }
++
++	       X(ifree)(buf);
++	  }
++	  else { /* I != O */
++	       for (i = 0; i < n_pes; ++i) {
++		    int pe = sched[i];
++		    if (my_pe == pe)
++			 memcpy(O + rbo[pe], I + sbo[pe], sbs[pe] * sizeof(R));
++		    else
++			 MPI_Sendrecv(I + sbo[pe], (int) (sbs[pe]),
++				      FFTW_MPI_TYPE,
++				      pe, (my_pe * n_pes + pe) & 0xffff,
++				      O + rbo[pe], (int) (rbs[pe]),
++				      FFTW_MPI_TYPE,
++				      pe, (pe * n_pes + my_pe) & 0xffff,
++				      comm, MPI_STATUS_IGNORE);
++	       }
++	  }
++     }
++}
++
++/* transpose locally to get contiguous chunks
++   this may take two transposes if the block sizes are unequal
++   (3 subplans, two of which operate on disjoint data) */
++static void apply_pretranspose(
++    const P *ego, R *I, R *O
++    )
++{
++  plan_rdft *cld2, *cld2rest, *cld3;
++
++  cld3 = (plan_rdft *) ego->cld3;
++  if (cld3)
++       cld3->apply(ego->cld3, O, O);
++  /* else TRANSPOSED_IN is true and user wants I transposed */
++
++  cld2 = (plan_rdft *) ego->cld2;
++  cld2->apply(ego->cld2, I, O);
++  cld2rest = (plan_rdft *) ego->cld2rest;
++  if (cld2rest) {
++       cld2rest->apply(ego->cld2rest,
++      		       I + ego->rest_Ioff, O + ego->rest_Ooff);
++  }
++}
++
++static void apply(const plan *ego_, R *I, R *O)
++{
++     const P *ego = (const P *) ego_;
++     plan_rdft *cld1 = (plan_rdft *) ego->cld1;
++     
++     if (cld1) {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, O);
++
++          if(ego->preserve_input) I = O;
++
++          /* transpose chunks globally */
++          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
++                           ego->send_block_sizes, ego->send_block_offsets,
++			   ego->recv_block_sizes, ego->recv_block_offsets,
++			   ego->comm, O, I);
++
++          /* transpose locally to get non-transposed output */
++          cld1->apply(ego->cld1, I, O);
++     } /* else TRANSPOSED_OUT is true and user wants O transposed */
++     else if (ego->preserve_input) {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, O);
++
++          /* transpose chunks globally */
++          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
++             	           ego->send_block_sizes, ego->send_block_offsets,
++			   ego->recv_block_sizes, ego->recv_block_offsets,
++			   ego->comm, O, O);
++     }
++     else {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, I);
++
++          /* transpose chunks globally */
++          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
++             	           ego->send_block_sizes, ego->send_block_offsets,
++			   ego->recv_block_sizes, ego->recv_block_offsets,
++			   ego->comm, I, O);
++     }
++}
++
++static int applicable(const S *ego, const problem *p_,
++		      const planner *plnr)
++{
++     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
++     /* Note: this is *not* UGLY for out-of-place, destroy-input plans;
++	the planner often prefers transpose-pairwise to transpose-alltoall,
++	at least with LAM MPI on my machine. */
++     return (1
++	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
++					  && p->I != p->O))
++	     && ONLY_TRANSPOSEDP(p->flags));
++}
++
++static void awake(plan *ego_, enum wakefulness wakefulness)
++{
++     P *ego = (P *) ego_;
++     X(plan_awake)(ego->cld1, wakefulness);
++     X(plan_awake)(ego->cld2, wakefulness);
++     X(plan_awake)(ego->cld2rest, wakefulness);
++     X(plan_awake)(ego->cld3, wakefulness);
++}
++
++static void destroy(plan *ego_)
++{
++     P *ego = (P *) ego_;
++     X(ifree0)(ego->sched);
++     X(ifree0)(ego->send_block_sizes);
++     MPI_Comm_free(&ego->comm);
++     X(plan_destroy_internal)(ego->cld3);
++     X(plan_destroy_internal)(ego->cld2rest);
++     X(plan_destroy_internal)(ego->cld2);
++     X(plan_destroy_internal)(ego->cld1);
++}
++
++static void print(const plan *ego_, printer *p)
++{
++     const P *ego = (const P *) ego_;
++     p->print(p, "(mpi-transpose-pairwise-transposed%s%(%p%)%(%p%)%(%p%)%(%p%))", 
++	      ego->preserve_input==2 ?"/p":"",
++	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
++}
++
++/* Given a process which_pe and a number of processes npes, fills
++   the array sched[npes] with a sequence of processes to communicate
++   with for a deadlock-free, optimum-overlap all-to-all communication.
++   (All processes must call this routine to get their own schedules.)
++   The schedule can be re-ordered arbitrarily as long as all processes
++   apply the same permutation to their schedules.
++
++   The algorithm here is based upon the one described in:
++       J. A. M. Schreuder, "Constructing timetables for sport
++       competitions," Mathematical Programming Study 13, pp. 58-67 (1980). 
++   In a sport competition, you have N teams and want every team to
++   play every other team in as short a time as possible (maximum overlap
++   between games).  This timetabling problem is therefore identical
++   to that of an all-to-all communications problem.  In our case, there
++   is one wrinkle: as part of the schedule, the process must do
++   some data transfer with itself (local data movement), analogous
++   to a requirement that each team "play itself" in addition to other
++   teams.  With this wrinkle, it turns out that an optimal timetable
++   (N parallel games) can be constructed for any N, not just for even
++   N as in the original problem described by Schreuder.
++*/
++static void fill1_comm_sched(int *sched, int which_pe, int npes)
++{
++     int pe, i, n, s = 0;
++     A(which_pe >= 0 && which_pe < npes);
++     if (npes % 2 == 0) {
++	  n = npes;
++	  sched[s++] = which_pe;
++     }
++     else
++	  n = npes + 1;
++     for (pe = 0; pe < n - 1; ++pe) {
++	  if (npes % 2 == 0) {
++	       if (pe == which_pe) sched[s++] = npes - 1;
++	       else if (npes - 1 == which_pe) sched[s++] = pe;
++	  }
++	  else if (pe == which_pe) sched[s++] = pe;
++
++	  if (pe != which_pe && which_pe < n - 1) {
++	       i = (pe - which_pe + (n - 1)) % (n - 1);
++	       if (i < n/2)
++		    sched[s++] = (pe + i) % (n - 1);
++	       
++	       i = (which_pe - pe + (n - 1)) % (n - 1);
++	       if (i < n/2)
++		    sched[s++] = (pe - i + (n - 1)) % (n - 1);
++	  }
++     }
++     A(s == npes);
++}
++
++/* Sort the communication schedule sched for npes so that the schedule
++   on process sortpe is ascending or descending (!ascending).  This is
++   necessary to allow in-place transposes when the problem does not
++   divide equally among the processes.  In this case there is one
++   process where the incoming blocks are bigger/smaller than the
++   outgoing blocks and thus have to be received in
++   descending/ascending order, respectively, to avoid overwriting data
++   before it is sent. */
++static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
++{
++     int *sortsched, i;
++     sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER);
++     fill1_comm_sched(sortsched, sortpe, npes);
++     if (ascending)
++	  for (i = 0; i < npes; ++i)
++	       sortsched[npes + sortsched[i]] = sched[i];
++     else
++	  for (i = 0; i < npes; ++i)
++	       sortsched[2*npes - 1 - sortsched[i]] = sched[i];
++     for (i = 0; i < npes; ++i)
++	  sched[i] = sortsched[npes + i];
++     X(ifree)(sortsched);
++}
++
++/* make the plans to do the pre-MPI transpositions (shared with
++   transpose-alltoall-transposed) */
++int XM(mkplans_pretranspose)(const problem_mpi_transpose *p, planner *plnr,
++			      R *I, R *O, int my_pe,
++			      plan **cld2, plan **cld2rest, plan **cld3,
++			      INT *rest_Ioff, INT *rest_Ooff)
++{
++     INT vn = p->vn;
++     INT b = XM(block)(p->nx, p->block, my_pe);
++     INT bt = p->tblock;
++     INT nyb = p->ny / bt; /* number of equal-sized blocks */
++     INT nyr = p->ny - nyb * bt; /* leftover rows after equal blocks */
++
++     *cld2 = *cld2rest = *cld3 = NULL;
++     *rest_Ioff = *rest_Ooff = 0;
++
++     if (!(p->flags & TRANSPOSED_IN) && (nyr == 0 || I != O)) {
++	  INT ny = p->ny * vn;
++	  bt *= vn;
++	  *cld2 = X(mkplan_f_d)(plnr, 
++				X(mkproblem_rdft_0_d)(X(mktensor_3d)
++						      (nyb, bt, b * bt,
++						       b, ny, bt,
++						       bt, 1, 1),
++						      I, O),
++				0, 0, NO_SLOW);
++	  if (!*cld2) goto nada;
++
++	  if (nyr > 0) {
++	       *rest_Ioff = nyb * bt;
++	       *rest_Ooff = nyb * b * bt;
++	       bt = nyr * vn;
++	       *cld2rest = X(mkplan_f_d)(plnr,
++					 X(mkproblem_rdft_0_d)(X(mktensor_2d)
++							       (b, ny, bt,
++								bt, 1, 1),
++							       I + *rest_Ioff,
++							       O + *rest_Ooff),
++                                        0, 0, NO_SLOW);
++               if (!*cld2rest) goto nada;
++	  }
++     }
++     else {
++	  *cld2 = X(mkplan_f_d)(plnr,
++				X(mkproblem_rdft_0_d)(
++				     X(mktensor_4d)
++				     (nyb, b * bt * vn, b * bt * vn,
++				      b, vn, bt * vn,
++				      bt, b * vn, vn,
++				      vn, 1, 1),
++				     I, O),
++				0, 0, NO_SLOW);
++	  if (!*cld2) goto nada;
++
++	  *rest_Ioff = *rest_Ooff = nyb * bt * b * vn;
++	  *cld2rest = X(mkplan_f_d)(plnr,
++				    X(mkproblem_rdft_0_d)(
++					 X(mktensor_3d)
++					 (b, vn, nyr * vn,
++					  nyr, b * vn, vn,
++					  vn, 1, 1),
++					 I + *rest_Ioff, O + *rest_Ooff),
++				    0, 0, NO_SLOW);
++	  if (!*cld2rest) goto nada;
++
++	  if (!(p->flags & TRANSPOSED_IN)) {
++	       *cld3 = X(mkplan_f_d)(plnr,
++				     X(mkproblem_rdft_0_d)(
++					  X(mktensor_3d)
++					  (p->ny, vn, b * vn,
++					   b, p->ny * vn, vn,
++					   vn, 1, 1),
++					  I, I),
++				     0, 0, NO_SLOW);
++	       if (!*cld3) goto nada;
++	  }
++     }
++
++     return 1;
++
++nada:
++     X(plan_destroy_internal)(*cld3);
++     X(plan_destroy_internal)(*cld2rest);
++     X(plan_destroy_internal)(*cld2);
++     *cld2 = *cld2rest = *cld3 = NULL;
++     return 0;
++}
++
++static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
++{
++     const S *ego = (const S *) ego_;
++     const problem_mpi_transpose *p;
++     P *pln;
++     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
++     INT b, bt, vn, rest_Ioff, rest_Ooff;
++     INT *sbs, *sbo, *rbs, *rbo;
++     int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
++     R *I, *O;
++     static const plan_adt padt = {
++          XM(transpose_solve), awake, print, destroy
++     };
++
++     UNUSED(ego);
++
++     if (!applicable(ego, p_, plnr))
++          return (plan *) 0;
++
++     p = (const problem_mpi_transpose *) p_;
++     vn = p->vn;
++     I = p->I; O = p->O;
++
++     MPI_Comm_rank(p->comm, &my_pe);
++     MPI_Comm_size(p->comm, &n_pes);
++
++     bt = XM(block)(p->ny, p->tblock, my_pe);
++
++
++     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = p->O;
++     
++     if (!(p->flags & TRANSPOSED_OUT)) { /* nx x bt x vn -> bt x nx x vn */
++	  cld1 = X(mkplan_f_d)(plnr, 
++			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
++						     (bt, vn, p->nx * vn,
++						      p->nx, bt * vn, vn,
++						      vn, 1, 1),
++						     I, O = p->O),
++			       0, 0, NO_SLOW);
++	  if (XM(any_true)(!cld1, p->comm)) goto nada;
++
++     }
++     else {
++       if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
++         O = p->O;
++       else
++         O = p->I;
++     }
++
++     if (XM(any_true)(!XM(mkplans_pretranspose)(p, plnr, p->I, O, my_pe,
++						&cld2, &cld2rest, &cld3,
++						&rest_Ioff, &rest_Ooff),
++		      p->comm)) goto nada;
++
++     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
++
++     pln->cld1 = cld1;
++     pln->cld2 = cld2;
++     pln->cld2rest = cld2rest;
++     pln->rest_Ioff = rest_Ioff;
++     pln->rest_Ooff = rest_Ooff;
++     pln->cld3 = cld3;
++     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
++
++     MPI_Comm_dup(p->comm, &pln->comm);
++
++     n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
++			   XM(num_blocks)(p->ny, p->tblock));
++
++     /* Compute sizes/offsets of blocks to exchange between processors */
++     sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
++     sbo = sbs + n_pes;
++     rbs = sbo + n_pes;
++     rbo = rbs + n_pes;
++     b = XM(block)(p->nx, p->block, my_pe);
++     bt = XM(block)(p->ny, p->tblock, my_pe);
++     for (pe = 0; pe < n_pes; ++pe) {
++	  INT db, dbt; /* destination block sizes */
++	  db = XM(block)(p->nx, p->block, pe);
++	  dbt = XM(block)(p->ny, p->tblock, pe);
++
++	  sbs[pe] = b * dbt * vn;
++	  sbo[pe] = pe * (b * p->tblock) * vn;
++	  rbs[pe] = db * bt * vn;
++	  rbo[pe] = pe * (p->block * bt) * vn;
++
++	  if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
++	       A(sort_pe == -1); /* only one process should need sorting */
++	       sort_pe = pe;
++	       ascending = db * p->tblock > p->block * dbt;
++	  }
++     }
++     pln->n_pes = n_pes;
++     pln->my_pe = my_pe;
++     pln->send_block_sizes = sbs;
++     pln->send_block_offsets = sbo;
++     pln->recv_block_sizes = rbs;
++     pln->recv_block_offsets = rbo;
++
++     if (my_pe >= n_pes) {
++	  pln->sched = 0; /* this process is not doing anything */
++     }
++     else {
++	  pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
++	  fill1_comm_sched(pln->sched, my_pe, n_pes);
++	  if (sort_pe >= 0)
++	       sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
++     }
++
++     X(ops_zero)(&pln->super.super.ops);
++     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
++     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
++     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
++     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
++     /* FIXME: should MPI operations be counted in "other" somehow? */
++
++     return &(pln->super.super);
++
++ nada:
++     X(plan_destroy_internal)(cld3);
++     X(plan_destroy_internal)(cld2rest);
++     X(plan_destroy_internal)(cld2);
++     X(plan_destroy_internal)(cld1);
++     return (plan *) 0;
++}
++
++static solver *mksolver(int preserve_input)
++{
++     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
++     S *slv = MKSOLVER(S, &sadt);
++     slv->preserve_input = preserve_input;
++     return &(slv->super);
++}
++
++void XM(transpose_pairwise_transposed_register)(planner *p)
++{
++     int preserve_input;
++     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
++	  REGISTER_SOLVER(p, mksolver(preserve_input));
++}
diff --git a/var/spack/repos/builtin/packages/fftw/pfft-3.3.5.patch b/var/spack/repos/builtin/packages/fftw/pfft-3.3.5.patch
new file mode 100644
index 0000000000..360a3757f9
--- /dev/null
+++ b/var/spack/repos/builtin/packages/fftw/pfft-3.3.5.patch
@@ -0,0 +1,858 @@
+--- mpi/conf.c	2014-03-04 19:41:03.000000000 +0100
++++ mpi/conf.c	2015-09-05 05:53:19.085516467 +0200
+@@ -29,6 +29,8 @@ static const solvtab s =
+      SOLVTAB(XM(transpose_pairwise_register)),
+      SOLVTAB(XM(transpose_alltoall_register)),
+      SOLVTAB(XM(transpose_recurse_register)),
++     SOLVTAB(XM(transpose_pairwise_transposed_register)),
++     SOLVTAB(XM(transpose_alltoall_transposed_register)),
+      SOLVTAB(XM(dft_rank_geq2_register)),
+      SOLVTAB(XM(dft_rank_geq2_transposed_register)),
+      SOLVTAB(XM(dft_serial_register)),
+
+--- mpi/Makefile.am  2013-03-18 13:10:45.000000000 +0100
++++ mpi/Makefile.am  2015-09-05 05:53:19.084516437 +0200
+@@ -16,6 +16,7 @@ BUILT_SOURCES = fftw3-mpi.f03.in fftw3-m
+ CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
+ 
+ TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-problem.c transpose-solve.c mpi-transpose.h
++TRANSPOSE_SRC += transpose-alltoall-transposed.c transpose-pairwise-transposed.c
+ DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
+ RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
+ RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h                       
+
+--- mpi/mpi-transpose.h	2014-03-04 19:41:03.000000000 +0100
++++ mpi/mpi-transpose.h	2015-09-05 05:53:19.085516467 +0200
+@@ -59,3 +59,5 @@ int XM(mkplans_posttranspose)(const prob
+ void XM(transpose_pairwise_register)(planner *p);
+ void XM(transpose_alltoall_register)(planner *p);
+ void XM(transpose_recurse_register)(planner *p);
++void XM(transpose_pairwise_transposed_register)(planner *p);
++void XM(transpose_alltoall_transposed_register)(planner *p);
+
+--- mpi/transpose-alltoall-transposed.c	1970-01-01 01:00:00.000000000 +0100
++++ mpi/transpose-alltoall-transposed.c	2015-09-05 05:53:19.085516467 +0200
+@@ -0,0 +1,280 @@
++/*
++ * Copyright (c) 2003, 2007-11 Matteo Frigo
++ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
++ * Copyright (c) 2012 Michael Pippig
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
++ *
++ */
++
++/* plans for distributed out-of-place transpose using MPI_Alltoall,
++   and which destroy the input array (also if TRANSPOSED_IN is used) */
++
++#include "mpi-transpose.h"
++#include <string.h>
++
++typedef struct {
++     solver super;
++     int copy_transposed_out; /* whether to copy the output for TRANSPOSED_OUT,
++				which makes the first transpose out-of-place
++				but costs an extra copy and requires us
++				to destroy the input */
++} S;
++
++typedef struct {
++     plan_mpi_transpose super;
++
++     plan *cld1, *cld2, *cld2rest, *cld3;
++
++     MPI_Comm comm;
++     int *send_block_sizes, *send_block_offsets;
++     int *recv_block_sizes, *recv_block_offsets;
++
++     INT rest_Ioff, rest_Ooff;
++
++     int equal_blocks;
++} P;
++
++/* transpose locally to get contiguous chunks
++   this may take two transposes if the block sizes are unequal
++   (3 subplans, two of which operate on disjoint data) */
++static void apply_pretranspose(
++    const P *ego, R *I, R *O
++    )
++{
++  plan_rdft *cld2, *cld2rest, *cld3;
++
++  cld3 = (plan_rdft *) ego->cld3;
++  if (cld3)
++       cld3->apply(ego->cld3, O, O);
++  /* else TRANSPOSED_IN is true and user wants I transposed */
++
++  cld2 = (plan_rdft *) ego->cld2;
++  cld2->apply(ego->cld2, I, O);
++  cld2rest = (plan_rdft *) ego->cld2rest;
++  if (cld2rest) {
++       cld2rest->apply(ego->cld2rest,
++      		       I + ego->rest_Ioff, O + ego->rest_Ooff);
++  }
++}
++
++static void apply(const plan *ego_, R *I, R *O)
++{
++     const P *ego = (const P *) ego_;
++     plan_rdft *cld1 = (plan_rdft *) ego->cld1;
++
++     if (cld1) {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, O);
++
++	  /* transpose chunks globally */
++	  if (ego->equal_blocks)
++	       MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE,
++			    I, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
++			    ego->comm);
++	  else
++	       MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets,
++			     FFTW_MPI_TYPE,
++			     I, ego->recv_block_sizes, ego->recv_block_offsets,
++			     FFTW_MPI_TYPE,
++			     ego->comm);
++
++          /* transpose locally to get non-transposed output */
++          cld1->apply(ego->cld1, I, O);
++     } /* else TRANSPOSED_OUT is true and user wants O transposed */
++     else {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, I);
++
++          /* transpose chunks globally */
++	  if (ego->equal_blocks)
++	       MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE,
++			    O, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
++			    ego->comm);
++	  else
++	       MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets,
++			     FFTW_MPI_TYPE,
++			     O, ego->recv_block_sizes, ego->recv_block_offsets,
++			     FFTW_MPI_TYPE,
++			     ego->comm);
++     }
++}
++
++static int applicable(const S *ego, const problem *p_,
++		      const planner *plnr)
++{
++     /* in contrast to transpose-alltoall this algorithm can not preserve the input,
++      * since we need at least one transpose before the (out-of-place) Alltoall */
++     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
++     return (1
++	     && p->I != p->O
++	     && (!NO_DESTROY_INPUTP(plnr))  
++	     && ((p->flags & TRANSPOSED_OUT) || !ego->copy_transposed_out)
++	     && ONLY_TRANSPOSEDP(p->flags)
++	  );
++}
++
++static void awake(plan *ego_, enum wakefulness wakefulness)
++{
++     P *ego = (P *) ego_;
++     X(plan_awake)(ego->cld1, wakefulness);
++     X(plan_awake)(ego->cld2, wakefulness);
++     X(plan_awake)(ego->cld2rest, wakefulness);
++     X(plan_awake)(ego->cld3, wakefulness);
++}
++
++static void destroy(plan *ego_)
++{
++     P *ego = (P *) ego_;
++     X(ifree0)(ego->send_block_sizes);
++     MPI_Comm_free(&ego->comm);
++     X(plan_destroy_internal)(ego->cld3);
++     X(plan_destroy_internal)(ego->cld2rest);
++     X(plan_destroy_internal)(ego->cld2);
++     X(plan_destroy_internal)(ego->cld1);
++}
++
++static void print(const plan *ego_, printer *p)
++{
++     const P *ego = (const P *) ego_;
++     p->print(p, "(mpi-transpose-alltoall-transposed%s%(%p%)%(%p%)%(%p%)%(%p%))",
++	      ego->equal_blocks ? "/e" : "",
++	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
++}
++
++static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
++{
++     const S *ego = (const S *) ego_;
++     const problem_mpi_transpose *p;
++     P *pln;
++     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
++     INT b, bt, vn, rest_Ioff, rest_Ooff;
++     R *O;
++     int *sbs, *sbo, *rbs, *rbo;
++     int pe, my_pe, n_pes;
++     int equal_blocks = 1;
++     static const plan_adt padt = {
++          XM(transpose_solve), awake, print, destroy
++     };
++
++     if (!applicable(ego, p_, plnr))
++          return (plan *) 0;
++
++     p = (const problem_mpi_transpose *) p_;
++     vn = p->vn;
++
++     MPI_Comm_rank(p->comm, &my_pe);
++     MPI_Comm_size(p->comm, &n_pes);
++
++     bt = XM(block)(p->ny, p->tblock, my_pe);
++
++     if (p->flags & TRANSPOSED_OUT) { /* O stays transposed */
++	  if (ego->copy_transposed_out) {
++	       cld1 = X(mkplan_f_d)(plnr,
++				  X(mkproblem_rdft_0_d)(X(mktensor_1d)
++							(bt * p->nx * vn, 1, 1),
++							p->I, O = p->O),
++				    0, 0, NO_SLOW);
++	       if (XM(any_true)(!cld1, p->comm)) goto nada;
++	  }
++	  else /* first transpose is in-place */
++              O = p->I;
++     }
++     else { /* transpose nx x bt x vn -> bt x nx x vn */
++	  cld1 = X(mkplan_f_d)(plnr, 
++			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
++						     (bt, vn, p->nx * vn,
++						      p->nx, bt * vn, vn,
++						      vn, 1, 1),
++						     p->I, O = p->O),
++			       0, 0, NO_SLOW);
++	  if (XM(any_true)(!cld1, p->comm)) goto nada;
++     }
++
++     if (XM(any_true)(!XM(mkplans_pretranspose)(p, plnr, p->I, O, my_pe,
++	 					&cld2, &cld2rest, &cld3,
++						&rest_Ioff, &rest_Ooff),
++		      p->comm)) goto nada;
++
++
++     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
++
++     pln->cld1 = cld1;
++     pln->cld2 = cld2;
++     pln->cld2rest = cld2rest;
++     pln->rest_Ioff = rest_Ioff;
++     pln->rest_Ooff = rest_Ooff;
++     pln->cld3 = cld3;
++
++     MPI_Comm_dup(p->comm, &pln->comm);
++
++     /* Compute sizes/offsets of blocks to send for all-to-all command. */
++     sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
++     sbo = sbs + n_pes;
++     rbs = sbo + n_pes;
++     rbo = rbs + n_pes;
++     b = XM(block)(p->nx, p->block, my_pe);
++     bt = XM(block)(p->ny, p->tblock, my_pe);
++     for (pe = 0; pe < n_pes; ++pe) {
++	  INT db, dbt; /* destination block sizes */
++	  db = XM(block)(p->nx, p->block, pe);
++	  dbt = XM(block)(p->ny, p->tblock, pe);
++	  if (db != p->block || dbt != p->tblock)
++	       equal_blocks = 0;
++
++	  /* MPI requires type "int" here; apparently it
++	     has no 64-bit API?  Grrr. */
++	  sbs[pe] = (int) (b * dbt * vn);
++	  sbo[pe] = (int) (pe * (b * p->tblock) * vn);
++	  rbs[pe] = (int) (db * bt * vn);
++	  rbo[pe] = (int) (pe * (p->block * bt) * vn);
++     }
++     pln->send_block_sizes = sbs;
++     pln->send_block_offsets = sbo;
++     pln->recv_block_sizes = rbs;
++     pln->recv_block_offsets = rbo;
++     pln->equal_blocks = equal_blocks;
++
++     X(ops_zero)(&pln->super.super.ops);
++     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
++     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
++     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
++     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
++     /* FIXME: should MPI operations be counted in "other" somehow? */
++
++     return &(pln->super.super);
++
++ nada:
++     X(plan_destroy_internal)(cld3);
++     X(plan_destroy_internal)(cld2rest);
++     X(plan_destroy_internal)(cld2);
++     X(plan_destroy_internal)(cld1);
++     return (plan *) 0;
++}
++
++static solver *mksolver(int copy_transposed_out)
++{
++     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
++     S *slv = MKSOLVER(S, &sadt);
++     slv->copy_transposed_out = copy_transposed_out;
++     return &(slv->super);
++}
++
++void XM(transpose_alltoall_transposed_register)(planner *p)
++{
++     int cto;
++     for (cto = 0; cto <= 1; ++cto)
++	  REGISTER_SOLVER(p, mksolver(cto));
++}
+
+--- mpi/transpose-pairwise.c	2014-03-04 19:41:03.000000000 +0100
++++ mpi/transpose-pairwise.c	2015-09-05 06:00:05.715433709 +0200
+@@ -53,7 +53,6 @@ static void transpose_chunks(int *sched,
+ {
+      if (sched) {
+ 	  int i;
+-	  MPI_Status status;
+ 
+ 	  /* TODO: explore non-synchronous send/recv? */
+ 
+@@ -74,7 +73,7 @@ static void transpose_chunks(int *sched,
+ 				      O + rbo[pe], (int) (rbs[pe]),
+ 				      FFTW_MPI_TYPE,
+ 				      pe, (pe * n_pes + my_pe) & 0xffff,
+-				      comm, &status);
++				      comm, MPI_STATUS_IGNORE);
+ 		    }
+ 	       }
+ 
+@@ -92,7 +91,7 @@ static void transpose_chunks(int *sched,
+ 				      O + rbo[pe], (int) (rbs[pe]),
+ 				      FFTW_MPI_TYPE,
+ 				      pe, (pe * n_pes + my_pe) & 0xffff,
+-				      comm, &status);
++				      comm, MPI_STATUS_IGNORE);
+ 	       }
+ 	  }
+      }
+ 
+--- mpi/transpose-pairwise-transposed.c	1970-01-01 01:00:00.000000000 +0100
++++ mpi/transpose-pairwise-transposed.c	2015-09-05 06:00:07.280481042 +0200
+@@ -0,0 +1,510 @@
++/*
++ * Copyright (c) 2003, 2007-11 Matteo Frigo
++ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
++ * Copyright (c) 2012 Michael Pippig
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
++ *
++ */
++
++/* Distributed transposes using a sequence of carefully scheduled
++   pairwise exchanges.  This has the advantage that it can be done
++   in-place, or out-of-place while preserving the input, using buffer
++   space proportional to the local size divided by the number of
++   processes (i.e. to the total array size divided by the number of
++   processes squared). */
++
++#include "mpi-transpose.h"
++#include <string.h>
++
++typedef struct {
++     solver super;
++     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
++} S;
++
++typedef struct {
++     plan_mpi_transpose super;
++
++     plan *cld1, *cld2, *cld2rest, *cld3;
++     INT rest_Ioff, rest_Ooff;
++     
++     int n_pes, my_pe, *sched;
++     INT *send_block_sizes, *send_block_offsets;
++     INT *recv_block_sizes, *recv_block_offsets;
++     MPI_Comm comm;
++     int preserve_input;
++} P;
++
++static void transpose_chunks(int *sched, int n_pes, int my_pe,
++			     INT *sbs, INT *sbo, INT *rbs, INT *rbo,
++			     MPI_Comm comm,
++			     R *I, R *O)
++{
++     if (sched) {
++	  int i;
++
++	  /* TODO: explore non-synchronous send/recv? */
++
++	  if (I == O) {
++	       R *buf = (R*) MALLOC(sizeof(R) * sbs[0], BUFFERS);
++	       
++	       for (i = 0; i < n_pes; ++i) {
++		    int pe = sched[i];
++		    if (my_pe == pe) {
++			 if (rbo[pe] != sbo[pe])
++			      memmove(O + rbo[pe], O + sbo[pe],
++				      sbs[pe] * sizeof(R));
++		    }
++		    else {
++			 memcpy(buf, O + sbo[pe], sbs[pe] * sizeof(R));
++			 MPI_Sendrecv(buf, (int) (sbs[pe]), FFTW_MPI_TYPE,
++				      pe, (my_pe * n_pes + pe) & 0xffff,
++				      O + rbo[pe], (int) (rbs[pe]),
++				      FFTW_MPI_TYPE,
++				      pe, (pe * n_pes + my_pe) & 0xffff,
++				      comm, MPI_STATUS_IGNORE);
++		    }
++	       }
++
++	       X(ifree)(buf);
++	  }
++	  else { /* I != O */
++	       for (i = 0; i < n_pes; ++i) {
++		    int pe = sched[i];
++		    if (my_pe == pe)
++			 memcpy(O + rbo[pe], I + sbo[pe], sbs[pe] * sizeof(R));
++		    else
++			 MPI_Sendrecv(I + sbo[pe], (int) (sbs[pe]),
++				      FFTW_MPI_TYPE,
++				      pe, (my_pe * n_pes + pe) & 0xffff,
++				      O + rbo[pe], (int) (rbs[pe]),
++				      FFTW_MPI_TYPE,
++				      pe, (pe * n_pes + my_pe) & 0xffff,
++				      comm, MPI_STATUS_IGNORE);
++	       }
++	  }
++     }
++}
++
++/* transpose locally to get contiguous chunks
++   this may take two transposes if the block sizes are unequal
++   (3 subplans, two of which operate on disjoint data) */
++static void apply_pretranspose(
++    const P *ego, R *I, R *O
++    )
++{
++  plan_rdft *cld2, *cld2rest, *cld3;
++
++  cld3 = (plan_rdft *) ego->cld3;
++  if (cld3)
++       cld3->apply(ego->cld3, O, O);
++  /* else TRANSPOSED_IN is true and user wants I transposed */
++
++  cld2 = (plan_rdft *) ego->cld2;
++  cld2->apply(ego->cld2, I, O);
++  cld2rest = (plan_rdft *) ego->cld2rest;
++  if (cld2rest) {
++       cld2rest->apply(ego->cld2rest,
++      		       I + ego->rest_Ioff, O + ego->rest_Ooff);
++  }
++}
++
++static void apply(const plan *ego_, R *I, R *O)
++{
++     const P *ego = (const P *) ego_;
++     plan_rdft *cld1 = (plan_rdft *) ego->cld1;
++     
++     if (cld1) {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, O);
++
++          if(ego->preserve_input) I = O;
++
++          /* transpose chunks globally */
++          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
++                           ego->send_block_sizes, ego->send_block_offsets,
++			   ego->recv_block_sizes, ego->recv_block_offsets,
++			   ego->comm, O, I);
++
++          /* transpose locally to get non-transposed output */
++          cld1->apply(ego->cld1, I, O);
++     } /* else TRANSPOSED_OUT is true and user wants O transposed */
++     else if (ego->preserve_input) {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, O);
++
++          /* transpose chunks globally */
++          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
++             	           ego->send_block_sizes, ego->send_block_offsets,
++			   ego->recv_block_sizes, ego->recv_block_offsets,
++			   ego->comm, O, O);
++     }
++     else {
++          /* transpose locally to get contiguous chunks */
++          apply_pretranspose(ego, I, I);
++
++          /* transpose chunks globally */
++          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
++             	           ego->send_block_sizes, ego->send_block_offsets,
++			   ego->recv_block_sizes, ego->recv_block_offsets,
++			   ego->comm, I, O);
++     }
++}
++
++static int applicable(const S *ego, const problem *p_,
++		      const planner *plnr)
++{
++     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
++     /* Note: this is *not* UGLY for out-of-place, destroy-input plans;
++	the planner often prefers transpose-pairwise to transpose-alltoall,
++	at least with LAM MPI on my machine. */
++     return (1
++	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
++					  && p->I != p->O))
++	     && ONLY_TRANSPOSEDP(p->flags));
++}
++
++static void awake(plan *ego_, enum wakefulness wakefulness)
++{
++     P *ego = (P *) ego_;
++     X(plan_awake)(ego->cld1, wakefulness);
++     X(plan_awake)(ego->cld2, wakefulness);
++     X(plan_awake)(ego->cld2rest, wakefulness);
++     X(plan_awake)(ego->cld3, wakefulness);
++}
++
++static void destroy(plan *ego_)
++{
++     P *ego = (P *) ego_;
++     X(ifree0)(ego->sched);
++     X(ifree0)(ego->send_block_sizes);
++     MPI_Comm_free(&ego->comm);
++     X(plan_destroy_internal)(ego->cld3);
++     X(plan_destroy_internal)(ego->cld2rest);
++     X(plan_destroy_internal)(ego->cld2);
++     X(plan_destroy_internal)(ego->cld1);
++}
++
++static void print(const plan *ego_, printer *p)
++{
++     const P *ego = (const P *) ego_;
++     p->print(p, "(mpi-transpose-pairwise-transposed%s%(%p%)%(%p%)%(%p%)%(%p%))", 
++	      ego->preserve_input==2 ?"/p":"",
++	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
++}
++
++/* Given a process which_pe and a number of processes npes, fills
++   the array sched[npes] with a sequence of processes to communicate
++   with for a deadlock-free, optimum-overlap all-to-all communication.
++   (All processes must call this routine to get their own schedules.)
++   The schedule can be re-ordered arbitrarily as long as all processes
++   apply the same permutation to their schedules.
++
++   The algorithm here is based upon the one described in:
++       J. A. M. Schreuder, "Constructing timetables for sport
++       competitions," Mathematical Programming Study 13, pp. 58-67 (1980). 
++   In a sport competition, you have N teams and want every team to
++   play every other team in as short a time as possible (maximum overlap
++   between games).  This timetabling problem is therefore identical
++   to that of an all-to-all communications problem.  In our case, there
++   is one wrinkle: as part of the schedule, the process must do
++   some data transfer with itself (local data movement), analogous
++   to a requirement that each team "play itself" in addition to other
++   teams.  With this wrinkle, it turns out that an optimal timetable
++   (N parallel games) can be constructed for any N, not just for even
++   N as in the original problem described by Schreuder.
++*/
++static void fill1_comm_sched(int *sched, int which_pe, int npes)
++{
++     int pe, i, n, s = 0;
++     A(which_pe >= 0 && which_pe < npes);
++     if (npes % 2 == 0) {
++	  n = npes;
++	  sched[s++] = which_pe;
++     }
++     else
++	  n = npes + 1;
++     for (pe = 0; pe < n - 1; ++pe) {
++	  if (npes % 2 == 0) {
++	       if (pe == which_pe) sched[s++] = npes - 1;
++	       else if (npes - 1 == which_pe) sched[s++] = pe;
++	  }
++	  else if (pe == which_pe) sched[s++] = pe;
++
++	  if (pe != which_pe && which_pe < n - 1) {
++	       i = (pe - which_pe + (n - 1)) % (n - 1);
++	       if (i < n/2)
++		    sched[s++] = (pe + i) % (n - 1);
++	       
++	       i = (which_pe - pe + (n - 1)) % (n - 1);
++	       if (i < n/2)
++		    sched[s++] = (pe - i + (n - 1)) % (n - 1);
++	  }
++     }
++     A(s == npes);
++}
++
++/* Sort the communication schedule sched for npes so that the schedule
++   on process sortpe is ascending or descending (!ascending).  This is
++   necessary to allow in-place transposes when the problem does not
++   divide equally among the processes.  In this case there is one
++   process where the incoming blocks are bigger/smaller than the
++   outgoing blocks and thus have to be received in
++   descending/ascending order, respectively, to avoid overwriting data
++   before it is sent. */
++static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
++{
++     int *sortsched, i;
++     sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER);
++     fill1_comm_sched(sortsched, sortpe, npes);
++     if (ascending)
++	  for (i = 0; i < npes; ++i)
++	       sortsched[npes + sortsched[i]] = sched[i];
++     else
++	  for (i = 0; i < npes; ++i)
++	       sortsched[2*npes - 1 - sortsched[i]] = sched[i];
++     for (i = 0; i < npes; ++i)
++	  sched[i] = sortsched[npes + i];
++     X(ifree)(sortsched);
++}
++
++/* make the plans to do the pre-MPI transpositions (shared with
++   transpose-alltoall-transposed) */
++int XM(mkplans_pretranspose)(const problem_mpi_transpose *p, planner *plnr,
++			      R *I, R *O, int my_pe,
++			      plan **cld2, plan **cld2rest, plan **cld3,
++			      INT *rest_Ioff, INT *rest_Ooff)
++{
++     INT vn = p->vn;
++     INT b = XM(block)(p->nx, p->block, my_pe);
++     INT bt = p->tblock;
++     INT nyb = p->ny / bt; /* number of equal-sized blocks */
++     INT nyr = p->ny - nyb * bt; /* leftover rows after equal blocks */
++
++     *cld2 = *cld2rest = *cld3 = NULL;
++     *rest_Ioff = *rest_Ooff = 0;
++
++     if (!(p->flags & TRANSPOSED_IN) && (nyr == 0 || I != O)) {
++	  INT ny = p->ny * vn;
++	  bt *= vn;
++	  *cld2 = X(mkplan_f_d)(plnr, 
++				X(mkproblem_rdft_0_d)(X(mktensor_3d)
++						      (nyb, bt, b * bt,
++						       b, ny, bt,
++						       bt, 1, 1),
++						      I, O),
++				0, 0, NO_SLOW);
++	  if (!*cld2) goto nada;
++
++	  if (nyr > 0) {
++	       *rest_Ioff = nyb * bt;
++	       *rest_Ooff = nyb * b * bt;
++	       bt = nyr * vn;
++	       *cld2rest = X(mkplan_f_d)(plnr,
++					 X(mkproblem_rdft_0_d)(X(mktensor_2d)
++							       (b, ny, bt,
++								bt, 1, 1),
++							       I + *rest_Ioff,
++							       O + *rest_Ooff),
++                                        0, 0, NO_SLOW);
++               if (!*cld2rest) goto nada;
++	  }
++     }
++     else {
++	  *cld2 = X(mkplan_f_d)(plnr,
++				X(mkproblem_rdft_0_d)(
++				     X(mktensor_4d)
++				     (nyb, b * bt * vn, b * bt * vn,
++				      b, vn, bt * vn,
++				      bt, b * vn, vn,
++				      vn, 1, 1),
++				     I, O),
++				0, 0, NO_SLOW);
++	  if (!*cld2) goto nada;
++
++	  *rest_Ioff = *rest_Ooff = nyb * bt * b * vn;
++	  *cld2rest = X(mkplan_f_d)(plnr,
++				    X(mkproblem_rdft_0_d)(
++					 X(mktensor_3d)
++					 (b, vn, nyr * vn,
++					  nyr, b * vn, vn,
++					  vn, 1, 1),
++					 I + *rest_Ioff, O + *rest_Ooff),
++				    0, 0, NO_SLOW);
++	  if (!*cld2rest) goto nada;
++
++	  if (!(p->flags & TRANSPOSED_IN)) {
++	       *cld3 = X(mkplan_f_d)(plnr,
++				     X(mkproblem_rdft_0_d)(
++					  X(mktensor_3d)
++					  (p->ny, vn, b * vn,
++					   b, p->ny * vn, vn,
++					   vn, 1, 1),
++					  I, I),
++				     0, 0, NO_SLOW);
++	       if (!*cld3) goto nada;
++	  }
++     }
++
++     return 1;
++
++nada:
++     X(plan_destroy_internal)(*cld3);
++     X(plan_destroy_internal)(*cld2rest);
++     X(plan_destroy_internal)(*cld2);
++     *cld2 = *cld2rest = *cld3 = NULL;
++     return 0;
++}
++
++static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
++{
++     const S *ego = (const S *) ego_;
++     const problem_mpi_transpose *p;
++     P *pln;
++     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
++     INT b, bt, vn, rest_Ioff, rest_Ooff;
++     INT *sbs, *sbo, *rbs, *rbo;
++     int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
++     R *I, *O;
++     static const plan_adt padt = {
++          XM(transpose_solve), awake, print, destroy
++     };
++
++     UNUSED(ego);
++
++     if (!applicable(ego, p_, plnr))
++          return (plan *) 0;
++
++     p = (const problem_mpi_transpose *) p_;
++     vn = p->vn;
++     I = p->I; O = p->O;
++
++     MPI_Comm_rank(p->comm, &my_pe);
++     MPI_Comm_size(p->comm, &n_pes);
++
++     bt = XM(block)(p->ny, p->tblock, my_pe);
++
++
++     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = p->O;
++     
++     if (!(p->flags & TRANSPOSED_OUT)) { /* nx x bt x vn -> bt x nx x vn */
++	  cld1 = X(mkplan_f_d)(plnr, 
++			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
++						     (bt, vn, p->nx * vn,
++						      p->nx, bt * vn, vn,
++						      vn, 1, 1),
++						     I, O = p->O),
++			       0, 0, NO_SLOW);
++	  if (XM(any_true)(!cld1, p->comm)) goto nada;
++
++     }
++     else {
++       if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
++         O = p->O;
++       else
++         O = p->I;
++     }
++
++     if (XM(any_true)(!XM(mkplans_pretranspose)(p, plnr, p->I, O, my_pe,
++						&cld2, &cld2rest, &cld3,
++						&rest_Ioff, &rest_Ooff),
++		      p->comm)) goto nada;
++
++     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
++
++     pln->cld1 = cld1;
++     pln->cld2 = cld2;
++     pln->cld2rest = cld2rest;
++     pln->rest_Ioff = rest_Ioff;
++     pln->rest_Ooff = rest_Ooff;
++     pln->cld3 = cld3;
++     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
++
++     MPI_Comm_dup(p->comm, &pln->comm);
++
++     n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
++			   XM(num_blocks)(p->ny, p->tblock));
++
++     /* Compute sizes/offsets of blocks to exchange between processors */
++     sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
++     sbo = sbs + n_pes;
++     rbs = sbo + n_pes;
++     rbo = rbs + n_pes;
++     b = XM(block)(p->nx, p->block, my_pe);
++     bt = XM(block)(p->ny, p->tblock, my_pe);
++     for (pe = 0; pe < n_pes; ++pe) {
++	  INT db, dbt; /* destination block sizes */
++	  db = XM(block)(p->nx, p->block, pe);
++	  dbt = XM(block)(p->ny, p->tblock, pe);
++
++	  sbs[pe] = b * dbt * vn;
++	  sbo[pe] = pe * (b * p->tblock) * vn;
++	  rbs[pe] = db * bt * vn;
++	  rbo[pe] = pe * (p->block * bt) * vn;
++
++	  if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
++	       A(sort_pe == -1); /* only one process should need sorting */
++	       sort_pe = pe;
++	       ascending = db * p->tblock > p->block * dbt;
++	  }
++     }
++     pln->n_pes = n_pes;
++     pln->my_pe = my_pe;
++     pln->send_block_sizes = sbs;
++     pln->send_block_offsets = sbo;
++     pln->recv_block_sizes = rbs;
++     pln->recv_block_offsets = rbo;
++
++     if (my_pe >= n_pes) {
++	  pln->sched = 0; /* this process is not doing anything */
++     }
++     else {
++	  pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
++	  fill1_comm_sched(pln->sched, my_pe, n_pes);
++	  if (sort_pe >= 0)
++	       sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
++     }
++
++     X(ops_zero)(&pln->super.super.ops);
++     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
++     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
++     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
++     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
++     /* FIXME: should MPI operations be counted in "other" somehow? */
++
++     return &(pln->super.super);
++
++ nada:
++     X(plan_destroy_internal)(cld3);
++     X(plan_destroy_internal)(cld2rest);
++     X(plan_destroy_internal)(cld2);
++     X(plan_destroy_internal)(cld1);
++     return (plan *) 0;
++}
++
++static solver *mksolver(int preserve_input)
++{
++     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
++     S *slv = MKSOLVER(S, &sadt);
++     slv->preserve_input = preserve_input;
++     return &(slv->super);
++}
++
++void XM(transpose_pairwise_transposed_register)(planner *p)
++{
++     int preserve_input;
++     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
++	  REGISTER_SOLVER(p, mksolver(preserve_input));
++}
diff --git a/var/spack/repos/builtin/packages/fontconfig/package.py b/var/spack/repos/builtin/packages/fontconfig/package.py
index 311156378a..99c9b1f15d 100644
--- a/var/spack/repos/builtin/packages/fontconfig/package.py
+++ b/var/spack/repos/builtin/packages/fontconfig/package.py
@@ -25,7 +25,7 @@
 from spack import *
 
 
-class Fontconfig(Package):
+class Fontconfig(AutotoolsPackage):
     """Fontconfig customizing font access"""
     homepage = "http://www.freedesktop.org/wiki/Software/fontconfig/"
     url      = "http://www.freedesktop.org/software/fontconfig/release/fontconfig-2.11.1.tar.gz"
@@ -36,10 +36,6 @@ class Fontconfig(Package):
     depends_on('libxml2')
     depends_on('pkg-config', type='build')
 
-    def install(self, spec, prefix):
-        configure("--prefix=%s" % prefix,
-                  "--enable-libxml2",
-                  "--disable-docs")
-
-        make()
-        make("install")
+    def configure_args(self):
+        args = ["--enable-libxml2", "--disable-docs"]
+        return args
diff --git a/var/spack/repos/builtin/packages/gcc/package.py b/var/spack/repos/builtin/packages/gcc/package.py
index be3b6cca51..18fe0d88f8 100644
--- a/var/spack/repos/builtin/packages/gcc/package.py
+++ b/var/spack/repos/builtin/packages/gcc/package.py
@@ -58,7 +58,7 @@ class Gcc(Package):
         provides('golang', when='@4.7.1:')
 
     patch('piclibs.patch', when='+piclibs')
-    patch('gcc-backport.patch', when='@4.7:5.3')
+    patch('gcc-backport.patch', when='@4.7:4.9.2,5:5.3')
 
     def install(self, spec, prefix):
         # libjava/configure needs a minor fix to install into spack paths.
diff --git a/var/spack/repos/builtin/packages/graphviz/package.py b/var/spack/repos/builtin/packages/graphviz/package.py
index b37121248c..bb23513d2f 100644
--- a/var/spack/repos/builtin/packages/graphviz/package.py
+++ b/var/spack/repos/builtin/packages/graphviz/package.py
@@ -24,9 +24,10 @@
 ##############################################################################
 from spack import *
 import sys
+import shutil
 
 
-class Graphviz(Package):
+class Graphviz(AutotoolsPackage):
     """Graph Visualization Software"""
     homepage = "http://www.graphviz.org"
     url      = "http://www.graphviz.org/pub/graphviz/stable/SOURCES/graphviz-2.38.0.tar.gz"
@@ -46,11 +47,13 @@ class Graphviz(Package):
     depends_on("swig")
     depends_on("python")
     depends_on("ghostscript")
+    depends_on("freetype")
+    depends_on("libtool", type='build')
     depends_on("pkg-config", type='build')
 
-    def install(self, spec, prefix):
-        options = ['--prefix=%s' % prefix]
-        if '+perl' not in spec:
+    def configure_args(self):
+        options = []
+        if '+perl' not in self.spec:
             options.append('--disable-perl')
 
         # On OSX fix the compiler error:
@@ -59,7 +62,9 @@ class Graphviz(Package):
         #       include <X11/Xlib.h>
         if sys.platform == 'darwin':
             options.append('CFLAGS=-I/opt/X11/include')
+        options.append('--with-ltdl-lib=%s/lib' % self.spec['libtool'].prefix)
 
-        configure(*options)
-        make()
-        make("install")
+        # A hack to patch config.guess in the libltdl sub directory
+        shutil.copyfile('./config/config.guess', 'libltdl/config/config.guess')
+
+        return options
diff --git a/var/spack/repos/builtin/packages/libelf/package.py b/var/spack/repos/builtin/packages/libelf/package.py
index 3304d27bdb..000b4e0957 100644
--- a/var/spack/repos/builtin/packages/libelf/package.py
+++ b/var/spack/repos/builtin/packages/libelf/package.py
@@ -25,7 +25,7 @@
 from spack import *
 
 
-class Libelf(Package):
+class Libelf(AutotoolsPackage):
     """libelf lets you read, modify or create ELF object files in an
        architecture-independent way. The library takes care of size
        and endian issues, e.g. you can process a file for SPARC
@@ -38,13 +38,13 @@ class Libelf(Package):
     version('0.8.12', 'e21f8273d9f5f6d43a59878dc274fec7')
 
     provides('elf')
+    depends_on('automake', type='build')
 
-    def install(self, spec, prefix):
-        configure("--prefix=" + prefix,
-                  "--enable-shared",
-                  "--disable-dependency-tracking",
-                  "--disable-debug")
-        make()
+    def configure_args(self):
+        args = ["--enable-shared",
+                "--disable-dependency-tracking",
+                "--disable-debug"]
+        return args
 
-        # The mkdir commands in libelf's install can fail in parallel
-        make("install", parallel=False)
+    def install(self, spec, prefix):
+        make('install', parallel=False)
diff --git a/var/spack/repos/builtin/packages/libiconv/package.py b/var/spack/repos/builtin/packages/libiconv/package.py
index 982929b80a..72f67ec80d 100644
--- a/var/spack/repos/builtin/packages/libiconv/package.py
+++ b/var/spack/repos/builtin/packages/libiconv/package.py
@@ -23,9 +23,10 @@
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ##############################################################################
 from spack import *
+import shutil
 
 
-class Libiconv(Package):
+class Libiconv(AutotoolsPackage):
     """GNU libiconv provides an implementation of the iconv() function
     and the iconv program for character set conversion."""
 
@@ -38,10 +39,10 @@ class Libiconv(Package):
     # of C11 any more and thus might not exist.
     patch("gets.patch")
 
-    def install(self, spec, prefix):
-        configure('--prefix={0}'.format(prefix),
-                  '--enable-extra-encodings')
+    def configure_args(self):
+        args = ['--enable-extra-encodings']
 
-        make()
-        make('check')
-        make('install')
+        # A hack to patch config.guess in the libcharset sub directory
+        shutil.copyfile('./build-aux/config.guess',
+                        'libcharset/build-aux/config.guess')
+        return args
diff --git a/var/spack/repos/builtin/packages/libsplash/package.py b/var/spack/repos/builtin/packages/libsplash/package.py
index b58d37e6ae..c87dae19be 100644
--- a/var/spack/repos/builtin/packages/libsplash/package.py
+++ b/var/spack/repos/builtin/packages/libsplash/package.py
@@ -41,6 +41,7 @@ class Libsplash(Package):
             git='https://github.com/ComputationalRadiationPhysics/libSplash.git')
     version('master', branch='master',
             git='https://github.com/ComputationalRadiationPhysics/libSplash.git')
+    version('1.6.0', 'c05bce95abfe1ae4cd9d9817acf58d94')
     version('1.5.0', 'c1efec4c20334242c8a3b6bfdc0207e3')
     version('1.4.0', '2de37bcef6fafa1960391bf44b1b50e0')
     version('1.3.1', '524580ba088d97253d03b4611772f37c')
diff --git a/var/spack/repos/builtin/packages/libtiff/package.py b/var/spack/repos/builtin/packages/libtiff/package.py
index 6c282dee7c..70c371b3b8 100644
--- a/var/spack/repos/builtin/packages/libtiff/package.py
+++ b/var/spack/repos/builtin/packages/libtiff/package.py
@@ -25,7 +25,7 @@
 from spack import *
 
 
-class Libtiff(Package):
+class Libtiff(AutotoolsPackage):
     """libtiff graphics format library"""
     homepage = "http://www.simplesystems.org/libtiff/"
     url      = "ftp://download.osgeo.org/libtiff/tiff-4.0.3.tar.gz"
@@ -36,9 +36,3 @@ class Libtiff(Package):
     depends_on('jpeg')
     depends_on('zlib')
     depends_on('xz')
-
-    def install(self, spec, prefix):
-        configure("--prefix=%s" % prefix)
-
-        make()
-        make("install")
diff --git a/var/spack/repos/builtin/packages/lzma/package.py b/var/spack/repos/builtin/packages/lzma/package.py
index 23d697ffe8..3eb97a2d9f 100644
--- a/var/spack/repos/builtin/packages/lzma/package.py
+++ b/var/spack/repos/builtin/packages/lzma/package.py
@@ -25,7 +25,7 @@
 from spack import *
 
 
-class Lzma(Package):
+class Lzma(AutotoolsPackage):
     """LZMA Utils are legacy data compression software with high compression
     ratio. LZMA Utils are no longer developed, although critical bugs may be
     fixed as long as fixing them doesn't require huge changes to the code.
@@ -39,11 +39,3 @@ class Lzma(Package):
     url      = "http://tukaani.org/lzma/lzma-4.32.7.tar.gz"
 
     version('4.32.7', '2a748b77a2f8c3cbc322dbd0b4c9d06a')
-
-    def install(self, spec, prefix):
-        configure('--prefix={0}'.format(prefix))
-
-        make()
-        if self.run_tests:
-            make('check')  # one of the tests fails for me
-        make('install')
diff --git a/var/spack/repos/builtin/packages/matio/package.py b/var/spack/repos/builtin/packages/matio/package.py
index c141f7e8af..a33b23a4e9 100644
--- a/var/spack/repos/builtin/packages/matio/package.py
+++ b/var/spack/repos/builtin/packages/matio/package.py
@@ -25,15 +25,26 @@
 from spack import *
 
 
-class Matio(Package):
+class Matio(AutotoolsPackage):
     """matio is an C library for reading and writing Matlab MAT files"""
     homepage = "http://sourceforge.net/projects/matio/"
-    url = "http://downloads.sourceforge.net/project/matio/matio/1.5.2/matio-1.5.2.tar.gz"
+    url = "http://downloads.sourceforge.net/project/matio/matio/1.5.9/matio-1.5.9.tar.gz"
 
+    version('1.5.9', 'aab5b4219a3c0262afe7eeb7bdd2f463')
     version('1.5.2', '85b007b99916c63791f28398f6a4c6f1')
 
-    def install(self, spec, prefix):
-        configure('--prefix=%s' % prefix)
+    variant("zlib", default=True,
+            description='support for compressed mat files')
+    variant("hdf5", default=True,
+            description='support for version 7.3 mat files via hdf5')
 
-        make()
-        make("install")
+    depends_on("zlib", when="+zlib")
+    depends_on("hdf5", when="+hdf5")
+
+    def configure_args(self):
+        args = []
+        if '+zlib' in self.spec:
+            args.append("--with-zlib=%s" % self.spec['zlib'].prefix)
+        if '+hdf5' in self.spec:
+            args.append("--with-hdf5=%s" % self.spec['hdf5'].prefix)
+        return args
diff --git a/var/spack/repos/builtin/packages/mpich/package.py b/var/spack/repos/builtin/packages/mpich/package.py
index e4ff29c00a..958fbe762c 100644
--- a/var/spack/repos/builtin/packages/mpich/package.py
+++ b/var/spack/repos/builtin/packages/mpich/package.py
@@ -26,12 +26,12 @@ from spack import *
 import os
 
 
-class Mpich(Package):
+class Mpich(AutotoolsPackage):
     """MPICH is a high performance and widely portable implementation of
     the Message Passing Interface (MPI) standard."""
 
     homepage = "http://www.mpich.org"
-    url      = "http://www.mpich.org/static/downloads/3.0.4/mpich-3.0.4.tar.gz"
+    url = "http://www.mpich.org/static/downloads/3.0.4/mpich-3.0.4.tar.gz"
     list_url = "http://www.mpich.org/static/downloads/"
     list_depth = 2
 
@@ -81,16 +81,19 @@ class Mpich(Package):
             join_path(self.prefix.lib, 'libmpi.{0}'.format(dso_suffix))
         ]
 
-    def install(self, spec, prefix):
+    @AutotoolsPackage.precondition('autoreconf')
+    def die_without_fortran(self):
         # Until we can pass variants such as +fortran through virtual
         # dependencies depends_on('mpi'), require Fortran compiler to
         # avoid delayed build errors in dependents.
         if (self.compiler.f77 is None) or (self.compiler.fc is None):
-            raise InstallError('Mpich requires both C and Fortran ',
-                               'compilers!')
+            raise InstallError(
+                'Mpich requires both C and Fortran compilers!'
+            )
 
-        config_args = [
-            '--prefix={0}'.format(prefix),
+    def configure_args(self):
+        spec = self.spec
+        return [
             '--enable-shared',
             '--with-pm={0}'.format('hydra' if '+hydra' in spec else 'no'),
             '--with-pmi={0}'.format('yes' if '+pmi' in spec else 'no'),
@@ -98,27 +101,8 @@ class Mpich(Package):
             '--{0}-ibverbs'.format('with' if '+verbs' in spec else 'without')
         ]
 
-        # TODO: Spack should make it so that you can't actually find
-        # these compilers if they're "disabled" for the current
-        # compiler configuration.
-        if not self.compiler.f77:
-            config_args.append("--disable-f77")
-
-        if not self.compiler.fc:
-            config_args.append("--disable-fc")
-
-        if not self.compiler.fc and not self.compiler.f77:
-            config_args.append("--disable-fortran")
-
-        configure(*config_args)
-
-        make()
-        make('check')
-        make('install')
-
-        self.filter_compilers(prefix)
-
-    def filter_compilers(self, prefix):
+    @AutotoolsPackage.sanity_check('install')
+    def filter_compilers(self):
         """Run after install to make the MPI compilers use the
         compilers that Spack built the package with.
 
@@ -126,14 +110,18 @@ class Mpich(Package):
         to Spack's generic cc, c++, f77, and f90.  We want them to
         be bound to whatever compiler they were built with."""
 
-        mpicc  = join_path(prefix.bin, 'mpicc')
-        mpicxx = join_path(prefix.bin, 'mpicxx')
-        mpif77 = join_path(prefix.bin, 'mpif77')
-        mpif90 = join_path(prefix.bin, 'mpif90')
+        mpicc = join_path(self.prefix.bin, 'mpicc')
+        mpicxx = join_path(self.prefix.bin, 'mpicxx')
+        mpif77 = join_path(self.prefix.bin, 'mpif77')
+        mpif90 = join_path(self.prefix.bin, 'mpif90')
 
         # Substitute Spack compile wrappers for the real
         # underlying compiler
-        kwargs = {'ignore_absent': True, 'backup': False, 'string': True}
+        kwargs = {
+            'ignore_absent': True,
+            'backup': False,
+            'string': True
+        }
         filter_file(env['CC'],  self.compiler.cc,  mpicc,  **kwargs)
         filter_file(env['CXX'], self.compiler.cxx, mpicxx, **kwargs)
         filter_file(env['F77'], self.compiler.f77, mpif77, **kwargs)
diff --git a/var/spack/repos/builtin/packages/ncl/package.py b/var/spack/repos/builtin/packages/ncl/package.py
new file mode 100644
index 0000000000..7d31c7a8f7
--- /dev/null
+++ b/var/spack/repos/builtin/packages/ncl/package.py
@@ -0,0 +1,233 @@
+##############################################################################
+# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the LICENSE file for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License (as
+# published by the Free Software Foundation) version 2.1, February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+from spack import *
+import os
+import shutil
+import tempfile
+
+
+class Ncl(Package):
+    """NCL is an interpreted language designed specifically for
+       scientific data analysis and visualization. Supports NetCDF 3/4,
+       GRIB 1/2, HDF 4/5, HDF-EOD 2/5, shapefile, ASCII, binary.
+       Numerous analysis functions are built-in."""
+
+    homepage = "https://www.ncl.ucar.edu"
+
+    version('6.3.0', '4834df63d3b56778441246303ab921c4',
+            url='https://www.earthsystemgrid.org/download/fileDownload.html?'
+                'logicalFileId=bec58cb3-cd9b-11e4-bb80-00c0f03d5b7c',
+            extension='tar.gz')
+    patch('spack_ncl.patch')
+
+    # This installation script is implemented according to this manual:
+    # http://www.ncl.ucar.edu/Download/build_from_src.shtml
+
+    variant('hdf4', default=False, description='Enable HDF4 support.')
+    variant('gdal', default=False, description='Enable GDAL support.')
+    variant('triangle', default=True, description='Enable Triangle support.')
+    variant('udunits2', default=True, description='Enable UDUNITS-2 support.')
+    variant('openmp', default=True, description='Enable OpenMP support.')
+
+    # Non-optional dependencies according to the manual:
+    depends_on('jpeg')
+    depends_on('netcdf')
+    depends_on('cairo')
+
+    # Also, the manual says that ncl requires zlib, but that comes as a
+    # mandatory dependency of libpng, which is a mandatory dependency of cairo.
+
+    # In Spack, we do not have an option to compile netcdf without netcdf-4
+    # support, so we will tell the ncl configuration script that we want
+    # support for netcdf-4, but the script assumes that hdf5 is compiled with
+    # szip support. We introduce this restriction with the following dependency
+    # statement.
+    depends_on('hdf5@:1.8+szip')
+
+    # In Spack, we also do not have an option to compile netcdf without DAP
+    # support, so we will tell the ncl configuration script that we have it.
+
+    # Some of the optional dependencies according to the manual:
+    depends_on('hdf', when='+hdf4')
+    depends_on('gdal', when='+gdal')
+    depends_on('udunits2', when='+udunits2')
+
+    # We need src files of triangle to appear in ncl's src tree if we want
+    # triangle's features.
+    resource(
+        name='triangle',
+        url='http://www.netlib.org/voronoi/triangle.zip',
+        md5='10aff8d7950f5e0e2fb6dd2e340be2c9',
+        placement='triangle_src',
+        when='+triangle')
+
+    def install(self, spec, prefix):
+
+        if (self.compiler.fc is None) or (self.compiler.cc is None):
+            raise InstallError('NCL package requires both '
+                               'C and Fortran compilers.')
+
+        self.prepare_site_config()
+        self.prepare_install_config()
+        self.prepare_src_tree()
+        make('Everything', parallel=False)
+
+    def setup_environment(self, spack_env, run_env):
+        run_env.set('NCARG_ROOT', self.spec.prefix)
+
+    def prepare_site_config(self):
+        fc_flags = []
+        cc_flags = []
+        c2f_flags = []
+
+        if '+openmp' in self.spec:
+            fc_flags.append(self.compiler.openmp_flag)
+            cc_flags.append(self.compiler.openmp_flag)
+
+        if self.compiler.name == 'gcc':
+            fc_flags.append('-fno-range-check')
+            c2f_flags.extend(['-lgfortran'])
+        elif self.compiler.name == 'intel':
+            fc_flags.append('-fp-model precise')
+            cc_flags.append('-fp-model precise')
+            c2f_flags.extend(['-lifcore', '-lifport'])
+
+        with open('./config/Spack', 'w') as f:
+            f.writelines([
+                '#define HdfDefines\n',
+                '#define CppCommand \'/usr/bin/env cpp -traditional\'\n',
+                '#define CCompiler cc\n',
+                '#define FCompiler fc\n',
+                ('#define CtoFLibraries ' + ' '.join(c2f_flags) + '\n'
+                 if len(c2f_flags) > 0
+                 else ''),
+                ('#define CtoFLibrariesUser ' + ' '.join(c2f_flags) + '\n'
+                 if len(c2f_flags) > 0
+                 else ''),
+                ('#define CcOptions ' + ' '.join(cc_flags) + '\n'
+                 if len(cc_flags) > 0
+                 else ''),
+                ('#define FcOptions ' + ' '.join(fc_flags) + '\n'
+                 if len(fc_flags) > 0
+                 else ''),
+                '#define BuildShared NO'
+            ])
+
+    def prepare_install_config(self):
+        # Remove the results of the previous configuration attempts.
+        self.delete_files('./Makefile', './config/Site.local')
+
+        # Generate an array of answers that will be passed to the interactive
+        # configuration script.
+        config_answers = [
+            # Enter Return to continue
+            '\n',
+            # Build NCL?
+            'y\n',
+            # Parent installation directory :
+            '\'' + self.spec.prefix + '\'\n',
+            # System temp space directory   :
+            '\'' + tempfile.mkdtemp(prefix='ncl_ncar_') + '\'\n',
+            # Build NetCDF4 feature support (optional)?
+            'y\n'
+        ]
+
+        if '+hdf4' in self.spec:
+            config_answers.extend([
+                # Build HDF4 support (optional) into NCL?
+                'y\n',
+                # Also build HDF4 support (optional) into raster library?
+                'y\n',
+                # Did you build HDF4 with szip support?
+                'y\n' if self.spec.satisfies('^hdf+szip') else 'n\n'
+            ])
+        else:
+            config_answers.extend([
+                # Build HDF4 support (optional) into NCL?
+                'n\n',
+                # Also build HDF4 support (optional) into raster library?
+                'n\n'
+            ])
+
+        config_answers.extend([
+            # Build Triangle support (optional) into NCL
+            'y\n' if '+triangle' in self.spec else 'n\n',
+            # If you are using NetCDF V4.x, did you enable NetCDF-4 support?
+            'y\n',
+            # Did you build NetCDF with OPeNDAP support?
+            'y\n',
+            # Build GDAL support (optional) into NCL?
+            'y\n' if '+gdal' in self.spec else 'n\n',
+            # Build Udunits-2 support (optional) into NCL?
+            'y\n' if '+uduints2' in self.spec else 'n\n',
+            # Build Vis5d+ support (optional) into NCL?
+            'n\n',
+            # Build HDF-EOS2 support (optional) into NCL?
+            'n\n',
+            # Build HDF5 support (optional) into NCL?
+            'y\n',
+            # Build HDF-EOS5 support (optional) into NCL?
+            'n\n',
+            # Build GRIB2 support (optional) into NCL?
+            'n\n',
+            # Enter local library search path(s) :
+            # The paths will be passed by the Spack wrapper.
+            ' \n',
+            # Enter local include search path(s) :
+            # All other paths will be passed by the Spack wrapper.
+            '\'' + join_path(self.spec['freetype'].prefix.include,
+                             'freetype2') + '\'\n',
+            # Go back and make more changes or review?
+            'n\n',
+            # Save current configuration?
+            'y\n'
+        ])
+
+        config_answers_filename = 'spack-config.in'
+        config_script = Executable('./Configure')
+
+        with open(config_answers_filename, 'w') as f:
+            f.writelines(config_answers)
+
+        with open(config_answers_filename, 'r') as f:
+            config_script(input=f)
+
+    def prepare_src_tree(self):
+        if '+triangle' in self.spec:
+            triangle_src = join_path(self.stage.source_path, 'triangle_src')
+            triangle_dst = join_path(self.stage.source_path, 'ni', 'src',
+                                     'lib', 'hlu')
+            shutil.copy(join_path(triangle_src, 'triangle.h'), triangle_dst)
+            shutil.copy(join_path(triangle_src, 'triangle.c'), triangle_dst)
+
+    @staticmethod
+    def delete_files(*filenames):
+        for filename in filenames:
+            if os.path.exists(filename):
+                try:
+                    os.remove(filename)
+                except OSError, e:
+                    raise InstallError('Failed to delete file %s: %s' % (
+                        e.filename, e.strerror))
diff --git a/var/spack/repos/builtin/packages/ncl/spack_ncl.patch b/var/spack/repos/builtin/packages/ncl/spack_ncl.patch
new file mode 100644
index 0000000000..ebbecc43ba
--- /dev/null
+++ b/var/spack/repos/builtin/packages/ncl/spack_ncl.patch
@@ -0,0 +1,30 @@
+--- a/config/ymake	2015-03-16 22:21:42.000000000 +0100
++++ b/config/ymake	2016-10-14 13:44:49.530646098 +0200
+@@ -537,0 +538,3 @@
++# We want to have our own definitions for spack
++set sysincs = Spack
++
+--- a/Configure	2015-03-16 22:22:17.000000000 +0100
++++ b/Configure	2016-10-14 13:49:42.157631106 +0200
+@@ -1137,5 +1137,13 @@
+-    if (! -d $incs[1]) then
+-      echo "  *** Warning: <$incs[1]> does not exist"
+-      echo ""
+-      goto proc_locincdir
+-    else 
++
++    # We don't want our path(s) to be preprocessed by cpp
++    # inside ymake script. That is why we pass them in quotes (')
++    # to this script. But if we do so, the following condition
++    # is always false. That is why we comment it out and promise
++    # to pass only correct path(s). You might want to do the same
++    # thing for the libraries search path(s).
++
++    # if (! -d $incs[1]) then
++    #   echo "  *** Warning: <$incs[1]> does not exist"
++    #   echo ""
++    #   goto proc_locincdir
++    # else 
+@@ -1143 +1151 @@
+-    endif
++    # endif
diff --git a/var/spack/repos/builtin/packages/nmap/package.py b/var/spack/repos/builtin/packages/nmap/package.py
new file mode 100644
index 0000000000..f4576cde53
--- /dev/null
+++ b/var/spack/repos/builtin/packages/nmap/package.py
@@ -0,0 +1,37 @@
+##############################################################################
+# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the LICENSE file for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License (as
+# published by the Free Software Foundation) version 2.1, February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+from spack import *
+
+
+class Nmap(AutotoolsPackage):
+    """Nmap ("Network Mapper") is a free and open source (license) 
+       utility for network discovery and security auditing. 
+       It also provides ncat an updated nc"""
+
+    homepage = "https://nmap.org"
+    url      = "https://nmap.org/dist/nmap-7.31.tar.bz2"
+
+    version('7.31', 'f2f6660142a777862342a58cc54258ea')
+    version('7.30', '8d86797d5c9e56de571f9630c0e6b5f8')
diff --git a/var/spack/repos/builtin/packages/opencoarrays/package.py b/var/spack/repos/builtin/packages/opencoarrays/package.py
index d9760e2afc..eb76960024 100644
--- a/var/spack/repos/builtin/packages/opencoarrays/package.py
+++ b/var/spack/repos/builtin/packages/opencoarrays/package.py
@@ -34,8 +34,9 @@ class Opencoarrays(CMakePackage):
     """
 
     homepage = "http://www.opencoarrays.org/"
-    url      = "https://github.com/sourceryinstitute/opencoarrays/releases/download/1.6.2/OpenCoarrays-1.6.2.tar.gz"
+    url      = "https://github.com/sourceryinstitute/opencoarrays/releases/download/1.7.4/OpenCoarrays-1.7.4.tar.gz"
 
+    version('1.7.4', '85ba87def461e3ff5a164de2e6482930')
     version('1.6.2', '5a4da993794f3e04ea7855a6678981ba')
 
     depends_on('cmake', type='build')
@@ -43,11 +44,8 @@ class Opencoarrays(CMakePackage):
 
     provides('coarrays')
 
-    def install(self, spec, prefix):
-        with working_dir('spack-build', create=True):
-            args = std_cmake_args
-            args.append("-DCMAKE_C_COMPILER=%s" % self.spec['mpi'].mpicc)
-            args.append("-DCMAKE_Fortran_COMPILER=%s" % self.spec['mpi'].mpifc)
-            cmake('..', *args)
-            make()
-            make("install")
+    def cmake_args(self):
+        args = []
+        args.append("-DCMAKE_C_COMPILER=%s" % self.spec['mpi'].mpicc)
+        args.append("-DCMAKE_Fortran_COMPILER=%s" % self.spec['mpi'].mpifc)
+        return args
diff --git a/var/spack/repos/builtin/packages/pfft/package.py b/var/spack/repos/builtin/packages/pfft/package.py
new file mode 100644
index 0000000000..575f0af3c5
--- /dev/null
+++ b/var/spack/repos/builtin/packages/pfft/package.py
@@ -0,0 +1,64 @@
+##############################################################################
+# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the LICENSE file for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License (as
+# published by the Free Software Foundation) version 2.1, February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+from spack import *
+
+
+class Pfft(AutotoolsPackage):
+    """PFFT is a software library for computing massively parallel,
+       fast Fourier transformations on distributed memory architectures.
+       PFFT can be understood as a generalization of FFTW-MPI to
+       multidimensional data decomposition."""
+
+    homepage = "https://www-user.tu-chemnitz.de/~potts/workgroup/pippig/software.php.en"
+    url      = "https://www-user.tu-chemnitz.de/~potts/workgroup/pippig/software/pfft-1.0.8-alpha.tar.gz"
+
+    version('1.0.8-alpha', '46457fbe8e38d02ff87d439b63dc0709')
+
+    depends_on('fftw+mpi+pfft_patches')
+    depends_on('mpi')
+
+    def install(self, spec, prefix):
+        options = ['--prefix={0}'.format(prefix)]
+        if not self.compiler.f77 or not self.compiler.fc:
+            options.append("--disable-fortran")
+
+        configure(*options)
+        make()
+        if self.run_tests:
+            make("check")
+        make("install")
+
+        if '+float' in spec['fftw']:
+            configure('--enable-float', *options)
+            make()
+            if self.run_tests:
+                make("check")
+            make("install")
+        if '+long_double' in spec['fftw']:
+            configure('--enable-long-double', *options)
+            make()
+            if self.run_tests:
+                make("check")
+            make("install")
diff --git a/var/spack/repos/builtin/packages/py-netcdf/package.py b/var/spack/repos/builtin/packages/py-netcdf/package.py
index 497f81f86d..d238855d1e 100644
--- a/var/spack/repos/builtin/packages/py-netcdf/package.py
+++ b/var/spack/repos/builtin/packages/py-netcdf/package.py
@@ -35,6 +35,7 @@ class PyNetcdf(Package):
     extends('python')
     depends_on('py-numpy', type=nolink)
     depends_on('py-cython', type=nolink)
+    depends_on('py-setuptools', type=nolink)
     depends_on('netcdf')
 
     def install(self, spec, prefix):
diff --git a/var/spack/repos/builtin/packages/py-pygobject/package.py b/var/spack/repos/builtin/packages/py-pygobject/package.py
index 3af849e758..675eb8f004 100644
--- a/var/spack/repos/builtin/packages/py-pygobject/package.py
+++ b/var/spack/repos/builtin/packages/py-pygobject/package.py
@@ -25,7 +25,7 @@
 from spack import *
 
 
-class PyPygobject(Package):
+class PyPygobject(AutotoolsPackage):
     """bindings for the GLib, and GObject,
        to be used in Python."""
 
@@ -43,6 +43,4 @@ class PyPygobject(Package):
     patch('pygobject-2.28.6-introspection-1.patch')
 
     def install(self, spec, prefix):
-        configure("--prefix=%s" % prefix)
-        make()
-        make("install", parallel=False)
+        make('install', parallel=False)
diff --git a/var/spack/repos/builtin/packages/py-pygtk/package.py b/var/spack/repos/builtin/packages/py-pygtk/package.py
index ab0a139f02..56e0b39fd5 100644
--- a/var/spack/repos/builtin/packages/py-pygtk/package.py
+++ b/var/spack/repos/builtin/packages/py-pygtk/package.py
@@ -25,7 +25,7 @@
 from spack import *
 
 
-class PyPygtk(Package):
+class PyPygtk(AutotoolsPackage):
     """bindings for the Gtk in Python"""
     homepage = "http://www.pygtk.org/"
     url      = "http://ftp.gnome.org/pub/GNOME/sources/pygtk/2.24/pygtk-2.24.0.tar.gz"
@@ -41,6 +41,4 @@ class PyPygtk(Package):
     depends_on('py-py2cairo')
 
     def install(self, spec, prefix):
-        configure("--prefix=%s" % prefix)
-        make()
-        make("install", parallel=False)
+        make('install', parallel=False)
diff --git a/var/spack/repos/builtin/packages/star-ccm-plus/package.py b/var/spack/repos/builtin/packages/star-ccm-plus/package.py
new file mode 100644
index 0000000000..ba1516b62a
--- /dev/null
+++ b/var/spack/repos/builtin/packages/star-ccm-plus/package.py
@@ -0,0 +1,78 @@
+##############################################################################
+# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the LICENSE file for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License (as
+# published by the Free Software Foundation) version 2.1, February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+from spack import *
+import glob
+import os
+
+
+class StarCcmPlus(Package):
+    """STAR-CCM+ (Computational Continuum Mechanics) CFD solver."""
+
+    homepage = "http://mdx.plm.automation.siemens.com/star-ccm-plus"
+
+    version('11.06.010_02', 'd349c6ac8293d8e6e7a53533d695588f')
+
+    variant('docs', default=False, description='Install the documentation')
+
+    # Licensing
+    license_required = True
+    license_vars = ['CDLMD_LICENSE_FILE', 'LM_LICENSE_FILE']
+
+    def url_for_version(self, version):
+        return "file://{0}/STAR-CCM+{1}_linux-x86_64.tar.gz".format(
+            os.getcwd(), version)
+
+    def install(self, spec, prefix):
+        # There is a known issue with the LaunchAnywhere application.
+        # Specifically, it cannot handle long prompts or prompts
+        # containing special characters and backslashes. It results in
+        # the following error message:
+        #
+        # An internal LaunchAnywhere application error has occured and this
+        # application cannot proceed. (LAX)
+        #
+        # Stack Trace:
+        #     java.lang.IllegalArgumentException: Malformed \uxxxx encoding.
+        #     at java.util.Properties.loadConvert(Unknown Source)
+        #     at java.util.Properties.load0(Unknown Source)
+        #     at java.util.Properties.load(Unknown Source)
+        #     at com.zerog.common.java.util.PropertiesUtil.loadProperties(
+        #         Unknown Source)
+        #     at com.zerog.lax.LAX.<init>(Unknown Source)
+        #     at com.zerog.lax.LAX.main(Unknown Source)
+        #
+        # https://www.maplesoft.com/support/faqs/detail.aspx?sid=35272
+        env['PS1'] = '>'
+        env['PROMPT_COMMAND'] = ''
+
+        installer = Executable(glob.glob('*.bin')[0])
+
+        installer(
+            '-i', 'silent',
+            '-DINSTALLDIR={0}'.format(prefix),
+            '-DINSTALLFLEX=false',
+            '-DADDSYSTEMPATH=false',
+            '-DNODOC={0}'.format('false' if '+docs' in spec else 'true')
+        )
diff --git a/var/spack/repos/builtin/packages/tau/package.py b/var/spack/repos/builtin/packages/tau/package.py
index 1801b41c37..d6b0a98d67 100644
--- a/var/spack/repos/builtin/packages/tau/package.py
+++ b/var/spack/repos/builtin/packages/tau/package.py
@@ -24,7 +24,7 @@
 ##############################################################################
 from spack import *
 import os
-import os.path
+import glob
 from llnl.util.filesystem import join_path
 
 
@@ -146,3 +146,8 @@ class Tau(Package):
                 dest = join_path(self.prefix, d)
                 if os.path.isdir(src) and not os.path.exists(dest):
                     os.symlink(join_path(subdir, d), dest)
+
+    def setup_environment(self, spack_env, run_env):
+        pattern = join_path(self.prefix.lib, 'Makefile.*')
+        files = glob.glob(pattern)
+        run_env.set('TAU_MAKEFILE', files[0])
diff --git a/var/spack/repos/builtin/packages/texlive/package.py b/var/spack/repos/builtin/packages/texlive/package.py
index c9c677e2b1..36b3fad2f6 100644
--- a/var/spack/repos/builtin/packages/texlive/package.py
+++ b/var/spack/repos/builtin/packages/texlive/package.py
@@ -32,8 +32,10 @@ class Texlive(Package):
 
     homepage = "http://www.tug.org/texlive"
 
-    version('live', '8402774984c67fed4a18b7f6491243a6',
-            url="http://mirror.ctan.org/systems/texlive/tlnet/install-tl-unx.tar.gz")
+    # pull from specific site because the texlive mirrors do not all
+    # update in synchrony.
+    version('live', '6d171d370f3a2f2b936b9b0c87e8d0fe',
+            url="http://ctan.math.utah.edu/ctan/tex-archive/systems/texlive/tlnet/install-tl-unx.tar.gz")
 
     # There does not seem to be a complete list of schemes.
     # Examples include:
diff --git a/var/spack/repos/builtin/packages/trilinos/package.py b/var/spack/repos/builtin/packages/trilinos/package.py
index 4eb50ba64d..07393f9e9b 100644
--- a/var/spack/repos/builtin/packages/trilinos/package.py
+++ b/var/spack/repos/builtin/packages/trilinos/package.py
@@ -36,7 +36,7 @@ import sys
 # https://github.com/trilinos/Trilinos/issues/175
 
 
-class Trilinos(Package):
+class Trilinos(CMakePackage):
     """The Trilinos Project is an effort to develop algorithms and enabling
     technologies within an object-oriented software framework for the solution
     of large-scale, complex multi-physics engineering and scientific problems.
@@ -124,12 +124,12 @@ class Trilinos(Package):
             raise RuntimeError('The superlu-dist variant can only be used' +
                                ' with Trilinos @12.0.1:')
 
-    def install(self, spec, prefix):
+    def cmake_args(self):
+        spec = self.spec
         self.variants_check()
 
         cxx_flags = []
         options = []
-        options.extend(std_cmake_args)
 
         mpi_bin = spec['mpi'].prefix.bin
         # Note: -DXYZ_LIBRARY_NAMES= needs semicolon separated list of names
@@ -157,7 +157,8 @@ class Trilinos(Package):
             '-DTrilinos_ENABLE_CXX11:BOOL=ON',
             '-DTPL_ENABLE_Netcdf:BOOL=ON',
             '-DTPL_ENABLE_HYPRE:BOOL=%s' % (
-                'ON' if '+hypre' in spec else 'OFF')
+                'ON' if '+hypre' in spec else 'OFF'),
+            '-DCMAKE_INSTALL_NAME_DIR:PATH=%s/lib' % self.prefix
         ])
 
         if spec.satisfies('%intel') and spec.satisfies('@12.6.2'):
@@ -206,11 +207,6 @@ class Trilinos(Package):
                 '-DTrilinos_ENABLE_Fortran=ON'
             ])
 
-        # for build-debug only:
-        # options.extend([
-        #    '-DCMAKE_VERBOSE_MAKEFILE:BOOL=TRUE'
-        # ])
-
         # suite-sparse related
         if '+suite-sparse' in spec:
             options.extend([
@@ -330,27 +326,20 @@ class Trilinos(Package):
             options.extend([
                 '-DTrilinos_ENABLE_FEI=OFF'
             ])
+        return options
 
-        with working_dir('spack-build', create=True):
-            cmake('..', *options)
-            make()
-            make('install')
-
-            # When trilinos is built with Python, libpytrilinos is included
-            # through cmake configure files. Namely, Trilinos_LIBRARIES in
-            # TrilinosConfig.cmake contains pytrilinos. This leads to a
-            # run-time error: Symbol not found: _PyBool_Type and prevents
-            # Trilinos to be used in any C++ code, which links executable
-            # against the libraries listed in Trilinos_LIBRARIES.  See
-            # https://github.com/Homebrew/homebrew-science/issues/2148#issuecomment-103614509
-            # A workaround it to remove PyTrilinos from the COMPONENTS_LIST :
-            if '+python' in self.spec:
-                filter_file(r'(SET\(COMPONENTS_LIST.*)(PyTrilinos;)(.*)',
-                            (r'\1\3'),
-                            '%s/cmake/Trilinos/TrilinosConfig.cmake' %
-                            prefix.lib)
-
-            # The shared libraries are not installed correctly on Darwin;
-            # correct this
-            if (sys.platform == 'darwin') and ('+shared' in spec):
-                fix_darwin_install_name(prefix.lib)
+    @CMakePackage.sanity_check('install')
+    def filter_python(self):
+        # When trilinos is built with Python, libpytrilinos is included
+        # through cmake configure files. Namely, Trilinos_LIBRARIES in
+        # TrilinosConfig.cmake contains pytrilinos. This leads to a
+        # run-time error: Symbol not found: _PyBool_Type and prevents
+        # Trilinos to be used in any C++ code, which links executable
+        # against the libraries listed in Trilinos_LIBRARIES.  See
+        # https://github.com/Homebrew/homebrew-science/issues/2148#issuecomment-103614509
+        # A workaround is to remove PyTrilinos from the COMPONENTS_LIST :
+        if '+python' in self.spec:
+            filter_file(r'(SET\(COMPONENTS_LIST.*)(PyTrilinos;)(.*)',
+                        (r'\1\3'),
+                        '%s/cmake/Trilinos/TrilinosConfig.cmake' %
+                        self.prefix.lib)