From 8ca7c7700895032d7ec9fe728146e3ac0dbd0a64 Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Mon, 7 Aug 2017 11:41:13 -0700
Subject: Elemental cublas (#4889)

* Added a package for the MDAnalysis toolkit.

* Added a patch that allows Elemental to use cuBLAS internally.

* Added support for LBANN to use the new cuBLAS extension in Elemental.

* Added a proper variant for when LBANN does not want to use cuBLAS in
elemental.

* Added a package for the cnpy project and used it in the lbann package.

* Removed unnecessary comment lines.

* Removed blank lines

* Removed debug variant

* Add support for libjpeg-turbo

* Added additional variants for OpenCV features. Fixed bug when linking
in TIFF support, where libtiff used the regular JPEG library and
OpenCV used libjpeg-turbo.  Now libtiff can use libjpeg-turbo.

* Removed the variant for getting Elemental to use the cublas variant.
Updated the requirements for OpenCV to add new options.

* Fixed a flake8 error in OpenCV and added a path to find cnpy in lbann.

* Fixed line too long flake8 error.

* Added a flag to specify the datatype size in lbann and fixed a flake8 error.

* Added a debug build variant using hte new build_type

* Fixed flake8

* Fixed how the debug build is pushed to Elemental

* Fixed a bug in the Elemental package where the blas search flags were
being overridden by the blas link flags.  Changed how the sequential
initialization variant is implemented in LBANN.

* Added support via a variant to explicitly use mkl or openblas.  This
helps work around variant forwarding problems.

* Updated package files to address pull request comments.
---
 var/spack/repos/builtin/packages/cnpy/package.py   |  34 ++
 .../packages/elemental/elemental_cublas.patch      | 668 +++++++++++++++++++++
 .../repos/builtin/packages/elemental/package.py    |  20 +-
 var/spack/repos/builtin/packages/lbann/package.py  |  26 +-
 .../repos/builtin/packages/libtiff/package.py      |   5 +-
 var/spack/repos/builtin/packages/opencv/package.py |  93 ++-
 6 files changed, 804 insertions(+), 42 deletions(-)
 create mode 100644 var/spack/repos/builtin/packages/cnpy/package.py
 create mode 100644 var/spack/repos/builtin/packages/elemental/elemental_cublas.patch

diff --git a/var/spack/repos/builtin/packages/cnpy/package.py b/var/spack/repos/builtin/packages/cnpy/package.py
new file mode 100644
index 0000000000..b62df10c2e
--- /dev/null
+++ b/var/spack/repos/builtin/packages/cnpy/package.py
@@ -0,0 +1,34 @@
+##############################################################################
+# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the NOTICE and LICENSE files for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License (as
+# published by the Free Software Foundation) version 2.1, February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+from spack import *
+
+
+class Cnpy(CMakePackage):
+    """cnpy: library to read/write .npy and .npz files in C/C++."""
+
+    homepage = "https://github.com/rogersce/cnpy"
+    url      = "https://github.com/rogersce/cnpy"
+
+    version('master', git='https://github.com/rogersce/cnpy.git', branch="master")
diff --git a/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch b/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch
new file mode 100644
index 0000000000..9cf9b6e6b5
--- /dev/null
+++ b/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch
@@ -0,0 +1,668 @@
+diff -Naur a/include/El/blas_like/level3.hpp b/include/El/blas_like/level3.hpp
+--- a/include/El/blas_like/level3.hpp	2017-06-08 07:30:43.180249917 -0700
++++ b/include/El/blas_like/level3.hpp	2017-06-08 07:35:27.325434602 -0700
+@@ -31,6 +31,10 @@
+ }
+ using namespace GemmAlgorithmNS;
+ 
++void GemmUseGPU(int min_M, int min_N, int min_K);
++
++void GemmUseCPU();
++
+ template<typename T>
+ void Gemm
+ ( Orientation orientA, Orientation orientB,
+diff -Naur a/include/El/core/imports/blas.hpp b/include/El/core/imports/blas.hpp
+--- a/include/El/core/imports/blas.hpp	2017-06-08 07:30:43.522016908 -0700
++++ b/include/El/core/imports/blas.hpp	2017-06-08 07:35:06.834030908 -0700
+@@ -916,4 +916,63 @@
+ } // namespace blas
+ } // namespace El
+ 
++
++#if defined(EL_USE_CUBLAS)
++
++namespace El {
++
++#ifdef EL_USE_64BIT_BLAS_INTS
++typedef long long int BlasInt;
++#else
++typedef int BlasInt;
++#endif
++
++namespace cublas {
++
++// NOTE: templated routines are custom and not wrappers
++
++// Level 3 BLAS
++// ============
++template<typename T>
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++  const T& alpha,
++  const T* A, BlasInt ALDim, 
++  const T* B, BlasInt BLDim,
++  const T& beta,
++        T* C, BlasInt CLDim );
++
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++  const float& alpha,
++  const float* A, BlasInt ALDim, 
++  const float* B, BlasInt BLDim,
++  const float& beta,
++        float* C, BlasInt CLDim );
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++  const double& alpha,
++  const double* A, BlasInt ALDim, 
++  const double* B, BlasInt BLDim,
++  const double& beta,
++        double* C, BlasInt CLDim );
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++  const scomplex& alpha,
++  const scomplex* A, BlasInt ALDim, 
++  const scomplex* B, BlasInt BLDim,
++  const scomplex& beta,
++        scomplex* C, BlasInt CLDim );
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++  const dcomplex& alpha,
++  const dcomplex* A, BlasInt ALDim, 
++  const dcomplex* B, BlasInt BLDim,
++  const dcomplex& beta,
++        dcomplex* C, BlasInt CLDim );
++
++} // namespace cublas
++} // namespace El
++#endif
++
+ #endif // ifndef EL_IMPORTS_BLAS_DECL_HPP
+diff -Naur a/src/blas_like/level3/Gemm.cpp b/src/blas_like/level3/Gemm.cpp
+--- a/src/blas_like/level3/Gemm.cpp	2017-06-08 07:30:44.307096427 -0700
++++ b/src/blas_like/level3/Gemm.cpp	2017-06-08 07:34:23.062863489 -0700
+@@ -16,6 +16,20 @@
+ 
+ namespace El {
+ 
++char gemm_cpu_gpu_switch = 'c';
++int min_M = 0, min_N = 0, min_K = 0;
++
++void GemmUseGPU(int _min_M, int _min_N, int _min_K) {
++   gemm_cpu_gpu_switch = 'g';
++   min_M = _min_M;
++   min_N = _min_N;
++   min_K = _min_K;
++}
++
++void GemmUseCPU() {
++   gemm_cpu_gpu_switch = 'c';
++}
++
+ template<typename T>
+ void Gemm
+ ( Orientation orientA, Orientation orientB,
+@@ -59,11 +73,30 @@
+     const Int k = ( orientA == NORMAL ? A.Width() : A.Height() );
+     if( k != 0 )
+     {
++#if defined(EL_USE_CUBLAS)
++        if (gemm_cpu_gpu_switch == 'g' && 
++            m >= min_M &&
++            n >= min_N &&
++            k >= min_K) {
++          cublas::Gemm
++          ( transA, transB, m, n, k,
++            alpha, A.LockedBuffer(), A.LDim(),
++                   B.LockedBuffer(), B.LDim(),
++            beta,  C.Buffer(),       C.LDim() );
++        } else {
++          blas::Gemm
++          ( transA, transB, m, n, k,
++            alpha, A.LockedBuffer(), A.LDim(),
++                   B.LockedBuffer(), B.LDim(),
++            beta,  C.Buffer(),       C.LDim() );
++        }
++#else
+         blas::Gemm
+         ( transA, transB, m, n, k,
+           alpha, A.LockedBuffer(), A.LDim(),
+                  B.LockedBuffer(), B.LDim(),
+           beta,  C.Buffer(),       C.LDim() );
++#endif
+     }
+     else
+     {
+diff -Naur a/src/core/imports/blas/Gemm.hpp b/src/core/imports/blas/Gemm.hpp
+--- a/src/core/imports/blas/Gemm.hpp	2017-06-08 07:30:45.090529967 -0700
++++ b/src/core/imports/blas/Gemm.hpp	2017-06-08 07:34:46.503009958 -0700
+@@ -41,6 +41,12 @@
+ 
+ } // extern "C"
+ 
++
++#if defined(EL_USE_CUBLAS)
++#include <cublas.h>
++#include <cub/util_allocator.cuh>
++#endif
++
+ namespace El {
+ namespace blas {
+ 
+@@ -515,3 +521,515 @@
+ 
+ } // namespace blas
+ } // namespace El
++
++
++#if EL_USE_CUBLAS
++
++#define USE_CUB 1
++
++namespace El {
++namespace cublas {
++
++#if USE_CUB
++cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
++#endif
++
++template<typename T>
++void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k,
++  const T& alpha,
++  const T* A, BlasInt ALDim,
++  const T* B, BlasInt BLDim,
++  const T& beta,
++        T* C, BlasInt CLDim )
++{
++   // put something here
++    printf("integer version \n");
++}
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const Int& alpha,
++  const Int* A, BlasInt ALDim,
++  const Int* B, BlasInt BLDim,
++  const Int& beta,
++        Int* C, BlasInt CLDim );
++#ifdef EL_HAVE_QD
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const DoubleDouble& alpha,
++  const DoubleDouble* A, BlasInt ALDim,
++  const DoubleDouble* B, BlasInt BLDim,
++  const DoubleDouble& beta,
++        DoubleDouble* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const QuadDouble& alpha,
++  const QuadDouble* A, BlasInt ALDim,
++  const QuadDouble* B, BlasInt BLDim,
++  const QuadDouble& beta,
++        QuadDouble* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const Complex<DoubleDouble>& alpha,
++  const Complex<DoubleDouble>* A, BlasInt ALDim,
++  const Complex<DoubleDouble>* B, BlasInt BLDim,
++  const Complex<DoubleDouble>& beta,
++        Complex<DoubleDouble>* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const Complex<QuadDouble>& alpha,
++  const Complex<QuadDouble>* A, BlasInt ALDim,
++  const Complex<QuadDouble>* B, BlasInt BLDim,
++  const Complex<QuadDouble>& beta,
++        Complex<QuadDouble>* C, BlasInt CLDim );
++#endif
++#ifdef EL_HAVE_QUAD
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const Quad& alpha,
++  const Quad* A, BlasInt ALDim,
++  const Quad* B, BlasInt BLDim,
++  const Quad& beta,
++        Quad* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const Complex<Quad>& alpha,
++  const Complex<Quad>* A, BlasInt ALDim, 
++  const Complex<Quad>* B, BlasInt BLDim,
++  const Complex<Quad>& beta,
++        Complex<Quad>* C, BlasInt CLDim );
++#endif
++#ifdef EL_HAVE_MPC
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const BigInt& alpha,
++  const BigInt* A, BlasInt ALDim,
++  const BigInt* B, BlasInt BLDim,
++  const BigInt& beta,
++        BigInt* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const BigFloat& alpha,
++  const BigFloat* A, BlasInt ALDim,
++  const BigFloat* B, BlasInt BLDim,
++  const BigFloat& beta,
++        BigFloat* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const Complex<BigFloat>& alpha,
++  const Complex<BigFloat>* A, BlasInt ALDim,
++  const Complex<BigFloat>* B, BlasInt BLDim,
++  const Complex<BigFloat>& beta,
++        Complex<BigFloat>* C, BlasInt CLDim );
++#endif
++
++void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const float& alpha,
++  const float* A, BlasInt ALDim,
++  const float* B, BlasInt BLDim,
++  const float& beta,
++        float* C, BlasInt CLDim )
++{
++    EL_DEBUG_CSE
++    EL_DEBUG_ONLY(
++      if( std::toupper(transA) == 'N' )
++      {
++          if( ALDim < Max(m,1) )
++              LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
++      }
++      else
++      {
++          if( ALDim < Max(k,1) )
++              LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
++      }
++
++      if( std::toupper(transB) == 'N' )
++      {
++          if( BLDim < Max(k,1) )
++              LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
++      }
++      else
++      {
++          if( BLDim < Max(n,1) )
++              LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
++      }
++
++      if( CLDim < Max(m,1) )
++          LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
++    )
++    const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
++    const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
++ 
++    const mpi::Comm comm;
++    const Int commRank = mpi::Rank( comm );
++    if (commRank == 0) {
++       //printf("calling cublas Sgemm: m %d n %d k %d\n", m, n, k);
++    }
++
++    BlasInt rowA, colA, rowB, colB, rowC, colC;
++    // device memory size for A, B and C
++    BlasInt sizeA, sizeB, sizeC;
++    float *devA=NULL, *devB=NULL, *devC=NULL;
++    
++    rowA = fixedTransA == 'T' ? k : m;
++    colA = fixedTransA == 'T' ? m : k;
++    rowB = fixedTransB == 'T' ? n : k;
++    colB = fixedTransB == 'T' ? k : n;
++    rowC = m;
++    colC = n;
++    sizeA = rowA * colA;
++    sizeB = rowB * colB;
++    sizeC = rowC * colC;
++
++    cublasStatus stat;
++    
++#if USE_CUB
++    CubDebugExit(g_allocator.DeviceAllocate((void**)&devA, 
++                 sizeof(float) * (sizeA+sizeB+sizeC) ));
++#else
++    stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(float), (void **) &devA);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
++#endif
++
++    devB = devA + sizeA;
++    devC = devB + sizeB;
++
++    // copy matrix A, B and C to device
++    stat = cublasSetMatrix(rowA, colA, sizeof(float), A, ALDim, devA, rowA);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
++
++    stat = cublasSetMatrix(rowB, colB, sizeof(float), B, BLDim, devB, rowB);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
++    
++    if (beta != 0.0)
++    {
++       stat = cublasSetMatrix(rowC, colC, sizeof(float), C, CLDim, devC, rowC);
++       if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
++    }
++    
++    // cublas<t>gemm
++    cublasSgemm
++    ( fixedTransA, fixedTransB, m, n, k,
++      alpha, devA, rowA, devB, rowB, beta, devC, rowC );
++
++    // copy matrix C to host
++    stat = cublasGetMatrix(rowC, colC, sizeof(float), devC, rowC, C, CLDim);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
++
++    // free
++#if USE_CUB
++    CubDebugExit(g_allocator.DeviceFree(devA));
++#else
++    cublasFree(devA);
++#endif
++    //printf("CUBLAS float done ...\n");
++}
++
++void Gemm
++( char transA, char transB,
++  BlasInt m, BlasInt n, BlasInt k, 
++  const double& alpha,
++  const double* A, BlasInt ALDim, 
++  const double* B, BlasInt BLDim,
++  const double& beta,
++        double* C, BlasInt CLDim )
++{
++    EL_DEBUG_CSE
++    EL_DEBUG_ONLY(
++      if( std::toupper(transA) == 'N' )
++      {
++          if( ALDim < Max(m,1) )
++              LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
++      }
++      else
++      {
++          if( ALDim < Max(k,1) )
++              LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
++      }      
++
++      if( std::toupper(transB) == 'N' )
++      {
++          if( BLDim < Max(k,1) )
++              LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
++      }
++      else
++      {
++          if( BLDim < Max(n,1) )
++              LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
++      }
++
++      if( CLDim < Max(m,1) )
++          LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
++    )
++    const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
++    const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
++
++    const mpi::Comm comm;
++    const Int commRank = mpi::Rank( comm );
++    if (commRank == 0) {
++       //printf("calling cublas Dgemm: m %d n %d k %d\n", m, n, k);
++    }
++
++    BlasInt rowA, colA, rowB, colB, rowC, colC;
++    // device memory size for A, B and C
++    BlasInt sizeA, sizeB, sizeC;
++    double *devA=NULL, *devB=NULL, *devC=NULL;
++    
++    rowA = fixedTransA == 'T' ? k : m;
++    colA = fixedTransA == 'T' ? m : k;
++    rowB = fixedTransB == 'T' ? n : k;
++    colB = fixedTransB == 'T' ? k : n;
++    rowC = m;
++    colC = n;
++    sizeA = rowA * colA;
++    sizeB = rowB * colB;
++    sizeC = rowC * colC;
++
++    cublasStatus stat;
++
++#if USE_CUB
++    CubDebugExit(g_allocator.DeviceAllocate((void**)&devA, 
++                 sizeof(double) * (sizeA+sizeB+sizeC) ));
++#else
++    stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(double), (void **) &devA);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
++#endif
++
++    devB = devA + sizeA;
++    devC = devB + sizeB;
++
++    // copy matrix A, B and C to device
++    stat = cublasSetMatrix(rowA, colA, sizeof(double), A, ALDim, devA, rowA);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
++
++    stat = cublasSetMatrix(rowB, colB, sizeof(double), B, BLDim, devB, rowB);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
++    
++    if (beta != 0.0)
++    {
++       stat = cublasSetMatrix(rowC, colC, sizeof(double), C, CLDim, devC, rowC);
++       if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
++    }
++
++    // cublas<t>gemm
++    cublasDgemm
++    ( fixedTransA, fixedTransB, m, n, k,
++      alpha, devA, rowA, devB, rowB, beta, devC, rowC );
++    
++    // copy matrix C to host
++    stat = cublasGetMatrix(rowC, colC, sizeof(double), devC, rowC, C, CLDim);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
++
++    // free
++#if USE_CUB
++    CubDebugExit(g_allocator.DeviceFree(devA));
++#else
++    cublasFree(devA);
++#endif
++    //printf("CUBLAS double done ...\n");
++}
++
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, 
++  const scomplex& alpha,
++  const scomplex* A, BlasInt ALDim, 
++  const scomplex* B, BlasInt BLDim,
++  const scomplex& beta,
++        scomplex* C, BlasInt CLDim )
++{
++    EL_DEBUG_CSE
++    EL_DEBUG_ONLY(
++      if( std::toupper(transA) == 'N' )
++      {
++          if( ALDim < Max(m,1) )
++              LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
++      }
++      else
++      {
++          if( ALDim < Max(k,1) )
++              LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
++      }      
++
++      if( std::toupper(transB) == 'N' )
++      {
++          if( BLDim < Max(k,1) )
++              LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
++      }
++      else
++      {
++          if( BLDim < Max(n,1) )
++              LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
++      }
++
++      if( CLDim < Max(m,1) )
++          LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
++    )
++        
++    const char fixedTransA = transA;
++    const char fixedTransB = transB;
++    
++    const mpi::Comm comm;
++    const Int commRank = mpi::Rank( comm );
++    if (commRank == 0) {
++       //printf("calling cublas Cgemm: m %d n %d k %d\n", m, n, k);
++    }
++
++    BlasInt rowA, colA, rowB, colB, rowC, colC;
++    // device memory size for A, B and C
++    BlasInt sizeA, sizeB, sizeC;
++    cuComplex *devA=NULL, *devB=NULL, *devC=NULL;
++    
++    rowA = fixedTransA == 'T' ? k : m;
++    colA = fixedTransA == 'T' ? m : k;
++    rowB = fixedTransB == 'T' ? n : k;
++    colB = fixedTransB == 'T' ? k : n;
++    rowC = m;
++    colC = n;
++    sizeA = rowA * colA;
++    sizeB = rowB * colB;
++    sizeC = rowC * colC;
++
++    cublasStatus stat;
++    stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuComplex), (void **) &devA);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
++
++    devB = devA + sizeA;
++    devC = devB + sizeB;
++
++    // copy matrix A, B and C to device
++    stat = cublasSetMatrix(rowA, colA, sizeof(cuComplex), A, ALDim, devA, rowA);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
++
++    stat = cublasSetMatrix(rowB, colB, sizeof(cuComplex), B, BLDim, devB, rowB);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
++    
++    if (beta.real() != 0.0 || beta.imag() != 0.0)
++    {
++       stat = cublasSetMatrix(rowC, colC, sizeof(cuComplex), C, CLDim, devC, rowC);
++       if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
++    }
++
++    // cublas<t>gemm
++    cublasCgemm
++    ( fixedTransA, fixedTransB, m, n, k,
++      *((cuComplex*) &alpha), devA, rowA, devB, rowB, *((cuComplex*) &beta), devC, rowC );
++
++    // copy matrix C to host
++    stat = cublasGetMatrix(rowC, colC, sizeof(cuComplex), devC, rowC, C, CLDim);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
++
++    // free
++    cublasFree(devA);
++}
++
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, 
++  const dcomplex& alpha,
++  const dcomplex* A, BlasInt ALDim, 
++  const dcomplex* B, BlasInt BLDim,
++  const dcomplex& beta,
++        dcomplex* C, BlasInt CLDim )
++{
++    EL_DEBUG_CSE
++    EL_DEBUG_ONLY(
++      if( std::toupper(transA) == 'N' )
++      {
++          if( ALDim < Max(m,1) )
++              LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
++      }
++      else
++      {
++          if( ALDim < Max(k,1) )
++              LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
++      }      
++
++      if( std::toupper(transB) == 'N' )
++      {
++          if( BLDim < Max(k,1) )
++              LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
++      }
++      else
++      {
++          if( BLDim < Max(n,1) )
++              LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
++      }
++
++      if( CLDim < Max(m,1) )
++          LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
++    )
++
++    const char fixedTransA = transA;
++    const char fixedTransB = transB;
++       
++    const mpi::Comm comm;
++    const Int commRank = mpi::Rank( comm );
++    if (commRank == 0) {
++       //printf("calling cublas Zgemm: m %d n %d k %d\n", m, n, k);
++    }
++
++    BlasInt rowA, colA, rowB, colB, rowC, colC;
++    // device memory size for A, B and C
++    BlasInt sizeA, sizeB, sizeC;
++    cuDoubleComplex *devA=NULL, *devB=NULL, *devC=NULL;
++    
++    rowA = fixedTransA == 'T' ? k : m;
++    colA = fixedTransA == 'T' ? m : k;
++    rowB = fixedTransB == 'T' ? n : k;
++    colB = fixedTransB == 'T' ? k : n;
++    rowC = m;
++    colC = n;
++    sizeA = rowA * colA;
++    sizeB = rowB * colB;
++    sizeC = rowC * colC;
++
++    cublasStatus stat;
++    stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuDoubleComplex), (void **) &devA);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
++
++    devB = devA + sizeA;
++    devC = devB + sizeB;
++
++    // copy matrix A, B and C to device
++    stat = cublasSetMatrix(rowA, colA, sizeof(cuDoubleComplex), A, ALDim, devA, rowA);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
++
++    stat = cublasSetMatrix(rowB, colB, sizeof(cuDoubleComplex), B, BLDim, devB, rowB);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
++    
++    if (beta.real() != 0.0 || beta.imag() != 0.0)
++    {
++       stat = cublasSetMatrix(rowC, colC, sizeof(cuDoubleComplex), C, CLDim, devC, rowC);
++       if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
++    }
++
++    cublasZgemm
++    ( fixedTransA, fixedTransB, m, n, k,
++      *((cuDoubleComplex*) &alpha), devA, rowA, devB, rowB, *((cuDoubleComplex*) &beta), 
++      devC, rowC );
++
++    // copy matrix C to host
++    stat = cublasGetMatrix(rowC, colC, sizeof(cuDoubleComplex), devC, rowC, C, CLDim);
++    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
++
++    // free
++    cublasFree(devA);
++}
++
++} // namespace cublas
++} // namespace El
++
++#endif
++
diff --git a/var/spack/repos/builtin/packages/elemental/package.py b/var/spack/repos/builtin/packages/elemental/package.py
index e118bcbd44..50fb4f9829 100644
--- a/var/spack/repos/builtin/packages/elemental/package.py
+++ b/var/spack/repos/builtin/packages/elemental/package.py
@@ -33,6 +33,7 @@ class Elemental(CMakePackage):
     homepage = "http://libelemental.org"
     url      = "https://github.com/elemental/Elemental/archive/v0.87.6.tar.gz"
 
+    version('master', git='https://github.com/elemental/Elemental.git', branch='master')
     version('0.87.7', '6c1e7442021c59a36049e37ea69b8075')
     version('0.87.6', '9fd29783d45b0a0e27c0df85f548abe9')
 
@@ -52,6 +53,8 @@ class Elemental(CMakePackage):
             description='Enable quad precision')
     variant('int64', default=False,
             description='Use 64bit integers')
+    variant('cublas', default=False,
+            description='Enable cuBLAS for local BLAS operations')
     # When this variant is set remove the normal dependencies since
     # Elemental has to build BLAS and ScaLAPACK internally
     variant('int64_blas', default=False,
@@ -62,15 +65,21 @@ class Elemental(CMakePackage):
     variant('build_type', default='Release',
             description='The build type to build',
             values=('Debug', 'Release'))
+    variant('blas', default='openblas', values=('openblas', 'mkl'),
+            description='Enable the use of OpenBlas/MKL')
 
-    # Note that this forces us to use OpenBLAS until #1712 is fixed
+    # Note that #1712 forces us to enumerate the different blas variants
     depends_on('blas', when='~openmp_blas ~int64_blas')
     # Hack to forward variant to openblas package
     # Allow Elemental to build internally when using 8-byte ints
-    depends_on('openblas +openmp', when='+openmp_blas ~int64_blas')
+    depends_on('openblas +openmp', when='blas=openblas +openmp_blas ~int64_blas')
+
+    depends_on('intel-mkl', when="blas=mkl ~openmp_blas ~int64_blas")
+    depends_on('intel-mkl +openmp', when='blas=mkl +openmp_blas ~int64_blas')
+    depends_on('intel-mkl@2017.1 +openmp +ilp64', when='blas=mkl +openmp_blas +int64_blas')
 
     # Note that this forces us to use OpenBLAS until #1712 is fixed
-    depends_on('lapack', when='~openmp_blas')
+    depends_on('lapack', when='blas=openblas ~openmp_blas')
     depends_on('metis')
     depends_on('metis +int64', when='+int64')
     depends_on('mpi')
@@ -79,6 +88,8 @@ class Elemental(CMakePackage):
     extends('python', when='+python')
     depends_on('python@:2.8', when='+python')
 
+    patch('elemental_cublas.patch', when='+cublas')
+
     @property
     def libs(self):
         shared = True if '+shared' in self.spec else False
@@ -126,8 +137,7 @@ class Elemental(CMakePackage):
                 math_libs = spec['scalapack'].libs + math_libs
 
             args.extend([
-                '-DMATH_LIBS:STRING={0}'.format(math_libs.search_flags),
-                '-DMATH_LIBS:STRING={0}'.format(math_libs.link_flags)])
+                '-DMATH_LIBS:STRING={0}'.format(math_libs.ld_flags)])
 
         if '+python' in spec:
             args.extend([
diff --git a/var/spack/repos/builtin/packages/lbann/package.py b/var/spack/repos/builtin/packages/lbann/package.py
index fea1924550..a93b9b5b66 100644
--- a/var/spack/repos/builtin/packages/lbann/package.py
+++ b/var/spack/repos/builtin/packages/lbann/package.py
@@ -39,37 +39,49 @@ class Lbann(CMakePackage):
     variant('gpu', default=False, description='Builds with support for GPUs via CUDA and cuDNN')
     variant('opencv', default=True, description='Builds with support for image processing routines with OpenCV')
     variant('seq_init', default=False, description='Force serial initialization of weight matrices.')
+    variant('dtype', default=4, description='Size (bits) of floating point representation for weights')
+    variant('build_type', default='Release',
+            description='The build type to build',
+            values=('Debug', 'Release'))
 
     depends_on('elemental +openmp_blas +scalapack +shared +int64')
+    depends_on('elemental +openmp_blas +scalapack +shared +int64 build_type=Debug', 
+               when=('build_type=Debug'))
     depends_on('cuda', when='+gpu')
     depends_on('mpi')
-    depends_on('opencv@3.2.0', when='+opencv')
+    depends_on('opencv@3.2.0: +openmp +core +highgui +imgproc +jpeg +png +tiff +zlib', when='+opencv')
     depends_on('protobuf@3.0.2:')
+    depends_on('cnpy')
 
     def cmake_args(self):
         spec = self.spec
         # Environment variables
         CPPFLAGS = []
         CPPFLAGS.append('-DLBANN_SET_EL_RNG')
-        if '~seq_init' in spec:
-            CPPFLAGS.append('-DLBANN_PARALLEL_RANDOM_MATRICES')
+
+        CPPFLAGS.append('-DLBANN_DATATYPE={0}'.format(
+            int(spec.variants['dtype'].value)))
 
         args = [
             '-DCMAKE_INSTALL_MESSAGE=LAZY',
             '-DCMAKE_CXX_FLAGS=%s' % ' '.join(CPPFLAGS),
             '-DWITH_CUDA:BOOL=%s' % ('+gpu' in spec),
             '-DWITH_CUDNN:BOOL=%s' % ('+gpu' in spec),
+            '-DELEMENTAL_USE_CUBLAS:BOOL=%s' % (
+                '+cublas' in spec['elemental']),
             '-DWITH_TBINF=OFF',
             '-DWITH_VTUNE=OFF',
-            '-DElemental_DIR={0}'.format(self.spec['elemental'].prefix),
+            '-DElemental_DIR={0}'.format(spec['elemental'].prefix),
+            '-DCNPY_DIR={0}'.format(spec['cnpy'].prefix),
             '-DELEMENTAL_MATH_LIBS={0}'.format(
-                self.spec['elemental'].libs),
+                spec['elemental'].libs),
+            '-DSEQ_INIT:BOOL=%s' % ('+seq_init' in spec),
             '-DVERBOSE=0',
             '-DLBANN_HOME=.',
             '-DLBANN_VER=spack']
 
-        if '+opencv' in self.spec:
+        if '+opencv' in spec:
             args.extend(['-DOpenCV_DIR:STRING={0}'.format(
-                self.spec['opencv'].prefix)])
+                spec['opencv'].prefix)])
 
         return args
diff --git a/var/spack/repos/builtin/packages/libtiff/package.py b/var/spack/repos/builtin/packages/libtiff/package.py
index 2fcccad739..29db7b42d3 100644
--- a/var/spack/repos/builtin/packages/libtiff/package.py
+++ b/var/spack/repos/builtin/packages/libtiff/package.py
@@ -35,6 +35,9 @@ class Libtiff(AutotoolsPackage):
     version('4.0.6', 'd1d2e940dea0b5ad435f21f03d96dd72')
     version('4.0.3', '051c1068e6a0627f461948c365290410')
 
-    depends_on('jpeg')
+    variant('turbo', default=False, description='use libjpeg-turbo')
+
+    depends_on('jpeg', when='-turbo')
+    depends_on('libjpeg-turbo', when='+turbo')
     depends_on('zlib')
     depends_on('xz')
diff --git a/var/spack/repos/builtin/packages/opencv/package.py b/var/spack/repos/builtin/packages/opencv/package.py
index 33adadc15e..f2bda99a01 100644
--- a/var/spack/repos/builtin/packages/opencv/package.py
+++ b/var/spack/repos/builtin/packages/opencv/package.py
@@ -42,8 +42,15 @@ class Opencv(CMakePackage):
     homepage = 'http://opencv.org/'
     url = 'https://github.com/Itseez/opencv/archive/3.1.0.tar.gz'
 
-    version('3.2.0', 'a43b65488124ba33dde195fea9041b70')
-    version('3.1.0', '70e1dd07f0aa06606f1bc0e3fa15abd3')
+    version('master', git="https://github.com/opencv/opencv.git", branch="master")
+    version('3.2.0',    'a43b65488124ba33dde195fea9041b70')
+    version('3.1.0',    '70e1dd07f0aa06606f1bc0e3fa15abd3')
+    version('2.4.13.2', 'fe52791ce523681a67036def4c25261b')
+    version('2.4.13.1', 'f6d354500d5013e60dc0fc44b07a63d1')
+    version('2.4.13',   '8feb45a71adad89b8017a777477c3eff')
+    version('2.4.12.3', '2496a4a4caf8fecfbfc294fbe6a814b0')
+    version('2.4.12.2', 'bc0c60c2ea1cf4078deef99569912fc7')
+    version('2.4.12.1', '7192f51434710904b5e3594872b897c3')
 
     variant('shared', default=True,
             description='Enables the build of shared libraries')
@@ -59,13 +66,21 @@ class Opencv(CMakePackage):
             description='Enables the build of Python extensions')
     variant('java', default=False,
             description='Activates support for Java')
+    variant('openmp', default=False, description='Activates support for OpenMP threads')
+    variant('core', default=False, description='Include opencv_core module into the OpenCV build')
+    variant('highgui', default=False, description='Include opencv_highgui module into the OpenCV build')
+    variant('imgproc', default=False, description='Include opencv_imgproc module into the OpenCV build')
+    variant('jpeg', default=False, description='Include JPEG support')
+    variant('png', default=False, description='Include PNG support')
+    variant('tiff', default=False, description='Include TIFF support')
+    variant('zlib', default=False, description='Build zlib from source')
 
     depends_on('eigen', when='+eigen', type='build')
 
-    depends_on('zlib')
-    depends_on('libpng')
-    depends_on('libjpeg-turbo')
-    depends_on('libtiff')
+    depends_on('zlib', when='+zlib')
+    depends_on('libpng', when='+png')
+    depends_on('libjpeg-turbo', when='+jpeg')
+    depends_on('libtiff+turbo', when='+tiff')
 
     depends_on('jasper', when='+jasper')
     depends_on('cuda', when='+cuda')
@@ -94,6 +109,22 @@ class Opencv(CMakePackage):
                 'ON' if '+vtk' in spec else 'OFF')),
             '-DBUILD_opencv_java:BOOL={0}'.format((
                 'ON' if '+java' in spec else 'OFF')),
+            '-DBUILD_opencv_core:BOOL={0}'.format((
+                'ON' if '+core' in spec else 'OFF')),
+            '-DBUILD_opencv_highgui:BOOL={0}'.format((
+                'ON' if '+highgui' in spec else 'OFF')),
+            '-DBUILD_opencv_imgproc:BOOL={0}'.format((
+                'ON' if '+imgproc' in spec else 'OFF')),
+            '-DWITH_JPEG:BOOL={0}'.format((
+                'ON' if '+jpeg' in spec else 'OFF')),
+            '-DWITH_PNG:BOOL={0}'.format((
+                'ON' if '+png' in spec else 'OFF')),
+            '-DWITH_TIFF:BOOL={0}'.format((
+                'ON' if '+tiff' in spec else 'OFF')),
+            '-DWITH_ZLIB:BOOL={0}'.format((
+                'ON' if '+zlib' in spec else 'OFF')),
+            '-DWITH_OPENMP:BOOL={0}'.format((
+                'ON' if '+openmp' in spec else 'OFF')),
         ]
 
         # Media I/O
@@ -115,31 +146,35 @@ class Opencv(CMakePackage):
             '-DPNG_INCLUDE_DIR:PATH={0}'.format(libpng.prefix.include)
         ])
 
-        libjpeg = spec['libjpeg-turbo']
-        args.extend([
-            '-DJPEG_LIBRARY:FILEPATH={0}'.format(
-                join_path(libjpeg.prefix.lib,
-                          'libjpeg.{0}'.format(dso_suffix))),
-            '-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include)
-        ])
+        if '+jpeg' in spec:
+            libjpeg = spec['libjpeg-turbo']
+            cmake_options.extend([
+                '-DBUILD_JPEG:BOOL=OFF',
+                '-DJPEG_LIBRARY:FILEPATH={0}'.format(
+                    join_path(libjpeg.prefix.lib,
+                              'libjpeg.{0}'.format(dso_suffix))),
+                '-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include)
+            ])
 
-        libtiff = spec['libtiff']
-        args.extend([
-            '-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format((
-                'DEBUG' if '+debug' in spec else 'RELEASE'),
-                join_path(libtiff.prefix.lib,
-                          'libtiff.{0}'.format(dso_suffix))),
-            '-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include)
-        ])
+        if '+tiff' in spec:
+            libtiff = spec['libtiff']
+            cmake_options.extend([
+                '-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format((
+                    'DEBUG' if '+debug' in spec else 'RELEASE'),
+                    join_path(libtiff.prefix.lib,
+                              'libtiff.{0}'.format(dso_suffix))),
+                '-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include)
+            ])
 
-        jasper = spec['jasper']
-        args.extend([
-            '-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format((
-                'DEBUG' if '+debug' in spec else 'RELEASE'),
-                join_path(jasper.prefix.lib,
-                          'libjasper.{0}'.format(dso_suffix))),
-            '-DJASPER_INCLUDE_DIR:PATH={0}'.format(jasper.prefix.include)
-        ])
+        if '+jasper' in spec:
+            jasper = spec['jasper']
+            cmake_options.extend([
+                '-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format((
+                    'DEBUG' if '+debug' in spec else 'RELEASE'),
+                    join_path(jasper.prefix.lib,
+                              'libjasper.{0}'.format(dso_suffix))),
+                '-DJASPER_INCLUDE_DIR:PATH={0}'.format(jasper.prefix.include)
+            ])
 
         # GUI
         if '+gtk' not in spec:
-- 
cgit v1.2.3-70-g09d2