summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Van Essen <vanessen1@llnl.gov>2017-08-07 11:41:13 -0700
committerAdam J. Stewart <ajstewart426@gmail.com>2017-08-07 13:41:13 -0500
commit8ca7c7700895032d7ec9fe728146e3ac0dbd0a64 (patch)
tree1a9c46db743b2a209a0cfa94aa8a1eee41920569
parent755081968f8967a534b2e84757647e98c7b34b70 (diff)
downloadspack-8ca7c7700895032d7ec9fe728146e3ac0dbd0a64.tar.gz
spack-8ca7c7700895032d7ec9fe728146e3ac0dbd0a64.tar.bz2
spack-8ca7c7700895032d7ec9fe728146e3ac0dbd0a64.tar.xz
spack-8ca7c7700895032d7ec9fe728146e3ac0dbd0a64.zip
Elemental cublas (#4889)
* Added a package for the MDAnalysis toolkit. * Added a patch that allows Elemental to use cuBLAS internally. * Added support for LBANN to use the new cuBLAS extension in Elemental. * Added a proper variant for when LBANN does not want to use cuBLAS in elemental. * Added a package for the cnpy project and used it in the lbann package. * Removed unnecessary comment lines. * Removed blank lines * Removed debug variant * Add support for libjpeg-turbo * Added additional variants for OpenCV features. Fixed bug when linking in TIFF support, where libtiff used the regular JPEG library and OpenCV used libjpeg-turbo. Now libtiff can use libjpeg-turbo. * Removed the variant for getting Elemental to use the cublas variant. Updated the requirements for OpenCV to add new options. * Fixed a flake8 error in OpenCV and added a path to find cnpy in lbann. * Fixed line too long flake8 error. * Added a flag to specify the datatype size in lbann and fixed a flake8 error. * Added a debug build variant using hte new build_type * Fixed flake8 * Fixed how the debug build is pushed to Elemental * Fixed a bug in the Elemental package where the blas search flags were being overridden by the blas link flags. Changed how the sequential initialization variant is implemented in LBANN. * Added support via a variant to explicitly use mkl or openblas. This helps work around variant forwarding problems. * Updated package files to address pull request comments.
-rw-r--r--var/spack/repos/builtin/packages/cnpy/package.py34
-rw-r--r--var/spack/repos/builtin/packages/elemental/elemental_cublas.patch668
-rw-r--r--var/spack/repos/builtin/packages/elemental/package.py20
-rw-r--r--var/spack/repos/builtin/packages/lbann/package.py26
-rw-r--r--var/spack/repos/builtin/packages/libtiff/package.py5
-rw-r--r--var/spack/repos/builtin/packages/opencv/package.py93
6 files changed, 804 insertions, 42 deletions
diff --git a/var/spack/repos/builtin/packages/cnpy/package.py b/var/spack/repos/builtin/packages/cnpy/package.py
new file mode 100644
index 0000000000..b62df10c2e
--- /dev/null
+++ b/var/spack/repos/builtin/packages/cnpy/package.py
@@ -0,0 +1,34 @@
+##############################################################################
+# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the NOTICE and LICENSE files for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License (as
+# published by the Free Software Foundation) version 2.1, February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+from spack import *
+
+
+class Cnpy(CMakePackage):
+ """cnpy: library to read/write .npy and .npz files in C/C++."""
+
+ homepage = "https://github.com/rogersce/cnpy"
+ url = "https://github.com/rogersce/cnpy"
+
+ version('master', git='https://github.com/rogersce/cnpy.git', branch="master")
diff --git a/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch b/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch
new file mode 100644
index 0000000000..9cf9b6e6b5
--- /dev/null
+++ b/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch
@@ -0,0 +1,668 @@
+diff -Naur a/include/El/blas_like/level3.hpp b/include/El/blas_like/level3.hpp
+--- a/include/El/blas_like/level3.hpp 2017-06-08 07:30:43.180249917 -0700
++++ b/include/El/blas_like/level3.hpp 2017-06-08 07:35:27.325434602 -0700
+@@ -31,6 +31,10 @@
+ }
+ using namespace GemmAlgorithmNS;
+
++void GemmUseGPU(int min_M, int min_N, int min_K);
++
++void GemmUseCPU();
++
+ template<typename T>
+ void Gemm
+ ( Orientation orientA, Orientation orientB,
+diff -Naur a/include/El/core/imports/blas.hpp b/include/El/core/imports/blas.hpp
+--- a/include/El/core/imports/blas.hpp 2017-06-08 07:30:43.522016908 -0700
++++ b/include/El/core/imports/blas.hpp 2017-06-08 07:35:06.834030908 -0700
+@@ -916,4 +916,63 @@
+ } // namespace blas
+ } // namespace El
+
++
++#if defined(EL_USE_CUBLAS)
++
++namespace El {
++
++#ifdef EL_USE_64BIT_BLAS_INTS
++typedef long long int BlasInt;
++#else
++typedef int BlasInt;
++#endif
++
++namespace cublas {
++
++// NOTE: templated routines are custom and not wrappers
++
++// Level 3 BLAS
++// ============
++template<typename T>
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++ const T& alpha,
++ const T* A, BlasInt ALDim,
++ const T* B, BlasInt BLDim,
++ const T& beta,
++ T* C, BlasInt CLDim );
++
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++ const float& alpha,
++ const float* A, BlasInt ALDim,
++ const float* B, BlasInt BLDim,
++ const float& beta,
++ float* C, BlasInt CLDim );
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++ const double& alpha,
++ const double* A, BlasInt ALDim,
++ const double* B, BlasInt BLDim,
++ const double& beta,
++ double* C, BlasInt CLDim );
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++ const scomplex& alpha,
++ const scomplex* A, BlasInt ALDim,
++ const scomplex* B, BlasInt BLDim,
++ const scomplex& beta,
++ scomplex* C, BlasInt CLDim );
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++ const dcomplex& alpha,
++ const dcomplex* A, BlasInt ALDim,
++ const dcomplex* B, BlasInt BLDim,
++ const dcomplex& beta,
++ dcomplex* C, BlasInt CLDim );
++
++} // namespace cublas
++} // namespace El
++#endif
++
+ #endif // ifndef EL_IMPORTS_BLAS_DECL_HPP
+diff -Naur a/src/blas_like/level3/Gemm.cpp b/src/blas_like/level3/Gemm.cpp
+--- a/src/blas_like/level3/Gemm.cpp 2017-06-08 07:30:44.307096427 -0700
++++ b/src/blas_like/level3/Gemm.cpp 2017-06-08 07:34:23.062863489 -0700
+@@ -16,6 +16,20 @@
+
+ namespace El {
+
++char gemm_cpu_gpu_switch = 'c';
++int min_M = 0, min_N = 0, min_K = 0;
++
++void GemmUseGPU(int _min_M, int _min_N, int _min_K) {
++ gemm_cpu_gpu_switch = 'g';
++ min_M = _min_M;
++ min_N = _min_N;
++ min_K = _min_K;
++}
++
++void GemmUseCPU() {
++ gemm_cpu_gpu_switch = 'c';
++}
++
+ template<typename T>
+ void Gemm
+ ( Orientation orientA, Orientation orientB,
+@@ -59,11 +73,30 @@
+ const Int k = ( orientA == NORMAL ? A.Width() : A.Height() );
+ if( k != 0 )
+ {
++#if defined(EL_USE_CUBLAS)
++ if (gemm_cpu_gpu_switch == 'g' &&
++ m >= min_M &&
++ n >= min_N &&
++ k >= min_K) {
++ cublas::Gemm
++ ( transA, transB, m, n, k,
++ alpha, A.LockedBuffer(), A.LDim(),
++ B.LockedBuffer(), B.LDim(),
++ beta, C.Buffer(), C.LDim() );
++ } else {
++ blas::Gemm
++ ( transA, transB, m, n, k,
++ alpha, A.LockedBuffer(), A.LDim(),
++ B.LockedBuffer(), B.LDim(),
++ beta, C.Buffer(), C.LDim() );
++ }
++#else
+ blas::Gemm
+ ( transA, transB, m, n, k,
+ alpha, A.LockedBuffer(), A.LDim(),
+ B.LockedBuffer(), B.LDim(),
+ beta, C.Buffer(), C.LDim() );
++#endif
+ }
+ else
+ {
+diff -Naur a/src/core/imports/blas/Gemm.hpp b/src/core/imports/blas/Gemm.hpp
+--- a/src/core/imports/blas/Gemm.hpp 2017-06-08 07:30:45.090529967 -0700
++++ b/src/core/imports/blas/Gemm.hpp 2017-06-08 07:34:46.503009958 -0700
+@@ -41,6 +41,12 @@
+
+ } // extern "C"
+
++
++#if defined(EL_USE_CUBLAS)
++#include <cublas.h>
++#include <cub/util_allocator.cuh>
++#endif
++
+ namespace El {
+ namespace blas {
+
+@@ -515,3 +521,515 @@
+
+ } // namespace blas
+ } // namespace El
++
++
++#if EL_USE_CUBLAS
++
++#define USE_CUB 1
++
++namespace El {
++namespace cublas {
++
++#if USE_CUB
++cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
++#endif
++
++template<typename T>
++void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const T& alpha,
++ const T* A, BlasInt ALDim,
++ const T* B, BlasInt BLDim,
++ const T& beta,
++ T* C, BlasInt CLDim )
++{
++ // put something here
++ printf("integer version \n");
++}
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const Int& alpha,
++ const Int* A, BlasInt ALDim,
++ const Int* B, BlasInt BLDim,
++ const Int& beta,
++ Int* C, BlasInt CLDim );
++#ifdef EL_HAVE_QD
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const DoubleDouble& alpha,
++ const DoubleDouble* A, BlasInt ALDim,
++ const DoubleDouble* B, BlasInt BLDim,
++ const DoubleDouble& beta,
++ DoubleDouble* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const QuadDouble& alpha,
++ const QuadDouble* A, BlasInt ALDim,
++ const QuadDouble* B, BlasInt BLDim,
++ const QuadDouble& beta,
++ QuadDouble* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const Complex<DoubleDouble>& alpha,
++ const Complex<DoubleDouble>* A, BlasInt ALDim,
++ const Complex<DoubleDouble>* B, BlasInt BLDim,
++ const Complex<DoubleDouble>& beta,
++ Complex<DoubleDouble>* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const Complex<QuadDouble>& alpha,
++ const Complex<QuadDouble>* A, BlasInt ALDim,
++ const Complex<QuadDouble>* B, BlasInt BLDim,
++ const Complex<QuadDouble>& beta,
++ Complex<QuadDouble>* C, BlasInt CLDim );
++#endif
++#ifdef EL_HAVE_QUAD
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const Quad& alpha,
++ const Quad* A, BlasInt ALDim,
++ const Quad* B, BlasInt BLDim,
++ const Quad& beta,
++ Quad* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const Complex<Quad>& alpha,
++ const Complex<Quad>* A, BlasInt ALDim,
++ const Complex<Quad>* B, BlasInt BLDim,
++ const Complex<Quad>& beta,
++ Complex<Quad>* C, BlasInt CLDim );
++#endif
++#ifdef EL_HAVE_MPC
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const BigInt& alpha,
++ const BigInt* A, BlasInt ALDim,
++ const BigInt* B, BlasInt BLDim,
++ const BigInt& beta,
++ BigInt* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const BigFloat& alpha,
++ const BigFloat* A, BlasInt ALDim,
++ const BigFloat* B, BlasInt BLDim,
++ const BigFloat& beta,
++ BigFloat* C, BlasInt CLDim );
++template void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const Complex<BigFloat>& alpha,
++ const Complex<BigFloat>* A, BlasInt ALDim,
++ const Complex<BigFloat>* B, BlasInt BLDim,
++ const Complex<BigFloat>& beta,
++ Complex<BigFloat>* C, BlasInt CLDim );
++#endif
++
++void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const float& alpha,
++ const float* A, BlasInt ALDim,
++ const float* B, BlasInt BLDim,
++ const float& beta,
++ float* C, BlasInt CLDim )
++{
++ EL_DEBUG_CSE
++ EL_DEBUG_ONLY(
++ if( std::toupper(transA) == 'N' )
++ {
++ if( ALDim < Max(m,1) )
++ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
++ }
++ else
++ {
++ if( ALDim < Max(k,1) )
++ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
++ }
++
++ if( std::toupper(transB) == 'N' )
++ {
++ if( BLDim < Max(k,1) )
++ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
++ }
++ else
++ {
++ if( BLDim < Max(n,1) )
++ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
++ }
++
++ if( CLDim < Max(m,1) )
++ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
++ )
++ const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
++ const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
++
++ const mpi::Comm comm;
++ const Int commRank = mpi::Rank( comm );
++ if (commRank == 0) {
++ //printf("calling cublas Sgemm: m %d n %d k %d\n", m, n, k);
++ }
++
++ BlasInt rowA, colA, rowB, colB, rowC, colC;
++ // device memory size for A, B and C
++ BlasInt sizeA, sizeB, sizeC;
++ float *devA=NULL, *devB=NULL, *devC=NULL;
++
++ rowA = fixedTransA == 'T' ? k : m;
++ colA = fixedTransA == 'T' ? m : k;
++ rowB = fixedTransB == 'T' ? n : k;
++ colB = fixedTransB == 'T' ? k : n;
++ rowC = m;
++ colC = n;
++ sizeA = rowA * colA;
++ sizeB = rowB * colB;
++ sizeC = rowC * colC;
++
++ cublasStatus stat;
++
++#if USE_CUB
++ CubDebugExit(g_allocator.DeviceAllocate((void**)&devA,
++ sizeof(float) * (sizeA+sizeB+sizeC) ));
++#else
++ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(float), (void **) &devA);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
++#endif
++
++ devB = devA + sizeA;
++ devC = devB + sizeB;
++
++ // copy matrix A, B and C to device
++ stat = cublasSetMatrix(rowA, colA, sizeof(float), A, ALDim, devA, rowA);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
++
++ stat = cublasSetMatrix(rowB, colB, sizeof(float), B, BLDim, devB, rowB);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
++
++ if (beta != 0.0)
++ {
++ stat = cublasSetMatrix(rowC, colC, sizeof(float), C, CLDim, devC, rowC);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
++ }
++
++ // cublas<t>gemm
++ cublasSgemm
++ ( fixedTransA, fixedTransB, m, n, k,
++ alpha, devA, rowA, devB, rowB, beta, devC, rowC );
++
++ // copy matrix C to host
++ stat = cublasGetMatrix(rowC, colC, sizeof(float), devC, rowC, C, CLDim);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
++
++ // free
++#if USE_CUB
++ CubDebugExit(g_allocator.DeviceFree(devA));
++#else
++ cublasFree(devA);
++#endif
++ //printf("CUBLAS float done ...\n");
++}
++
++void Gemm
++( char transA, char transB,
++ BlasInt m, BlasInt n, BlasInt k,
++ const double& alpha,
++ const double* A, BlasInt ALDim,
++ const double* B, BlasInt BLDim,
++ const double& beta,
++ double* C, BlasInt CLDim )
++{
++ EL_DEBUG_CSE
++ EL_DEBUG_ONLY(
++ if( std::toupper(transA) == 'N' )
++ {
++ if( ALDim < Max(m,1) )
++ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
++ }
++ else
++ {
++ if( ALDim < Max(k,1) )
++ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
++ }
++
++ if( std::toupper(transB) == 'N' )
++ {
++ if( BLDim < Max(k,1) )
++ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
++ }
++ else
++ {
++ if( BLDim < Max(n,1) )
++ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
++ }
++
++ if( CLDim < Max(m,1) )
++ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
++ )
++ const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
++ const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
++
++ const mpi::Comm comm;
++ const Int commRank = mpi::Rank( comm );
++ if (commRank == 0) {
++ //printf("calling cublas Dgemm: m %d n %d k %d\n", m, n, k);
++ }
++
++ BlasInt rowA, colA, rowB, colB, rowC, colC;
++ // device memory size for A, B and C
++ BlasInt sizeA, sizeB, sizeC;
++ double *devA=NULL, *devB=NULL, *devC=NULL;
++
++ rowA = fixedTransA == 'T' ? k : m;
++ colA = fixedTransA == 'T' ? m : k;
++ rowB = fixedTransB == 'T' ? n : k;
++ colB = fixedTransB == 'T' ? k : n;
++ rowC = m;
++ colC = n;
++ sizeA = rowA * colA;
++ sizeB = rowB * colB;
++ sizeC = rowC * colC;
++
++ cublasStatus stat;
++
++#if USE_CUB
++ CubDebugExit(g_allocator.DeviceAllocate((void**)&devA,
++ sizeof(double) * (sizeA+sizeB+sizeC) ));
++#else
++ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(double), (void **) &devA);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
++#endif
++
++ devB = devA + sizeA;
++ devC = devB + sizeB;
++
++ // copy matrix A, B and C to device
++ stat = cublasSetMatrix(rowA, colA, sizeof(double), A, ALDim, devA, rowA);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
++
++ stat = cublasSetMatrix(rowB, colB, sizeof(double), B, BLDim, devB, rowB);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
++
++ if (beta != 0.0)
++ {
++ stat = cublasSetMatrix(rowC, colC, sizeof(double), C, CLDim, devC, rowC);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
++ }
++
++ // cublas<t>gemm
++ cublasDgemm
++ ( fixedTransA, fixedTransB, m, n, k,
++ alpha, devA, rowA, devB, rowB, beta, devC, rowC );
++
++ // copy matrix C to host
++ stat = cublasGetMatrix(rowC, colC, sizeof(double), devC, rowC, C, CLDim);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
++
++ // free
++#if USE_CUB
++ CubDebugExit(g_allocator.DeviceFree(devA));
++#else
++ cublasFree(devA);
++#endif
++ //printf("CUBLAS double done ...\n");
++}
++
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++ const scomplex& alpha,
++ const scomplex* A, BlasInt ALDim,
++ const scomplex* B, BlasInt BLDim,
++ const scomplex& beta,
++ scomplex* C, BlasInt CLDim )
++{
++ EL_DEBUG_CSE
++ EL_DEBUG_ONLY(
++ if( std::toupper(transA) == 'N' )
++ {
++ if( ALDim < Max(m,1) )
++ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
++ }
++ else
++ {
++ if( ALDim < Max(k,1) )
++ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
++ }
++
++ if( std::toupper(transB) == 'N' )
++ {
++ if( BLDim < Max(k,1) )
++ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
++ }
++ else
++ {
++ if( BLDim < Max(n,1) )
++ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
++ }
++
++ if( CLDim < Max(m,1) )
++ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
++ )
++
++ const char fixedTransA = transA;
++ const char fixedTransB = transB;
++
++ const mpi::Comm comm;
++ const Int commRank = mpi::Rank( comm );
++ if (commRank == 0) {
++ //printf("calling cublas Cgemm: m %d n %d k %d\n", m, n, k);
++ }
++
++ BlasInt rowA, colA, rowB, colB, rowC, colC;
++ // device memory size for A, B and C
++ BlasInt sizeA, sizeB, sizeC;
++ cuComplex *devA=NULL, *devB=NULL, *devC=NULL;
++
++ rowA = fixedTransA == 'T' ? k : m;
++ colA = fixedTransA == 'T' ? m : k;
++ rowB = fixedTransB == 'T' ? n : k;
++ colB = fixedTransB == 'T' ? k : n;
++ rowC = m;
++ colC = n;
++ sizeA = rowA * colA;
++ sizeB = rowB * colB;
++ sizeC = rowC * colC;
++
++ cublasStatus stat;
++ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuComplex), (void **) &devA);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
++
++ devB = devA + sizeA;
++ devC = devB + sizeB;
++
++ // copy matrix A, B and C to device
++ stat = cublasSetMatrix(rowA, colA, sizeof(cuComplex), A, ALDim, devA, rowA);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
++
++ stat = cublasSetMatrix(rowB, colB, sizeof(cuComplex), B, BLDim, devB, rowB);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
++
++ if (beta.real() != 0.0 || beta.imag() != 0.0)
++ {
++ stat = cublasSetMatrix(rowC, colC, sizeof(cuComplex), C, CLDim, devC, rowC);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
++ }
++
++ // cublas<t>gemm
++ cublasCgemm
++ ( fixedTransA, fixedTransB, m, n, k,
++ *((cuComplex*) &alpha), devA, rowA, devB, rowB, *((cuComplex*) &beta), devC, rowC );
++
++ // copy matrix C to host
++ stat = cublasGetMatrix(rowC, colC, sizeof(cuComplex), devC, rowC, C, CLDim);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
++
++ // free
++ cublasFree(devA);
++}
++
++void Gemm
++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
++ const dcomplex& alpha,
++ const dcomplex* A, BlasInt ALDim,
++ const dcomplex* B, BlasInt BLDim,
++ const dcomplex& beta,
++ dcomplex* C, BlasInt CLDim )
++{
++ EL_DEBUG_CSE
++ EL_DEBUG_ONLY(
++ if( std::toupper(transA) == 'N' )
++ {
++ if( ALDim < Max(m,1) )
++ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
++ }
++ else
++ {
++ if( ALDim < Max(k,1) )
++ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
++ }
++
++ if( std::toupper(transB) == 'N' )
++ {
++ if( BLDim < Max(k,1) )
++ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
++ }
++ else
++ {
++ if( BLDim < Max(n,1) )
++ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
++ }
++
++ if( CLDim < Max(m,1) )
++ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
++ )
++
++ const char fixedTransA = transA;
++ const char fixedTransB = transB;
++
++ const mpi::Comm comm;
++ const Int commRank = mpi::Rank( comm );
++ if (commRank == 0) {
++ //printf("calling cublas Zgemm: m %d n %d k %d\n", m, n, k);
++ }
++
++ BlasInt rowA, colA, rowB, colB, rowC, colC;
++ // device memory size for A, B and C
++ BlasInt sizeA, sizeB, sizeC;
++ cuDoubleComplex *devA=NULL, *devB=NULL, *devC=NULL;
++
++ rowA = fixedTransA == 'T' ? k : m;
++ colA = fixedTransA == 'T' ? m : k;
++ rowB = fixedTransB == 'T' ? n : k;
++ colB = fixedTransB == 'T' ? k : n;
++ rowC = m;
++ colC = n;
++ sizeA = rowA * colA;
++ sizeB = rowB * colB;
++ sizeC = rowC * colC;
++
++ cublasStatus stat;
++ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuDoubleComplex), (void **) &devA);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
++
++ devB = devA + sizeA;
++ devC = devB + sizeB;
++
++ // copy matrix A, B and C to device
++ stat = cublasSetMatrix(rowA, colA, sizeof(cuDoubleComplex), A, ALDim, devA, rowA);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
++
++ stat = cublasSetMatrix(rowB, colB, sizeof(cuDoubleComplex), B, BLDim, devB, rowB);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
++
++ if (beta.real() != 0.0 || beta.imag() != 0.0)
++ {
++ stat = cublasSetMatrix(rowC, colC, sizeof(cuDoubleComplex), C, CLDim, devC, rowC);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
++ }
++
++ cublasZgemm
++ ( fixedTransA, fixedTransB, m, n, k,
++ *((cuDoubleComplex*) &alpha), devA, rowA, devB, rowB, *((cuDoubleComplex*) &beta),
++ devC, rowC );
++
++ // copy matrix C to host
++ stat = cublasGetMatrix(rowC, colC, sizeof(cuDoubleComplex), devC, rowC, C, CLDim);
++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
++
++ // free
++ cublasFree(devA);
++}
++
++} // namespace cublas
++} // namespace El
++
++#endif
++
diff --git a/var/spack/repos/builtin/packages/elemental/package.py b/var/spack/repos/builtin/packages/elemental/package.py
index e118bcbd44..50fb4f9829 100644
--- a/var/spack/repos/builtin/packages/elemental/package.py
+++ b/var/spack/repos/builtin/packages/elemental/package.py
@@ -33,6 +33,7 @@ class Elemental(CMakePackage):
homepage = "http://libelemental.org"
url = "https://github.com/elemental/Elemental/archive/v0.87.6.tar.gz"
+ version('master', git='https://github.com/elemental/Elemental.git', branch='master')
version('0.87.7', '6c1e7442021c59a36049e37ea69b8075')
version('0.87.6', '9fd29783d45b0a0e27c0df85f548abe9')
@@ -52,6 +53,8 @@ class Elemental(CMakePackage):
description='Enable quad precision')
variant('int64', default=False,
description='Use 64bit integers')
+ variant('cublas', default=False,
+ description='Enable cuBLAS for local BLAS operations')
# When this variant is set remove the normal dependencies since
# Elemental has to build BLAS and ScaLAPACK internally
variant('int64_blas', default=False,
@@ -62,15 +65,21 @@ class Elemental(CMakePackage):
variant('build_type', default='Release',
description='The build type to build',
values=('Debug', 'Release'))
+ variant('blas', default='openblas', values=('openblas', 'mkl'),
+ description='Enable the use of OpenBlas/MKL')
- # Note that this forces us to use OpenBLAS until #1712 is fixed
+ # Note that #1712 forces us to enumerate the different blas variants
depends_on('blas', when='~openmp_blas ~int64_blas')
# Hack to forward variant to openblas package
# Allow Elemental to build internally when using 8-byte ints
- depends_on('openblas +openmp', when='+openmp_blas ~int64_blas')
+ depends_on('openblas +openmp', when='blas=openblas +openmp_blas ~int64_blas')
+
+ depends_on('intel-mkl', when="blas=mkl ~openmp_blas ~int64_blas")
+ depends_on('intel-mkl +openmp', when='blas=mkl +openmp_blas ~int64_blas')
+ depends_on('intel-mkl@2017.1 +openmp +ilp64', when='blas=mkl +openmp_blas +int64_blas')
# Note that this forces us to use OpenBLAS until #1712 is fixed
- depends_on('lapack', when='~openmp_blas')
+ depends_on('lapack', when='blas=openblas ~openmp_blas')
depends_on('metis')
depends_on('metis +int64', when='+int64')
depends_on('mpi')
@@ -79,6 +88,8 @@ class Elemental(CMakePackage):
extends('python', when='+python')
depends_on('python@:2.8', when='+python')
+ patch('elemental_cublas.patch', when='+cublas')
+
@property
def libs(self):
shared = True if '+shared' in self.spec else False
@@ -126,8 +137,7 @@ class Elemental(CMakePackage):
math_libs = spec['scalapack'].libs + math_libs
args.extend([
- '-DMATH_LIBS:STRING={0}'.format(math_libs.search_flags),
- '-DMATH_LIBS:STRING={0}'.format(math_libs.link_flags)])
+ '-DMATH_LIBS:STRING={0}'.format(math_libs.ld_flags)])
if '+python' in spec:
args.extend([
diff --git a/var/spack/repos/builtin/packages/lbann/package.py b/var/spack/repos/builtin/packages/lbann/package.py
index fea1924550..a93b9b5b66 100644
--- a/var/spack/repos/builtin/packages/lbann/package.py
+++ b/var/spack/repos/builtin/packages/lbann/package.py
@@ -39,37 +39,49 @@ class Lbann(CMakePackage):
variant('gpu', default=False, description='Builds with support for GPUs via CUDA and cuDNN')
variant('opencv', default=True, description='Builds with support for image processing routines with OpenCV')
variant('seq_init', default=False, description='Force serial initialization of weight matrices.')
+ variant('dtype', default=4, description='Size (bits) of floating point representation for weights')
+ variant('build_type', default='Release',
+ description='The build type to build',
+ values=('Debug', 'Release'))
depends_on('elemental +openmp_blas +scalapack +shared +int64')
+ depends_on('elemental +openmp_blas +scalapack +shared +int64 build_type=Debug',
+ when=('build_type=Debug'))
depends_on('cuda', when='+gpu')
depends_on('mpi')
- depends_on('opencv@3.2.0', when='+opencv')
+ depends_on('opencv@3.2.0: +openmp +core +highgui +imgproc +jpeg +png +tiff +zlib', when='+opencv')
depends_on('protobuf@3.0.2:')
+ depends_on('cnpy')
def cmake_args(self):
spec = self.spec
# Environment variables
CPPFLAGS = []
CPPFLAGS.append('-DLBANN_SET_EL_RNG')
- if '~seq_init' in spec:
- CPPFLAGS.append('-DLBANN_PARALLEL_RANDOM_MATRICES')
+
+ CPPFLAGS.append('-DLBANN_DATATYPE={0}'.format(
+ int(spec.variants['dtype'].value)))
args = [
'-DCMAKE_INSTALL_MESSAGE=LAZY',
'-DCMAKE_CXX_FLAGS=%s' % ' '.join(CPPFLAGS),
'-DWITH_CUDA:BOOL=%s' % ('+gpu' in spec),
'-DWITH_CUDNN:BOOL=%s' % ('+gpu' in spec),
+ '-DELEMENTAL_USE_CUBLAS:BOOL=%s' % (
+ '+cublas' in spec['elemental']),
'-DWITH_TBINF=OFF',
'-DWITH_VTUNE=OFF',
- '-DElemental_DIR={0}'.format(self.spec['elemental'].prefix),
+ '-DElemental_DIR={0}'.format(spec['elemental'].prefix),
+ '-DCNPY_DIR={0}'.format(spec['cnpy'].prefix),
'-DELEMENTAL_MATH_LIBS={0}'.format(
- self.spec['elemental'].libs),
+ spec['elemental'].libs),
+ '-DSEQ_INIT:BOOL=%s' % ('+seq_init' in spec),
'-DVERBOSE=0',
'-DLBANN_HOME=.',
'-DLBANN_VER=spack']
- if '+opencv' in self.spec:
+ if '+opencv' in spec:
args.extend(['-DOpenCV_DIR:STRING={0}'.format(
- self.spec['opencv'].prefix)])
+ spec['opencv'].prefix)])
return args
diff --git a/var/spack/repos/builtin/packages/libtiff/package.py b/var/spack/repos/builtin/packages/libtiff/package.py
index 2fcccad739..29db7b42d3 100644
--- a/var/spack/repos/builtin/packages/libtiff/package.py
+++ b/var/spack/repos/builtin/packages/libtiff/package.py
@@ -35,6 +35,9 @@ class Libtiff(AutotoolsPackage):
version('4.0.6', 'd1d2e940dea0b5ad435f21f03d96dd72')
version('4.0.3', '051c1068e6a0627f461948c365290410')
- depends_on('jpeg')
+ variant('turbo', default=False, description='use libjpeg-turbo')
+
+ depends_on('jpeg', when='-turbo')
+ depends_on('libjpeg-turbo', when='+turbo')
depends_on('zlib')
depends_on('xz')
diff --git a/var/spack/repos/builtin/packages/opencv/package.py b/var/spack/repos/builtin/packages/opencv/package.py
index 33adadc15e..f2bda99a01 100644
--- a/var/spack/repos/builtin/packages/opencv/package.py
+++ b/var/spack/repos/builtin/packages/opencv/package.py
@@ -42,8 +42,15 @@ class Opencv(CMakePackage):
homepage = 'http://opencv.org/'
url = 'https://github.com/Itseez/opencv/archive/3.1.0.tar.gz'
- version('3.2.0', 'a43b65488124ba33dde195fea9041b70')
- version('3.1.0', '70e1dd07f0aa06606f1bc0e3fa15abd3')
+ version('master', git="https://github.com/opencv/opencv.git", branch="master")
+ version('3.2.0', 'a43b65488124ba33dde195fea9041b70')
+ version('3.1.0', '70e1dd07f0aa06606f1bc0e3fa15abd3')
+ version('2.4.13.2', 'fe52791ce523681a67036def4c25261b')
+ version('2.4.13.1', 'f6d354500d5013e60dc0fc44b07a63d1')
+ version('2.4.13', '8feb45a71adad89b8017a777477c3eff')
+ version('2.4.12.3', '2496a4a4caf8fecfbfc294fbe6a814b0')
+ version('2.4.12.2', 'bc0c60c2ea1cf4078deef99569912fc7')
+ version('2.4.12.1', '7192f51434710904b5e3594872b897c3')
variant('shared', default=True,
description='Enables the build of shared libraries')
@@ -59,13 +66,21 @@ class Opencv(CMakePackage):
description='Enables the build of Python extensions')
variant('java', default=False,
description='Activates support for Java')
+ variant('openmp', default=False, description='Activates support for OpenMP threads')
+ variant('core', default=False, description='Include opencv_core module into the OpenCV build')
+ variant('highgui', default=False, description='Include opencv_highgui module into the OpenCV build')
+ variant('imgproc', default=False, description='Include opencv_imgproc module into the OpenCV build')
+ variant('jpeg', default=False, description='Include JPEG support')
+ variant('png', default=False, description='Include PNG support')
+ variant('tiff', default=False, description='Include TIFF support')
+ variant('zlib', default=False, description='Build zlib from source')
depends_on('eigen', when='+eigen', type='build')
- depends_on('zlib')
- depends_on('libpng')
- depends_on('libjpeg-turbo')
- depends_on('libtiff')
+ depends_on('zlib', when='+zlib')
+ depends_on('libpng', when='+png')
+ depends_on('libjpeg-turbo', when='+jpeg')
+ depends_on('libtiff+turbo', when='+tiff')
depends_on('jasper', when='+jasper')
depends_on('cuda', when='+cuda')
@@ -94,6 +109,22 @@ class Opencv(CMakePackage):
'ON' if '+vtk' in spec else 'OFF')),
'-DBUILD_opencv_java:BOOL={0}'.format((
'ON' if '+java' in spec else 'OFF')),
+ '-DBUILD_opencv_core:BOOL={0}'.format((
+ 'ON' if '+core' in spec else 'OFF')),
+ '-DBUILD_opencv_highgui:BOOL={0}'.format((
+ 'ON' if '+highgui' in spec else 'OFF')),
+ '-DBUILD_opencv_imgproc:BOOL={0}'.format((
+ 'ON' if '+imgproc' in spec else 'OFF')),
+ '-DWITH_JPEG:BOOL={0}'.format((
+ 'ON' if '+jpeg' in spec else 'OFF')),
+ '-DWITH_PNG:BOOL={0}'.format((
+ 'ON' if '+png' in spec else 'OFF')),
+ '-DWITH_TIFF:BOOL={0}'.format((
+ 'ON' if '+tiff' in spec else 'OFF')),
+ '-DWITH_ZLIB:BOOL={0}'.format((
+ 'ON' if '+zlib' in spec else 'OFF')),
+ '-DWITH_OPENMP:BOOL={0}'.format((
+ 'ON' if '+openmp' in spec else 'OFF')),
]
# Media I/O
@@ -115,31 +146,35 @@ class Opencv(CMakePackage):
'-DPNG_INCLUDE_DIR:PATH={0}'.format(libpng.prefix.include)
])
- libjpeg = spec['libjpeg-turbo']
- args.extend([
- '-DJPEG_LIBRARY:FILEPATH={0}'.format(
- join_path(libjpeg.prefix.lib,
- 'libjpeg.{0}'.format(dso_suffix))),
- '-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include)
- ])
+ if '+jpeg' in spec:
+ libjpeg = spec['libjpeg-turbo']
+ cmake_options.extend([
+ '-DBUILD_JPEG:BOOL=OFF',
+ '-DJPEG_LIBRARY:FILEPATH={0}'.format(
+ join_path(libjpeg.prefix.lib,
+ 'libjpeg.{0}'.format(dso_suffix))),
+ '-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include)
+ ])
- libtiff = spec['libtiff']
- args.extend([
- '-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format((
- 'DEBUG' if '+debug' in spec else 'RELEASE'),
- join_path(libtiff.prefix.lib,
- 'libtiff.{0}'.format(dso_suffix))),
- '-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include)
- ])
+ if '+tiff' in spec:
+ libtiff = spec['libtiff']
+ cmake_options.extend([
+ '-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format((
+ 'DEBUG' if '+debug' in spec else 'RELEASE'),
+ join_path(libtiff.prefix.lib,
+ 'libtiff.{0}'.format(dso_suffix))),
+ '-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include)
+ ])
- jasper = spec['jasper']
- args.extend([
- '-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format((
- 'DEBUG' if '+debug' in spec else 'RELEASE'),
- join_path(jasper.prefix.lib,
- 'libjasper.{0}'.format(dso_suffix))),
- '-DJASPER_INCLUDE_DIR:PATH={0}'.format(jasper.prefix.include)
- ])
+ if '+jasper' in spec:
+ jasper = spec['jasper']
+ cmake_options.extend([
+ '-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format((
+ 'DEBUG' if '+debug' in spec else 'RELEASE'),
+ join_path(jasper.prefix.lib,
+ 'libjasper.{0}'.format(dso_suffix))),
+ '-DJASPER_INCLUDE_DIR:PATH={0}'.format(jasper.prefix.include)
+ ])
# GUI
if '+gtk' not in spec: