1 files changed, 428 insertions, 0 deletions
diff --git a/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch b/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch
new file mode 100644
index 0000000000..1ac800c5f8
--- /dev/null
+++ b/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch
@@ -0,0 +1,428 @@
+diff -r 89706c0efbdb .hgtags
+--- a/.hgtags	Wed Jan 02 14:17:26 2019 -0500
++++ b/.hgtags	Wed Apr 03 15:50:54 2019 -0700
+@@ -1,3 +1,4 @@
+ 9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0
+ b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0
+ 04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0
++89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0
+diff -r 89706c0efbdb include/magma_types.h
+--- a/include/magma_types.h	Wed Jan 02 14:17:26 2019 -0500
++++ b/include/magma_types.h	Wed Apr 03 15:50:54 2019 -0700
+@@ -77,7 +77,7 @@
+     typedef magma_int_t    magma_device_t;
+ 
+     // Half precision in CUDA 
+-    #if defined(__cplusplus) && CUDA_VERSION > 7500
++    #if defined(__cplusplus) && CUDA_VERSION >= 7500
+     #include <cuda_fp16.h>
+     typedef __half           magmaHalf;
+     #else
+diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu
+--- a/sparse/blas/magma_zsampleselect.cu	Wed Jan 02 14:17:26 2019 -0500
++++ b/sparse/blas/magma_zsampleselect.cu	Wed Apr 03 15:50:54 2019 -0700
+@@ -15,9 +15,12 @@
+ 
+ #define PRECISION_z
+ 
++
+ namespace magma_sampleselect {
+ 
+-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) {
++__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) 
++{
++#if (__CUDA_ARCH__ >= 350)
+     auto idx = threadIdx.x + blockDim.x * blockIdx.x;
+     if (idx >= size) {
+         return;
+@@ -25,6 +28,7 @@
+ 
+     auto v = in[idx];
+     out[idx] = real(v) * real(v) + imag(v) * imag(v);
++#endif
+ }
+ 
+ } // namespace magma_sampleselect
+@@ -164,36 +168,43 @@
+     magma_queue_t queue )
+ {
+     magma_int_t info = 0;
++    magma_int_t arch = magma_getdevice_arch();
+ 
+-    auto num_blocks = magma_ceildiv(total_size, block_size);
+-    auto local_work = (total_size + num_threads - 1) / num_threads;
+-    auto required_size = sizeof(double) * (total_size + searchtree_size)
++    if( arch >= 350 ) {
++        auto num_blocks = magma_ceildiv(total_size, block_size);
++        auto local_work = (total_size + num_threads - 1) / num_threads;
++        auto required_size = sizeof(double) * (total_size + searchtree_size)
+                          + sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1);
+-    auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
++        auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
+ 
+-    double* gputmp = (double*)*tmp_ptr;
+-    double* gputree = gputmp + total_size;
+-    uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
+-    int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
+-    int32_t* gpucounts = gpurankout + 1;
+-    int32_t* gpulocalcounts = gpucounts + searchtree_width;
+-    uint32_t bucketidx{};
++        double* gputmp = (double*)*tmp_ptr;
++        double* gputree = gputmp + total_size;
++        uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
++        int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
++        int32_t* gpucounts = gpurankout + 1;
++        int32_t* gpulocalcounts = gpucounts + searchtree_width;
++        uint32_t bucketidx{};
+ 
+-    CHECK(realloc_result);
++        CHECK(realloc_result);
+ 
+-    compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
+-        (val, gputmp, total_size);
+-    build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
+-        (gputmp, gputree, total_size);
+-    count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
+-        (gputmp, gputree, gpulocalcounts, total_size, local_work);
+-    reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
+-        (gpulocalcounts, gpucounts, num_grouped_blocks);
+-    sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
+-        (gpucounts, subset_size, gpubucketidx, gpurankout);
+-    magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
+-    magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
+-    *thrs = std::sqrt(*thrs);
++        compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
++            (val, gputmp, total_size);
++        build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
++            (gputmp, gputree, total_size);
++        count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
++            (gputmp, gputree, gpulocalcounts, total_size, local_work);
++        reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
++            (gpulocalcounts, gpucounts, num_grouped_blocks);
++        sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
++            (gpucounts, subset_size, gpubucketidx, gpurankout);
++        magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
++        magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
++        *thrs = std::sqrt(*thrs);
++    }
++    else {
++        printf("error: this functionality needs CUDA architecture >= 3.5\n");
++        info = MAGMA_ERR_NOT_SUPPORTED;
++    }
+ 
+ cleanup:
+     return info;
+diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp
+--- a/src/xhsgetrf_gpu.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/src/xhsgetrf_gpu.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -16,6 +16,131 @@
+ #include <cuda_fp16.h>
+ #endif
+ 
++#if CUDA_VERSION < 9020
++// conversion float to half are not defined for host in CUDA version <9.2
++// thus uses the conversion below when CUDA VERSION is < 9.2.
++#include <string.h>
++//
++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions
++// are met:
++//  * Redistributions of source code must retain the above copyright
++//    notice, this list of conditions and the following disclaimer.
++//  * Redistributions in binary form must reproduce the above copyright
++//    notice, this list of conditions and the following disclaimer in the
++//    documentation and/or other materials provided with the distribution.
++//  * Neither the name of NVIDIA CORPORATION nor the names of its
++//    contributors may be used to endorse or promote products derived
++//    from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// This code modified from the public domain code here: 
++// https://gist.github.com/rygorous/2156668
++// The URL above includes more robust conversion routines
++// that handle Inf and NaN correctly. 
++// 
++// It is recommended to use the more robust versions in production code.
++
++typedef unsigned uint;
++
++union FP32
++{
++    uint u;
++    float f;
++    struct
++    {
++        uint Mantissa : 23;
++        uint Exponent : 8;
++        uint Sign : 1;
++    };
++};
++
++union FP16
++{
++    unsigned short u;
++    struct
++    {
++        uint Mantissa : 10;
++        uint Exponent : 5;
++        uint Sign : 1;
++    };
++};
++
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++static half approx_float_to_half(float fl)
++{
++    FP32 f32infty = { 255 << 23 };
++    FP32 f16max = { (127 + 16) << 23 };
++    FP32 magic = { 15 << 23 };
++    FP32 expinf = { (255 ^ 31) << 23 };
++    uint sign_mask = 0x80000000u;
++    FP16 o = { 0 };
++
++    FP32 f = *((FP32*)&fl);
++
++    uint sign = f.u & sign_mask;
++    f.u ^= sign;
++
++    if (!(f.f < f32infty.u)) // Inf or NaN
++        o.u = f.u ^ expinf.u;
++    else
++    {
++        if (f.f > f16max.f) f.f = f16max.f;
++        f.f *= magic.f;
++    }
++
++    o.u = f.u >> 13; // Take the mantissa bits
++    o.u |= sign >> 16;
++    half tmp;
++    memcpy(&tmp, &o, sizeof(half));
++    //return *((half*)&o);
++    return tmp;
++}
++
++// from half->float code - just for verification.
++static float half_to_float(half hf)
++{
++    FP16 h;
++    memcpy(&h, &hf, sizeof(half));
++
++    static const FP32 magic = { 113 << 23 };
++    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
++    FP32 o;
++
++    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
++    uint exp = shifted_exp & o.u;   // just the exponent
++    o.u += (127 - 15) << 23;        // exponent adjust
++
++    // handle exponent special cases
++    if (exp == shifted_exp) // Inf/NaN?
++        o.u += (128 - 16) << 23;    // extra exp adjust
++    else if (exp == 0) // Zero/Denormal?
++    {
++        o.u += 1 << 23;             // extra exp adjust
++        o.f -= magic.f;             // renormalize
++    }
++
++    o.u |= (h.u & 0x8000) << 16;    // sign bit
++    return o.f;
++}
++#endif
++
+ #include "magma_internal.h"
+ //#include "nvToolsExt.h"
+ 
+@@ -106,10 +231,13 @@
+     float c_one     = MAGMA_S_ONE;
+     float c_neg_one = MAGMA_S_NEG_ONE;
+     #if 1
++    #if CUDA_VERSION >= 9020
+     const magmaHalf h_one     = (magmaHalf) 1.0;
+     const magmaHalf h_neg_one = (magmaHalf)-1.0;
+-    //const magmaHalf h_one = approx_float_to_half(1.0);
+-    //const magmaHalf h_neg_one = approx_float_to_half(-1.0);
++    #else
++    const magmaHalf h_one = approx_float_to_half(1.0);
++    const magmaHalf h_neg_one = approx_float_to_half(-1.0);
++    #endif
+     #else
+     FP32 float_one    = *((FP32*)&c_one);
+     FP16 half_one     = float_to_half_full(float_one);
+diff -r 89706c0efbdb src/xshgetrf_gpu.cpp
+--- a/src/xshgetrf_gpu.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/src/xshgetrf_gpu.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -92,7 +92,7 @@
+     magma_mp_type_t enable_tc,
+     magma_mp_type_t mp_algo_type )
+ {
+-#if CUDA_VERSION >= 7500
++#if CUDA_VERSION >= 9000
+     #ifdef HAVE_clBLAS
+     #define  dA(i_, j_) dA,  (dA_offset  + (i_)       + (j_)*ldda)
+     #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_))
+diff -r 89706c0efbdb testing/testing_hgemm.cpp
+--- a/testing/testing_hgemm.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/testing/testing_hgemm.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -22,6 +22,131 @@
+ #include "magma_operators.h"
+ #include "testings.h"
+ 
++#if CUDA_VERSION < 9020
++// conversion float to half are not defined for host in CUDA version <9.2
++// thus uses the conversion below when CUDA VERSION is < 9.2.
++#include <string.h>
++//
++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions
++// are met:
++//  * Redistributions of source code must retain the above copyright
++//    notice, this list of conditions and the following disclaimer.
++//  * Redistributions in binary form must reproduce the above copyright
++//    notice, this list of conditions and the following disclaimer in the
++//    documentation and/or other materials provided with the distribution.
++//  * Neither the name of NVIDIA CORPORATION nor the names of its
++//    contributors may be used to endorse or promote products derived
++//    from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// This code modified from the public domain code here: 
++// https://gist.github.com/rygorous/2156668
++// The URL above includes more robust conversion routines
++// that handle Inf and NaN correctly. 
++// 
++// It is recommended to use the more robust versions in production code.
++
++typedef unsigned uint;
++
++union FP32
++{
++    uint u;
++    float f;
++    struct
++    {
++        uint Mantissa : 23;
++        uint Exponent : 8;
++        uint Sign : 1;
++    };
++};
++
++union FP16
++{
++    unsigned short u;
++    struct
++    {
++        uint Mantissa : 10;
++        uint Exponent : 5;
++        uint Sign : 1;
++    };
++};
++
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++static half approx_float_to_half(float fl)
++{
++    FP32 f32infty = { 255 << 23 };
++    FP32 f16max = { (127 + 16) << 23 };
++    FP32 magic = { 15 << 23 };
++    FP32 expinf = { (255 ^ 31) << 23 };
++    uint sign_mask = 0x80000000u;
++    FP16 o = { 0 };
++
++    FP32 f = *((FP32*)&fl);
++
++    uint sign = f.u & sign_mask;
++    f.u ^= sign;
++
++    if (!(f.f < f32infty.u)) // Inf or NaN
++        o.u = f.u ^ expinf.u;
++    else
++    {
++        if (f.f > f16max.f) f.f = f16max.f;
++        f.f *= magic.f;
++    }
++
++    o.u = f.u >> 13; // Take the mantissa bits
++    o.u |= sign >> 16;
++    half tmp;
++    memcpy(&tmp, &o, sizeof(half));
++    //return *((half*)&o);
++    return tmp;
++}
++
++// from half->float code - just for verification.
++static float half_to_float(half hf)
++{
++    FP16 h;
++    memcpy(&h, &hf, sizeof(half));
++
++    static const FP32 magic = { 113 << 23 };
++    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
++    FP32 o;
++
++    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
++    uint exp = shifted_exp & o.u;   // just the exponent
++    o.u += (127 - 15) << 23;        // exponent adjust
++
++    // handle exponent special cases
++    if (exp == shifted_exp) // Inf/NaN?
++        o.u += (128 - 16) << 23;    // extra exp adjust
++    else if (exp == 0) // Zero/Denormal?
++    {
++        o.u += 1 << 23;             // extra exp adjust
++        o.f -= magic.f;             // renormalize
++    }
++
++    o.u |= (h.u & 0x8000) << 16;    // sign bit
++    return o.f;
++}
++#endif
++
+ /* ////////////////////////////////////////////////////////////////////////////
+    -- Testing sgemm
+ */
+@@ -47,8 +172,13 @@
+     float c_neg_one = MAGMA_S_NEG_ONE;
+     float alpha = MAGMA_S_MAKE(  0.29, -0.86 );
+     float beta  = MAGMA_S_MAKE( -0.48,  0.38 );
+-    magmaHalf h_alpha = (magmaHalf)alpha;
+-    magmaHalf h_beta  = (magmaHalf)beta;
++    #if CUDA_VERSION >= 9020
++    const magmaHalf h_alpha = (magmaHalf) alpha;
++    const magmaHalf h_beta  = (magmaHalf) beta;
++    #else
++    const magmaHalf h_alpha = approx_float_to_half(alpha);
++    const magmaHalf h_beta  = approx_float_to_half(beta);
++    #endif
+     magma_opts opts;
+     opts.parse_opts( argc, argv );
+