summaryrefslogtreecommitdiff
path: root/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch
diff options
context:
space:
mode:
Diffstat (limited to 'var/spack/repos/builtin/packages/magma/magma-2.5.0.patch')
-rw-r--r--var/spack/repos/builtin/packages/magma/magma-2.5.0.patch428
1 files changed, 428 insertions, 0 deletions
diff --git a/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch b/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch
new file mode 100644
index 0000000000..1ac800c5f8
--- /dev/null
+++ b/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch
@@ -0,0 +1,428 @@
+diff -r 89706c0efbdb .hgtags
+--- a/.hgtags Wed Jan 02 14:17:26 2019 -0500
++++ b/.hgtags Wed Apr 03 15:50:54 2019 -0700
+@@ -1,3 +1,4 @@
+ 9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0
+ b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0
+ 04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0
++89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0
+diff -r 89706c0efbdb include/magma_types.h
+--- a/include/magma_types.h Wed Jan 02 14:17:26 2019 -0500
++++ b/include/magma_types.h Wed Apr 03 15:50:54 2019 -0700
+@@ -77,7 +77,7 @@
+ typedef magma_int_t magma_device_t;
+
+ // Half precision in CUDA
+- #if defined(__cplusplus) && CUDA_VERSION > 7500
++ #if defined(__cplusplus) && CUDA_VERSION >= 7500
+ #include <cuda_fp16.h>
+ typedef __half magmaHalf;
+ #else
+diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu
+--- a/sparse/blas/magma_zsampleselect.cu Wed Jan 02 14:17:26 2019 -0500
++++ b/sparse/blas/magma_zsampleselect.cu Wed Apr 03 15:50:54 2019 -0700
+@@ -15,9 +15,12 @@
+
+ #define PRECISION_z
+
++
+ namespace magma_sampleselect {
+
+-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) {
++__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size)
++{
++#if (__CUDA_ARCH__ >= 350)
+ auto idx = threadIdx.x + blockDim.x * blockIdx.x;
+ if (idx >= size) {
+ return;
+@@ -25,6 +28,7 @@
+
+ auto v = in[idx];
+ out[idx] = real(v) * real(v) + imag(v) * imag(v);
++#endif
+ }
+
+ } // namespace magma_sampleselect
+@@ -164,36 +168,43 @@
+ magma_queue_t queue )
+ {
+ magma_int_t info = 0;
++ magma_int_t arch = magma_getdevice_arch();
+
+- auto num_blocks = magma_ceildiv(total_size, block_size);
+- auto local_work = (total_size + num_threads - 1) / num_threads;
+- auto required_size = sizeof(double) * (total_size + searchtree_size)
++ if( arch >= 350 ) {
++ auto num_blocks = magma_ceildiv(total_size, block_size);
++ auto local_work = (total_size + num_threads - 1) / num_threads;
++ auto required_size = sizeof(double) * (total_size + searchtree_size)
+ + sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1);
+- auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
++ auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
+
+- double* gputmp = (double*)*tmp_ptr;
+- double* gputree = gputmp + total_size;
+- uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
+- int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
+- int32_t* gpucounts = gpurankout + 1;
+- int32_t* gpulocalcounts = gpucounts + searchtree_width;
+- uint32_t bucketidx{};
++ double* gputmp = (double*)*tmp_ptr;
++ double* gputree = gputmp + total_size;
++ uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
++ int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
++ int32_t* gpucounts = gpurankout + 1;
++ int32_t* gpulocalcounts = gpucounts + searchtree_width;
++ uint32_t bucketidx{};
+
+- CHECK(realloc_result);
++ CHECK(realloc_result);
+
+- compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
+- (val, gputmp, total_size);
+- build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
+- (gputmp, gputree, total_size);
+- count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
+- (gputmp, gputree, gpulocalcounts, total_size, local_work);
+- reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
+- (gpulocalcounts, gpucounts, num_grouped_blocks);
+- sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
+- (gpucounts, subset_size, gpubucketidx, gpurankout);
+- magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
+- magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
+- *thrs = std::sqrt(*thrs);
++ compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
++ (val, gputmp, total_size);
++ build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
++ (gputmp, gputree, total_size);
++ count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
++ (gputmp, gputree, gpulocalcounts, total_size, local_work);
++ reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
++ (gpulocalcounts, gpucounts, num_grouped_blocks);
++ sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
++ (gpucounts, subset_size, gpubucketidx, gpurankout);
++ magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
++ magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
++ *thrs = std::sqrt(*thrs);
++ }
++ else {
++ printf("error: this functionality needs CUDA architecture >= 3.5\n");
++ info = MAGMA_ERR_NOT_SUPPORTED;
++ }
+
+ cleanup:
+ return info;
+diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp
+--- a/src/xhsgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500
++++ b/src/xhsgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700
+@@ -16,6 +16,131 @@
+ #include <cuda_fp16.h>
+ #endif
+
++#if CUDA_VERSION < 9020
++// conversion float to half are not defined for host in CUDA version <9.2
++// thus uses the conversion below when CUDA VERSION is < 9.2.
++#include <string.h>
++//
++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions
++// are met:
++// * Redistributions of source code must retain the above copyright
++// notice, this list of conditions and the following disclaimer.
++// * Redistributions in binary form must reproduce the above copyright
++// notice, this list of conditions and the following disclaimer in the
++// documentation and/or other materials provided with the distribution.
++// * Neither the name of NVIDIA CORPORATION nor the names of its
++// contributors may be used to endorse or promote products derived
++// from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// This code modified from the public domain code here:
++// https://gist.github.com/rygorous/2156668
++// The URL above includes more robust conversion routines
++// that handle Inf and NaN correctly.
++//
++// It is recommended to use the more robust versions in production code.
++
++typedef unsigned uint;
++
++union FP32
++{
++ uint u;
++ float f;
++ struct
++ {
++ uint Mantissa : 23;
++ uint Exponent : 8;
++ uint Sign : 1;
++ };
++};
++
++union FP16
++{
++ unsigned short u;
++ struct
++ {
++ uint Mantissa : 10;
++ uint Exponent : 5;
++ uint Sign : 1;
++ };
++};
++
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++static half approx_float_to_half(float fl)
++{
++ FP32 f32infty = { 255 << 23 };
++ FP32 f16max = { (127 + 16) << 23 };
++ FP32 magic = { 15 << 23 };
++ FP32 expinf = { (255 ^ 31) << 23 };
++ uint sign_mask = 0x80000000u;
++ FP16 o = { 0 };
++
++ FP32 f = *((FP32*)&fl);
++
++ uint sign = f.u & sign_mask;
++ f.u ^= sign;
++
++ if (!(f.f < f32infty.u)) // Inf or NaN
++ o.u = f.u ^ expinf.u;
++ else
++ {
++ if (f.f > f16max.f) f.f = f16max.f;
++ f.f *= magic.f;
++ }
++
++ o.u = f.u >> 13; // Take the mantissa bits
++ o.u |= sign >> 16;
++ half tmp;
++ memcpy(&tmp, &o, sizeof(half));
++ //return *((half*)&o);
++ return tmp;
++}
++
++// from half->float code - just for verification.
++static float half_to_float(half hf)
++{
++ FP16 h;
++ memcpy(&h, &hf, sizeof(half));
++
++ static const FP32 magic = { 113 << 23 };
++ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
++ FP32 o;
++
++ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
++ uint exp = shifted_exp & o.u; // just the exponent
++ o.u += (127 - 15) << 23; // exponent adjust
++
++ // handle exponent special cases
++ if (exp == shifted_exp) // Inf/NaN?
++ o.u += (128 - 16) << 23; // extra exp adjust
++ else if (exp == 0) // Zero/Denormal?
++ {
++ o.u += 1 << 23; // extra exp adjust
++ o.f -= magic.f; // renormalize
++ }
++
++ o.u |= (h.u & 0x8000) << 16; // sign bit
++ return o.f;
++}
++#endif
++
+ #include "magma_internal.h"
+ //#include "nvToolsExt.h"
+
+@@ -106,10 +231,13 @@
+ float c_one = MAGMA_S_ONE;
+ float c_neg_one = MAGMA_S_NEG_ONE;
+ #if 1
++ #if CUDA_VERSION >= 9020
+ const magmaHalf h_one = (magmaHalf) 1.0;
+ const magmaHalf h_neg_one = (magmaHalf)-1.0;
+- //const magmaHalf h_one = approx_float_to_half(1.0);
+- //const magmaHalf h_neg_one = approx_float_to_half(-1.0);
++ #else
++ const magmaHalf h_one = approx_float_to_half(1.0);
++ const magmaHalf h_neg_one = approx_float_to_half(-1.0);
++ #endif
+ #else
+ FP32 float_one = *((FP32*)&c_one);
+ FP16 half_one = float_to_half_full(float_one);
+diff -r 89706c0efbdb src/xshgetrf_gpu.cpp
+--- a/src/xshgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500
++++ b/src/xshgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700
+@@ -92,7 +92,7 @@
+ magma_mp_type_t enable_tc,
+ magma_mp_type_t mp_algo_type )
+ {
+-#if CUDA_VERSION >= 7500
++#if CUDA_VERSION >= 9000
+ #ifdef HAVE_clBLAS
+ #define dA(i_, j_) dA, (dA_offset + (i_) + (j_)*ldda)
+ #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_))
+diff -r 89706c0efbdb testing/testing_hgemm.cpp
+--- a/testing/testing_hgemm.cpp Wed Jan 02 14:17:26 2019 -0500
++++ b/testing/testing_hgemm.cpp Wed Apr 03 15:50:54 2019 -0700
+@@ -22,6 +22,131 @@
+ #include "magma_operators.h"
+ #include "testings.h"
+
++#if CUDA_VERSION < 9020
++// conversion float to half are not defined for host in CUDA version <9.2
++// thus uses the conversion below when CUDA VERSION is < 9.2.
++#include <string.h>
++//
++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions
++// are met:
++// * Redistributions of source code must retain the above copyright
++// notice, this list of conditions and the following disclaimer.
++// * Redistributions in binary form must reproduce the above copyright
++// notice, this list of conditions and the following disclaimer in the
++// documentation and/or other materials provided with the distribution.
++// * Neither the name of NVIDIA CORPORATION nor the names of its
++// contributors may be used to endorse or promote products derived
++// from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// This code modified from the public domain code here:
++// https://gist.github.com/rygorous/2156668
++// The URL above includes more robust conversion routines
++// that handle Inf and NaN correctly.
++//
++// It is recommended to use the more robust versions in production code.
++
++typedef unsigned uint;
++
++union FP32
++{
++ uint u;
++ float f;
++ struct
++ {
++ uint Mantissa : 23;
++ uint Exponent : 8;
++ uint Sign : 1;
++ };
++};
++
++union FP16
++{
++ unsigned short u;
++ struct
++ {
++ uint Mantissa : 10;
++ uint Exponent : 5;
++ uint Sign : 1;
++ };
++};
++
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++static half approx_float_to_half(float fl)
++{
++ FP32 f32infty = { 255 << 23 };
++ FP32 f16max = { (127 + 16) << 23 };
++ FP32 magic = { 15 << 23 };
++ FP32 expinf = { (255 ^ 31) << 23 };
++ uint sign_mask = 0x80000000u;
++ FP16 o = { 0 };
++
++ FP32 f = *((FP32*)&fl);
++
++ uint sign = f.u & sign_mask;
++ f.u ^= sign;
++
++ if (!(f.f < f32infty.u)) // Inf or NaN
++ o.u = f.u ^ expinf.u;
++ else
++ {
++ if (f.f > f16max.f) f.f = f16max.f;
++ f.f *= magic.f;
++ }
++
++ o.u = f.u >> 13; // Take the mantissa bits
++ o.u |= sign >> 16;
++ half tmp;
++ memcpy(&tmp, &o, sizeof(half));
++ //return *((half*)&o);
++ return tmp;
++}
++
++// from half->float code - just for verification.
++static float half_to_float(half hf)
++{
++ FP16 h;
++ memcpy(&h, &hf, sizeof(half));
++
++ static const FP32 magic = { 113 << 23 };
++ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
++ FP32 o;
++
++ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
++ uint exp = shifted_exp & o.u; // just the exponent
++ o.u += (127 - 15) << 23; // exponent adjust
++
++ // handle exponent special cases
++ if (exp == shifted_exp) // Inf/NaN?
++ o.u += (128 - 16) << 23; // extra exp adjust
++ else if (exp == 0) // Zero/Denormal?
++ {
++ o.u += 1 << 23; // extra exp adjust
++ o.f -= magic.f; // renormalize
++ }
++
++ o.u |= (h.u & 0x8000) << 16; // sign bit
++ return o.f;
++}
++#endif
++
+ /* ////////////////////////////////////////////////////////////////////////////
+ -- Testing sgemm
+ */
+@@ -47,8 +172,13 @@
+ float c_neg_one = MAGMA_S_NEG_ONE;
+ float alpha = MAGMA_S_MAKE( 0.29, -0.86 );
+ float beta = MAGMA_S_MAKE( -0.48, 0.38 );
+- magmaHalf h_alpha = (magmaHalf)alpha;
+- magmaHalf h_beta = (magmaHalf)beta;
++ #if CUDA_VERSION >= 9020
++ const magmaHalf h_alpha = (magmaHalf) alpha;
++ const magmaHalf h_beta = (magmaHalf) beta;
++ #else
++ const magmaHalf h_alpha = approx_float_to_half(alpha);
++ const magmaHalf h_beta = approx_float_to_half(beta);
++ #endif
+ magma_opts opts;
+ opts.parse_opts( argc, argv );
+