diff options
Diffstat (limited to 'var/spack/repos/builtin/packages/magma/magma-2.5.0.patch')
-rw-r--r-- | var/spack/repos/builtin/packages/magma/magma-2.5.0.patch | 428 |
1 files changed, 428 insertions, 0 deletions
diff --git a/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch b/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch new file mode 100644 index 0000000000..1ac800c5f8 --- /dev/null +++ b/var/spack/repos/builtin/packages/magma/magma-2.5.0.patch @@ -0,0 +1,428 @@ +diff -r 89706c0efbdb .hgtags +--- a/.hgtags Wed Jan 02 14:17:26 2019 -0500 ++++ b/.hgtags Wed Apr 03 15:50:54 2019 -0700 +@@ -1,3 +1,4 @@ + 9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0 + b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0 + 04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0 ++89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0 +diff -r 89706c0efbdb include/magma_types.h +--- a/include/magma_types.h Wed Jan 02 14:17:26 2019 -0500 ++++ b/include/magma_types.h Wed Apr 03 15:50:54 2019 -0700 +@@ -77,7 +77,7 @@ + typedef magma_int_t magma_device_t; + + // Half precision in CUDA +- #if defined(__cplusplus) && CUDA_VERSION > 7500 ++ #if defined(__cplusplus) && CUDA_VERSION >= 7500 + #include <cuda_fp16.h> + typedef __half magmaHalf; + #else +diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu +--- a/sparse/blas/magma_zsampleselect.cu Wed Jan 02 14:17:26 2019 -0500 ++++ b/sparse/blas/magma_zsampleselect.cu Wed Apr 03 15:50:54 2019 -0700 +@@ -15,9 +15,12 @@ + + #define PRECISION_z + ++ + namespace magma_sampleselect { + +-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) { ++__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) ++{ ++#if (__CUDA_ARCH__ >= 350) + auto idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= size) { + return; +@@ -25,6 +28,7 @@ + + auto v = in[idx]; + out[idx] = real(v) * real(v) + imag(v) * imag(v); ++#endif + } + + } // namespace magma_sampleselect +@@ -164,36 +168,43 @@ + magma_queue_t queue ) + { + magma_int_t info = 0; ++ magma_int_t arch = magma_getdevice_arch(); + +- auto num_blocks = magma_ceildiv(total_size, block_size); +- auto local_work = (total_size + num_threads - 1) / num_threads; +- auto required_size = sizeof(double) * (total_size + searchtree_size) ++ if( arch >= 350 ) { ++ auto num_blocks = magma_ceildiv(total_size, block_size); ++ auto local_work = (total_size + num_threads - 1) / num_threads; ++ auto required_size = sizeof(double) * (total_size + searchtree_size) + + sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1); +- auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size); ++ auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size); + +- double* gputmp = (double*)*tmp_ptr; +- double* gputree = gputmp + total_size; +- uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size); +- int32_t* gpurankout = (int32_t*)(gpubucketidx + 1); +- int32_t* gpucounts = gpurankout + 1; +- int32_t* gpulocalcounts = gpucounts + searchtree_width; +- uint32_t bucketidx{}; ++ double* gputmp = (double*)*tmp_ptr; ++ double* gputree = gputmp + total_size; ++ uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size); ++ int32_t* gpurankout = (int32_t*)(gpubucketidx + 1); ++ int32_t* gpucounts = gpurankout + 1; ++ int32_t* gpulocalcounts = gpucounts + searchtree_width; ++ uint32_t bucketidx{}; + +- CHECK(realloc_result); ++ CHECK(realloc_result); + +- compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>> +- (val, gputmp, total_size); +- build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>> +- (gputmp, gputree, total_size); +- count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>> +- (gputmp, gputree, gpulocalcounts, total_size, local_work); +- reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>> +- (gpulocalcounts, gpucounts, num_grouped_blocks); +- sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>> +- (gpucounts, subset_size, gpubucketidx, gpurankout); +- magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue); +- magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue); +- *thrs = std::sqrt(*thrs); ++ compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>> ++ (val, gputmp, total_size); ++ build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>> ++ (gputmp, gputree, total_size); ++ count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>> ++ (gputmp, gputree, gpulocalcounts, total_size, local_work); ++ reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>> ++ (gpulocalcounts, gpucounts, num_grouped_blocks); ++ sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>> ++ (gpucounts, subset_size, gpubucketidx, gpurankout); ++ magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue); ++ magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue); ++ *thrs = std::sqrt(*thrs); ++ } ++ else { ++ printf("error: this functionality needs CUDA architecture >= 3.5\n"); ++ info = MAGMA_ERR_NOT_SUPPORTED; ++ } + + cleanup: + return info; +diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp +--- a/src/xhsgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/src/xhsgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -16,6 +16,131 @@ + #include <cuda_fp16.h> + #endif + ++#if CUDA_VERSION < 9020 ++// conversion float to half are not defined for host in CUDA version <9.2 ++// thus uses the conversion below when CUDA VERSION is < 9.2. ++#include <string.h> ++// ++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved. ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions ++// are met: ++// * Redistributions of source code must retain the above copyright ++// notice, this list of conditions and the following disclaimer. ++// * Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// * Neither the name of NVIDIA CORPORATION nor the names of its ++// contributors may be used to endorse or promote products derived ++// from this software without specific prior written permission. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY ++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++// This code modified from the public domain code here: ++// https://gist.github.com/rygorous/2156668 ++// The URL above includes more robust conversion routines ++// that handle Inf and NaN correctly. ++// ++// It is recommended to use the more robust versions in production code. ++ ++typedef unsigned uint; ++ ++union FP32 ++{ ++ uint u; ++ float f; ++ struct ++ { ++ uint Mantissa : 23; ++ uint Exponent : 8; ++ uint Sign : 1; ++ }; ++}; ++ ++union FP16 ++{ ++ unsigned short u; ++ struct ++ { ++ uint Mantissa : 10; ++ uint Exponent : 5; ++ uint Sign : 1; ++ }; ++}; ++ ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++static half approx_float_to_half(float fl) ++{ ++ FP32 f32infty = { 255 << 23 }; ++ FP32 f16max = { (127 + 16) << 23 }; ++ FP32 magic = { 15 << 23 }; ++ FP32 expinf = { (255 ^ 31) << 23 }; ++ uint sign_mask = 0x80000000u; ++ FP16 o = { 0 }; ++ ++ FP32 f = *((FP32*)&fl); ++ ++ uint sign = f.u & sign_mask; ++ f.u ^= sign; ++ ++ if (!(f.f < f32infty.u)) // Inf or NaN ++ o.u = f.u ^ expinf.u; ++ else ++ { ++ if (f.f > f16max.f) f.f = f16max.f; ++ f.f *= magic.f; ++ } ++ ++ o.u = f.u >> 13; // Take the mantissa bits ++ o.u |= sign >> 16; ++ half tmp; ++ memcpy(&tmp, &o, sizeof(half)); ++ //return *((half*)&o); ++ return tmp; ++} ++ ++// from half->float code - just for verification. ++static float half_to_float(half hf) ++{ ++ FP16 h; ++ memcpy(&h, &hf, sizeof(half)); ++ ++ static const FP32 magic = { 113 << 23 }; ++ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift ++ FP32 o; ++ ++ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits ++ uint exp = shifted_exp & o.u; // just the exponent ++ o.u += (127 - 15) << 23; // exponent adjust ++ ++ // handle exponent special cases ++ if (exp == shifted_exp) // Inf/NaN? ++ o.u += (128 - 16) << 23; // extra exp adjust ++ else if (exp == 0) // Zero/Denormal? ++ { ++ o.u += 1 << 23; // extra exp adjust ++ o.f -= magic.f; // renormalize ++ } ++ ++ o.u |= (h.u & 0x8000) << 16; // sign bit ++ return o.f; ++} ++#endif ++ + #include "magma_internal.h" + //#include "nvToolsExt.h" + +@@ -106,10 +231,13 @@ + float c_one = MAGMA_S_ONE; + float c_neg_one = MAGMA_S_NEG_ONE; + #if 1 ++ #if CUDA_VERSION >= 9020 + const magmaHalf h_one = (magmaHalf) 1.0; + const magmaHalf h_neg_one = (magmaHalf)-1.0; +- //const magmaHalf h_one = approx_float_to_half(1.0); +- //const magmaHalf h_neg_one = approx_float_to_half(-1.0); ++ #else ++ const magmaHalf h_one = approx_float_to_half(1.0); ++ const magmaHalf h_neg_one = approx_float_to_half(-1.0); ++ #endif + #else + FP32 float_one = *((FP32*)&c_one); + FP16 half_one = float_to_half_full(float_one); +diff -r 89706c0efbdb src/xshgetrf_gpu.cpp +--- a/src/xshgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/src/xshgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -92,7 +92,7 @@ + magma_mp_type_t enable_tc, + magma_mp_type_t mp_algo_type ) + { +-#if CUDA_VERSION >= 7500 ++#if CUDA_VERSION >= 9000 + #ifdef HAVE_clBLAS + #define dA(i_, j_) dA, (dA_offset + (i_) + (j_)*ldda) + #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_)) +diff -r 89706c0efbdb testing/testing_hgemm.cpp +--- a/testing/testing_hgemm.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/testing/testing_hgemm.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -22,6 +22,131 @@ + #include "magma_operators.h" + #include "testings.h" + ++#if CUDA_VERSION < 9020 ++// conversion float to half are not defined for host in CUDA version <9.2 ++// thus uses the conversion below when CUDA VERSION is < 9.2. ++#include <string.h> ++// ++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved. ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions ++// are met: ++// * Redistributions of source code must retain the above copyright ++// notice, this list of conditions and the following disclaimer. ++// * Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// * Neither the name of NVIDIA CORPORATION nor the names of its ++// contributors may be used to endorse or promote products derived ++// from this software without specific prior written permission. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY ++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++// This code modified from the public domain code here: ++// https://gist.github.com/rygorous/2156668 ++// The URL above includes more robust conversion routines ++// that handle Inf and NaN correctly. ++// ++// It is recommended to use the more robust versions in production code. ++ ++typedef unsigned uint; ++ ++union FP32 ++{ ++ uint u; ++ float f; ++ struct ++ { ++ uint Mantissa : 23; ++ uint Exponent : 8; ++ uint Sign : 1; ++ }; ++}; ++ ++union FP16 ++{ ++ unsigned short u; ++ struct ++ { ++ uint Mantissa : 10; ++ uint Exponent : 5; ++ uint Sign : 1; ++ }; ++}; ++ ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++static half approx_float_to_half(float fl) ++{ ++ FP32 f32infty = { 255 << 23 }; ++ FP32 f16max = { (127 + 16) << 23 }; ++ FP32 magic = { 15 << 23 }; ++ FP32 expinf = { (255 ^ 31) << 23 }; ++ uint sign_mask = 0x80000000u; ++ FP16 o = { 0 }; ++ ++ FP32 f = *((FP32*)&fl); ++ ++ uint sign = f.u & sign_mask; ++ f.u ^= sign; ++ ++ if (!(f.f < f32infty.u)) // Inf or NaN ++ o.u = f.u ^ expinf.u; ++ else ++ { ++ if (f.f > f16max.f) f.f = f16max.f; ++ f.f *= magic.f; ++ } ++ ++ o.u = f.u >> 13; // Take the mantissa bits ++ o.u |= sign >> 16; ++ half tmp; ++ memcpy(&tmp, &o, sizeof(half)); ++ //return *((half*)&o); ++ return tmp; ++} ++ ++// from half->float code - just for verification. ++static float half_to_float(half hf) ++{ ++ FP16 h; ++ memcpy(&h, &hf, sizeof(half)); ++ ++ static const FP32 magic = { 113 << 23 }; ++ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift ++ FP32 o; ++ ++ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits ++ uint exp = shifted_exp & o.u; // just the exponent ++ o.u += (127 - 15) << 23; // exponent adjust ++ ++ // handle exponent special cases ++ if (exp == shifted_exp) // Inf/NaN? ++ o.u += (128 - 16) << 23; // extra exp adjust ++ else if (exp == 0) // Zero/Denormal? ++ { ++ o.u += 1 << 23; // extra exp adjust ++ o.f -= magic.f; // renormalize ++ } ++ ++ o.u |= (h.u & 0x8000) << 16; // sign bit ++ return o.f; ++} ++#endif ++ + /* //////////////////////////////////////////////////////////////////////////// + -- Testing sgemm + */ +@@ -47,8 +172,13 @@ + float c_neg_one = MAGMA_S_NEG_ONE; + float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); + float beta = MAGMA_S_MAKE( -0.48, 0.38 ); +- magmaHalf h_alpha = (magmaHalf)alpha; +- magmaHalf h_beta = (magmaHalf)beta; ++ #if CUDA_VERSION >= 9020 ++ const magmaHalf h_alpha = (magmaHalf) alpha; ++ const magmaHalf h_beta = (magmaHalf) beta; ++ #else ++ const magmaHalf h_alpha = approx_float_to_half(alpha); ++ const magmaHalf h_beta = approx_float_to_half(beta); ++ #endif + magma_opts opts; + opts.parse_opts( argc, argv ); + |