From d2c2e000a77d1d0f59a311a1e0ad49f8498d787a Mon Sep 17 00:00:00 2001
From: kolamsrinivas <67026525+kolamsrinivas@users.noreply.github.com>
Date: Wed, 15 Jul 2020 16:45:22 -0700
Subject: changes to py-torch recipe to enable rocm build (#17410)

* changes to recipe to enable rocm build

* fixing flake8 issue

* addressed the review comment
---
 .../repos/builtin/packages/py-torch/package.py     | 15 +++-
 .../repos/builtin/packages/py-torch/rocm.patch     | 98 ++++++++++++++++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 var/spack/repos/builtin/packages/py-torch/rocm.patch

diff --git a/var/spack/repos/builtin/packages/py-torch/package.py b/var/spack/repos/builtin/packages/py-torch/package.py
index 1ffeb81b23..bebf83d3d6 100644
--- a/var/spack/repos/builtin/packages/py-torch/package.py
+++ b/var/spack/repos/builtin/packages/py-torch/package.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
 from spack import *
+import os
 
 
 class PyTorch(PythonPackage, CudaPackage):
@@ -68,6 +69,7 @@ class PyTorch(PythonPackage, CudaPackage):
 
     variant('cuda', default=True, description='Build with CUDA')
     variant('cudnn', default=True, description='Enables the cuDNN build')
+    variant('rocm', default=False, description='Build with ROCm build')
     variant('magma', default=False, description='Enables the MAGMA build')
     variant('fbgemm', default=False, description='Enables the FBGEMM build')
     variant('test', default=False, description='Enables the test build')
@@ -112,6 +114,7 @@ class PyTorch(PythonPackage, CudaPackage):
     conflicts('cuda_arch=none', when='+cuda',
               msg='Must specify CUDA compute capabilities of your GPU, see '
               'https://developer.nvidia.com/cuda-gpus')
+    conflicts('+rocm', when='+cuda')
 
     # Required dependencies
     depends_on('cmake@3.5:', type='build')
@@ -173,6 +176,9 @@ class PyTorch(PythonPackage, CudaPackage):
     # Fixes CMake configuration error when XNNPACK is disabled
     patch('xnnpack.patch', when='@1.5.0:1.5.999')
 
+    # Fixes Build error for when ROCm is enable for pytorch-1.5 release
+    patch('rocm.patch', when='@1.5.0:1.5.999+rocm')
+
     # https://github.com/pytorch/pytorch/pull/37086
     # Fixes compilation with Clang 9.0.0 and Apple Clang 11.0.3
     patch('https://github.com/pytorch/pytorch/commit/e921cd222a8fbeabf5a3e74e83e0d8dfb01aa8b5.patch',
@@ -244,7 +250,9 @@ class PyTorch(PythonPackage, CudaPackage):
 
         enable_or_disable('fbgemm')
         enable_or_disable('test', keyword='BUILD')
-
+        enable_or_disable('rocm')
+        if '+rocm' in self.spec:
+            env.set('USE_MKLDNN', 0)
         if '+miopen' in self.spec:
             env.set('MIOPEN_LIB_DIR', self.spec['miopen'].libs.directories[0])
             env.set('MIOPEN_INCLUDE_DIR', self.spec['miopen'].prefix.include)
@@ -297,6 +305,11 @@ class PyTorch(PythonPackage, CudaPackage):
         enable_or_disable('zstd', newer=True)
         enable_or_disable('tbb', newer=True)
 
+    @run_before('install')
+    def build_amd(self):
+        if '+rocm' in self.spec:
+            python(os.path.join('tools', 'amd_build', 'build_amd.py'))
+
     def install_test(self):
         with working_dir('test'):
             python('run_test.py')
diff --git a/var/spack/repos/builtin/packages/py-torch/rocm.patch b/var/spack/repos/builtin/packages/py-torch/rocm.patch
new file mode 100644
index 0000000000..b50cc7e159
--- /dev/null
+++ b/var/spack/repos/builtin/packages/py-torch/rocm.patch
@@ -0,0 +1,98 @@
+diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+index 9cd678dfb4cc7..4630465115c7c 100644
+--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
++++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+@@ -67,6 +67,14 @@ namespace at { namespace cuda {
+ //
+ // HIP doesn't have
+ //   cuGetErrorString  (maps to non-functional hipGetErrorString___)
++//
++// HIP from ROCm 3.5 on renamed hipOccupancyMaxActiveBlocksPerMultiprocessor
++// to hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.
++#if HIP_VERSION < 305
++#define HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR hipOccupancyMaxActiveBlocksPerMultiprocessor
++#else
++#define HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR cuOccupancyMaxActiveBlocksPerMultiprocessor
++#endif
+ 
+ #define AT_FORALL_NVRTC(_)                       \
+   _(nvrtcVersion)                                \
+@@ -76,7 +84,7 @@ namespace at { namespace cuda {
+   _(nvrtcGetPTX)                                 \
+   _(cuModuleLoadData)                            \
+   _(cuModuleGetFunction)                         \
+-  _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \
++  _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)\
+   _(nvrtcGetErrorString)                         \
+   _(nvrtcGetProgramLogSize)                      \
+   _(nvrtcGetProgramLog)                          \
+diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
+index da1995123ecfc..f935eb4ef3d0e 100644
+--- a/aten/src/ATen/native/cuda/SoftMax.cu
++++ b/aten/src/ATen/native/cuda/SoftMax.cu
+@@ -127,8 +127,8 @@ void SpatialSoftMax_getLaunchSizes(
+   uint32_t block_threads = block.x * block.y;
+   smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t);
+   int max_active_blocks;
+-#ifdef __HIP_PLATFORM_HCC__
+-  // XXX HIP function signature is not compatible yet.
++#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION < 305
++  // HIP function signature is not compatible yet.
+   uint32_t max_blocks;
+   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks,
+                                                 k, block_threads, smem_size);
+diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+index 5586e49919727..27315ee475277 100644
+--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
++++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+@@ -140,10 +140,10 @@ FusedKernelCUDA::FusedKernelCUDA(
+       nvrtc().cuModuleGetFunction(&function_, module_, name_.c_str()));
+ 
+   // Computes max blocks
+-#ifdef __HIP_PLATFORM_HCC__
+-  // XXX HIP function signature is not compatible yet
++#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION < 305
++  // HIP function signature is not compatible yet
+   uint32_t max_blocks;
+-  AT_CUDA_DRIVER_CHECK(nvrtc().cuOccupancyMaxActiveBlocksPerMultiprocessor(
++  AT_CUDA_DRIVER_CHECK(nvrtc().hipOccupancyMaxActiveBlocksPerMultiprocessor(
+       &max_blocks, function_, 128, 0));
+   maxBlocks_ = max_blocks;
+ #else
+diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
+index 7e21363cbe6af..26f269d92ae38 100644
+--- a/torch/utils/hipify/cuda_to_hip_mappings.py
++++ b/torch/utils/hipify/cuda_to_hip_mappings.py
+@@ -2890,7 +2890,7 @@
+         (
+             "cuOccupancyMaxActiveBlocksPerMultiprocessor",
+             (
+-                "hipOccupancyMaxActiveBlocksPerMultiprocessor",
++                "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor",
+                 CONV_OCCUPANCY,
+                 API_DRIVER,
+             ),
+@@ -2898,7 +2898,7 @@
+         (
+             "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+             (
+-                "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
++                "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+                 CONV_OCCUPANCY,
+                 API_DRIVER,
+                 HIP_UNSUPPORTED,
+@@ -2906,12 +2906,12 @@
+         ),
+         (
+             "cuOccupancyMaxPotentialBlockSize",
+-            ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER),
++            ("hipModuleOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER),
+         ),
+         (
+             "cuOccupancyMaxPotentialBlockSizeWithFlags",
+             (
+-                "hipOccupancyMaxPotentialBlockSizeWithFlags",
++                "hipModuleOccupancyMaxPotentialBlockSizeWithFlags",
+                 CONV_OCCUPANCY,
+                 API_DRIVER,
+                 HIP_UNSUPPORTED,
-- 
cgit v1.2.3-70-g09d2