From 878e89eceae69652a8d9a25abda1c8585b0ff08c Mon Sep 17 00:00:00 2001
From: Harsh Bhatia <bhatia4@llnl.gov>
Date: Fri, 16 Oct 2020 00:47:22 -0700
Subject: faiss: added new package at v1.6.3 (#17290)

Co-authored-by: Massimiliano Culpo <massimiliano.culpo@gmail.com>
---
 .../builtin/packages/faiss/fixes-in-v1.5.3.patch   |   51 +
 .../builtin/packages/faiss/fixes-in-v1.6.3.patch   | 1239 ++++++++++++++++++++
 var/spack/repos/builtin/packages/faiss/package.py  |  126 ++
 3 files changed, 1416 insertions(+)
 create mode 100644 var/spack/repos/builtin/packages/faiss/fixes-in-v1.5.3.patch
 create mode 100644 var/spack/repos/builtin/packages/faiss/fixes-in-v1.6.3.patch
 create mode 100644 var/spack/repos/builtin/packages/faiss/package.py

diff --git a/var/spack/repos/builtin/packages/faiss/fixes-in-v1.5.3.patch b/var/spack/repos/builtin/packages/faiss/fixes-in-v1.5.3.patch
new file mode 100644
index 0000000000..9b164c28a1
--- /dev/null
+++ b/var/spack/repos/builtin/packages/faiss/fixes-in-v1.5.3.patch
@@ -0,0 +1,51 @@
+From 6a581e31bc461654460452be5255083ae2d33a6f Mon Sep 17 00:00:00 2001
+From: Harsh Bhatia <bhatia4@llnl.gov>
+Date: Mon, 29 Jun 2020 10:59:33 -0700
+Subject: [PATCH] fixed v1.5.3
+
+---
+ faiss             | 1 +
+ gpu/test/Makefile | 4 ++++
+ tests/Makefile    | 2 +-
+ 3 files changed, 6 insertions(+), 1 deletion(-)
+ create mode 120000 faiss
+
+diff --git a/faiss b/faiss
+new file mode 120000
+index 0000000..945c9b4
+--- /dev/null
++++ b/faiss
+@@ -0,0 +1 @@
++.
+\ No newline at end of file
+diff --git a/gpu/test/Makefile b/gpu/test/Makefile
+index 6836314..8aa67e8 100644
+--- a/gpu/test/Makefile
++++ b/gpu/test/Makefile
+@@ -17,6 +17,10 @@ TESTS_BIN = $(TESTS_OBJ:.o=) $(CUDA_TESTS_OBJ:.o=)
+ 
+ # test_gpu_index.py test_pytorch_faiss.py
+ 
++build: $(TESTS_BIN)
++TestUtils.o: TestUtils.cpp
++	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ ../../libfaiss.a -Igtest/include -I../..
++
+ run: $(TESTS_BIN) $(CUDA_TESTS_BIN)
+ 	for t in $(TESTS_BIN) $(CUDA_TESTS_BIN); do ./$$t || exit; done
+ 
+diff --git a/tests/Makefile b/tests/Makefile
+index c46c292..d22d70d 100644
+--- a/tests/Makefile
++++ b/tests/Makefile
+@@ -18,7 +18,7 @@ tests: $(TESTS_OBJ) ../libfaiss.a gtest/make/gtest_main.a
+ 	$(CXX) -o $@ $^ $(LDFLAGS) $(LIBS)
+ 
+ %.o: %.cpp gtest
+-	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c -o $@ $< -Igtest/include -I../..
++	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c -o $@ $< -Igtest/include -I../.. -I..
+ 
+ gtest/make/gtest_main.a: gtest
+ 	$(MAKE) -C gtest/make CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)" gtest_main.a
+-- 
+1.8.3.1
+
diff --git a/var/spack/repos/builtin/packages/faiss/fixes-in-v1.6.3.patch b/var/spack/repos/builtin/packages/faiss/fixes-in-v1.6.3.patch
new file mode 100644
index 0000000000..0916bb9225
--- /dev/null
+++ b/var/spack/repos/builtin/packages/faiss/fixes-in-v1.6.3.patch
@@ -0,0 +1,1239 @@
+From 931f610a2075301b26d3d811bdbf30e4394da614 Mon Sep 17 00:00:00 2001
+From: Harsh Bhatia <bhatia4@llnl.gov>
+Date: Sun, 28 Jun 2020 15:09:17 -0700
+Subject: [PATCH] fixes in v1.6.3
+
+---
+ gpu/impl/PQCodeDistances.cu              | 567 -----------------------------
+ gpu/impl/PQScanMultiPassNoPrecomputed.cu | 597 -------------------------------
+ gpu/test/Makefile                        |  10 +-
+ gpu/test/demo_ivfpq_indexing_gpu.cpp     |   1 +
+ 4 files changed, 8 insertions(+), 1167 deletions(-)
+ delete mode 100644 gpu/impl/PQCodeDistances.cu
+ delete mode 100644 gpu/impl/PQScanMultiPassNoPrecomputed.cu
+
+diff --git a/gpu/impl/PQCodeDistances.cu b/gpu/impl/PQCodeDistances.cu
+deleted file mode 100644
+index 817990b..0000000
+--- a/gpu/impl/PQCodeDistances.cu
++++ /dev/null
+@@ -1,567 +0,0 @@
+-/**
+- * Copyright (c) Facebook, Inc. and its affiliates.
+- *
+- * This source code is licensed under the MIT license found in the
+- * LICENSE file in the root directory of this source tree.
+- */
+-
+-
+-#include <faiss/gpu/impl/PQCodeDistances.cuh>
+-
+-#include <faiss/gpu/impl/BroadcastSum.cuh>
+-#include <faiss/gpu/impl/Distance.cuh>
+-#include <faiss/gpu/impl/L2Norm.cuh>
+-#include <faiss/gpu/utils/ConversionOperators.cuh>
+-#include <faiss/gpu/utils/DeviceDefs.cuh>
+-#include <faiss/gpu/utils/DeviceUtils.h>
+-#include <faiss/gpu/utils/Float16.cuh>
+-#include <faiss/gpu/utils/MatrixMult.cuh>
+-#include <faiss/gpu/utils/PtxUtils.cuh>
+-#include <faiss/gpu/utils/StaticUtils.h>
+-#include <faiss/gpu/utils/Transpose.cuh>
+-
+-namespace faiss { namespace gpu {
+-
+-template <typename T>
+-struct Converter {
+-};
+-
+-template <>
+-struct Converter<half> {
+-  inline static __device__ half to(float v) { return __float2half(v); }
+-};
+-
+-template <>
+-struct Converter<float> {
+-  inline static __device__ float to(float v) { return v; }
+-};
+-
+-// Kernel responsible for calculating distance from residual vector to
+-// each product quantizer code centroid
+-template <typename OutCodeT, int DimsPerSubQuantizer, bool L2Distance>
+-__global__ void
+-__launch_bounds__(288, 4)
+-pqCodeDistances(Tensor<float, 2, true> queries,
+-                int queriesPerBlock,
+-                Tensor<float, 2, true> coarseCentroids,
+-                Tensor<float, 3, true> pqCentroids,
+-                Tensor<int, 2, true> topQueryToCentroid,
+-                // (query id)(coarse)(subquantizer)(code) -> dist
+-                Tensor<OutCodeT, 4, true> outCodeDistances) {
+-  const auto numSubQuantizers = pqCentroids.getSize(0);
+-  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
+-  assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
+-  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+-
+-  bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
+-  int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
+-
+-  extern __shared__ float smem[];
+-
+-  // Each thread calculates a single code
+-  float subQuantizerData[DimsPerSubQuantizer];
+-
+-  auto code = threadIdx.x;
+-  auto subQuantizer = blockIdx.y;
+-
+-  // Each thread will load the pq centroid data for the code that it
+-  // is processing
+-#pragma unroll
+-  for (int i = 0; i < DimsPerSubQuantizer; ++i) {
+-    subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
+-  }
+-
+-  // Where we store our query vector
+-  float* smemQuery = smem;
+-
+-  // Where we store our residual vector; this is double buffered so we
+-  // can be loading the next one while processing the current one
+-  float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
+-  float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
+-
+-  // Where we pre-load the coarse centroid IDs
+-  int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
+-
+-  // Each thread is calculating the distance for a single code,
+-  // performing the reductions locally
+-
+-  // Handle multiple queries per block
+-  auto startQueryId = blockIdx.x * queriesPerBlock;
+-  auto numQueries = queries.getSize(0) - startQueryId;
+-  if (numQueries > queriesPerBlock) {
+-    numQueries = queriesPerBlock;
+-  }
+-
+-  for (int query = 0; query < numQueries; ++query) {
+-    auto queryId = startQueryId + query;
+-
+-    auto querySubQuantizer =
+-      queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
+-
+-    // Load current query vector
+-    for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
+-      smemQuery[i] = querySubQuantizer[i];
+-    }
+-
+-    // Load list of coarse centroids found
+-    for (int i = threadIdx.x;
+-         i < topQueryToCentroid.getSize(1); i += blockDim.x) {
+-      coarseIds[i] = topQueryToCentroid[queryId][i];
+-    }
+-
+-    // We need coarseIds below
+-    // FIXME: investigate loading separately, so we don't need this
+-    __syncthreads();
+-
+-    // Preload first buffer of residual data
+-    if (isLoadingThread) {
+-      for (int i = loadingThreadId;
+-           i < DimsPerSubQuantizer;
+-           i += blockDim.x - codesPerSubQuantizer) {
+-        auto coarseId = coarseIds[0];
+-        // In case NaNs were in the original query data
+-        coarseId = coarseId == -1 ? 0 : coarseId;
+-        auto coarseCentroidSubQuantizer =
+-          coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
+-
+-        if (L2Distance) {
+-          smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
+-        } else {
+-          smemResidual1[i] = coarseCentroidSubQuantizer[i];
+-        }
+-      }
+-    }
+-
+-    // The block walks the list for a single query
+-    for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
+-      // Wait for smemResidual1 to be loaded
+-      __syncthreads();
+-
+-      if (isLoadingThread) {
+-        // Preload second buffer of residual data
+-        for (int i = loadingThreadId;
+-             i < DimsPerSubQuantizer;
+-             i += blockDim.x - codesPerSubQuantizer) {
+-          // FIXME: try always making this centroid id 0 so we can
+-          // terminate
+-          if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
+-            auto coarseId = coarseIds[coarse + 1];
+-            // In case NaNs were in the original query data
+-            coarseId = coarseId == -1 ? 0 : coarseId;
+-
+-            auto coarseCentroidSubQuantizer =
+-              coarseCentroids[coarseId]
+-              [subQuantizer * dimsPerSubQuantizer].data();
+-
+-            if (L2Distance) {
+-              smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
+-            } else {
+-              smemResidual2[i] = coarseCentroidSubQuantizer[i];
+-            }
+-          }
+-        }
+-      } else {
+-        // These are the processing threads
+-        float dist = 0.0f;
+-
+-        constexpr int kUnroll = 4;
+-        constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
+-        constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
+-        float vals[kUnroll];
+-
+-        // Calculate residual - pqCentroid for each dim that we're
+-        // processing
+-
+-        // Unrolled loop
+-        if (L2Distance) {
+-#pragma unroll
+-          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+-#pragma unroll
+-            for (int j = 0; j < kUnroll; ++j) {
+-              vals[j] = smemResidual1[i * kUnroll + j];
+-            }
+-
+-#pragma unroll
+-            for (int j = 0; j < kUnroll; ++j) {
+-              vals[j] -= subQuantizerData[i * kUnroll + j];
+-            }
+-
+-#pragma unroll
+-            for (int j = 0; j < kUnroll; ++j) {
+-              vals[j] *= vals[j];
+-            }
+-
+-#pragma unroll
+-            for (int j = 0; j < kUnroll; ++j) {
+-              dist += vals[j];
+-            }
+-          }
+-        } else {
+-          // Inner product: query slice against the reconstructed sub-quantizer
+-          // for this coarse cell (query o (centroid + subQCentroid))
+-#pragma unroll
+-          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+-#pragma unroll
+-            for (int j = 0; j < kUnroll; ++j) {
+-              vals[j] = smemResidual1[i * kUnroll + j];
+-            }
+-
+-#pragma unroll
+-            for (int j = 0; j < kUnroll; ++j) {
+-              vals[j] += subQuantizerData[i * kUnroll + j];
+-            }
+-
+-#pragma unroll
+-            for (int j = 0; j < kUnroll; ++j) {
+-              vals[j] *= smemQuery[i * kUnroll + j];
+-            }
+-
+-#pragma unroll
+-            for (int j = 0; j < kUnroll; ++j) {
+-              dist += vals[j];
+-            }
+-          }
+-        }
+-
+-        // Remainder loop
+-        if (L2Distance) {
+-#pragma unroll
+-          for (int j = 0; j < kRemainder; ++j) {
+-            vals[j] = smemResidual1[kRemainderBase + j];
+-          }
+-
+-#pragma unroll
+-          for (int j = 0; j < kRemainder; ++j) {
+-            vals[j] -= subQuantizerData[kRemainderBase + j];
+-          }
+-
+-#pragma unroll
+-          for (int j = 0; j < kRemainder; ++j) {
+-            vals[j] *= vals[j];
+-          }
+-        } else {
+-          // Inner product
+-          // Inner product: query slice against the reconstructed sub-quantizer
+-          // for this coarse cell (query o (centroid + subQCentroid))
+-#pragma unroll
+-          for (int j = 0; j < kRemainder; ++j) {
+-            vals[j] = smemResidual1[kRemainderBase + j];
+-          }
+-
+-#pragma unroll
+-          for (int j = 0; j < kRemainder; ++j) {
+-            vals[j] += subQuantizerData[kRemainderBase + j];
+-          }
+-
+-#pragma unroll
+-          for (int j = 0; j < kRemainder; ++j) {
+-            vals[j] *= smemQuery[kRemainderBase + j];
+-          }
+-        }
+-
+-#pragma unroll
+-        for (int j = 0; j < kRemainder; ++j) {
+-          dist += vals[j];
+-        }
+-
+-        // We have the distance for our code; write it out
+-        outCodeDistances[queryId][coarse][subQuantizer][code] =
+-          Converter<OutCodeT>::to(dist);
+-      } // !isLoadingThread
+-
+-      // Swap residual buffers
+-      float* tmp = smemResidual1;
+-      smemResidual1 = smemResidual2;
+-      smemResidual2 = tmp;
+-    }
+-  }
+-}
+-
+-__global__ void
+-residualVector(Tensor<float, 2, true> queries,
+-               Tensor<float, 2, true> coarseCentroids,
+-               Tensor<int, 2, true> topQueryToCentroid,
+-               int numSubDim,
+-               // output is transposed:
+-               // (sub q)(query id)(centroid id)(sub dim)
+-               Tensor<float, 4, true> residual) {
+-  // block x is query id
+-  // block y is centroid id
+-  // thread x is dim
+-  auto queryId = blockIdx.x;
+-  auto centroidId = blockIdx.y;
+-
+-  int realCentroidId = topQueryToCentroid[queryId][centroidId];
+-
+-  for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
+-    float q = queries[queryId][dim];
+-    float c = coarseCentroids[realCentroidId][dim];
+-
+-    residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] =
+-      q - c;
+-  }
+-}
+-
+-void
+-runResidualVector(Tensor<float, 3, true>& pqCentroids,
+-                  Tensor<float, 2, true>& queries,
+-                  Tensor<float, 2, true>& coarseCentroids,
+-                  Tensor<int, 2, true>& topQueryToCentroid,
+-                  Tensor<float, 4, true>& residual,
+-                  cudaStream_t stream) {
+-  auto grid =
+-    dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
+-  auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
+-
+-  residualVector<<<grid, block, 0, stream>>>(
+-    queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
+-    residual);
+-
+-  CUDA_TEST_ERROR();
+-}
+-
+-void
+-runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
+-                     Tensor<float, 2, true>& queries,
+-                     Tensor<float, 2, true>& coarseCentroids,
+-                     Tensor<int, 2, true>& topQueryToCentroid,
+-                     NoTypeTensor<4, true>& outCodeDistances,
+-                     bool useFloat16Lookup,
+-                     DeviceMemory& mem,
+-                     cublasHandle_t handle,
+-                     cudaStream_t stream) {
+-  // Calculate (q - c) residual vector
+-  // (sub q)(query id)(centroid id)(sub dim)
+-  DeviceTensor<float, 4, true> residual(
+-    mem,
+-    {pqCentroids.getSize(0),
+-        topQueryToCentroid.getSize(0),
+-        topQueryToCentroid.getSize(1),
+-        pqCentroids.getSize(1)},
+-    stream);
+-
+-  runResidualVector(pqCentroids, queries,
+-                    coarseCentroids, topQueryToCentroid,
+-                    residual, stream);
+-
+-  // Calculate ||q - c||^2
+-  DeviceTensor<float, 1, true> residualNorms(
+-    mem,
+-    {pqCentroids.getSize(0) *
+-        topQueryToCentroid.getSize(0) *
+-        topQueryToCentroid.getSize(1)},
+-    stream);
+-
+-  auto residualView2 = residual.view<2>(
+-    {pqCentroids.getSize(0) *
+-        topQueryToCentroid.getSize(0) *
+-        topQueryToCentroid.getSize(1),
+-        pqCentroids.getSize(1)});
+-
+-  runL2Norm(residualView2, true, residualNorms, true, stream);
+-
+-  // Perform a batch MM:
+-  // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
+-  // (sub q) x {(q * c)(code)}
+-  auto residualView3 = residual.view<3>(
+-    {pqCentroids.getSize(0),
+-        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+-        pqCentroids.getSize(1)});
+-
+-  DeviceTensor<float, 3, true> residualDistance(
+-    mem,
+-    {pqCentroids.getSize(0),
+-        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+-        pqCentroids.getSize(2)},
+-    stream);
+-
+-  runIteratedMatrixMult(residualDistance, false,
+-                        residualView3, false,
+-                        pqCentroids, false,
+-                        -2.0f, 0.0f,
+-                        handle,
+-                        stream);
+-
+-  // Sum ||q - c||^2 along rows
+-  auto residualDistanceView2 = residualDistance.view<2>(
+-    {pqCentroids.getSize(0) *
+-        topQueryToCentroid.getSize(0) *
+-        topQueryToCentroid.getSize(1),
+-        pqCentroids.getSize(2)});
+-
+-  runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
+-
+-  Tensor<float, 4, true> outCodeDistancesF;
+-  DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
+-
+-  if (useFloat16Lookup) {
+-    outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
+-      mem, {outCodeDistances.getSize(0),
+-          outCodeDistances.getSize(1),
+-          outCodeDistances.getSize(2),
+-          outCodeDistances.getSize(3)},
+-      stream);
+-
+-    outCodeDistancesF = outCodeDistancesFloatMem;
+-  } else {
+-    outCodeDistancesF = outCodeDistances.toTensor<float>();
+-  }
+-
+-  // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
+-  // is where we build our output distances)
+-  auto outCodeDistancesView = outCodeDistancesF.view<3>(
+-    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+-        outCodeDistances.getSize(2),
+-        outCodeDistances.getSize(3)});
+-
+-  runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
+-
+-  // Calculate code norms per each sub-dim
+-  // (sub q)(sub dim)(code) is pqCentroids
+-  // transpose to (sub q)(code)(sub dim)
+-  DeviceTensor<float, 3, true> pqCentroidsTranspose(
+-    mem,
+-    {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
+-    stream);
+-
+-  runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
+-
+-  auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
+-    {pqCentroids.getSize(0) * pqCentroids.getSize(2),
+-        pqCentroids.getSize(1)});
+-
+-  DeviceTensor<float, 1, true> pqCentroidsNorm(
+-    mem,
+-    {pqCentroids.getSize(0) * pqCentroids.getSize(2)},
+-    stream);
+-
+-  runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
+-
+-  // View output as (q * c)(sub q * code), and add centroid norm to
+-  // each row
+-  auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
+-    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+-        outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
+-
+-  runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
+-
+-  if (useFloat16Lookup) {
+-    // Need to convert back
+-    auto outCodeDistancesH = outCodeDistances.toTensor<half>();
+-    convertTensor<float, half, 4>(stream,
+-                                  outCodeDistancesF,
+-                                  outCodeDistancesH);
+-  }
+-}
+-
+-void
+-runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
+-                   Tensor<float, 2, true>& queries,
+-                   Tensor<float, 2, true>& coarseCentroids,
+-                   Tensor<int, 2, true>& topQueryToCentroid,
+-                   NoTypeTensor<4, true>& outCodeDistances,
+-                   bool l2Distance,
+-                   bool useFloat16Lookup,
+-                   cudaStream_t stream) {
+-  const auto numSubQuantizers = pqCentroids.getSize(0);
+-  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
+-  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+-
+-  // FIXME: tune
+-  // Reuse of pq centroid data is based on both # of queries * nprobe,
+-  // and we should really be tiling in both dimensions
+-  constexpr int kQueriesPerBlock = 8;
+-
+-  auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
+-                   numSubQuantizers);
+-
+-  // Reserve one block of threads for double buffering
+-  // FIXME: probably impractical for large # of dims?
+-  auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
+-  auto block = dim3(codesPerSubQuantizer + loadingThreads);
+-
+-  auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
+-    + topQueryToCentroid.getSize(1) * sizeof(int);
+-
+-#define RUN_CODE(DIMS, L2)                                              \
+-  do {                                                                  \
+-    if (useFloat16Lookup) {                                             \
+-      auto outCodeDistancesT = outCodeDistances.toTensor<half>();       \
+-                                                                        \
+-      pqCodeDistances<half, DIMS, L2><<<grid, block, smem, stream>>>(   \
+-        queries, kQueriesPerBlock,                                      \
+-        coarseCentroids, pqCentroids,                                   \
+-        topQueryToCentroid, outCodeDistancesT);                         \
+-    } else {                                                            \
+-      auto outCodeDistancesT = outCodeDistances.toTensor<float>();      \
+-                                                                        \
+-      pqCodeDistances<float, DIMS, L2><<<grid, block, smem, stream>>>(  \
+-        queries, kQueriesPerBlock,                                      \
+-        coarseCentroids, pqCentroids,                                   \
+-        topQueryToCentroid, outCodeDistancesT);                         \
+-    }                                                                   \
+-  } while (0)
+-
+-#define CODE_L2(DIMS)                           \
+-  do {                                          \
+-    if (l2Distance) {                           \
+-      RUN_CODE(DIMS, true);                     \
+-    } else {                                    \
+-      RUN_CODE(DIMS, false);                    \
+-    }                                           \
+-  } while (0)
+-
+-  switch (dimsPerSubQuantizer) {
+-    case 1:
+-      CODE_L2(1);
+-      break;
+-    case 2:
+-      CODE_L2(2);
+-      break;
+-    case 3:
+-      CODE_L2(3);
+-      break;
+-    case 4:
+-      CODE_L2(4);
+-      break;
+-    case 6:
+-      CODE_L2(6);
+-      break;
+-    case 8:
+-      CODE_L2(8);
+-      break;
+-    case 10:
+-      CODE_L2(10);
+-      break;
+-    case 12:
+-      CODE_L2(12);
+-      break;
+-    case 16:
+-      CODE_L2(16);
+-      break;
+-    case 20:
+-      CODE_L2(20);
+-      break;
+-    case 24:
+-      CODE_L2(24);
+-      break;
+-    case 28:
+-      CODE_L2(28);
+-      break;
+-    case 32:
+-      CODE_L2(32);
+-      break;
+-      // FIXME: larger sizes require too many registers - we need the
+-      // MM implementation working
+-    default:
+-      FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
+-                      "not currently supported");
+-  }
+-
+-#undef RUN_CODE
+-#undef CODE_L2
+-
+-  CUDA_TEST_ERROR();
+-}
+-
+-} } // namespace
+diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/gpu/impl/PQScanMultiPassNoPrecomputed.cu
+deleted file mode 100644
+index a514694..0000000
+--- a/gpu/impl/PQScanMultiPassNoPrecomputed.cu
++++ /dev/null
+@@ -1,597 +0,0 @@
+-/**
+- * Copyright (c) Facebook, Inc. and its affiliates.
+- *
+- * This source code is licensed under the MIT license found in the
+- * LICENSE file in the root directory of this source tree.
+- */
+-
+-
+-#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh>
+-#include <faiss/gpu/GpuResources.h>
+-#include <faiss/gpu/impl/PQCodeDistances.cuh>
+-#include <faiss/gpu/impl/PQCodeLoad.cuh>
+-#include <faiss/gpu/impl/IVFUtils.cuh>
+-#include <faiss/gpu/utils/ConversionOperators.cuh>
+-#include <faiss/gpu/utils/DeviceTensor.cuh>
+-#include <faiss/gpu/utils/DeviceUtils.h>
+-#include <faiss/gpu/utils/Float16.cuh>
+-#include <faiss/gpu/utils/LoadStoreOperators.cuh>
+-#include <faiss/gpu/utils/NoTypeTensor.cuh>
+-#include <faiss/gpu/utils/StaticUtils.h>
+-
+-#include <faiss/gpu/utils/HostTensor.cuh>
+-
+-namespace faiss { namespace gpu {
+-
+-// This must be kept in sync with PQCodeDistances.cu
+-bool isSupportedNoPrecomputedSubDimSize(int dims) {
+-  switch (dims) {
+-    case 1:
+-    case 2:
+-    case 3:
+-    case 4:
+-    case 6:
+-    case 8:
+-    case 10:
+-    case 12:
+-    case 16:
+-    case 20:
+-    case 24:
+-    case 28:
+-    case 32:
+-      return true;
+-    default:
+-      // FIXME: larger sizes require too many registers - we need the
+-      // MM implementation working
+-      return false;
+-  }
+-}
+-
+-template <typename LookupT, typename LookupVecT>
+-struct LoadCodeDistances {
+-  static inline __device__ void load(LookupT* smem,
+-                                     LookupT* codes,
+-                                     int numCodes) {
+-    constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
+-
+-    // We can only use the vector type if the data is guaranteed to be
+-    // aligned. The codes are innermost, so if it is evenly divisible,
+-    // then any slice will be aligned.
+-    if (numCodes % kWordSize == 0) {
+-      // Load the data by float4 for efficiency, and then handle any remainder
+-      // limitVec is the number of whole vec words we can load, in terms
+-      // of whole blocks performing the load
+-      constexpr int kUnroll = 2;
+-      int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
+-      limitVec *= kUnroll * blockDim.x;
+-
+-      LookupVecT* smemV = (LookupVecT*) smem;
+-      LookupVecT* codesV = (LookupVecT*) codes;
+-
+-      for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
+-        LookupVecT vals[kUnroll];
+-
+-#pragma unroll
+-        for (int j = 0; j < kUnroll; ++j) {
+-          vals[j] =
+-            LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
+-        }
+-
+-#pragma unroll
+-        for (int j = 0; j < kUnroll; ++j) {
+-          LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
+-        }
+-      }
+-
+-      // This is where we start loading the remainder that does not evenly
+-      // fit into kUnroll x blockDim.x
+-      int remainder = limitVec * kWordSize;
+-
+-      for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
+-        smem[i] = codes[i];
+-      }
+-    } else {
+-      // Potential unaligned load
+-      constexpr int kUnroll = 4;
+-
+-      int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
+-
+-      int i = threadIdx.x;
+-      for (; i < limit; i += kUnroll * blockDim.x) {
+-        LookupT vals[kUnroll];
+-
+-#pragma unroll
+-        for (int j = 0; j < kUnroll; ++j) {
+-          vals[j] = codes[i + j * blockDim.x];
+-        }
+-
+-#pragma unroll
+-        for (int j = 0; j < kUnroll; ++j) {
+-          smem[i + j * blockDim.x] = vals[j];
+-        }
+-      }
+-
+-      for (; i < numCodes; i += blockDim.x) {
+-        smem[i] = codes[i];
+-      }
+-    }
+-  }
+-};
+-
+-template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
+-__global__ void
+-pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
+-                             Tensor<float, 3, true> pqCentroids,
+-                             Tensor<int, 2, true> topQueryToCentroid,
+-                             Tensor<LookupT, 4, true> codeDistances,
+-                             void** listCodes,
+-                             int* listLengths,
+-                             Tensor<int, 2, true> prefixSumOffsets,
+-                             Tensor<float, 1, true> distance) {
+-  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+-
+-  // Where the pq code -> residual distance is stored
+-  extern __shared__ char smemCodeDistances[];
+-  LookupT* codeDist = (LookupT*) smemCodeDistances;
+-
+-  // Each block handles a single query
+-  auto queryId = blockIdx.y;
+-  auto probeId = blockIdx.x;
+-
+-  // This is where we start writing out data
+-  // We ensure that before the array (at offset -1), there is a 0 value
+-  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+-  float* distanceOut = distance[outBase].data();
+-
+-  auto listId = topQueryToCentroid[queryId][probeId];
+-  // Safety guard in case NaNs in input cause no list ID to be generated
+-  if (listId == -1) {
+-    return;
+-  }
+-
+-  unsigned char* codeList = (unsigned char*) listCodes[listId];
+-  int limit = listLengths[listId];
+-
+-  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
+-    (NumSubQuantizers / 4);
+-  unsigned int code32[kNumCode32];
+-  unsigned int nextCode32[kNumCode32];
+-
+-  // We double-buffer the code loading, which improves memory utilization
+-  if (threadIdx.x < limit) {
+-    LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
+-  }
+-
+-  LoadCodeDistances<LookupT, LookupVecT>::load(
+-    codeDist,
+-    codeDistances[queryId][probeId].data(),
+-    codeDistances.getSize(2) * codeDistances.getSize(3));
+-
+-  // Prevent WAR dependencies
+-  __syncthreads();
+-
+-  // Each thread handles one code element in the list, with a
+-  // block-wide stride
+-  for (int codeIndex = threadIdx.x;
+-       codeIndex < limit;
+-       codeIndex += blockDim.x) {
+-    // Prefetch next codes
+-    if (codeIndex + blockDim.x < limit) {
+-      LoadCode32<NumSubQuantizers>::load(
+-        nextCode32, codeList, codeIndex + blockDim.x);
+-    }
+-
+-    float dist = 0.0f;
+-
+-#pragma unroll
+-    for (int word = 0; word < kNumCode32; ++word) {
+-      constexpr int kBytesPerCode32 =
+-        NumSubQuantizers < 4 ? NumSubQuantizers : 4;
+-
+-      if (kBytesPerCode32 == 1) {
+-        auto code = code32[0];
+-        dist = ConvertTo<float>::to(codeDist[code]);
+-
+-      } else {
+-#pragma unroll
+-        for (int byte = 0; byte < kBytesPerCode32; ++byte) {
+-          auto code = getByte(code32[word], byte * 8, 8);
+-
+-          auto offset =
+-            codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
+-
+-          dist += ConvertTo<float>::to(codeDist[offset + code]);
+-        }
+-      }
+-    }
+-
+-    // Write out intermediate distance result
+-    // We do not maintain indices here, in order to reduce global
+-    // memory traffic. Those are recovered in the final selection step.
+-    distanceOut[codeIndex] = dist;
+-
+-    // Rotate buffers
+-#pragma unroll
+-    for (int word = 0; word < kNumCode32; ++word) {
+-      code32[word] = nextCode32[word];
+-    }
+-  }
+-}
+-
+-void
+-runMultiPassTile(Tensor<float, 2, true>& queries,
+-                 Tensor<float, 2, true>& centroids,
+-                 Tensor<float, 3, true>& pqCentroidsInnermostCode,
+-                 NoTypeTensor<4, true>& codeDistances,
+-                 Tensor<int, 2, true>& topQueryToCentroid,
+-                 bool useFloat16Lookup,
+-                 int bytesPerCode,
+-                 int numSubQuantizers,
+-                 int numSubQuantizerCodes,
+-                 thrust::device_vector<void*>& listCodes,
+-                 thrust::device_vector<void*>& listIndices,
+-                 IndicesOptions indicesOptions,
+-                 thrust::device_vector<int>& listLengths,
+-                 Tensor<char, 1, true>& thrustMem,
+-                 Tensor<int, 2, true>& prefixSumOffsets,
+-                 Tensor<float, 1, true>& allDistances,
+-                 Tensor<float, 3, true>& heapDistances,
+-                 Tensor<int, 3, true>& heapIndices,
+-                 int k,
+-                 faiss::MetricType metric,
+-                 Tensor<float, 2, true>& outDistances,
+-                 Tensor<long, 2, true>& outIndices,
+-                 cudaStream_t stream) {
+-  // We only support two metrics at the moment
+-  FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
+-               metric == MetricType::METRIC_L2);
+-
+-  bool l2Distance = metric == MetricType::METRIC_L2;
+-
+-  // Calculate offset lengths, so we know where to write out
+-  // intermediate results
+-  runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
+-                     thrustMem, stream);
+-
+-  // Calculate residual code distances, since this is without
+-  // precomputed codes
+-  runPQCodeDistances(pqCentroidsInnermostCode,
+-                     queries,
+-                     centroids,
+-                     topQueryToCentroid,
+-                     codeDistances,
+-                     l2Distance,
+-                     useFloat16Lookup,
+-                     stream);
+-
+-  // Convert all codes to a distance, and write out (distance,
+-  // index) values for all intermediate results
+-  {
+-    auto kThreadsPerBlock = 256;
+-
+-    auto grid = dim3(topQueryToCentroid.getSize(1),
+-                     topQueryToCentroid.getSize(0));
+-    auto block = dim3(kThreadsPerBlock);
+-
+-    // pq centroid distances
+-    auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
+-
+-    smem *= numSubQuantizers * numSubQuantizerCodes;
+-    FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
+-
+-#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T)                   \
+-    do {                                                                \
+-      auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>();         \
+-                                                                        \
+-      pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T>   \
+-        <<<grid, block, smem, stream>>>(                                \
+-          queries,                                                      \
+-          pqCentroidsInnermostCode,                                     \
+-          topQueryToCentroid,                                           \
+-          codeDistancesT,                                               \
+-          listCodes.data().get(),                                       \
+-          listLengths.data().get(),                                     \
+-          prefixSumOffsets,                                             \
+-          allDistances);                                                \
+-    } while (0)
+-
+-#define RUN_PQ(NUM_SUB_Q)                       \
+-    do {                                        \
+-      if (useFloat16Lookup) {                   \
+-        RUN_PQ_OPT(NUM_SUB_Q, half, Half8);     \
+-      } else {                                  \
+-        RUN_PQ_OPT(NUM_SUB_Q, float, float4);   \
+-      }                                         \
+-    } while (0)
+-
+-    switch (bytesPerCode) {
+-      case 1:
+-        RUN_PQ(1);
+-        break;
+-      case 2:
+-        RUN_PQ(2);
+-        break;
+-      case 3:
+-        RUN_PQ(3);
+-        break;
+-      case 4:
+-        RUN_PQ(4);
+-        break;
+-      case 8:
+-        RUN_PQ(8);
+-        break;
+-      case 12:
+-        RUN_PQ(12);
+-        break;
+-      case 16:
+-        RUN_PQ(16);
+-        break;
+-      case 20:
+-        RUN_PQ(20);
+-        break;
+-      case 24:
+-        RUN_PQ(24);
+-        break;
+-      case 28:
+-        RUN_PQ(28);
+-        break;
+-      case 32:
+-        RUN_PQ(32);
+-        break;
+-      case 40:
+-        RUN_PQ(40);
+-        break;
+-      case 48:
+-        RUN_PQ(48);
+-        break;
+-      case 56:
+-        RUN_PQ(56);
+-        break;
+-      case 64:
+-        RUN_PQ(64);
+-        break;
+-      case 96:
+-        RUN_PQ(96);
+-        break;
+-      default:
+-        FAISS_ASSERT(false);
+-        break;
+-    }
+-
+-#undef RUN_PQ
+-#undef RUN_PQ_OPT
+-  }
+-
+-  CUDA_TEST_ERROR();
+-
+-  // k-select the output in chunks, to increase parallelism
+-  runPass1SelectLists(prefixSumOffsets,
+-                      allDistances,
+-                      topQueryToCentroid.getSize(1),
+-                      k,
+-                      !l2Distance, // L2 distance chooses smallest
+-                      heapDistances,
+-                      heapIndices,
+-                      stream);
+-
+-  // k-select final output
+-  auto flatHeapDistances = heapDistances.downcastInner<2>();
+-  auto flatHeapIndices = heapIndices.downcastInner<2>();
+-
+-  runPass2SelectLists(flatHeapDistances,
+-                      flatHeapIndices,
+-                      listIndices,
+-                      indicesOptions,
+-                      prefixSumOffsets,
+-                      topQueryToCentroid,
+-                      k,
+-                      !l2Distance, // L2 distance chooses smallest
+-                      outDistances,
+-                      outIndices,
+-                      stream);
+-}
+-
+-void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
+-                                     Tensor<float, 2, true>& centroids,
+-                                     Tensor<float, 3, true>& pqCentroidsInnermostCode,
+-                                     Tensor<int, 2, true>& topQueryToCentroid,
+-                                     bool useFloat16Lookup,
+-                                     int bytesPerCode,
+-                                     int numSubQuantizers,
+-                                     int numSubQuantizerCodes,
+-                                     thrust::device_vector<void*>& listCodes,
+-                                     thrust::device_vector<void*>& listIndices,
+-                                     IndicesOptions indicesOptions,
+-                                     thrust::device_vector<int>& listLengths,
+-                                     int maxListLength,
+-                                     int k,
+-                                     faiss::MetricType metric,
+-                                     // output
+-                                     Tensor<float, 2, true>& outDistances,
+-                                     // output
+-                                     Tensor<long, 2, true>& outIndices,
+-                                     GpuResources* res) {
+-  constexpr int kMinQueryTileSize = 8;
+-  constexpr int kMaxQueryTileSize = 128;
+-  constexpr int kThrustMemSize = 16384;
+-
+-  int nprobe = topQueryToCentroid.getSize(1);
+-
+-  auto& mem = res->getMemoryManagerCurrentDevice();
+-  auto stream = res->getDefaultStreamCurrentDevice();
+-
+-  // Make a reservation for Thrust to do its dirty work (global memory
+-  // cross-block reduction space); hopefully this is large enough.
+-  DeviceTensor<char, 1, true> thrustMem1(
+-    mem, {kThrustMemSize}, stream);
+-  DeviceTensor<char, 1, true> thrustMem2(
+-    mem, {kThrustMemSize}, stream);
+-  DeviceTensor<char, 1, true>* thrustMem[2] =
+-    {&thrustMem1, &thrustMem2};
+-
+-  // How much temporary storage is available?
+-  // If possible, we'd like to fit within the space available.
+-  size_t sizeAvailable = mem.getSizeAvailable();
+-
+-  // We run two passes of heap selection
+-  // This is the size of the first-level heap passes
+-  constexpr int kNProbeSplit = 8;
+-  int pass2Chunks = std::min(nprobe, kNProbeSplit);
+-
+-  size_t sizeForFirstSelectPass =
+-    pass2Chunks * k * (sizeof(float) + sizeof(int));
+-
+-  // How much temporary storage we need per each query
+-  size_t sizePerQuery =
+-    2 * // streams
+-    ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
+-     nprobe * maxListLength * sizeof(float) + // allDistances
+-     // residual distances
+-     nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
+-     sizeForFirstSelectPass);
+-
+-  int queryTileSize = (int) (sizeAvailable / sizePerQuery);
+-
+-  if (queryTileSize < kMinQueryTileSize) {
+-    queryTileSize = kMinQueryTileSize;
+-  } else if (queryTileSize > kMaxQueryTileSize) {
+-    queryTileSize = kMaxQueryTileSize;
+-  }
+-
+-  // FIXME: we should adjust queryTileSize to deal with this, since
+-  // indexing is in int32
+-  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
+-         std::numeric_limits<int>::max());
+-
+-  // Temporary memory buffers
+-  // Make sure there is space prior to the start which will be 0, and
+-  // will handle the boundary condition without branches
+-  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+-    mem, {queryTileSize * nprobe + 1}, stream);
+-  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+-    mem, {queryTileSize * nprobe + 1}, stream);
+-
+-  DeviceTensor<int, 2, true> prefixSumOffsets1(
+-    prefixSumOffsetSpace1[1].data(),
+-    {queryTileSize, nprobe});
+-  DeviceTensor<int, 2, true> prefixSumOffsets2(
+-    prefixSumOffsetSpace2[1].data(),
+-    {queryTileSize, nprobe});
+-  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
+-    {&prefixSumOffsets1, &prefixSumOffsets2};
+-
+-  // Make sure the element before prefixSumOffsets is 0, since we
+-  // depend upon simple, boundary-less indexing to get proper results
+-  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
+-                              0,
+-                              sizeof(int),
+-                              stream));
+-  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
+-                              0,
+-                              sizeof(int),
+-                              stream));
+-
+-  int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
+-
+-  int totalCodeDistancesSize =
+-    queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
+-    codeDistanceTypeSize;
+-
+-  DeviceTensor<char, 1, true> codeDistances1Mem(
+-    mem, {totalCodeDistancesSize}, stream);
+-  NoTypeTensor<4, true> codeDistances1(
+-    codeDistances1Mem.data(),
+-    codeDistanceTypeSize,
+-    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
+-
+-  DeviceTensor<char, 1, true> codeDistances2Mem(
+-    mem, {totalCodeDistancesSize}, stream);
+-  NoTypeTensor<4, true> codeDistances2(
+-    codeDistances2Mem.data(),
+-    codeDistanceTypeSize,
+-    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
+-
+-  NoTypeTensor<4, true>* codeDistances[2] =
+-    {&codeDistances1, &codeDistances2};
+-
+-  DeviceTensor<float, 1, true> allDistances1(
+-    mem, {queryTileSize * nprobe * maxListLength}, stream);
+-  DeviceTensor<float, 1, true> allDistances2(
+-    mem, {queryTileSize * nprobe * maxListLength}, stream);
+-  DeviceTensor<float, 1, true>* allDistances[2] =
+-    {&allDistances1, &allDistances2};
+-
+-  DeviceTensor<float, 3, true> heapDistances1(
+-    mem, {queryTileSize, pass2Chunks, k}, stream);
+-  DeviceTensor<float, 3, true> heapDistances2(
+-    mem, {queryTileSize, pass2Chunks, k}, stream);
+-  DeviceTensor<float, 3, true>* heapDistances[2] =
+-    {&heapDistances1, &heapDistances2};
+-
+-  DeviceTensor<int, 3, true> heapIndices1(
+-    mem, {queryTileSize, pass2Chunks, k}, stream);
+-  DeviceTensor<int, 3, true> heapIndices2(
+-    mem, {queryTileSize, pass2Chunks, k}, stream);
+-  DeviceTensor<int, 3, true>* heapIndices[2] =
+-    {&heapIndices1, &heapIndices2};
+-
+-  auto streams = res->getAlternateStreamsCurrentDevice();
+-  streamWait(streams, {stream});
+-
+-  int curStream = 0;
+-
+-  for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
+-    int numQueriesInTile =
+-      std::min(queryTileSize, queries.getSize(0) - query);
+-
+-    auto prefixSumOffsetsView =
+-      prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
+-
+-    auto codeDistancesView =
+-      codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+-    auto coarseIndicesView =
+-      topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
+-    auto queryView =
+-      queries.narrowOutermost(query, numQueriesInTile);
+-
+-    auto heapDistancesView =
+-      heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+-    auto heapIndicesView =
+-      heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
+-
+-    auto outDistanceView =
+-      outDistances.narrowOutermost(query, numQueriesInTile);
+-    auto outIndicesView =
+-      outIndices.narrowOutermost(query, numQueriesInTile);
+-
+-    runMultiPassTile(queryView,
+-                     centroids,
+-                     pqCentroidsInnermostCode,
+-                     codeDistancesView,
+-                     coarseIndicesView,
+-                     useFloat16Lookup,
+-                     bytesPerCode,
+-                     numSubQuantizers,
+-                     numSubQuantizerCodes,
+-                     listCodes,
+-                     listIndices,
+-                     indicesOptions,
+-                     listLengths,
+-                     *thrustMem[curStream],
+-                     prefixSumOffsetsView,
+-                     *allDistances[curStream],
+-                     heapDistancesView,
+-                     heapIndicesView,
+-                     k,
+-                     metric,
+-                     outDistanceView,
+-                     outIndicesView,
+-                     streams[curStream]);
+-
+-    curStream = (curStream + 1) % 2;
+-  }
+-
+-  streamWait({stream}, streams);
+-}
+-
+-} } // namespace
+diff --git a/gpu/test/Makefile b/gpu/test/Makefile
+index 6836314..697e8eb 100644
+--- a/gpu/test/Makefile
++++ b/gpu/test/Makefile
+@@ -17,14 +17,18 @@ TESTS_BIN = $(TESTS_OBJ:.o=) $(CUDA_TESTS_OBJ:.o=)
+ 
+ # test_gpu_index.py test_pytorch_faiss.py
+ 
++build: $(TESTS_BIN)
++TestUtils.o: TestUtils.cpp
++	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ ../../libfaiss.a -Igtest/include -I../..
++
+ run: $(TESTS_BIN) $(CUDA_TESTS_BIN)
+ 	for t in $(TESTS_BIN) $(CUDA_TESTS_BIN); do ./$$t || exit; done
+ 
+ $(CUDA_TESTS_OBJ): %.o: %.cu gtest
+-	$(NVCC) $(NVCCFLAGS) -g -O3 -o $@ -c $< -Igtest/include
++	$(NVCC) $(NVCCFLAGS) -g -O3 -o $@ -c $< -Igtest/include -I../..
+ 
+ $(TESTS_OBJ): %.o: %.cpp gtest
+-	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $< -Igtest/include
++	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $< -Igtest/include -I../..
+ 
+ $(TESTS_BIN): %: %.o TestUtils.o ../../libfaiss.a gtest/make/gtest.a
+ 	$(CXX) -o $@ $^ $(LDFLAGS) $(LIBS)
+@@ -33,7 +37,7 @@ demo_ivfpq_indexing_gpu: demo_ivfpq_indexing_gpu.o ../../libfaiss.a
+ 	$(CXX) -o $@ $^ $(LDFLAGS) $(LIBS)
+ 
+ demo_ivfpq_indexing_gpu.o: demo_ivfpq_indexing_gpu.cpp
+-	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^
++	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ -I../..
+ 
+ gtest/make/gtest.a: gtest
+ 	$(MAKE) -C gtest/make CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)" gtest.a
+diff --git a/gpu/test/demo_ivfpq_indexing_gpu.cpp b/gpu/test/demo_ivfpq_indexing_gpu.cpp
+index 852a43c..5fde30c 100644
+--- a/gpu/test/demo_ivfpq_indexing_gpu.cpp
++++ b/gpu/test/demo_ivfpq_indexing_gpu.cpp
+@@ -17,6 +17,7 @@
+ 
+ #include <faiss/gpu/StandardGpuResources.h>
+ #include <faiss/gpu/GpuIndexIVFPQ.h>
++#include <faiss/gpu/GpuCloner.h>
+ 
+ #include <faiss/gpu/GpuAutoTune.h>
+ #include <faiss/index_io.h>
+-- 
+1.8.3.1
+
diff --git a/var/spack/repos/builtin/packages/faiss/package.py b/var/spack/repos/builtin/packages/faiss/package.py
new file mode 100644
index 0000000000..a660693268
--- /dev/null
+++ b/var/spack/repos/builtin/packages/faiss/package.py
@@ -0,0 +1,126 @@
+# Copyright 2013-2019 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+import os
+from spack import *
+
+
+class Faiss(AutotoolsPackage, CudaPackage):
+    """Faiss is a library for efficient similarity search and clustering of
+       dense vectors.
+
+      Faiss contains algorithms that search in sets of vectors of any size, up
+      to ones that possibly do not fit in RAM. It also contains supporting code
+      for evaluation and parameter tuning. Faiss is written in C++ with
+      complete wrappers for Python/numpy. Some of the most useful algorithms
+      are implemented on the GPU. It is developed by Facebook AI Research.
+    """
+
+    homepage = "https://github.com/facebookresearch/faiss"
+    url      = "https://github.com/facebookresearch/faiss/archive/v1.6.3.tar.gz"
+
+    maintainers = ['bhatiaharsh']
+
+    version('1.6.3', sha256='e1a41c159f0b896975fbb133e0240a233af5c9286c09a28fde6aefff5336e542')
+    version('1.5.3', sha256='b24d347b0285d01c2ed663ccc7596cd0ea95071f3dd5ebb573ccfc28f15f043b')
+
+    variant('python', default=False, description='Build Python bindings')
+    variant('tests',  default=False, description='Build Tests')
+
+    conflicts('+tests', when='~python', msg='+tests must be accompanied by +python')
+
+    depends_on('python@3.7:',   when='+python', type=('build', 'run'))
+    depends_on('py-numpy',      when='+python', type=('build', 'run'))
+    depends_on('py-scipy',      when='+tests',  type=('build', 'run'))
+
+    depends_on('blas')
+    depends_on('py-setuptools', when='+python', type='build')
+    depends_on('swig',          when='+python', type='build')
+
+    # patch for v1.5.3
+    # faiss assumes that the "source directory" will always
+    # be called "faiss" (not spack-src or faiss-1.5.3)
+    # so, we will have to create a symlink to self (faiss did that in 1.6.3)
+    # and add an include path
+    patch('fixes-in-v1.5.3.patch', when='@1.5.3')
+
+    # patch for v1.6.3
+    # for v1.6.3, GPU build has a bug (two files need to be deleted)
+    # https://github.com/facebookresearch/faiss/issues/1159
+    # also, some include paths in gpu/tests/Makefile are missing
+    patch('fixes-in-v1.6.3.patch', when='@1.6.3')
+
+    def configure_args(self):
+        args = []
+        args.extend(self.with_or_without('cuda', activation_value='prefix'))
+        return args
+
+    def build(self, spec, prefix):
+
+        make()
+
+        if '+python' in self.spec:
+            make('-C', 'python')
+
+        # CPU tests
+        if '+tests' in self.spec:
+            with working_dir('tests'):
+                make('gtest')
+                make('tests')
+
+        # GPU tests
+        if '+tests+cuda' in self.spec:
+            with working_dir(os.path.join('gpu', 'test')):
+                make('gtest')
+                make('build')                       # target added by the patch
+                make('demo_ivfpq_indexing_gpu')
+
+    def install(self, spec, prefix):
+
+        make('install')
+
+        if '+python' in self.spec:
+            with working_dir('python'):
+                setup_py('install', '--prefix=' + prefix,
+                         '--single-version-externally-managed', '--root=/')
+
+        if '+tests' not in self.spec:
+            return
+
+        if not os.path.isdir(self.prefix.bin):
+            os.makedirs(self.prefix.bin)
+
+        def _prefix_and_install(file):
+            os.rename(file, 'faiss_' + file)
+            install('faiss_' + file, self.prefix.bin)
+
+        # CPU tests
+        with working_dir('tests'):
+            # rename the exec to keep consistent with gpu tests
+            os.rename('tests', 'TestCpu')
+            _prefix_and_install('TestCpu')
+
+        # GPU tests
+        if '+cuda' in self.spec:
+            with working_dir(os.path.join('gpu', 'test')):
+                _prefix_and_install('TestGpuIndexFlat')
+                _prefix_and_install('TestGpuIndexBinaryFlat')
+                _prefix_and_install('TestGpuIndexIVFFlat')
+                _prefix_and_install('TestGpuIndexIVFPQ')
+                _prefix_and_install('TestGpuMemoryException')
+                _prefix_and_install('TestGpuSelect')
+                _prefix_and_install('demo_ivfpq_indexing_gpu')
+
+    @run_after('configure')
+    def _fix_makefile(self):
+
+        # spack injects its own optimization flags
+        makefile = FileFilter('makefile.inc')
+        makefile.filter('CPUFLAGS     = -mavx2 -mf16c',
+                        '#CPUFLAGS     = -mavx2 -mf16c')
+
+    def setup_run_environment(self, env):
+        if '+python' in self.spec:
+            env.prepend_path('PYTHONPATH', site_packages_dir)
-- 
cgit v1.2.3-70-g09d2