diff options
author | Harsh Bhatia <bhatia4@llnl.gov> | 2020-10-16 00:47:22 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-16 09:47:22 +0200 |
commit | 878e89eceae69652a8d9a25abda1c8585b0ff08c (patch) | |
tree | 6d79012ddde1a574c666254f399f2caf4c84e55e | |
parent | 7a6268593ce957a54bbe8fbcb7b3e1ea1b71ee2c (diff) | |
download | spack-878e89eceae69652a8d9a25abda1c8585b0ff08c.tar.gz spack-878e89eceae69652a8d9a25abda1c8585b0ff08c.tar.bz2 spack-878e89eceae69652a8d9a25abda1c8585b0ff08c.tar.xz spack-878e89eceae69652a8d9a25abda1c8585b0ff08c.zip |
faiss: added new package at v1.6.3 (#17290)
Co-authored-by: Massimiliano Culpo <massimiliano.culpo@gmail.com>
3 files changed, 1416 insertions, 0 deletions
diff --git a/var/spack/repos/builtin/packages/faiss/fixes-in-v1.5.3.patch b/var/spack/repos/builtin/packages/faiss/fixes-in-v1.5.3.patch new file mode 100644 index 0000000000..9b164c28a1 --- /dev/null +++ b/var/spack/repos/builtin/packages/faiss/fixes-in-v1.5.3.patch @@ -0,0 +1,51 @@ +From 6a581e31bc461654460452be5255083ae2d33a6f Mon Sep 17 00:00:00 2001 +From: Harsh Bhatia <bhatia4@llnl.gov> +Date: Mon, 29 Jun 2020 10:59:33 -0700 +Subject: [PATCH] fixed v1.5.3 + +--- + faiss | 1 + + gpu/test/Makefile | 4 ++++ + tests/Makefile | 2 +- + 3 files changed, 6 insertions(+), 1 deletion(-) + create mode 120000 faiss + +diff --git a/faiss b/faiss +new file mode 120000 +index 0000000..945c9b4 +--- /dev/null ++++ b/faiss +@@ -0,0 +1 @@ ++. +\ No newline at end of file +diff --git a/gpu/test/Makefile b/gpu/test/Makefile +index 6836314..8aa67e8 100644 +--- a/gpu/test/Makefile ++++ b/gpu/test/Makefile +@@ -17,6 +17,10 @@ TESTS_BIN = $(TESTS_OBJ:.o=) $(CUDA_TESTS_OBJ:.o=) + + # test_gpu_index.py test_pytorch_faiss.py + ++build: $(TESTS_BIN) ++TestUtils.o: TestUtils.cpp ++ $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ ../../libfaiss.a -Igtest/include -I../.. ++ + run: $(TESTS_BIN) $(CUDA_TESTS_BIN) + for t in $(TESTS_BIN) $(CUDA_TESTS_BIN); do ./$$t || exit; done + +diff --git a/tests/Makefile b/tests/Makefile +index c46c292..d22d70d 100644 +--- a/tests/Makefile ++++ b/tests/Makefile +@@ -18,7 +18,7 @@ tests: $(TESTS_OBJ) ../libfaiss.a gtest/make/gtest_main.a + $(CXX) -o $@ $^ $(LDFLAGS) $(LIBS) + + %.o: %.cpp gtest +- $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c -o $@ $< -Igtest/include -I../.. ++ $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c -o $@ $< -Igtest/include -I../.. -I.. + + gtest/make/gtest_main.a: gtest + $(MAKE) -C gtest/make CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)" gtest_main.a +-- +1.8.3.1 + diff --git a/var/spack/repos/builtin/packages/faiss/fixes-in-v1.6.3.patch b/var/spack/repos/builtin/packages/faiss/fixes-in-v1.6.3.patch new file mode 100644 index 0000000000..0916bb9225 --- /dev/null +++ b/var/spack/repos/builtin/packages/faiss/fixes-in-v1.6.3.patch @@ -0,0 +1,1239 @@ +From 931f610a2075301b26d3d811bdbf30e4394da614 Mon Sep 17 00:00:00 2001 +From: Harsh Bhatia <bhatia4@llnl.gov> +Date: Sun, 28 Jun 2020 15:09:17 -0700 +Subject: [PATCH] fixes in v1.6.3 + +--- + gpu/impl/PQCodeDistances.cu | 567 ----------------------------- + gpu/impl/PQScanMultiPassNoPrecomputed.cu | 597 ------------------------------- + gpu/test/Makefile | 10 +- + gpu/test/demo_ivfpq_indexing_gpu.cpp | 1 + + 4 files changed, 8 insertions(+), 1167 deletions(-) + delete mode 100644 gpu/impl/PQCodeDistances.cu + delete mode 100644 gpu/impl/PQScanMultiPassNoPrecomputed.cu + +diff --git a/gpu/impl/PQCodeDistances.cu b/gpu/impl/PQCodeDistances.cu +deleted file mode 100644 +index 817990b..0000000 +--- a/gpu/impl/PQCodeDistances.cu ++++ /dev/null +@@ -1,567 +0,0 @@ +-/** +- * Copyright (c) Facebook, Inc. and its affiliates. +- * +- * This source code is licensed under the MIT license found in the +- * LICENSE file in the root directory of this source tree. +- */ +- +- +-#include <faiss/gpu/impl/PQCodeDistances.cuh> +- +-#include <faiss/gpu/impl/BroadcastSum.cuh> +-#include <faiss/gpu/impl/Distance.cuh> +-#include <faiss/gpu/impl/L2Norm.cuh> +-#include <faiss/gpu/utils/ConversionOperators.cuh> +-#include <faiss/gpu/utils/DeviceDefs.cuh> +-#include <faiss/gpu/utils/DeviceUtils.h> +-#include <faiss/gpu/utils/Float16.cuh> +-#include <faiss/gpu/utils/MatrixMult.cuh> +-#include <faiss/gpu/utils/PtxUtils.cuh> +-#include <faiss/gpu/utils/StaticUtils.h> +-#include <faiss/gpu/utils/Transpose.cuh> +- +-namespace faiss { namespace gpu { +- +-template <typename T> +-struct Converter { +-}; +- +-template <> +-struct Converter<half> { +- inline static __device__ half to(float v) { return __float2half(v); } +-}; +- +-template <> +-struct Converter<float> { +- inline static __device__ float to(float v) { return v; } +-}; +- +-// Kernel responsible for calculating distance from residual vector to +-// each product quantizer code centroid +-template <typename OutCodeT, int DimsPerSubQuantizer, bool L2Distance> +-__global__ void +-__launch_bounds__(288, 4) +-pqCodeDistances(Tensor<float, 2, true> queries, +- int queriesPerBlock, +- Tensor<float, 2, true> coarseCentroids, +- Tensor<float, 3, true> pqCentroids, +- Tensor<int, 2, true> topQueryToCentroid, +- // (query id)(coarse)(subquantizer)(code) -> dist +- Tensor<OutCodeT, 4, true> outCodeDistances) { +- const auto numSubQuantizers = pqCentroids.getSize(0); +- const auto dimsPerSubQuantizer = pqCentroids.getSize(1); +- assert(DimsPerSubQuantizer == dimsPerSubQuantizer); +- const auto codesPerSubQuantizer = pqCentroids.getSize(2); +- +- bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer; +- int loadingThreadId = threadIdx.x - codesPerSubQuantizer; +- +- extern __shared__ float smem[]; +- +- // Each thread calculates a single code +- float subQuantizerData[DimsPerSubQuantizer]; +- +- auto code = threadIdx.x; +- auto subQuantizer = blockIdx.y; +- +- // Each thread will load the pq centroid data for the code that it +- // is processing +-#pragma unroll +- for (int i = 0; i < DimsPerSubQuantizer; ++i) { +- subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg(); +- } +- +- // Where we store our query vector +- float* smemQuery = smem; +- +- // Where we store our residual vector; this is double buffered so we +- // can be loading the next one while processing the current one +- float* smemResidual1 = &smemQuery[DimsPerSubQuantizer]; +- float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer]; +- +- // Where we pre-load the coarse centroid IDs +- int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer]; +- +- // Each thread is calculating the distance for a single code, +- // performing the reductions locally +- +- // Handle multiple queries per block +- auto startQueryId = blockIdx.x * queriesPerBlock; +- auto numQueries = queries.getSize(0) - startQueryId; +- if (numQueries > queriesPerBlock) { +- numQueries = queriesPerBlock; +- } +- +- for (int query = 0; query < numQueries; ++query) { +- auto queryId = startQueryId + query; +- +- auto querySubQuantizer = +- queries[queryId][subQuantizer * DimsPerSubQuantizer].data(); +- +- // Load current query vector +- for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) { +- smemQuery[i] = querySubQuantizer[i]; +- } +- +- // Load list of coarse centroids found +- for (int i = threadIdx.x; +- i < topQueryToCentroid.getSize(1); i += blockDim.x) { +- coarseIds[i] = topQueryToCentroid[queryId][i]; +- } +- +- // We need coarseIds below +- // FIXME: investigate loading separately, so we don't need this +- __syncthreads(); +- +- // Preload first buffer of residual data +- if (isLoadingThread) { +- for (int i = loadingThreadId; +- i < DimsPerSubQuantizer; +- i += blockDim.x - codesPerSubQuantizer) { +- auto coarseId = coarseIds[0]; +- // In case NaNs were in the original query data +- coarseId = coarseId == -1 ? 0 : coarseId; +- auto coarseCentroidSubQuantizer = +- coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data(); +- +- if (L2Distance) { +- smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i]; +- } else { +- smemResidual1[i] = coarseCentroidSubQuantizer[i]; +- } +- } +- } +- +- // The block walks the list for a single query +- for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) { +- // Wait for smemResidual1 to be loaded +- __syncthreads(); +- +- if (isLoadingThread) { +- // Preload second buffer of residual data +- for (int i = loadingThreadId; +- i < DimsPerSubQuantizer; +- i += blockDim.x - codesPerSubQuantizer) { +- // FIXME: try always making this centroid id 0 so we can +- // terminate +- if (coarse != (topQueryToCentroid.getSize(1) - 1)) { +- auto coarseId = coarseIds[coarse + 1]; +- // In case NaNs were in the original query data +- coarseId = coarseId == -1 ? 0 : coarseId; +- +- auto coarseCentroidSubQuantizer = +- coarseCentroids[coarseId] +- [subQuantizer * dimsPerSubQuantizer].data(); +- +- if (L2Distance) { +- smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i]; +- } else { +- smemResidual2[i] = coarseCentroidSubQuantizer[i]; +- } +- } +- } +- } else { +- // These are the processing threads +- float dist = 0.0f; +- +- constexpr int kUnroll = 4; +- constexpr int kRemainder = DimsPerSubQuantizer % kUnroll; +- constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder; +- float vals[kUnroll]; +- +- // Calculate residual - pqCentroid for each dim that we're +- // processing +- +- // Unrolled loop +- if (L2Distance) { +-#pragma unroll +- for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) { +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- vals[j] = smemResidual1[i * kUnroll + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- vals[j] -= subQuantizerData[i * kUnroll + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- vals[j] *= vals[j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- dist += vals[j]; +- } +- } +- } else { +- // Inner product: query slice against the reconstructed sub-quantizer +- // for this coarse cell (query o (centroid + subQCentroid)) +-#pragma unroll +- for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) { +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- vals[j] = smemResidual1[i * kUnroll + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- vals[j] += subQuantizerData[i * kUnroll + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- vals[j] *= smemQuery[i * kUnroll + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- dist += vals[j]; +- } +- } +- } +- +- // Remainder loop +- if (L2Distance) { +-#pragma unroll +- for (int j = 0; j < kRemainder; ++j) { +- vals[j] = smemResidual1[kRemainderBase + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kRemainder; ++j) { +- vals[j] -= subQuantizerData[kRemainderBase + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kRemainder; ++j) { +- vals[j] *= vals[j]; +- } +- } else { +- // Inner product +- // Inner product: query slice against the reconstructed sub-quantizer +- // for this coarse cell (query o (centroid + subQCentroid)) +-#pragma unroll +- for (int j = 0; j < kRemainder; ++j) { +- vals[j] = smemResidual1[kRemainderBase + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kRemainder; ++j) { +- vals[j] += subQuantizerData[kRemainderBase + j]; +- } +- +-#pragma unroll +- for (int j = 0; j < kRemainder; ++j) { +- vals[j] *= smemQuery[kRemainderBase + j]; +- } +- } +- +-#pragma unroll +- for (int j = 0; j < kRemainder; ++j) { +- dist += vals[j]; +- } +- +- // We have the distance for our code; write it out +- outCodeDistances[queryId][coarse][subQuantizer][code] = +- Converter<OutCodeT>::to(dist); +- } // !isLoadingThread +- +- // Swap residual buffers +- float* tmp = smemResidual1; +- smemResidual1 = smemResidual2; +- smemResidual2 = tmp; +- } +- } +-} +- +-__global__ void +-residualVector(Tensor<float, 2, true> queries, +- Tensor<float, 2, true> coarseCentroids, +- Tensor<int, 2, true> topQueryToCentroid, +- int numSubDim, +- // output is transposed: +- // (sub q)(query id)(centroid id)(sub dim) +- Tensor<float, 4, true> residual) { +- // block x is query id +- // block y is centroid id +- // thread x is dim +- auto queryId = blockIdx.x; +- auto centroidId = blockIdx.y; +- +- int realCentroidId = topQueryToCentroid[queryId][centroidId]; +- +- for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) { +- float q = queries[queryId][dim]; +- float c = coarseCentroids[realCentroidId][dim]; +- +- residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = +- q - c; +- } +-} +- +-void +-runResidualVector(Tensor<float, 3, true>& pqCentroids, +- Tensor<float, 2, true>& queries, +- Tensor<float, 2, true>& coarseCentroids, +- Tensor<int, 2, true>& topQueryToCentroid, +- Tensor<float, 4, true>& residual, +- cudaStream_t stream) { +- auto grid = +- dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1)); +- auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice())); +- +- residualVector<<<grid, block, 0, stream>>>( +- queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1), +- residual); +- +- CUDA_TEST_ERROR(); +-} +- +-void +-runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids, +- Tensor<float, 2, true>& queries, +- Tensor<float, 2, true>& coarseCentroids, +- Tensor<int, 2, true>& topQueryToCentroid, +- NoTypeTensor<4, true>& outCodeDistances, +- bool useFloat16Lookup, +- DeviceMemory& mem, +- cublasHandle_t handle, +- cudaStream_t stream) { +- // Calculate (q - c) residual vector +- // (sub q)(query id)(centroid id)(sub dim) +- DeviceTensor<float, 4, true> residual( +- mem, +- {pqCentroids.getSize(0), +- topQueryToCentroid.getSize(0), +- topQueryToCentroid.getSize(1), +- pqCentroids.getSize(1)}, +- stream); +- +- runResidualVector(pqCentroids, queries, +- coarseCentroids, topQueryToCentroid, +- residual, stream); +- +- // Calculate ||q - c||^2 +- DeviceTensor<float, 1, true> residualNorms( +- mem, +- {pqCentroids.getSize(0) * +- topQueryToCentroid.getSize(0) * +- topQueryToCentroid.getSize(1)}, +- stream); +- +- auto residualView2 = residual.view<2>( +- {pqCentroids.getSize(0) * +- topQueryToCentroid.getSize(0) * +- topQueryToCentroid.getSize(1), +- pqCentroids.getSize(1)}); +- +- runL2Norm(residualView2, true, residualNorms, true, stream); +- +- // Perform a batch MM: +- // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} => +- // (sub q) x {(q * c)(code)} +- auto residualView3 = residual.view<3>( +- {pqCentroids.getSize(0), +- topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), +- pqCentroids.getSize(1)}); +- +- DeviceTensor<float, 3, true> residualDistance( +- mem, +- {pqCentroids.getSize(0), +- topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), +- pqCentroids.getSize(2)}, +- stream); +- +- runIteratedMatrixMult(residualDistance, false, +- residualView3, false, +- pqCentroids, false, +- -2.0f, 0.0f, +- handle, +- stream); +- +- // Sum ||q - c||^2 along rows +- auto residualDistanceView2 = residualDistance.view<2>( +- {pqCentroids.getSize(0) * +- topQueryToCentroid.getSize(0) * +- topQueryToCentroid.getSize(1), +- pqCentroids.getSize(2)}); +- +- runSumAlongRows(residualNorms, residualDistanceView2, false, stream); +- +- Tensor<float, 4, true> outCodeDistancesF; +- DeviceTensor<float, 4, true> outCodeDistancesFloatMem; +- +- if (useFloat16Lookup) { +- outCodeDistancesFloatMem = DeviceTensor<float, 4, true>( +- mem, {outCodeDistances.getSize(0), +- outCodeDistances.getSize(1), +- outCodeDistances.getSize(2), +- outCodeDistances.getSize(3)}, +- stream); +- +- outCodeDistancesF = outCodeDistancesFloatMem; +- } else { +- outCodeDistancesF = outCodeDistances.toTensor<float>(); +- } +- +- // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which +- // is where we build our output distances) +- auto outCodeDistancesView = outCodeDistancesF.view<3>( +- {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), +- outCodeDistances.getSize(2), +- outCodeDistances.getSize(3)}); +- +- runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream); +- +- // Calculate code norms per each sub-dim +- // (sub q)(sub dim)(code) is pqCentroids +- // transpose to (sub q)(code)(sub dim) +- DeviceTensor<float, 3, true> pqCentroidsTranspose( +- mem, +- {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)}, +- stream); +- +- runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream); +- +- auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>( +- {pqCentroids.getSize(0) * pqCentroids.getSize(2), +- pqCentroids.getSize(1)}); +- +- DeviceTensor<float, 1, true> pqCentroidsNorm( +- mem, +- {pqCentroids.getSize(0) * pqCentroids.getSize(2)}, +- stream); +- +- runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream); +- +- // View output as (q * c)(sub q * code), and add centroid norm to +- // each row +- auto outDistancesCodeViewCols = outCodeDistancesView.view<2>( +- {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), +- outCodeDistances.getSize(2) * outCodeDistances.getSize(3)}); +- +- runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream); +- +- if (useFloat16Lookup) { +- // Need to convert back +- auto outCodeDistancesH = outCodeDistances.toTensor<half>(); +- convertTensor<float, half, 4>(stream, +- outCodeDistancesF, +- outCodeDistancesH); +- } +-} +- +-void +-runPQCodeDistances(Tensor<float, 3, true>& pqCentroids, +- Tensor<float, 2, true>& queries, +- Tensor<float, 2, true>& coarseCentroids, +- Tensor<int, 2, true>& topQueryToCentroid, +- NoTypeTensor<4, true>& outCodeDistances, +- bool l2Distance, +- bool useFloat16Lookup, +- cudaStream_t stream) { +- const auto numSubQuantizers = pqCentroids.getSize(0); +- const auto dimsPerSubQuantizer = pqCentroids.getSize(1); +- const auto codesPerSubQuantizer = pqCentroids.getSize(2); +- +- // FIXME: tune +- // Reuse of pq centroid data is based on both # of queries * nprobe, +- // and we should really be tiling in both dimensions +- constexpr int kQueriesPerBlock = 8; +- +- auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock), +- numSubQuantizers); +- +- // Reserve one block of threads for double buffering +- // FIXME: probably impractical for large # of dims? +- auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize); +- auto block = dim3(codesPerSubQuantizer + loadingThreads); +- +- auto smem = (3 * dimsPerSubQuantizer) * sizeof(float) +- + topQueryToCentroid.getSize(1) * sizeof(int); +- +-#define RUN_CODE(DIMS, L2) \ +- do { \ +- if (useFloat16Lookup) { \ +- auto outCodeDistancesT = outCodeDistances.toTensor<half>(); \ +- \ +- pqCodeDistances<half, DIMS, L2><<<grid, block, smem, stream>>>( \ +- queries, kQueriesPerBlock, \ +- coarseCentroids, pqCentroids, \ +- topQueryToCentroid, outCodeDistancesT); \ +- } else { \ +- auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \ +- \ +- pqCodeDistances<float, DIMS, L2><<<grid, block, smem, stream>>>( \ +- queries, kQueriesPerBlock, \ +- coarseCentroids, pqCentroids, \ +- topQueryToCentroid, outCodeDistancesT); \ +- } \ +- } while (0) +- +-#define CODE_L2(DIMS) \ +- do { \ +- if (l2Distance) { \ +- RUN_CODE(DIMS, true); \ +- } else { \ +- RUN_CODE(DIMS, false); \ +- } \ +- } while (0) +- +- switch (dimsPerSubQuantizer) { +- case 1: +- CODE_L2(1); +- break; +- case 2: +- CODE_L2(2); +- break; +- case 3: +- CODE_L2(3); +- break; +- case 4: +- CODE_L2(4); +- break; +- case 6: +- CODE_L2(6); +- break; +- case 8: +- CODE_L2(8); +- break; +- case 10: +- CODE_L2(10); +- break; +- case 12: +- CODE_L2(12); +- break; +- case 16: +- CODE_L2(16); +- break; +- case 20: +- CODE_L2(20); +- break; +- case 24: +- CODE_L2(24); +- break; +- case 28: +- CODE_L2(28); +- break; +- case 32: +- CODE_L2(32); +- break; +- // FIXME: larger sizes require too many registers - we need the +- // MM implementation working +- default: +- FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer " +- "not currently supported"); +- } +- +-#undef RUN_CODE +-#undef CODE_L2 +- +- CUDA_TEST_ERROR(); +-} +- +-} } // namespace +diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/gpu/impl/PQScanMultiPassNoPrecomputed.cu +deleted file mode 100644 +index a514694..0000000 +--- a/gpu/impl/PQScanMultiPassNoPrecomputed.cu ++++ /dev/null +@@ -1,597 +0,0 @@ +-/** +- * Copyright (c) Facebook, Inc. and its affiliates. +- * +- * This source code is licensed under the MIT license found in the +- * LICENSE file in the root directory of this source tree. +- */ +- +- +-#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh> +-#include <faiss/gpu/GpuResources.h> +-#include <faiss/gpu/impl/PQCodeDistances.cuh> +-#include <faiss/gpu/impl/PQCodeLoad.cuh> +-#include <faiss/gpu/impl/IVFUtils.cuh> +-#include <faiss/gpu/utils/ConversionOperators.cuh> +-#include <faiss/gpu/utils/DeviceTensor.cuh> +-#include <faiss/gpu/utils/DeviceUtils.h> +-#include <faiss/gpu/utils/Float16.cuh> +-#include <faiss/gpu/utils/LoadStoreOperators.cuh> +-#include <faiss/gpu/utils/NoTypeTensor.cuh> +-#include <faiss/gpu/utils/StaticUtils.h> +- +-#include <faiss/gpu/utils/HostTensor.cuh> +- +-namespace faiss { namespace gpu { +- +-// This must be kept in sync with PQCodeDistances.cu +-bool isSupportedNoPrecomputedSubDimSize(int dims) { +- switch (dims) { +- case 1: +- case 2: +- case 3: +- case 4: +- case 6: +- case 8: +- case 10: +- case 12: +- case 16: +- case 20: +- case 24: +- case 28: +- case 32: +- return true; +- default: +- // FIXME: larger sizes require too many registers - we need the +- // MM implementation working +- return false; +- } +-} +- +-template <typename LookupT, typename LookupVecT> +-struct LoadCodeDistances { +- static inline __device__ void load(LookupT* smem, +- LookupT* codes, +- int numCodes) { +- constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT); +- +- // We can only use the vector type if the data is guaranteed to be +- // aligned. The codes are innermost, so if it is evenly divisible, +- // then any slice will be aligned. +- if (numCodes % kWordSize == 0) { +- // Load the data by float4 for efficiency, and then handle any remainder +- // limitVec is the number of whole vec words we can load, in terms +- // of whole blocks performing the load +- constexpr int kUnroll = 2; +- int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x); +- limitVec *= kUnroll * blockDim.x; +- +- LookupVecT* smemV = (LookupVecT*) smem; +- LookupVecT* codesV = (LookupVecT*) codes; +- +- for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) { +- LookupVecT vals[kUnroll]; +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- vals[j] = +- LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]); +- } +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]); +- } +- } +- +- // This is where we start loading the remainder that does not evenly +- // fit into kUnroll x blockDim.x +- int remainder = limitVec * kWordSize; +- +- for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) { +- smem[i] = codes[i]; +- } +- } else { +- // Potential unaligned load +- constexpr int kUnroll = 4; +- +- int limit = utils::roundDown(numCodes, kUnroll * blockDim.x); +- +- int i = threadIdx.x; +- for (; i < limit; i += kUnroll * blockDim.x) { +- LookupT vals[kUnroll]; +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- vals[j] = codes[i + j * blockDim.x]; +- } +- +-#pragma unroll +- for (int j = 0; j < kUnroll; ++j) { +- smem[i + j * blockDim.x] = vals[j]; +- } +- } +- +- for (; i < numCodes; i += blockDim.x) { +- smem[i] = codes[i]; +- } +- } +- } +-}; +- +-template <int NumSubQuantizers, typename LookupT, typename LookupVecT> +-__global__ void +-pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries, +- Tensor<float, 3, true> pqCentroids, +- Tensor<int, 2, true> topQueryToCentroid, +- Tensor<LookupT, 4, true> codeDistances, +- void** listCodes, +- int* listLengths, +- Tensor<int, 2, true> prefixSumOffsets, +- Tensor<float, 1, true> distance) { +- const auto codesPerSubQuantizer = pqCentroids.getSize(2); +- +- // Where the pq code -> residual distance is stored +- extern __shared__ char smemCodeDistances[]; +- LookupT* codeDist = (LookupT*) smemCodeDistances; +- +- // Each block handles a single query +- auto queryId = blockIdx.y; +- auto probeId = blockIdx.x; +- +- // This is where we start writing out data +- // We ensure that before the array (at offset -1), there is a 0 value +- int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1); +- float* distanceOut = distance[outBase].data(); +- +- auto listId = topQueryToCentroid[queryId][probeId]; +- // Safety guard in case NaNs in input cause no list ID to be generated +- if (listId == -1) { +- return; +- } +- +- unsigned char* codeList = (unsigned char*) listCodes[listId]; +- int limit = listLengths[listId]; +- +- constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 : +- (NumSubQuantizers / 4); +- unsigned int code32[kNumCode32]; +- unsigned int nextCode32[kNumCode32]; +- +- // We double-buffer the code loading, which improves memory utilization +- if (threadIdx.x < limit) { +- LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x); +- } +- +- LoadCodeDistances<LookupT, LookupVecT>::load( +- codeDist, +- codeDistances[queryId][probeId].data(), +- codeDistances.getSize(2) * codeDistances.getSize(3)); +- +- // Prevent WAR dependencies +- __syncthreads(); +- +- // Each thread handles one code element in the list, with a +- // block-wide stride +- for (int codeIndex = threadIdx.x; +- codeIndex < limit; +- codeIndex += blockDim.x) { +- // Prefetch next codes +- if (codeIndex + blockDim.x < limit) { +- LoadCode32<NumSubQuantizers>::load( +- nextCode32, codeList, codeIndex + blockDim.x); +- } +- +- float dist = 0.0f; +- +-#pragma unroll +- for (int word = 0; word < kNumCode32; ++word) { +- constexpr int kBytesPerCode32 = +- NumSubQuantizers < 4 ? NumSubQuantizers : 4; +- +- if (kBytesPerCode32 == 1) { +- auto code = code32[0]; +- dist = ConvertTo<float>::to(codeDist[code]); +- +- } else { +-#pragma unroll +- for (int byte = 0; byte < kBytesPerCode32; ++byte) { +- auto code = getByte(code32[word], byte * 8, 8); +- +- auto offset = +- codesPerSubQuantizer * (word * kBytesPerCode32 + byte); +- +- dist += ConvertTo<float>::to(codeDist[offset + code]); +- } +- } +- } +- +- // Write out intermediate distance result +- // We do not maintain indices here, in order to reduce global +- // memory traffic. Those are recovered in the final selection step. +- distanceOut[codeIndex] = dist; +- +- // Rotate buffers +-#pragma unroll +- for (int word = 0; word < kNumCode32; ++word) { +- code32[word] = nextCode32[word]; +- } +- } +-} +- +-void +-runMultiPassTile(Tensor<float, 2, true>& queries, +- Tensor<float, 2, true>& centroids, +- Tensor<float, 3, true>& pqCentroidsInnermostCode, +- NoTypeTensor<4, true>& codeDistances, +- Tensor<int, 2, true>& topQueryToCentroid, +- bool useFloat16Lookup, +- int bytesPerCode, +- int numSubQuantizers, +- int numSubQuantizerCodes, +- thrust::device_vector<void*>& listCodes, +- thrust::device_vector<void*>& listIndices, +- IndicesOptions indicesOptions, +- thrust::device_vector<int>& listLengths, +- Tensor<char, 1, true>& thrustMem, +- Tensor<int, 2, true>& prefixSumOffsets, +- Tensor<float, 1, true>& allDistances, +- Tensor<float, 3, true>& heapDistances, +- Tensor<int, 3, true>& heapIndices, +- int k, +- faiss::MetricType metric, +- Tensor<float, 2, true>& outDistances, +- Tensor<long, 2, true>& outIndices, +- cudaStream_t stream) { +- // We only support two metrics at the moment +- FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT || +- metric == MetricType::METRIC_L2); +- +- bool l2Distance = metric == MetricType::METRIC_L2; +- +- // Calculate offset lengths, so we know where to write out +- // intermediate results +- runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets, +- thrustMem, stream); +- +- // Calculate residual code distances, since this is without +- // precomputed codes +- runPQCodeDistances(pqCentroidsInnermostCode, +- queries, +- centroids, +- topQueryToCentroid, +- codeDistances, +- l2Distance, +- useFloat16Lookup, +- stream); +- +- // Convert all codes to a distance, and write out (distance, +- // index) values for all intermediate results +- { +- auto kThreadsPerBlock = 256; +- +- auto grid = dim3(topQueryToCentroid.getSize(1), +- topQueryToCentroid.getSize(0)); +- auto block = dim3(kThreadsPerBlock); +- +- // pq centroid distances +- auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float); +- +- smem *= numSubQuantizers * numSubQuantizerCodes; +- FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice()); +- +-#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \ +- do { \ +- auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>(); \ +- \ +- pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \ +- <<<grid, block, smem, stream>>>( \ +- queries, \ +- pqCentroidsInnermostCode, \ +- topQueryToCentroid, \ +- codeDistancesT, \ +- listCodes.data().get(), \ +- listLengths.data().get(), \ +- prefixSumOffsets, \ +- allDistances); \ +- } while (0) +- +-#define RUN_PQ(NUM_SUB_Q) \ +- do { \ +- if (useFloat16Lookup) { \ +- RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \ +- } else { \ +- RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ +- } \ +- } while (0) +- +- switch (bytesPerCode) { +- case 1: +- RUN_PQ(1); +- break; +- case 2: +- RUN_PQ(2); +- break; +- case 3: +- RUN_PQ(3); +- break; +- case 4: +- RUN_PQ(4); +- break; +- case 8: +- RUN_PQ(8); +- break; +- case 12: +- RUN_PQ(12); +- break; +- case 16: +- RUN_PQ(16); +- break; +- case 20: +- RUN_PQ(20); +- break; +- case 24: +- RUN_PQ(24); +- break; +- case 28: +- RUN_PQ(28); +- break; +- case 32: +- RUN_PQ(32); +- break; +- case 40: +- RUN_PQ(40); +- break; +- case 48: +- RUN_PQ(48); +- break; +- case 56: +- RUN_PQ(56); +- break; +- case 64: +- RUN_PQ(64); +- break; +- case 96: +- RUN_PQ(96); +- break; +- default: +- FAISS_ASSERT(false); +- break; +- } +- +-#undef RUN_PQ +-#undef RUN_PQ_OPT +- } +- +- CUDA_TEST_ERROR(); +- +- // k-select the output in chunks, to increase parallelism +- runPass1SelectLists(prefixSumOffsets, +- allDistances, +- topQueryToCentroid.getSize(1), +- k, +- !l2Distance, // L2 distance chooses smallest +- heapDistances, +- heapIndices, +- stream); +- +- // k-select final output +- auto flatHeapDistances = heapDistances.downcastInner<2>(); +- auto flatHeapIndices = heapIndices.downcastInner<2>(); +- +- runPass2SelectLists(flatHeapDistances, +- flatHeapIndices, +- listIndices, +- indicesOptions, +- prefixSumOffsets, +- topQueryToCentroid, +- k, +- !l2Distance, // L2 distance chooses smallest +- outDistances, +- outIndices, +- stream); +-} +- +-void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries, +- Tensor<float, 2, true>& centroids, +- Tensor<float, 3, true>& pqCentroidsInnermostCode, +- Tensor<int, 2, true>& topQueryToCentroid, +- bool useFloat16Lookup, +- int bytesPerCode, +- int numSubQuantizers, +- int numSubQuantizerCodes, +- thrust::device_vector<void*>& listCodes, +- thrust::device_vector<void*>& listIndices, +- IndicesOptions indicesOptions, +- thrust::device_vector<int>& listLengths, +- int maxListLength, +- int k, +- faiss::MetricType metric, +- // output +- Tensor<float, 2, true>& outDistances, +- // output +- Tensor<long, 2, true>& outIndices, +- GpuResources* res) { +- constexpr int kMinQueryTileSize = 8; +- constexpr int kMaxQueryTileSize = 128; +- constexpr int kThrustMemSize = 16384; +- +- int nprobe = topQueryToCentroid.getSize(1); +- +- auto& mem = res->getMemoryManagerCurrentDevice(); +- auto stream = res->getDefaultStreamCurrentDevice(); +- +- // Make a reservation for Thrust to do its dirty work (global memory +- // cross-block reduction space); hopefully this is large enough. +- DeviceTensor<char, 1, true> thrustMem1( +- mem, {kThrustMemSize}, stream); +- DeviceTensor<char, 1, true> thrustMem2( +- mem, {kThrustMemSize}, stream); +- DeviceTensor<char, 1, true>* thrustMem[2] = +- {&thrustMem1, &thrustMem2}; +- +- // How much temporary storage is available? +- // If possible, we'd like to fit within the space available. +- size_t sizeAvailable = mem.getSizeAvailable(); +- +- // We run two passes of heap selection +- // This is the size of the first-level heap passes +- constexpr int kNProbeSplit = 8; +- int pass2Chunks = std::min(nprobe, kNProbeSplit); +- +- size_t sizeForFirstSelectPass = +- pass2Chunks * k * (sizeof(float) + sizeof(int)); +- +- // How much temporary storage we need per each query +- size_t sizePerQuery = +- 2 * // streams +- ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets +- nprobe * maxListLength * sizeof(float) + // allDistances +- // residual distances +- nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) + +- sizeForFirstSelectPass); +- +- int queryTileSize = (int) (sizeAvailable / sizePerQuery); +- +- if (queryTileSize < kMinQueryTileSize) { +- queryTileSize = kMinQueryTileSize; +- } else if (queryTileSize > kMaxQueryTileSize) { +- queryTileSize = kMaxQueryTileSize; +- } +- +- // FIXME: we should adjust queryTileSize to deal with this, since +- // indexing is in int32 +- FAISS_ASSERT(queryTileSize * nprobe * maxListLength < +- std::numeric_limits<int>::max()); +- +- // Temporary memory buffers +- // Make sure there is space prior to the start which will be 0, and +- // will handle the boundary condition without branches +- DeviceTensor<int, 1, true> prefixSumOffsetSpace1( +- mem, {queryTileSize * nprobe + 1}, stream); +- DeviceTensor<int, 1, true> prefixSumOffsetSpace2( +- mem, {queryTileSize * nprobe + 1}, stream); +- +- DeviceTensor<int, 2, true> prefixSumOffsets1( +- prefixSumOffsetSpace1[1].data(), +- {queryTileSize, nprobe}); +- DeviceTensor<int, 2, true> prefixSumOffsets2( +- prefixSumOffsetSpace2[1].data(), +- {queryTileSize, nprobe}); +- DeviceTensor<int, 2, true>* prefixSumOffsets[2] = +- {&prefixSumOffsets1, &prefixSumOffsets2}; +- +- // Make sure the element before prefixSumOffsets is 0, since we +- // depend upon simple, boundary-less indexing to get proper results +- CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(), +- 0, +- sizeof(int), +- stream)); +- CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(), +- 0, +- sizeof(int), +- stream)); +- +- int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float); +- +- int totalCodeDistancesSize = +- queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes * +- codeDistanceTypeSize; +- +- DeviceTensor<char, 1, true> codeDistances1Mem( +- mem, {totalCodeDistancesSize}, stream); +- NoTypeTensor<4, true> codeDistances1( +- codeDistances1Mem.data(), +- codeDistanceTypeSize, +- {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes}); +- +- DeviceTensor<char, 1, true> codeDistances2Mem( +- mem, {totalCodeDistancesSize}, stream); +- NoTypeTensor<4, true> codeDistances2( +- codeDistances2Mem.data(), +- codeDistanceTypeSize, +- {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes}); +- +- NoTypeTensor<4, true>* codeDistances[2] = +- {&codeDistances1, &codeDistances2}; +- +- DeviceTensor<float, 1, true> allDistances1( +- mem, {queryTileSize * nprobe * maxListLength}, stream); +- DeviceTensor<float, 1, true> allDistances2( +- mem, {queryTileSize * nprobe * maxListLength}, stream); +- DeviceTensor<float, 1, true>* allDistances[2] = +- {&allDistances1, &allDistances2}; +- +- DeviceTensor<float, 3, true> heapDistances1( +- mem, {queryTileSize, pass2Chunks, k}, stream); +- DeviceTensor<float, 3, true> heapDistances2( +- mem, {queryTileSize, pass2Chunks, k}, stream); +- DeviceTensor<float, 3, true>* heapDistances[2] = +- {&heapDistances1, &heapDistances2}; +- +- DeviceTensor<int, 3, true> heapIndices1( +- mem, {queryTileSize, pass2Chunks, k}, stream); +- DeviceTensor<int, 3, true> heapIndices2( +- mem, {queryTileSize, pass2Chunks, k}, stream); +- DeviceTensor<int, 3, true>* heapIndices[2] = +- {&heapIndices1, &heapIndices2}; +- +- auto streams = res->getAlternateStreamsCurrentDevice(); +- streamWait(streams, {stream}); +- +- int curStream = 0; +- +- for (int query = 0; query < queries.getSize(0); query += queryTileSize) { +- int numQueriesInTile = +- std::min(queryTileSize, queries.getSize(0) - query); +- +- auto prefixSumOffsetsView = +- prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile); +- +- auto codeDistancesView = +- codeDistances[curStream]->narrowOutermost(0, numQueriesInTile); +- auto coarseIndicesView = +- topQueryToCentroid.narrowOutermost(query, numQueriesInTile); +- auto queryView = +- queries.narrowOutermost(query, numQueriesInTile); +- +- auto heapDistancesView = +- heapDistances[curStream]->narrowOutermost(0, numQueriesInTile); +- auto heapIndicesView = +- heapIndices[curStream]->narrowOutermost(0, numQueriesInTile); +- +- auto outDistanceView = +- outDistances.narrowOutermost(query, numQueriesInTile); +- auto outIndicesView = +- outIndices.narrowOutermost(query, numQueriesInTile); +- +- runMultiPassTile(queryView, +- centroids, +- pqCentroidsInnermostCode, +- codeDistancesView, +- coarseIndicesView, +- useFloat16Lookup, +- bytesPerCode, +- numSubQuantizers, +- numSubQuantizerCodes, +- listCodes, +- listIndices, +- indicesOptions, +- listLengths, +- *thrustMem[curStream], +- prefixSumOffsetsView, +- *allDistances[curStream], +- heapDistancesView, +- heapIndicesView, +- k, +- metric, +- outDistanceView, +- outIndicesView, +- streams[curStream]); +- +- curStream = (curStream + 1) % 2; +- } +- +- streamWait({stream}, streams); +-} +- +-} } // namespace +diff --git a/gpu/test/Makefile b/gpu/test/Makefile +index 6836314..697e8eb 100644 +--- a/gpu/test/Makefile ++++ b/gpu/test/Makefile +@@ -17,14 +17,18 @@ TESTS_BIN = $(TESTS_OBJ:.o=) $(CUDA_TESTS_OBJ:.o=) + + # test_gpu_index.py test_pytorch_faiss.py + ++build: $(TESTS_BIN) ++TestUtils.o: TestUtils.cpp ++ $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ ../../libfaiss.a -Igtest/include -I../.. ++ + run: $(TESTS_BIN) $(CUDA_TESTS_BIN) + for t in $(TESTS_BIN) $(CUDA_TESTS_BIN); do ./$$t || exit; done + + $(CUDA_TESTS_OBJ): %.o: %.cu gtest +- $(NVCC) $(NVCCFLAGS) -g -O3 -o $@ -c $< -Igtest/include ++ $(NVCC) $(NVCCFLAGS) -g -O3 -o $@ -c $< -Igtest/include -I../.. + + $(TESTS_OBJ): %.o: %.cpp gtest +- $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $< -Igtest/include ++ $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $< -Igtest/include -I../.. + + $(TESTS_BIN): %: %.o TestUtils.o ../../libfaiss.a gtest/make/gtest.a + $(CXX) -o $@ $^ $(LDFLAGS) $(LIBS) +@@ -33,7 +37,7 @@ demo_ivfpq_indexing_gpu: demo_ivfpq_indexing_gpu.o ../../libfaiss.a + $(CXX) -o $@ $^ $(LDFLAGS) $(LIBS) + + demo_ivfpq_indexing_gpu.o: demo_ivfpq_indexing_gpu.cpp +- $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ ++ $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ -I../.. + + gtest/make/gtest.a: gtest + $(MAKE) -C gtest/make CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)" gtest.a +diff --git a/gpu/test/demo_ivfpq_indexing_gpu.cpp b/gpu/test/demo_ivfpq_indexing_gpu.cpp +index 852a43c..5fde30c 100644 +--- a/gpu/test/demo_ivfpq_indexing_gpu.cpp ++++ b/gpu/test/demo_ivfpq_indexing_gpu.cpp +@@ -17,6 +17,7 @@ + + #include <faiss/gpu/StandardGpuResources.h> + #include <faiss/gpu/GpuIndexIVFPQ.h> ++#include <faiss/gpu/GpuCloner.h> + + #include <faiss/gpu/GpuAutoTune.h> + #include <faiss/index_io.h> +-- +1.8.3.1 + diff --git a/var/spack/repos/builtin/packages/faiss/package.py b/var/spack/repos/builtin/packages/faiss/package.py new file mode 100644 index 0000000000..a660693268 --- /dev/null +++ b/var/spack/repos/builtin/packages/faiss/package.py @@ -0,0 +1,126 @@ +# Copyright 2013-2019 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +import os +from spack import * + + +class Faiss(AutotoolsPackage, CudaPackage): + """Faiss is a library for efficient similarity search and clustering of + dense vectors. + + Faiss contains algorithms that search in sets of vectors of any size, up + to ones that possibly do not fit in RAM. It also contains supporting code + for evaluation and parameter tuning. Faiss is written in C++ with + complete wrappers for Python/numpy. Some of the most useful algorithms + are implemented on the GPU. It is developed by Facebook AI Research. + """ + + homepage = "https://github.com/facebookresearch/faiss" + url = "https://github.com/facebookresearch/faiss/archive/v1.6.3.tar.gz" + + maintainers = ['bhatiaharsh'] + + version('1.6.3', sha256='e1a41c159f0b896975fbb133e0240a233af5c9286c09a28fde6aefff5336e542') + version('1.5.3', sha256='b24d347b0285d01c2ed663ccc7596cd0ea95071f3dd5ebb573ccfc28f15f043b') + + variant('python', default=False, description='Build Python bindings') + variant('tests', default=False, description='Build Tests') + + conflicts('+tests', when='~python', msg='+tests must be accompanied by +python') + + depends_on('python@3.7:', when='+python', type=('build', 'run')) + depends_on('py-numpy', when='+python', type=('build', 'run')) + depends_on('py-scipy', when='+tests', type=('build', 'run')) + + depends_on('blas') + depends_on('py-setuptools', when='+python', type='build') + depends_on('swig', when='+python', type='build') + + # patch for v1.5.3 + # faiss assumes that the "source directory" will always + # be called "faiss" (not spack-src or faiss-1.5.3) + # so, we will have to create a symlink to self (faiss did that in 1.6.3) + # and add an include path + patch('fixes-in-v1.5.3.patch', when='@1.5.3') + + # patch for v1.6.3 + # for v1.6.3, GPU build has a bug (two files need to be deleted) + # https://github.com/facebookresearch/faiss/issues/1159 + # also, some include paths in gpu/tests/Makefile are missing + patch('fixes-in-v1.6.3.patch', when='@1.6.3') + + def configure_args(self): + args = [] + args.extend(self.with_or_without('cuda', activation_value='prefix')) + return args + + def build(self, spec, prefix): + + make() + + if '+python' in self.spec: + make('-C', 'python') + + # CPU tests + if '+tests' in self.spec: + with working_dir('tests'): + make('gtest') + make('tests') + + # GPU tests + if '+tests+cuda' in self.spec: + with working_dir(os.path.join('gpu', 'test')): + make('gtest') + make('build') # target added by the patch + make('demo_ivfpq_indexing_gpu') + + def install(self, spec, prefix): + + make('install') + + if '+python' in self.spec: + with working_dir('python'): + setup_py('install', '--prefix=' + prefix, + '--single-version-externally-managed', '--root=/') + + if '+tests' not in self.spec: + return + + if not os.path.isdir(self.prefix.bin): + os.makedirs(self.prefix.bin) + + def _prefix_and_install(file): + os.rename(file, 'faiss_' + file) + install('faiss_' + file, self.prefix.bin) + + # CPU tests + with working_dir('tests'): + # rename the exec to keep consistent with gpu tests + os.rename('tests', 'TestCpu') + _prefix_and_install('TestCpu') + + # GPU tests + if '+cuda' in self.spec: + with working_dir(os.path.join('gpu', 'test')): + _prefix_and_install('TestGpuIndexFlat') + _prefix_and_install('TestGpuIndexBinaryFlat') + _prefix_and_install('TestGpuIndexIVFFlat') + _prefix_and_install('TestGpuIndexIVFPQ') + _prefix_and_install('TestGpuMemoryException') + _prefix_and_install('TestGpuSelect') + _prefix_and_install('demo_ivfpq_indexing_gpu') + + @run_after('configure') + def _fix_makefile(self): + + # spack injects its own optimization flags + makefile = FileFilter('makefile.inc') + makefile.filter('CPUFLAGS = -mavx2 -mf16c', + '#CPUFLAGS = -mavx2 -mf16c') + + def setup_run_environment(self, env): + if '+python' in self.spec: + env.prepend_path('PYTHONPATH', site_packages_dir) |