From 931f610a2075301b26d3d811bdbf30e4394da614 Mon Sep 17 00:00:00 2001 From: Harsh Bhatia Date: Sun, 28 Jun 2020 15:09:17 -0700 Subject: [PATCH] fixes in v1.6.3 --- gpu/impl/PQCodeDistances.cu | 567 ----------------------------- gpu/impl/PQScanMultiPassNoPrecomputed.cu | 597 ------------------------------- gpu/test/Makefile | 10 +- gpu/test/demo_ivfpq_indexing_gpu.cpp | 1 + 4 files changed, 8 insertions(+), 1167 deletions(-) delete mode 100644 gpu/impl/PQCodeDistances.cu delete mode 100644 gpu/impl/PQScanMultiPassNoPrecomputed.cu diff --git a/gpu/impl/PQCodeDistances.cu b/gpu/impl/PQCodeDistances.cu deleted file mode 100644 index 817990b..0000000 --- a/gpu/impl/PQCodeDistances.cu +++ /dev/null @@ -1,567 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace faiss { namespace gpu { - -template -struct Converter { -}; - -template <> -struct Converter { - inline static __device__ half to(float v) { return __float2half(v); } -}; - -template <> -struct Converter { - inline static __device__ float to(float v) { return v; } -}; - -// Kernel responsible for calculating distance from residual vector to -// each product quantizer code centroid -template -__global__ void -__launch_bounds__(288, 4) -pqCodeDistances(Tensor queries, - int queriesPerBlock, - Tensor coarseCentroids, - Tensor pqCentroids, - Tensor topQueryToCentroid, - // (query id)(coarse)(subquantizer)(code) -> dist - Tensor outCodeDistances) { - const auto numSubQuantizers = pqCentroids.getSize(0); - const auto dimsPerSubQuantizer = pqCentroids.getSize(1); - assert(DimsPerSubQuantizer == dimsPerSubQuantizer); - const auto codesPerSubQuantizer = pqCentroids.getSize(2); - - bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer; - int loadingThreadId = threadIdx.x - codesPerSubQuantizer; - - extern __shared__ float smem[]; - - // Each thread calculates a single code - float subQuantizerData[DimsPerSubQuantizer]; - - auto code = threadIdx.x; - auto subQuantizer = blockIdx.y; - - // Each thread will load the pq centroid data for the code that it - // is processing -#pragma unroll - for (int i = 0; i < DimsPerSubQuantizer; ++i) { - subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg(); - } - - // Where we store our query vector - float* smemQuery = smem; - - // Where we store our residual vector; this is double buffered so we - // can be loading the next one while processing the current one - float* smemResidual1 = &smemQuery[DimsPerSubQuantizer]; - float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer]; - - // Where we pre-load the coarse centroid IDs - int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer]; - - // Each thread is calculating the distance for a single code, - // performing the reductions locally - - // Handle multiple queries per block - auto startQueryId = blockIdx.x * queriesPerBlock; - auto numQueries = queries.getSize(0) - startQueryId; - if (numQueries > queriesPerBlock) { - numQueries = queriesPerBlock; - } - - for (int query = 0; query < numQueries; ++query) { - auto queryId = startQueryId + query; - - auto querySubQuantizer = - queries[queryId][subQuantizer * DimsPerSubQuantizer].data(); - - // Load current query vector - for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) { - smemQuery[i] = querySubQuantizer[i]; - } - - // Load list of coarse centroids found - for (int i = threadIdx.x; - i < topQueryToCentroid.getSize(1); i += blockDim.x) { - coarseIds[i] = topQueryToCentroid[queryId][i]; - } - - // We need coarseIds below - // FIXME: investigate loading separately, so we don't need this - __syncthreads(); - - // Preload first buffer of residual data - if (isLoadingThread) { - for (int i = loadingThreadId; - i < DimsPerSubQuantizer; - i += blockDim.x - codesPerSubQuantizer) { - auto coarseId = coarseIds[0]; - // In case NaNs were in the original query data - coarseId = coarseId == -1 ? 0 : coarseId; - auto coarseCentroidSubQuantizer = - coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data(); - - if (L2Distance) { - smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i]; - } else { - smemResidual1[i] = coarseCentroidSubQuantizer[i]; - } - } - } - - // The block walks the list for a single query - for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) { - // Wait for smemResidual1 to be loaded - __syncthreads(); - - if (isLoadingThread) { - // Preload second buffer of residual data - for (int i = loadingThreadId; - i < DimsPerSubQuantizer; - i += blockDim.x - codesPerSubQuantizer) { - // FIXME: try always making this centroid id 0 so we can - // terminate - if (coarse != (topQueryToCentroid.getSize(1) - 1)) { - auto coarseId = coarseIds[coarse + 1]; - // In case NaNs were in the original query data - coarseId = coarseId == -1 ? 0 : coarseId; - - auto coarseCentroidSubQuantizer = - coarseCentroids[coarseId] - [subQuantizer * dimsPerSubQuantizer].data(); - - if (L2Distance) { - smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i]; - } else { - smemResidual2[i] = coarseCentroidSubQuantizer[i]; - } - } - } - } else { - // These are the processing threads - float dist = 0.0f; - - constexpr int kUnroll = 4; - constexpr int kRemainder = DimsPerSubQuantizer % kUnroll; - constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder; - float vals[kUnroll]; - - // Calculate residual - pqCentroid for each dim that we're - // processing - - // Unrolled loop - if (L2Distance) { -#pragma unroll - for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - vals[j] = smemResidual1[i * kUnroll + j]; - } - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - vals[j] -= subQuantizerData[i * kUnroll + j]; - } - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - vals[j] *= vals[j]; - } - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - dist += vals[j]; - } - } - } else { - // Inner product: query slice against the reconstructed sub-quantizer - // for this coarse cell (query o (centroid + subQCentroid)) -#pragma unroll - for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - vals[j] = smemResidual1[i * kUnroll + j]; - } - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - vals[j] += subQuantizerData[i * kUnroll + j]; - } - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - vals[j] *= smemQuery[i * kUnroll + j]; - } - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - dist += vals[j]; - } - } - } - - // Remainder loop - if (L2Distance) { -#pragma unroll - for (int j = 0; j < kRemainder; ++j) { - vals[j] = smemResidual1[kRemainderBase + j]; - } - -#pragma unroll - for (int j = 0; j < kRemainder; ++j) { - vals[j] -= subQuantizerData[kRemainderBase + j]; - } - -#pragma unroll - for (int j = 0; j < kRemainder; ++j) { - vals[j] *= vals[j]; - } - } else { - // Inner product - // Inner product: query slice against the reconstructed sub-quantizer - // for this coarse cell (query o (centroid + subQCentroid)) -#pragma unroll - for (int j = 0; j < kRemainder; ++j) { - vals[j] = smemResidual1[kRemainderBase + j]; - } - -#pragma unroll - for (int j = 0; j < kRemainder; ++j) { - vals[j] += subQuantizerData[kRemainderBase + j]; - } - -#pragma unroll - for (int j = 0; j < kRemainder; ++j) { - vals[j] *= smemQuery[kRemainderBase + j]; - } - } - -#pragma unroll - for (int j = 0; j < kRemainder; ++j) { - dist += vals[j]; - } - - // We have the distance for our code; write it out - outCodeDistances[queryId][coarse][subQuantizer][code] = - Converter::to(dist); - } // !isLoadingThread - - // Swap residual buffers - float* tmp = smemResidual1; - smemResidual1 = smemResidual2; - smemResidual2 = tmp; - } - } -} - -__global__ void -residualVector(Tensor queries, - Tensor coarseCentroids, - Tensor topQueryToCentroid, - int numSubDim, - // output is transposed: - // (sub q)(query id)(centroid id)(sub dim) - Tensor residual) { - // block x is query id - // block y is centroid id - // thread x is dim - auto queryId = blockIdx.x; - auto centroidId = blockIdx.y; - - int realCentroidId = topQueryToCentroid[queryId][centroidId]; - - for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) { - float q = queries[queryId][dim]; - float c = coarseCentroids[realCentroidId][dim]; - - residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = - q - c; - } -} - -void -runResidualVector(Tensor& pqCentroids, - Tensor& queries, - Tensor& coarseCentroids, - Tensor& topQueryToCentroid, - Tensor& residual, - cudaStream_t stream) { - auto grid = - dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1)); - auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice())); - - residualVector<<>>( - queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1), - residual); - - CUDA_TEST_ERROR(); -} - -void -runPQCodeDistancesMM(Tensor& pqCentroids, - Tensor& queries, - Tensor& coarseCentroids, - Tensor& topQueryToCentroid, - NoTypeTensor<4, true>& outCodeDistances, - bool useFloat16Lookup, - DeviceMemory& mem, - cublasHandle_t handle, - cudaStream_t stream) { - // Calculate (q - c) residual vector - // (sub q)(query id)(centroid id)(sub dim) - DeviceTensor residual( - mem, - {pqCentroids.getSize(0), - topQueryToCentroid.getSize(0), - topQueryToCentroid.getSize(1), - pqCentroids.getSize(1)}, - stream); - - runResidualVector(pqCentroids, queries, - coarseCentroids, topQueryToCentroid, - residual, stream); - - // Calculate ||q - c||^2 - DeviceTensor residualNorms( - mem, - {pqCentroids.getSize(0) * - topQueryToCentroid.getSize(0) * - topQueryToCentroid.getSize(1)}, - stream); - - auto residualView2 = residual.view<2>( - {pqCentroids.getSize(0) * - topQueryToCentroid.getSize(0) * - topQueryToCentroid.getSize(1), - pqCentroids.getSize(1)}); - - runL2Norm(residualView2, true, residualNorms, true, stream); - - // Perform a batch MM: - // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} => - // (sub q) x {(q * c)(code)} - auto residualView3 = residual.view<3>( - {pqCentroids.getSize(0), - topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), - pqCentroids.getSize(1)}); - - DeviceTensor residualDistance( - mem, - {pqCentroids.getSize(0), - topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), - pqCentroids.getSize(2)}, - stream); - - runIteratedMatrixMult(residualDistance, false, - residualView3, false, - pqCentroids, false, - -2.0f, 0.0f, - handle, - stream); - - // Sum ||q - c||^2 along rows - auto residualDistanceView2 = residualDistance.view<2>( - {pqCentroids.getSize(0) * - topQueryToCentroid.getSize(0) * - topQueryToCentroid.getSize(1), - pqCentroids.getSize(2)}); - - runSumAlongRows(residualNorms, residualDistanceView2, false, stream); - - Tensor outCodeDistancesF; - DeviceTensor outCodeDistancesFloatMem; - - if (useFloat16Lookup) { - outCodeDistancesFloatMem = DeviceTensor( - mem, {outCodeDistances.getSize(0), - outCodeDistances.getSize(1), - outCodeDistances.getSize(2), - outCodeDistances.getSize(3)}, - stream); - - outCodeDistancesF = outCodeDistancesFloatMem; - } else { - outCodeDistancesF = outCodeDistances.toTensor(); - } - - // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which - // is where we build our output distances) - auto outCodeDistancesView = outCodeDistancesF.view<3>( - {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), - outCodeDistances.getSize(2), - outCodeDistances.getSize(3)}); - - runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream); - - // Calculate code norms per each sub-dim - // (sub q)(sub dim)(code) is pqCentroids - // transpose to (sub q)(code)(sub dim) - DeviceTensor pqCentroidsTranspose( - mem, - {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)}, - stream); - - runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream); - - auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>( - {pqCentroids.getSize(0) * pqCentroids.getSize(2), - pqCentroids.getSize(1)}); - - DeviceTensor pqCentroidsNorm( - mem, - {pqCentroids.getSize(0) * pqCentroids.getSize(2)}, - stream); - - runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream); - - // View output as (q * c)(sub q * code), and add centroid norm to - // each row - auto outDistancesCodeViewCols = outCodeDistancesView.view<2>( - {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), - outCodeDistances.getSize(2) * outCodeDistances.getSize(3)}); - - runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream); - - if (useFloat16Lookup) { - // Need to convert back - auto outCodeDistancesH = outCodeDistances.toTensor(); - convertTensor(stream, - outCodeDistancesF, - outCodeDistancesH); - } -} - -void -runPQCodeDistances(Tensor& pqCentroids, - Tensor& queries, - Tensor& coarseCentroids, - Tensor& topQueryToCentroid, - NoTypeTensor<4, true>& outCodeDistances, - bool l2Distance, - bool useFloat16Lookup, - cudaStream_t stream) { - const auto numSubQuantizers = pqCentroids.getSize(0); - const auto dimsPerSubQuantizer = pqCentroids.getSize(1); - const auto codesPerSubQuantizer = pqCentroids.getSize(2); - - // FIXME: tune - // Reuse of pq centroid data is based on both # of queries * nprobe, - // and we should really be tiling in both dimensions - constexpr int kQueriesPerBlock = 8; - - auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock), - numSubQuantizers); - - // Reserve one block of threads for double buffering - // FIXME: probably impractical for large # of dims? - auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize); - auto block = dim3(codesPerSubQuantizer + loadingThreads); - - auto smem = (3 * dimsPerSubQuantizer) * sizeof(float) - + topQueryToCentroid.getSize(1) * sizeof(int); - -#define RUN_CODE(DIMS, L2) \ - do { \ - if (useFloat16Lookup) { \ - auto outCodeDistancesT = outCodeDistances.toTensor(); \ - \ - pqCodeDistances<<>>( \ - queries, kQueriesPerBlock, \ - coarseCentroids, pqCentroids, \ - topQueryToCentroid, outCodeDistancesT); \ - } else { \ - auto outCodeDistancesT = outCodeDistances.toTensor(); \ - \ - pqCodeDistances<<>>( \ - queries, kQueriesPerBlock, \ - coarseCentroids, pqCentroids, \ - topQueryToCentroid, outCodeDistancesT); \ - } \ - } while (0) - -#define CODE_L2(DIMS) \ - do { \ - if (l2Distance) { \ - RUN_CODE(DIMS, true); \ - } else { \ - RUN_CODE(DIMS, false); \ - } \ - } while (0) - - switch (dimsPerSubQuantizer) { - case 1: - CODE_L2(1); - break; - case 2: - CODE_L2(2); - break; - case 3: - CODE_L2(3); - break; - case 4: - CODE_L2(4); - break; - case 6: - CODE_L2(6); - break; - case 8: - CODE_L2(8); - break; - case 10: - CODE_L2(10); - break; - case 12: - CODE_L2(12); - break; - case 16: - CODE_L2(16); - break; - case 20: - CODE_L2(20); - break; - case 24: - CODE_L2(24); - break; - case 28: - CODE_L2(28); - break; - case 32: - CODE_L2(32); - break; - // FIXME: larger sizes require too many registers - we need the - // MM implementation working - default: - FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer " - "not currently supported"); - } - -#undef RUN_CODE -#undef CODE_L2 - - CUDA_TEST_ERROR(); -} - -} } // namespace diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/gpu/impl/PQScanMultiPassNoPrecomputed.cu deleted file mode 100644 index a514694..0000000 --- a/gpu/impl/PQScanMultiPassNoPrecomputed.cu +++ /dev/null @@ -1,597 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace faiss { namespace gpu { - -// This must be kept in sync with PQCodeDistances.cu -bool isSupportedNoPrecomputedSubDimSize(int dims) { - switch (dims) { - case 1: - case 2: - case 3: - case 4: - case 6: - case 8: - case 10: - case 12: - case 16: - case 20: - case 24: - case 28: - case 32: - return true; - default: - // FIXME: larger sizes require too many registers - we need the - // MM implementation working - return false; - } -} - -template -struct LoadCodeDistances { - static inline __device__ void load(LookupT* smem, - LookupT* codes, - int numCodes) { - constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT); - - // We can only use the vector type if the data is guaranteed to be - // aligned. The codes are innermost, so if it is evenly divisible, - // then any slice will be aligned. - if (numCodes % kWordSize == 0) { - // Load the data by float4 for efficiency, and then handle any remainder - // limitVec is the number of whole vec words we can load, in terms - // of whole blocks performing the load - constexpr int kUnroll = 2; - int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x); - limitVec *= kUnroll * blockDim.x; - - LookupVecT* smemV = (LookupVecT*) smem; - LookupVecT* codesV = (LookupVecT*) codes; - - for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) { - LookupVecT vals[kUnroll]; - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - vals[j] = - LoadStore::load(&codesV[i + j * blockDim.x]); - } - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - LoadStore::store(&smemV[i + j * blockDim.x], vals[j]); - } - } - - // This is where we start loading the remainder that does not evenly - // fit into kUnroll x blockDim.x - int remainder = limitVec * kWordSize; - - for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) { - smem[i] = codes[i]; - } - } else { - // Potential unaligned load - constexpr int kUnroll = 4; - - int limit = utils::roundDown(numCodes, kUnroll * blockDim.x); - - int i = threadIdx.x; - for (; i < limit; i += kUnroll * blockDim.x) { - LookupT vals[kUnroll]; - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - vals[j] = codes[i + j * blockDim.x]; - } - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - smem[i + j * blockDim.x] = vals[j]; - } - } - - for (; i < numCodes; i += blockDim.x) { - smem[i] = codes[i]; - } - } - } -}; - -template -__global__ void -pqScanNoPrecomputedMultiPass(Tensor queries, - Tensor pqCentroids, - Tensor topQueryToCentroid, - Tensor codeDistances, - void** listCodes, - int* listLengths, - Tensor prefixSumOffsets, - Tensor distance) { - const auto codesPerSubQuantizer = pqCentroids.getSize(2); - - // Where the pq code -> residual distance is stored - extern __shared__ char smemCodeDistances[]; - LookupT* codeDist = (LookupT*) smemCodeDistances; - - // Each block handles a single query - auto queryId = blockIdx.y; - auto probeId = blockIdx.x; - - // This is where we start writing out data - // We ensure that before the array (at offset -1), there is a 0 value - int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1); - float* distanceOut = distance[outBase].data(); - - auto listId = topQueryToCentroid[queryId][probeId]; - // Safety guard in case NaNs in input cause no list ID to be generated - if (listId == -1) { - return; - } - - unsigned char* codeList = (unsigned char*) listCodes[listId]; - int limit = listLengths[listId]; - - constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 : - (NumSubQuantizers / 4); - unsigned int code32[kNumCode32]; - unsigned int nextCode32[kNumCode32]; - - // We double-buffer the code loading, which improves memory utilization - if (threadIdx.x < limit) { - LoadCode32::load(code32, codeList, threadIdx.x); - } - - LoadCodeDistances::load( - codeDist, - codeDistances[queryId][probeId].data(), - codeDistances.getSize(2) * codeDistances.getSize(3)); - - // Prevent WAR dependencies - __syncthreads(); - - // Each thread handles one code element in the list, with a - // block-wide stride - for (int codeIndex = threadIdx.x; - codeIndex < limit; - codeIndex += blockDim.x) { - // Prefetch next codes - if (codeIndex + blockDim.x < limit) { - LoadCode32::load( - nextCode32, codeList, codeIndex + blockDim.x); - } - - float dist = 0.0f; - -#pragma unroll - for (int word = 0; word < kNumCode32; ++word) { - constexpr int kBytesPerCode32 = - NumSubQuantizers < 4 ? NumSubQuantizers : 4; - - if (kBytesPerCode32 == 1) { - auto code = code32[0]; - dist = ConvertTo::to(codeDist[code]); - - } else { -#pragma unroll - for (int byte = 0; byte < kBytesPerCode32; ++byte) { - auto code = getByte(code32[word], byte * 8, 8); - - auto offset = - codesPerSubQuantizer * (word * kBytesPerCode32 + byte); - - dist += ConvertTo::to(codeDist[offset + code]); - } - } - } - - // Write out intermediate distance result - // We do not maintain indices here, in order to reduce global - // memory traffic. Those are recovered in the final selection step. - distanceOut[codeIndex] = dist; - - // Rotate buffers -#pragma unroll - for (int word = 0; word < kNumCode32; ++word) { - code32[word] = nextCode32[word]; - } - } -} - -void -runMultiPassTile(Tensor& queries, - Tensor& centroids, - Tensor& pqCentroidsInnermostCode, - NoTypeTensor<4, true>& codeDistances, - Tensor& topQueryToCentroid, - bool useFloat16Lookup, - int bytesPerCode, - int numSubQuantizers, - int numSubQuantizerCodes, - thrust::device_vector& listCodes, - thrust::device_vector& listIndices, - IndicesOptions indicesOptions, - thrust::device_vector& listLengths, - Tensor& thrustMem, - Tensor& prefixSumOffsets, - Tensor& allDistances, - Tensor& heapDistances, - Tensor& heapIndices, - int k, - faiss::MetricType metric, - Tensor& outDistances, - Tensor& outIndices, - cudaStream_t stream) { - // We only support two metrics at the moment - FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT || - metric == MetricType::METRIC_L2); - - bool l2Distance = metric == MetricType::METRIC_L2; - - // Calculate offset lengths, so we know where to write out - // intermediate results - runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets, - thrustMem, stream); - - // Calculate residual code distances, since this is without - // precomputed codes - runPQCodeDistances(pqCentroidsInnermostCode, - queries, - centroids, - topQueryToCentroid, - codeDistances, - l2Distance, - useFloat16Lookup, - stream); - - // Convert all codes to a distance, and write out (distance, - // index) values for all intermediate results - { - auto kThreadsPerBlock = 256; - - auto grid = dim3(topQueryToCentroid.getSize(1), - topQueryToCentroid.getSize(0)); - auto block = dim3(kThreadsPerBlock); - - // pq centroid distances - auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float); - - smem *= numSubQuantizers * numSubQuantizerCodes; - FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice()); - -#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \ - do { \ - auto codeDistancesT = codeDistances.toTensor(); \ - \ - pqScanNoPrecomputedMultiPass \ - <<>>( \ - queries, \ - pqCentroidsInnermostCode, \ - topQueryToCentroid, \ - codeDistancesT, \ - listCodes.data().get(), \ - listLengths.data().get(), \ - prefixSumOffsets, \ - allDistances); \ - } while (0) - -#define RUN_PQ(NUM_SUB_Q) \ - do { \ - if (useFloat16Lookup) { \ - RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \ - } else { \ - RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ - } \ - } while (0) - - switch (bytesPerCode) { - case 1: - RUN_PQ(1); - break; - case 2: - RUN_PQ(2); - break; - case 3: - RUN_PQ(3); - break; - case 4: - RUN_PQ(4); - break; - case 8: - RUN_PQ(8); - break; - case 12: - RUN_PQ(12); - break; - case 16: - RUN_PQ(16); - break; - case 20: - RUN_PQ(20); - break; - case 24: - RUN_PQ(24); - break; - case 28: - RUN_PQ(28); - break; - case 32: - RUN_PQ(32); - break; - case 40: - RUN_PQ(40); - break; - case 48: - RUN_PQ(48); - break; - case 56: - RUN_PQ(56); - break; - case 64: - RUN_PQ(64); - break; - case 96: - RUN_PQ(96); - break; - default: - FAISS_ASSERT(false); - break; - } - -#undef RUN_PQ -#undef RUN_PQ_OPT - } - - CUDA_TEST_ERROR(); - - // k-select the output in chunks, to increase parallelism - runPass1SelectLists(prefixSumOffsets, - allDistances, - topQueryToCentroid.getSize(1), - k, - !l2Distance, // L2 distance chooses smallest - heapDistances, - heapIndices, - stream); - - // k-select final output - auto flatHeapDistances = heapDistances.downcastInner<2>(); - auto flatHeapIndices = heapIndices.downcastInner<2>(); - - runPass2SelectLists(flatHeapDistances, - flatHeapIndices, - listIndices, - indicesOptions, - prefixSumOffsets, - topQueryToCentroid, - k, - !l2Distance, // L2 distance chooses smallest - outDistances, - outIndices, - stream); -} - -void runPQScanMultiPassNoPrecomputed(Tensor& queries, - Tensor& centroids, - Tensor& pqCentroidsInnermostCode, - Tensor& topQueryToCentroid, - bool useFloat16Lookup, - int bytesPerCode, - int numSubQuantizers, - int numSubQuantizerCodes, - thrust::device_vector& listCodes, - thrust::device_vector& listIndices, - IndicesOptions indicesOptions, - thrust::device_vector& listLengths, - int maxListLength, - int k, - faiss::MetricType metric, - // output - Tensor& outDistances, - // output - Tensor& outIndices, - GpuResources* res) { - constexpr int kMinQueryTileSize = 8; - constexpr int kMaxQueryTileSize = 128; - constexpr int kThrustMemSize = 16384; - - int nprobe = topQueryToCentroid.getSize(1); - - auto& mem = res->getMemoryManagerCurrentDevice(); - auto stream = res->getDefaultStreamCurrentDevice(); - - // Make a reservation for Thrust to do its dirty work (global memory - // cross-block reduction space); hopefully this is large enough. - DeviceTensor thrustMem1( - mem, {kThrustMemSize}, stream); - DeviceTensor thrustMem2( - mem, {kThrustMemSize}, stream); - DeviceTensor* thrustMem[2] = - {&thrustMem1, &thrustMem2}; - - // How much temporary storage is available? - // If possible, we'd like to fit within the space available. - size_t sizeAvailable = mem.getSizeAvailable(); - - // We run two passes of heap selection - // This is the size of the first-level heap passes - constexpr int kNProbeSplit = 8; - int pass2Chunks = std::min(nprobe, kNProbeSplit); - - size_t sizeForFirstSelectPass = - pass2Chunks * k * (sizeof(float) + sizeof(int)); - - // How much temporary storage we need per each query - size_t sizePerQuery = - 2 * // streams - ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets - nprobe * maxListLength * sizeof(float) + // allDistances - // residual distances - nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) + - sizeForFirstSelectPass); - - int queryTileSize = (int) (sizeAvailable / sizePerQuery); - - if (queryTileSize < kMinQueryTileSize) { - queryTileSize = kMinQueryTileSize; - } else if (queryTileSize > kMaxQueryTileSize) { - queryTileSize = kMaxQueryTileSize; - } - - // FIXME: we should adjust queryTileSize to deal with this, since - // indexing is in int32 - FAISS_ASSERT(queryTileSize * nprobe * maxListLength < - std::numeric_limits::max()); - - // Temporary memory buffers - // Make sure there is space prior to the start which will be 0, and - // will handle the boundary condition without branches - DeviceTensor prefixSumOffsetSpace1( - mem, {queryTileSize * nprobe + 1}, stream); - DeviceTensor prefixSumOffsetSpace2( - mem, {queryTileSize * nprobe + 1}, stream); - - DeviceTensor prefixSumOffsets1( - prefixSumOffsetSpace1[1].data(), - {queryTileSize, nprobe}); - DeviceTensor prefixSumOffsets2( - prefixSumOffsetSpace2[1].data(), - {queryTileSize, nprobe}); - DeviceTensor* prefixSumOffsets[2] = - {&prefixSumOffsets1, &prefixSumOffsets2}; - - // Make sure the element before prefixSumOffsets is 0, since we - // depend upon simple, boundary-less indexing to get proper results - CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(), - 0, - sizeof(int), - stream)); - CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(), - 0, - sizeof(int), - stream)); - - int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float); - - int totalCodeDistancesSize = - queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes * - codeDistanceTypeSize; - - DeviceTensor codeDistances1Mem( - mem, {totalCodeDistancesSize}, stream); - NoTypeTensor<4, true> codeDistances1( - codeDistances1Mem.data(), - codeDistanceTypeSize, - {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes}); - - DeviceTensor codeDistances2Mem( - mem, {totalCodeDistancesSize}, stream); - NoTypeTensor<4, true> codeDistances2( - codeDistances2Mem.data(), - codeDistanceTypeSize, - {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes}); - - NoTypeTensor<4, true>* codeDistances[2] = - {&codeDistances1, &codeDistances2}; - - DeviceTensor allDistances1( - mem, {queryTileSize * nprobe * maxListLength}, stream); - DeviceTensor allDistances2( - mem, {queryTileSize * nprobe * maxListLength}, stream); - DeviceTensor* allDistances[2] = - {&allDistances1, &allDistances2}; - - DeviceTensor heapDistances1( - mem, {queryTileSize, pass2Chunks, k}, stream); - DeviceTensor heapDistances2( - mem, {queryTileSize, pass2Chunks, k}, stream); - DeviceTensor* heapDistances[2] = - {&heapDistances1, &heapDistances2}; - - DeviceTensor heapIndices1( - mem, {queryTileSize, pass2Chunks, k}, stream); - DeviceTensor heapIndices2( - mem, {queryTileSize, pass2Chunks, k}, stream); - DeviceTensor* heapIndices[2] = - {&heapIndices1, &heapIndices2}; - - auto streams = res->getAlternateStreamsCurrentDevice(); - streamWait(streams, {stream}); - - int curStream = 0; - - for (int query = 0; query < queries.getSize(0); query += queryTileSize) { - int numQueriesInTile = - std::min(queryTileSize, queries.getSize(0) - query); - - auto prefixSumOffsetsView = - prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile); - - auto codeDistancesView = - codeDistances[curStream]->narrowOutermost(0, numQueriesInTile); - auto coarseIndicesView = - topQueryToCentroid.narrowOutermost(query, numQueriesInTile); - auto queryView = - queries.narrowOutermost(query, numQueriesInTile); - - auto heapDistancesView = - heapDistances[curStream]->narrowOutermost(0, numQueriesInTile); - auto heapIndicesView = - heapIndices[curStream]->narrowOutermost(0, numQueriesInTile); - - auto outDistanceView = - outDistances.narrowOutermost(query, numQueriesInTile); - auto outIndicesView = - outIndices.narrowOutermost(query, numQueriesInTile); - - runMultiPassTile(queryView, - centroids, - pqCentroidsInnermostCode, - codeDistancesView, - coarseIndicesView, - useFloat16Lookup, - bytesPerCode, - numSubQuantizers, - numSubQuantizerCodes, - listCodes, - listIndices, - indicesOptions, - listLengths, - *thrustMem[curStream], - prefixSumOffsetsView, - *allDistances[curStream], - heapDistancesView, - heapIndicesView, - k, - metric, - outDistanceView, - outIndicesView, - streams[curStream]); - - curStream = (curStream + 1) % 2; - } - - streamWait({stream}, streams); -} - -} } // namespace diff --git a/gpu/test/Makefile b/gpu/test/Makefile index 6836314..697e8eb 100644 --- a/gpu/test/Makefile +++ b/gpu/test/Makefile @@ -17,14 +17,18 @@ TESTS_BIN = $(TESTS_OBJ:.o=) $(CUDA_TESTS_OBJ:.o=) # test_gpu_index.py test_pytorch_faiss.py +build: $(TESTS_BIN) +TestUtils.o: TestUtils.cpp + $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ ../../libfaiss.a -Igtest/include -I../.. + run: $(TESTS_BIN) $(CUDA_TESTS_BIN) for t in $(TESTS_BIN) $(CUDA_TESTS_BIN); do ./$$t || exit; done $(CUDA_TESTS_OBJ): %.o: %.cu gtest - $(NVCC) $(NVCCFLAGS) -g -O3 -o $@ -c $< -Igtest/include + $(NVCC) $(NVCCFLAGS) -g -O3 -o $@ -c $< -Igtest/include -I../.. $(TESTS_OBJ): %.o: %.cpp gtest - $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $< -Igtest/include + $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $< -Igtest/include -I../.. $(TESTS_BIN): %: %.o TestUtils.o ../../libfaiss.a gtest/make/gtest.a $(CXX) -o $@ $^ $(LDFLAGS) $(LIBS) @@ -33,7 +37,7 @@ demo_ivfpq_indexing_gpu: demo_ivfpq_indexing_gpu.o ../../libfaiss.a $(CXX) -o $@ $^ $(LDFLAGS) $(LIBS) demo_ivfpq_indexing_gpu.o: demo_ivfpq_indexing_gpu.cpp - $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ + $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -o $@ -c $^ -I../.. gtest/make/gtest.a: gtest $(MAKE) -C gtest/make CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)" gtest.a diff --git a/gpu/test/demo_ivfpq_indexing_gpu.cpp b/gpu/test/demo_ivfpq_indexing_gpu.cpp index 852a43c..5fde30c 100644 --- a/gpu/test/demo_ivfpq_indexing_gpu.cpp +++ b/gpu/test/demo_ivfpq_indexing_gpu.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include -- 1.8.3.1