BVLC · artyom-beilis · Apr 20, 2021 · Sep 13, 2021 · Nov 2, 2021
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
@@ -198,7 +198,11 @@ function(detect_cuDNN)
     set(HAVE_CUDNN  TRUE PARENT_SCOPE)
     set(CUDNN_FOUND TRUE PARENT_SCOPE)
 
-    file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    if(EXISTS ${CUDNN_INCLUDE}/cudnn_version.h)
+      file(READ ${CUDNN_INCLUDE}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
+    else()
+      file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    endif()
 
     # cuDNN v3 and beyond
     string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
@@ -224,7 +228,7 @@ function(detect_cuDNN)
 
     string(COMPARE LESS "${CUDNN_VERSION_MAJOR}" 3 cuDNNVersionIncompatible)
     if(cuDNNVersionIncompatible)
-      message(FATAL_ERROR "cuDNN version >3 is required.")
+      message("cuDNN version >3 is required.")
     endif()
 
     set(CUDNN_VERSION "${CUDNN_VERSION}" PARENT_SCOPE)

diff --git a/include/caffe/layers/cudnn_conv_layer.hpp b/include/caffe/layers/cudnn_conv_layer.hpp
@@ -30,7 +30,9 @@ template <typename Dtype>
 class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
  public:
   explicit CuDNNConvolutionLayer(const LayerParameter& param)
-      : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
+      : ConvolutionLayer<Dtype>(param),
+        handles_setup_(false),
+        shapes_ready_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -43,7 +45,10 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  void findOptimalAlgorithm(int index, size_t workspace_limit_bytes);
+
   bool handles_setup_;
+  bool shapes_ready_;
   cudnnHandle_t* handle_;
   cudaStream_t*  stream_;
 
@@ -58,6 +63,8 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
   int bottom_offset_, top_offset_, bias_offset_;
 
+  std::vector<int> cudnn_shape_;
+
   size_t *workspace_fwd_sizes_;
   size_t *workspace_bwd_data_sizes_;
   size_t *workspace_bwd_filter_sizes_;

diff --git a/include/caffe/layers/cudnn_deconv_layer.hpp b/include/caffe/layers/cudnn_deconv_layer.hpp
@@ -25,7 +25,9 @@ template <typename Dtype>
 class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
  public:
   explicit CuDNNDeconvolutionLayer(const LayerParameter& param)
-    : DeconvolutionLayer<Dtype>(param), handles_setup_(false) {}
+    : DeconvolutionLayer<Dtype>(param),
+      handles_setup_(false),
+      shapes_ready_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
                           const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -39,7 +41,11 @@ class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
                             const vector<bool>& propagate_down,
                             const vector<Blob<Dtype>*>& bottom);
 
+  void findOptimalAlgorithm(int index, size_t workspace_limit_bytes);
+  void getWorkSpaces(int index);
+
   bool handles_setup_;
+  bool shapes_ready_;
   cudnnHandle_t* handle_;
   cudaStream_t*  stream_;
 
@@ -54,6 +60,8 @@ class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
   int bottom_offset_, top_offset_, bias_offset_;
 
+  std::vector<int> cudnn_shape_;
+
   size_t *workspace_fwd_sizes_;
   size_t *workspace_bwd_data_sizes_;
   size_t *workspace_bwd_filter_sizes_;

diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
@@ -3,6 +3,8 @@
 #ifdef USE_CUDNN
 
 #include <cudnn.h>
+#include <algorithm>
+#include <vector>
 
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
@@ -50,6 +52,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
       return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
     case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
       return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+#endif
+#if CUDNN_VERSION_MIN(8, 0, 0)
+    case CUDNN_STATUS_VERSION_MISMATCH:
+      return "CUDNN_STATUS_VERSION_MISMATCH";
 #endif
   }
   return "Unknown cudnn status";
@@ -161,6 +167,40 @@ inline void createActivationDescriptor(cudnnActivationDescriptor_t* activ_desc,
                                            CUDNN_PROPAGATE_NAN, Dtype(0)));
 }
 
+template<typename T>
+inline T findFirstSuitableAlgorithm(std::vector<T> const &v,
+                                    size_t count, size_t limit) {
+  count = std::min(count, v.size());
+  for (size_t i = 0; i < count; i++) {
+    if (v[i].memory <= limit) {
+      return v[i];
+    }
+  }
+  if (!v.empty()) {
+    return v[0];
+  }
+  return T();
+}
+
+/// Check that new shape identical to previous up to batch size
+/// that allowed to be smaller
+inline bool areConvShapesCompatible(std::vector<int> const &src,
+                                    std::vector<int> const &reshaped) {
+  if (src.size() != reshaped.size() || src.size() < 2) {
+    return false;
+  }
+  if (src[0] < reshaped[0]) {
+    return false;
+  }
+  for (size_t i = 1; i < src.size(); i++) {
+    if (src[i] != reshaped[i]) {
+        return false;
+    }
+  }
+  return true;
+}
+
+
 }  // namespace cudnn
 
 }  // namespace caffe

diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -85,12 +85,133 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
   }
 
   handles_setup_ = true;
+  shapes_ready_ = false;
 }
 
+#if CUDNN_VERSION_MIN(7, 0, 0)
+// Using FindAlgorithm
+template <typename Dtype>
+void CuDNNConvolutionLayer<Dtype>::findOptimalAlgorithm(int index,
+                                size_t workspace_limit_bytes) {
+
+  int nfwd = 1, nbwd_filter = 1, nbwd_data = 1;
+  CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(
+                handle_[0], &nfwd));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+                handle_[0], &nbwd_filter));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+                handle_[0], &nbwd_data));
+
+  std::vector<cudnnConvolutionFwdAlgoPerf_t>
+  fwd_v(std::max(nfwd, 1));
+
+  std::vector<cudnnConvolutionBwdFilterAlgoPerf_t>
+  bwd_filter_v(std::max(nbwd_filter, 1));
+
+  std::vector<cudnnConvolutionBwdDataAlgoPerf_t>
+  bwd_data_v(std::max(nbwd_data, 1));
+
+  cudnnConvolutionFwdAlgoPerf_t       fwd_perf;
+  cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_perf;
+  cudnnConvolutionBwdDataAlgoPerf_t   bwd_data_perf;
+  int count = 0;
+  // choose forward and backward algorithms + workspace(s)
+  CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
+    bottom_descs_[index],
+    filter_desc_,
+    conv_descs_[index],
+    top_descs_[index],
+    fwd_v.size(), &count, &fwd_v[0]));
+
+  fwd_perf = cudnn::findFirstSuitableAlgorithm(fwd_v,
+               count, workspace_limit_bytes);
+  fwd_algo_[index] =fwd_perf.algo;
+  workspace_fwd_sizes_[index] = fwd_perf.memory;
+
+  // choose backward algorithm for filter
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7(handle_[0],
+        bottom_descs_[index],
+        top_descs_[index],
+        conv_descs_[index],
+        filter_desc_,
+        bwd_filter_v.size(),
+        &count,
+        &bwd_filter_v[0]));
+
+  bwd_filter_perf = cudnn::findFirstSuitableAlgorithm(bwd_filter_v,
+                                count, workspace_limit_bytes);
+  bwd_filter_algo_[index] = bwd_filter_perf.algo;
+  workspace_bwd_filter_sizes_[index] = bwd_filter_perf.memory;
+
+  // choose backward algo for data
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0],
+        filter_desc_, top_descs_[index], conv_descs_[index],
+        bottom_descs_[index],
+        bwd_data_v.size(), &count, &bwd_data_v[0]));
+
+  bwd_data_perf = cudnn::findFirstSuitableAlgorithm(bwd_data_v,
+                            count, workspace_limit_bytes);
+  bwd_data_algo_[index] = bwd_data_perf.algo;
+  workspace_bwd_data_sizes_[index] = bwd_data_perf.memory;
+
+}
+#else
+
+template <typename Dtype>
+void CuDNNConvolutionLayer<Dtype>::findOptimalAlgorithm(int index,
+                                        size_t workspace_limit_bytes) {
+  // choose forward and backward algorithms + workspace(s)
+  CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
+    bottom_descs_[index],
+    filter_desc_,
+    conv_descs_[index],
+    top_descs_[index],
+    CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+    workspace_limit_bytes,
+    &fwd_algo_[index]));
+
+  // choose backward algorithm for filter
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
+        bottom_descs_[index], top_descs_[index], conv_descs_[index],
+        filter_desc_,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes, &bwd_filter_algo_[index]) );
+
+  // choose backward algo for data
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
+    filter_desc_, top_descs_[index], conv_descs_[index],
+    bottom_descs_[index],
+    CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+    workspace_limit_bytes, &bwd_data_algo_[index]));
+
+  CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
+    bottom_descs_[index],
+    filter_desc_,
+    conv_descs_[index],
+    top_descs_[index],
+    fwd_algo_[index],
+    &(workspace_fwd_sizes_[index])));
+
+  // get workspace for backwards filter algorithm
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
+    bottom_descs_[index], top_descs_[index], conv_descs_[index],
+    filter_desc_, bwd_filter_algo_[index],
+    &workspace_bwd_filter_sizes_[index]));
+
+
+  // get workspace size
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
+    filter_desc_, top_descs_[index], conv_descs_[index], bottom_descs_[index],
+    bwd_data_algo_[index], &workspace_bwd_data_sizes_[index]) );
+}
+#endif
+
+
 template <typename Dtype>
 void CuDNNConvolutionLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   ConvolutionLayer<Dtype>::Reshape(bottom, top);
+
   CHECK_EQ(2, this->num_spatial_axes_)
       << "CuDNNConvolution input must have 2 spatial axes "
       << "(e.g., height and width). "
@@ -112,6 +233,9 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
   // planning strategy and a rewrite of Caffe's GPU memory mangagement
   size_t workspace_limit_bytes = 8*1024*1024;
 
+  bool select_algo = !shapes_ready_ 
+	  	     || cudnn_shape_ != bottom[0]->shape();
+
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
         this->num_,
@@ -127,46 +251,14 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
         filter_desc_, pad_h, pad_w,
         stride_h, stride_w);
 
-    // choose forward and backward algorithms + workspace(s)
-    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
-      bottom_descs_[i],
-      filter_desc_,
-      conv_descs_[i],
-      top_descs_[i],
-      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-      workspace_limit_bytes,
-      &fwd_algo_[i]));
-
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
-      bottom_descs_[i],
-      filter_desc_,
-      conv_descs_[i],
-      top_descs_[i],
-      fwd_algo_[i],
-      &(workspace_fwd_sizes_[i])));
-
-    // choose backward algorithm for filter
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
-          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
-          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-          workspace_limit_bytes, &bwd_filter_algo_[i]) );
-
-    // get workspace for backwards filter algorithm
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
-          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
-          bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
-
-    // choose backward algo for data
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
-          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
-          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes, &bwd_data_algo_[i]));
-
-    // get workspace size
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
-          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
-          bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
+    if (select_algo) {
+      findOptimalAlgorithm(i, workspace_limit_bytes);
+    }
+  }
+  if (select_algo) {
+    cudnn_shape_ = bottom[0]->shape();
   }
+  shapes_ready_ = true;
 
   // reduce over all workspace sizes to get a maximum to allocate / reallocate
   size_t total_workspace_fwd = 0;
@@ -185,15 +277,22 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
   size_t max_workspace = std::max(total_workspace_fwd,
                              total_workspace_bwd_data);
   max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
+  // ensure alignment
+  const size_t aligmnent_workspace = 128;
+  max_workspace = (max_workspace + aligmnent_workspace - 1) 
+	  / aligmnent_workspace * aligmnent_workspace;
   // ensure all groups have enough workspace
   size_t total_max_workspace = max_workspace *
                                (this->group_ * CUDNN_STREAMS_PER_GROUP);
 
   // this is the total amount of storage needed over all groups + streams
   if (total_max_workspace > workspaceSizeInBytes) {
     DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
+    static size_t total_ws_global = 0;
+    total_ws_global += total_max_workspace - workspaceSizeInBytes;
     workspaceSizeInBytes = total_max_workspace;
 
+
     // free the existing workspace and allocate a new (larger) one
     cudaFree(this->workspaceData);