Skip to content

Commit

Permalink
Cus backend (tensorflow#5)
Browse files Browse the repository at this point in the history
* avoid autotune for cus type

* support cus matmul with cutlass using float32 as computation type

* add cus support for more ops

* add stream and scratchmemory to cutlass conv function interface

* avoid recompilation when changing env vars

* temporarily disabled GpuConvAlgorithmPicker

* add hlo compare for cus

* emulating fp16, add forceinline
  • Loading branch information
serach24 committed Jul 23, 2022
1 parent 6829ee9 commit 9b1eb96
Show file tree
Hide file tree
Showing 15 changed files with 1,620 additions and 956 deletions.
1 change: 1 addition & 0 deletions .bazelrc
Expand Up @@ -92,6 +92,7 @@

# Allow builds using libc++ as a linker library
# This is mostly for OSSFuzz, so we also pass in the flags from environment to clean build file
build --incompatible_strict_action_env
build:libc++ --action_env=CC
build:libc++ --action_env=CXX
build:libc++ --action_env=CXXFLAGS=-stdlib=libc++
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
Expand Up @@ -262,10 +262,15 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(

int64 batch_size = gemm_config.batch_size();
absl::optional<se::blas::AlgorithmType> result;
auto op0_type = instr->operand(0)->shape().element_type();
auto op1_type = instr->operand(1)->shape().element_type();
if (batch_size != 1) {
// TODO(b/112111608): Implement auto tune for batched gemm.
VLOG(2) << "Batch size is non-singular, using generic algorithm";
result = absl::nullopt;
} else if (op0_type == CUS || op1_type == CUS) {
VLOG(2) << "data type is cus, no need to autotune";
result = absl::nullopt;
} else {
TF_ASSIGN_OR_RETURN(result,
DoUncachedGemmAutotune(instr, stream, allocator));
Expand Down
16 changes: 9 additions & 7 deletions tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
Expand Up @@ -30,9 +30,9 @@ limitations under the License.
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Linker/Linker.h"
#include "mlir/IR/Module.h" // from @llvm-project
#include "llvm/Support/SourceMgr.h"
#include "mlir/IR/Module.h" // from @llvm-project
#include "mlir/InitAllDialects.h" // from @llvm-project
#include "tensorflow/compiler/xla/protobuf_util.h"
#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
Expand Down Expand Up @@ -455,7 +455,7 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
// the gte(customcall, 0) would probably already be into a fusion node. We
// can't simplify across HloComputation boundaries, so in this case we
// wouldn't be able to simplify away the new_tuple bits.
pipeline.AddPass<GpuConvAlgorithmPicker>(stream_exec, device_allocator);
// pipeline.AddPass<GpuConvAlgorithmPicker>(stream_exec, device_allocator);

// Clean up new_tuple described above.
pipeline.AddPass<TupleSimplifier>();
Expand Down Expand Up @@ -713,17 +713,19 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
if (user_pre_optimization_hook_) {
user_pre_optimization_hook_(*llvm_module);
}

llvm::SMDiagnostic diagnostic;
auto m = llvm::parseIRFile("tensorflow/core/platform/cus.bc", diagnostic,
llvm_context);
llvm::Linker::linkModules(*llvm_module, std::move(m));

string ir_module_string_before_opt;
const bool embed_ir_in_executable =
module->config().debug_options().xla_embed_ir_in_executable();
if (embed_ir_in_executable) {
ir_module_string_before_opt = llvm_ir::DumpModuleToString(*llvm_module);
}

llvm::SMDiagnostic diagnostic;
auto m = llvm::parseIRFile("tensorflow/core/platform/cus.bc", diagnostic, llvm_context);
llvm::Linker::linkModules(*llvm_module, std::move(m));

llvm_ir::DumpIrIfEnabled(*module, *llvm_module, /*optimized=*/false);

{
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/compiler/xla/service/hlo_evaluator.cc
Expand Up @@ -779,6 +779,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
Compare<float>(compare->shape(), direction,
lhs_literal, rhs_literal));
} break;
case CUS: {
TF_ASSIGN_OR_RETURN(evaluated_[compare],
Compare<cus>(compare->shape(), direction,
lhs_literal, rhs_literal));
} break;
case F64: {
TF_ASSIGN_OR_RETURN(evaluated_[compare],
Compare<double>(compare->shape(), direction,
Expand Down
115 changes: 57 additions & 58 deletions tensorflow/core/kernels/cutlass_conv.cu.cc
Expand Up @@ -3,33 +3,33 @@
#include <vector>

#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
#include "cutlass/conv/threadblock/threadblock_swizzle.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/conv/threadblock/threadblock_swizzle.h"
#include "tensorflow/stream_executor/platform/logging.h"

#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/host_tensor.h"

cudaError_t cutlassCusConvForward(MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size,
void* input_data, Tensor4DCoord filter_size,
void* filter_data, Tensor4DCoord output_size,
void* output_data, float alpha, float beta) {
cudaError_t cutlassCusConvForward(CUstream stream, MatrixCoord stride,
Tensor4DCoord padding, MatrixCoord dilation,
Tensor4DCoord input_size, void* input_data,
Tensor4DCoord filter_size, void* filter_data,
Tensor4DCoord output_size, void* output_data,
float alpha, float beta, void* workSpace,
size_t workSpaceSizeInBytes) {
using ElementA = cus;
using ElementB = cus;
using ElementC = cus;
using ElementAccumulator = cus;
using ElementAccumulator = float;
using ElementCompute = cus;
using SmArch = cutlass::arch::Sm75;

using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;

using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
Expand All @@ -39,7 +39,9 @@ cudaError_t cutlassCusConvForward(MatrixCoord stride, Tensor4DCoord padding,
// math instructions in the epilogue too.
ElementAccumulator, // Data type of accumulator
ElementCompute,
cutlass::epilogue::thread::ScaleType::Default>; // Data type for alpha/beta in linear combination
cutlass::epilogue::thread::ScaleType::Default>; // Data type for
// alpha/beta in linear
// combination

using Conv2d = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC, ElementB,
Expand All @@ -50,8 +52,7 @@ cudaError_t cutlassCusConvForward(MatrixCoord stride, Tensor4DCoord padding,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kAnalytic>::Kernel;

using ImplicitGemm =
cutlass::conv::device::ImplicitGemmConvolution<Conv2d>;
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2d>;

cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;

Expand Down Expand Up @@ -83,27 +84,26 @@ cudaError_t cutlassCusConvForward(MatrixCoord stride, Tensor4DCoord padding,

ImplicitGemm implicitGemmOp;
size_t workspace_size = implicitGemmOp.get_workspace_size(arguments);
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);

cutlass::Status status = implicitGemmOp.can_implement(arguments);
if (status != cutlass::Status::kSuccess) VLOG(3)<< "operation not possible" << cutlassGetStatusString(status);
implicitGemmOp.initialize(arguments, workspace.get());
status = implicitGemmOp();
if (status != cutlass::Status::kSuccess)
VLOG(3) << "operation not possible" << cutlassGetStatusString(status);
status = implicitGemmOp(arguments, workSpace, stream);
if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
return cudaSuccess;
}

cudaError_t cutlassCusConvBackwardData(MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size,
void* input_data, Tensor4DCoord filter_size,
void* filter_data, Tensor4DCoord output_size,
void* output_data, float alpha, float beta) {
cudaError_t cutlassCusConvBackwardData(
CUstream stream, MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size, void* input_data,
Tensor4DCoord filter_size, void* filter_data, Tensor4DCoord output_size,
void* output_data, float alpha, float beta, void* workSpace,
size_t workSpaceSizeInBytes) {
using ElementA = cus;
using ElementB = cus;
using ElementC = cus;
using ElementAccumulator = cus;
using ElementAccumulator = float;
using ElementCompute = cus;
using SmArch = cutlass::arch::Sm75;

Expand All @@ -118,7 +118,9 @@ cudaError_t cutlassCusConvBackwardData(MatrixCoord stride, Tensor4DCoord padding
// math instructions in the epilogue too.
ElementAccumulator, // Data type of accumulator
ElementCompute,
cutlass::epilogue::thread::ScaleType::Default>; // Data type for alpha/beta in linear combination
cutlass::epilogue::thread::ScaleType::Default>; // Data type for
// alpha/beta in linear
// combination

using Conv2d = typename cutlass::conv::kernel::DefaultConv2dDgrad<
ElementA, cutlass::layout::TensorNHWC, ElementB,
Expand All @@ -130,8 +132,7 @@ cudaError_t cutlassCusConvBackwardData(MatrixCoord stride, Tensor4DCoord padding
cutlass::conv::IteratorAlgorithm::kAnalytic,
cutlass::conv::StrideSupport::kStrided>::Kernel;

using ImplicitGemm =
cutlass::conv::device::ImplicitGemmConvolution<Conv2d>;
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2d>;

cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;

Expand Down Expand Up @@ -163,28 +164,27 @@ cudaError_t cutlassCusConvBackwardData(MatrixCoord stride, Tensor4DCoord padding

ImplicitGemm implicitGemmOp;
size_t workspace_size = implicitGemmOp.get_workspace_size(arguments);
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);

cutlass::Status status = implicitGemmOp.can_implement(arguments);
if (status != cutlass::Status::kSuccess) VLOG(3)<< "operation not possible" << cutlassGetStatusString(status);
implicitGemmOp.initialize(arguments, workspace.get());
status = implicitGemmOp();
if (status != cutlass::Status::kSuccess)
VLOG(3) << "operation not possible" << cutlassGetStatusString(status);
status = implicitGemmOp(arguments, workSpace, stream);
if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
return cudaSuccess;
}


cudaError_t cutlassCusConvBackwardFilter(MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size,
void* input_data, Tensor4DCoord filter_size,
void* filter_data, Tensor4DCoord output_size,
void* output_data, float alpha, float beta) {
cudaError_t cutlassCusConvBackwardFilter(
CUstream stream, MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size, void* input_data,
Tensor4DCoord filter_size, void* filter_data, Tensor4DCoord output_size,
void* output_data, float alpha, float beta, void* workSpace,
size_t workSpaceSizeInBytes) {
using ElementA = cus;
using ElementB = cus;
using ElementC = cus;
using ElementAccumulator = cus;
using ElementAccumulator = float;
using ElementCompute = cus;
using SmArch = cutlass::arch::Sm75;

Expand All @@ -199,7 +199,9 @@ cudaError_t cutlassCusConvBackwardFilter(MatrixCoord stride, Tensor4DCoord paddi
// math instructions in the epilogue too.
ElementAccumulator, // Data type of accumulator
ElementCompute,
cutlass::epilogue::thread::ScaleType::Default>; // Data type for alpha/beta in linear combination
cutlass::epilogue::thread::ScaleType::Default>; // Data type for
// alpha/beta in linear
// combination

using Conv2d = typename cutlass::conv::kernel::DefaultConv2dWgrad<
ElementA, cutlass::layout::TensorNHWC, ElementB,
Expand All @@ -210,8 +212,7 @@ cudaError_t cutlassCusConvBackwardFilter(MatrixCoord stride, Tensor4DCoord paddi
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kAnalytic>::Kernel;

using ImplicitGemm =
cutlass::conv::device::ImplicitGemmConvolution<Conv2d>;
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2d>;

cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;

Expand All @@ -236,36 +237,34 @@ cudaError_t cutlassCusConvBackwardFilter(MatrixCoord stride, Tensor4DCoord paddi
cutlass::TensorRef<cus, cutlass::layout::TensorNHWC> tensor_c(
static_cast<cus*>(output_data), layout_c);

// typename ImplicitGemm::Arguments arguments{
// problem_size, tensor_x,
// tensor_w, tensor_c,
// tensor_c, {static_cast<cus>(alpha), static_cast<cus>(beta)}};
// typename ImplicitGemm::Arguments arguments{
// problem_size, tensor_x,
// tensor_w, tensor_c,
// tensor_c, {static_cast<cus>(alpha), static_cast<cus>(beta)}};

typename ImplicitGemm::Arguments arguments{
typename ImplicitGemm::Arguments arguments{
problem_size, tensor_c,
tensor_x, tensor_w,
tensor_w, {static_cast<cus>(alpha), static_cast<cus>(beta)}};

ImplicitGemm implicitGemmOp;
size_t workspace_size = implicitGemmOp.get_workspace_size(arguments);
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);

cutlass::Status status = implicitGemmOp.can_implement(arguments);
if (status != cutlass::Status::kSuccess) VLOG(3)<< "operation not possible" << cutlassGetStatusString(status);
implicitGemmOp.initialize(arguments, workspace.get());
status = implicitGemmOp();
if (status != cutlass::Status::kSuccess)
VLOG(3) << "operation not possible" << cutlassGetStatusString(status);
status = implicitGemmOp(arguments, workSpace, stream);
if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
return cudaSuccess;
}


cudaError_t cutlassCusBiasActivationConv(
MatrixCoord stride, Tensor4DCoord padding, MatrixCoord dilation,
Tensor4DCoord input_size, const void* input_data, Tensor4DCoord filter_size,
const void* filter_data, Tensor4DCoord output_size, void* output_data,
float alpha, float beta) {
CUstream stream, MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size, const void* input_data,
Tensor4DCoord filter_size, const void* filter_data,
Tensor4DCoord output_size, void* output_data, float alpha, float beta) {
return cudaErrorUnknown;
}

21 changes: 12 additions & 9 deletions tensorflow/core/kernels/cutlass_conv.h
Expand Up @@ -2,8 +2,8 @@
#define TENSORFLOW_CORE_KERNELS_CUTLASS_CONV_H_

#include "tensorflow/core/platform/cus.h"
#include "third_party/gpus/cuda/include/cuda.h"
#include "third_party/gpus/cuda/include/driver_types.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "cutlass/tensor_coord.h"
#include "cutlass/matrix_coord.h"

Expand All @@ -12,26 +12,29 @@ using cutlass::Tensor4DCoord;

using tensorflow::cus;

cudaError_t cutlassCusConvForward(MatrixCoord stride, Tensor4DCoord padding,
cudaError_t cutlassCusConvForward(CUstream stream, MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size,
void* input_data, Tensor4DCoord filter_size,
void* filter_data, Tensor4DCoord output_size,
void* output_data, float alpha, float beta);
void* output_data, float alpha, float beta,
void *workSpace, size_t workSpaceSizeInBytes);

cudaError_t cutlassCusConvBackwardData(MatrixCoord stride, Tensor4DCoord padding,
cudaError_t cutlassCusConvBackwardData(CUstream stream, MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size,
void* input_data, Tensor4DCoord filter_size,
void* filter_data, Tensor4DCoord output_size,
void* output_data, float alpha, float beta);
void* output_data, float alpha, float beta,
void *workSpace, size_t workSpaceSizeInBytes);

cudaError_t cutlassCusConvBackwardFilter(MatrixCoord stride, Tensor4DCoord padding,
cudaError_t cutlassCusConvBackwardFilter(CUstream stream, MatrixCoord stride, Tensor4DCoord padding,
MatrixCoord dilation, Tensor4DCoord input_size,
void* input_data, Tensor4DCoord filter_size,
void* filter_data, Tensor4DCoord output_size,
void* output_data, float alpha, float beta);
void* output_data, float alpha, float beta,
void *workSpace, size_t workSpaceSizeInBytes);

cudaError_t cutlassCusBiasActivationConv(
MatrixCoord stride, Tensor4DCoord padding, MatrixCoord dilation,
CUstream stream, MatrixCoord stride, Tensor4DCoord padding, MatrixCoord dilation,
Tensor4DCoord input_size, const void* input_data, Tensor4DCoord filter_size,
const void* filter_data, Tensor4DCoord output_size, void* output_data,
void* alpha, void* beta);
Expand Down

0 comments on commit 9b1eb96

Please sign in to comment.