From 1fb7430dd5f9ecbb993dc1f169d5fab83c267337 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Thu, 11 Nov 2021 07:49:16 -0700
Subject: [PATCH] sve - initial SVE backend framework

---
 Makefile                           |  17 +++-
 README.md                          |   4 +
 backends/ceed-backend-list.h       |   2 +
 backends/sve/ceed-sve-blocked.c    |  45 +++++++++
 backends/sve/ceed-sve-serial.c     |  45 +++++++++
 backends/sve/ceed-sve-tensor-f32.c | 150 +++++++++++++++++++++++++++++
 backends/sve/ceed-sve-tensor-f64.c | 150 +++++++++++++++++++++++++++++
 backends/sve/ceed-sve.h            |  17 ++++
 8 files changed, 428 insertions(+), 2 deletions(-)
 create mode 100644 backends/sve/ceed-sve-blocked.c
 create mode 100644 backends/sve/ceed-sve-serial.c
 create mode 100644 backends/sve/ceed-sve-tensor-f32.c
 create mode 100644 backends/sve/ceed-sve-tensor-f64.c
 create mode 100644 backends/sve/ceed-sve.h

diff --git a/Makefile b/Makefile
index 8a3bfd5f82..4c1b494bbb 100644
--- a/Makefile
+++ b/Makefile
@@ -236,6 +236,7 @@ blocked.c      := $(sort $(wildcard backends/blocked/*.c))
 ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
 opt.c          := $(sort $(wildcard backends/opt/*.c))
 avx.c          := $(sort $(wildcard backends/avx/*.c))
+sve.c          := $(sort $(wildcard backends/sve/*.c))
 xsmm.c         := $(sort $(wildcard backends/xsmm/*.c))
 cuda.c         := $(sort $(wildcard backends/cuda/*.c))
 cuda.cpp       := $(sort $(wildcard backends/cuda/*.cpp))
@@ -301,6 +302,7 @@ info:
 	$(info ------------------------------------)
 	$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
 	$(info AVX_STATUS    = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
+	$(info SVE_STATUS    = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS)))
 	$(info XSMM_DIR      = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
 	$(info OCCA_DIR      = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
 	$(info MAGMA_DIR     = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
@@ -341,7 +343,7 @@ ifeq ($(MEMCHK),1)
   BACKENDS_MAKE += $(MEMCHK_BACKENDS)
 endif
 
-# AVX Backed
+# AVX Backends
 AVX_STATUS = Disabled
 AVX_FLAG := $(if $(filter clang,$(CC_VENDOR)),+avx,-mavx)
 AVX := $(filter $(AVX_FLAG),$(shell $(CC) $(CFLAGS:-M%=) -v -E -x c /dev/null 2>&1))
@@ -352,6 +354,17 @@ ifneq ($(AVX),)
   BACKENDS_MAKE += $(AVX_BACKENDS)
 endif
 
+# SVE Backends
+SVE_STATUS = Disabled
+SVE_FLAG := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve)
+SVE ?=
+SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked
+ifneq ($(SVE),)
+  SVE_STATUS = Enabled
+  libceed.c += $(sve.c)
+  BACKENDS_MAKE += $(SVE_BACKENDS)
+endif
+
 # Collect list of libraries and paths for use in linking and pkg-config
 PKG_LIBS =
 # Stubs that will not be RPATH'd
@@ -434,7 +447,7 @@ ifneq ($(HIP_LIB_DIR),)
   BACKENDS_MAKE += $(HIP_BACKENDS)
 endif
 
-# MAGMA Backend
+# MAGMA Backends
 ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),)
   MAGMA_ARCH=$(shell nm -g $(MAGMA_DIR)/lib/libmagma.* | grep -c "hipblas")
   ifeq ($(MAGMA_ARCH), 0) #CUDA MAGMA
diff --git a/README.md b/README.md
index e2012c629d..ca52b88d48 100644
--- a/README.md
+++ b/README.md
@@ -139,6 +139,8 @@ There are multiple supported backends, which can be selected at runtime in the e
 | `/cpu/self/opt/blocked`    | Blocked optimized C implementation                | Yes                   |
 | `/cpu/self/avx/serial`     | Serial AVX implementation                         | Yes                   |
 | `/cpu/self/avx/blocked`    | Blocked AVX implementation                        | Yes                   |
+| `/cpu/self/sve/serial`     | Serial SVE implementation                         | Yes                   |
+| `/cpu/self/sve/blocked`    | Blocked SVE implementation                        | Yes                   |
 ||
 | **CPU Valgrind**           |
 | `/cpu/self/memcheck/*`     | Memcheck backends, undefined value checks         | Yes                   |
@@ -180,6 +182,8 @@ The `/cpu/self/opt/*` backends are written in pure C and use partial e-vectors t
 
 The `/cpu/self/avx/*` backends rely upon AVX instructions to provide vectorized CPU performance.
 
+The `/cpu/self/sve/*` backends rely upon SVE instructions to provide vectorized CPU performance.
+
 The `/cpu/self/memcheck/*` backends rely upon the [Valgrind](http://valgrind.org/) Memcheck tool to help verify that user QFunctions have no undefined values.
 To use, run your code with Valgrind and the Memcheck backends, e.g. `valgrind ./build/ex1 -ceed /cpu/self/ref/memcheck`.
 A 'development' or 'debugging' version of Valgrind with headers is required to use this backend.
diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
index d350f365ed..0745a3b43e 100644
--- a/backends/ceed-backend-list.h
+++ b/backends/ceed-backend-list.h
@@ -28,5 +28,7 @@ MACRO(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
 MACRO(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
 MACRO(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
 MACRO(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
+MACRO(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial")
+MACRO(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked")
 MACRO(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
 MACRO(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
diff --git a/backends/sve/ceed-sve-blocked.c b/backends/sve/ceed-sve-blocked.c
new file mode 100644
index 0000000000..50d9fe0154
--- /dev/null
+++ b/backends/sve/ceed-sve-blocked.c
@@ -0,0 +1,45 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "ceed-sve.h"
+
+//------------------------------------------------------------------------------
+// Backend Init
+//------------------------------------------------------------------------------
+static int CeedInit_Sve(const char *resource, Ceed ceed) {
+  if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/sve") && strcmp(resource, "/cpu/self/sve/blocked")) {
+    // LCOV_EXCL_START
+    return CeedError(ceed, CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
+    // LCOV_EXCL_STOP
+  }
+  CeedCallBackend(CeedSetDeterministic(ceed, true));
+
+  // Create reference CEED that implementation will be dispatched
+  //   through unless overridden
+  Ceed ceed_ref;
+  CeedInit("/cpu/self/opt/blocked", &ceed_ref);
+  CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+
+  if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
+  } else {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve);
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Backend Register
+//------------------------------------------------------------------------------
+CEED_INTERN int CeedRegister_Sve_Blocked(void) { return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30); }
+//------------------------------------------------------------------------------
diff --git a/backends/sve/ceed-sve-serial.c b/backends/sve/ceed-sve-serial.c
new file mode 100644
index 0000000000..c988164978
--- /dev/null
+++ b/backends/sve/ceed-sve-serial.c
@@ -0,0 +1,45 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "ceed-sve.h"
+
+//------------------------------------------------------------------------------
+// Backend Init
+//------------------------------------------------------------------------------
+static int CeedInit_Sve(const char *resource, Ceed ceed) {
+  if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/sve/serial")) {
+    // LCOV_EXCL_START
+    return CeedError(ceed, CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
+    // LCOV_EXCL_STOP
+  }
+  CeedCallBackend(CeedSetDeterministic(ceed, true));
+
+  // Create reference CEED that implementation will be dispatched
+  //   through unless overridden
+  Ceed ceed_ref;
+  CeedInit("/cpu/self/opt/serial", &ceed_ref);
+  CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+
+  if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
+  } else {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve));
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Backend Register
+//------------------------------------------------------------------------------
+CEED_INTERN int CeedRegister_Sve_Serial(void) { return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35); }
+//------------------------------------------------------------------------------
diff --git a/backends/sve/ceed-sve-tensor-f32.c b/backends/sve/ceed-sve-tensor-f32.c
new file mode 100644
index 0000000000..2971db4b7c
--- /dev/null
+++ b/backends/sve/ceed-sve-tensor-f32.c
@@ -0,0 +1,150 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+#include <stdbool.h>
+
+#include "ceed-sve.h"
+
+//------------------------------------------------------------------------------
+// Blocked Tensor Contract
+//------------------------------------------------------------------------------
+static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                                 CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
+                                                 const CeedInt JJ) {
+  CeedInt t_stride_0 = B, t_stride_1 = 1;
+  if (t_mode == CEED_TRANSPOSE) {
+    t_stride_0 = 1;
+    t_stride_1 = J;
+  }
+
+  for (CeedInt a = 0; a < A; a++) {
+    for (CeedInt b = 0; b < B; b++) {
+      // Blocks of JJ rows
+      for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
+        for (CeedInt jj = 0; jj < JJ; jj++) {  // unroll
+          // C vectorization by compiler
+          for (int32_t c = 0; c < C; c += svcntd()) {
+            svbool_t pg = svwhilelt_b32(c, C);
+            // Load u, v into vectors
+            svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
+            svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
+            // Basis matrix value
+            float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
+            // fmadd
+            svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
+          }
+        }
+      }
+    }
+  }
+
+  // Remainder of rows
+  CeedInt j = (J / JJ) * JJ;
+  if (j < J) {
+    for (CeedInt a = 0; a < A; a++) {
+      for (CeedInt b = 0; b < B; b++) {
+        // Blocks of JJ rows
+        for (CeedInt jj = 0; jj < J - j; jj++) {  // not unrolled
+          // C vectorization by compiler
+          for (int32_t c = 0; c < C; c += svcntd()) {
+            svbool_t pg = svwhilelt_b32(c, C);
+            // Load u, v into vectors
+            svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
+            svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
+            // Basis matrix value
+            float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
+            // fmadd
+            svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
+          }
+        }
+      }
+    }
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Blocked Tensor Contract
+//------------------------------------------------------------------------------
+static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                                CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
+                                                const CeedInt JJ) {
+  CeedInt t_stride_0 = B, t_stride_1 = 1;
+  if (t_mode == CEED_TRANSPOSE) {
+    t_stride_0 = 1;
+    t_stride_1 = J;
+  }
+
+  for (CeedInt a = 0; a < A; a++) {
+    for (CeedInt b = 0; b < B; b++) {
+      for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
+        for (CeedInt jj = 0; jj < JJ; jj++) {  // unroll
+          v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
+        }
+      }
+    }
+  }
+
+  CeedInt j = (J / JJ) * JJ;
+  if (j < J) {
+    for (CeedInt a = 0; a < A; a++) {
+      for (CeedInt b = 0; b < B; b++) {
+        for (CeedInt jj = 0; jj < J - j; jj++) {  // not unrolled
+          v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
+        }
+      }
+    }
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract - Common Sizes
+//------------------------------------------------------------------------------
+static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                            CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
+  return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8);
+}
+static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                           CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
+  return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8);
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract Apply
+//------------------------------------------------------------------------------
+static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                       CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
+  if (!add) {
+    for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0;
+  }
+
+  if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
+  else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract Create
+//------------------------------------------------------------------------------
+int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) {
+  Ceed ceed;
+  CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
+
+  CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve));
+
+  return CEED_ERROR_SUCCESS;
+}
+//------------------------------------------------------------------------------
diff --git a/backends/sve/ceed-sve-tensor-f64.c b/backends/sve/ceed-sve-tensor-f64.c
new file mode 100644
index 0000000000..970c57d93a
--- /dev/null
+++ b/backends/sve/ceed-sve-tensor-f64.c
@@ -0,0 +1,150 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+#include <stdbool.h>
+
+#include "ceed-sve.h"
+
+//------------------------------------------------------------------------------
+// Blocked Tensor Contract
+//------------------------------------------------------------------------------
+static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t,
+                                                 CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v,
+                                                 const CeedInt JJ) {
+  CeedInt t_stride_0 = B, t_stride_1 = 1;
+  if (t_mode == CEED_TRANSPOSE) {
+    t_stride_0 = 1;
+    t_stride_1 = J;
+  }
+
+  for (CeedInt a = 0; a < A; a++) {
+    for (CeedInt b = 0; b < B; b++) {
+      // Blocks of JJ rows
+      for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
+        for (CeedInt jj = 0; jj < JJ; jj++) {  // unroll
+          // C vectorization by compiler
+          for (int32_t c = 0; c < C; c += svcntd()) {
+            svbool_t pg = svwhilelt_b64(c, C);
+            // Load u, v into vectors
+            svfloat64_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
+            svfloat64_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
+            // Basis matrix value
+            double tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
+            // fmadd
+            svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
+          }
+        }
+      }
+    }
+  }
+
+  // Remainder of rows
+  CeedInt j = (J / JJ) * JJ;
+  if (j < J) {
+    for (CeedInt a = 0; a < A; a++) {
+      for (CeedInt b = 0; b < B; b++) {
+        // Blocks of JJ rows
+        for (CeedInt jj = 0; jj < J - j; jj++) {  // not unrolled
+          // C vectorization by compiler
+          for (int32_t c = 0; c < C; c += svcntd()) {
+            svbool_t pg = svwhilelt_b64(c, C);
+            // Load u, v into vectors
+            svfloat64_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
+            svfloat64_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
+            // Basis matrix value
+            double tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
+            // fmadd
+            svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
+          }
+        }
+      }
+    }
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Blocked Tensor Contract
+//------------------------------------------------------------------------------
+static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t,
+                                                CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v,
+                                                const CeedInt JJ) {
+  CeedInt t_stride_0 = B, t_stride_1 = 1;
+  if (t_mode == CEED_TRANSPOSE) {
+    t_stride_0 = 1;
+    t_stride_1 = J;
+  }
+
+  for (CeedInt a = 0; a < A; a++) {
+    for (CeedInt b = 0; b < B; b++) {
+      for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
+        for (CeedInt jj = 0; jj < JJ; jj++) {  // unroll
+          v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
+        }
+      }
+    }
+  }
+
+  CeedInt j = (J / JJ) * JJ;
+  if (j < J) {
+    for (CeedInt a = 0; a < A; a++) {
+      for (CeedInt b = 0; b < B; b++) {
+        for (CeedInt jj = 0; jj < J - j; jj++) {  // not unrolled
+          v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
+        }
+      }
+    }
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract - Common Sizes
+//------------------------------------------------------------------------------
+static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t,
+                                            CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v) {
+  return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8);
+}
+static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t,
+                                           CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v) {
+  return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8);
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract Apply
+//------------------------------------------------------------------------------
+static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t,
+                                       CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v) {
+  if (!add) {
+    for (CeedInt q = 0; q < A * J * C; q++) v[q] = (double)0.0;
+  }
+
+  if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
+  else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract Create
+//------------------------------------------------------------------------------
+int CeedTensorContractCreate_f64_Sve(CeedBasis basis, CeedTensorContract contract) {
+  Ceed ceed;
+  CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
+
+  CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve));
+
+  return CEED_ERROR_SUCCESS;
+}
+//------------------------------------------------------------------------------
diff --git a/backends/sve/ceed-sve.h b/backends/sve/ceed-sve.h
new file mode 100644
index 0000000000..b9e0be853c
--- /dev/null
+++ b/backends/sve/ceed-sve.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#ifndef _ceed_sve_h
+#define _ceed_sve_h
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+
+CEED_INTERN int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract);
+CEED_INTERN int CeedTensorContractCreate_f64_Sve(CeedBasis basis, CeedTensorContract contract);
+
+#endif  // _ceed_sve_h