gramineproject · vijaydhanraj · Feb 17, 2021 · Mar 5, 2021 · Mar 22, 2021 · Mar 31, 2021
diff --git a/Documentation/devel/performance.rst b/Documentation/devel/performance.rst
@@ -386,10 +386,10 @@ workloads. The manifest options include:
 - ``libos.check_invalid_pointers = false`` -- disable checks of invalid pointers
   on system call invocations. Most real-world applications never provide invalid
   arguments to system calls, so there is no need in additional checks.
-- ``sgx.preheat_enclave = true`` -- pre-fault all enclave pages during enclave
-  initialization. This shifts the overhead of page faults on non-present enclave
-  pages from runtime to enclave startup time. Using this option makes sense only
-  if the whole enclave memory fits into :term:`EPC`.
+- ``sgx.preheat_enclave_sz = "1"`` -- pre-fault all enclave pages during enclave
+  initialization when ``sgx.edmm_enable_heap = false``. This shifts the overhead
+  of page faults on non-present enclave pages from runtime to enclave startup time.
+  Using this option makes sense only if the whole enclave memory fits into :term:`EPC`.
 
 If your application periodically fails and complains about seemingly irrelevant
 things, it may be due to insufficient enclave memory. Please try to increase

diff --git a/Documentation/manifest-syntax.rst b/Documentation/manifest-syntax.rst
@@ -420,6 +420,59 @@ more CPU cores and burning more CPU cycles. For example, a single-threaded
 Redis instance on Linux becomes 5-threaded on Graphene with Exitless. Thus,
 Exitless may negatively impact throughput but may improve latency.
 
+EDMM dynamic heap (Experimental)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    sgx.edmm_enable_heap = [true|false]
+    (Default: false)
+
+This syntax enables EDMM dynamic heap feature available as part of Intel
+":term:`SGX2`" capable hardware. When enabled, EPC pages are not added when
+creating the enclave but allocated dynamically using EACCEPT when Graphene
+requests more heap memory. This triggers a page fault (#PF) which is handled by
+the Intel SGX driver (legacy driver) by EAUGing the page and returning the
+control back to the enclave. The enclave now continues from the same EACCEPT
+instruction (but this time the instruction succeeds).
+
+One of the key advantages of EDMM is that the enclave ends up using only the
+EPC pages that it requires and the user does not need to tailor the enclave
+size precisely for each workload. EDMM does help to reduce the loading time of
+a large enclave application but can impact the runtime as there is a penalty
+for additional asynchronous enclave exits (AEXs) caused by #PFs.
+
+EDMM Batch Allocation (Experimental)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    sgx.edmm_batch_allocation = [true|false]
+    (Default: false)
+
+SGX driver allocates EPC pages dynamically by faulting in pages one at a time.
+This incurs a huge overhead due to enclave exit for each page. This syntax enables
+use of a new IOCTL has been introduced in the SGX driver which can take the
+requested range and EAUG all the pages in one shot. Enclave then EACCEPTs all
+the pages requested.
+
+.. note ::
+   New SGX driver IOCTL is experimental and is not yet available as part of official
+   Intel SGX OOT driver release. This option is not yet ready for public usage.
+
+EDMM Lazy Free optimization (Experimental)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    sgx.edmm_lazyfree_th = [NUM]
+    (Default: 0)
+
+This syntax specifies the **percentage** of total heap that can be freed in a lazy manner.
+Until this threshold is met, graphene doesn't release any dynamically allocated memory.
+This optimization helps reduce the expensive enclave entries/exits associated with dynamic
+freeing of EPC pages.
+
 Optional CPU features (AVX, AVX512, MPX, PKRU)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -562,11 +615,15 @@ Pre-heating enclave
 
 ::
 
-    sgx.preheat_enclave = [true|false]
-    (Default: false)
+    sgx.preheat_enclave_sz = ["0"|"1"|"SIZE"]
+    (Default: "0")
+
+When set to "1", this option instructs Graphene to pre-fault all heap pages during
+initialization and setting it to "0" disables the feature. When ``sgx.edmm_enable_heap``
+is enabled, user can precisely set the amount of heap to preheat by setting the ``SIZE``.
+For example, when size is "64M" Graphene will pre-fault top 64M of heap pages.
 
-When enabled, this option instructs Graphene to pre-fault all heap pages during
-initialization. This has a negative impact on the total run time, but shifts the
+This has a negative impact on the total run time, but shifts the
 :term:`EPC` page faults cost to the initialization phase, which can be useful in
 a scenario where a server starts and receives connections / work packages only
 after some time. It also makes the later run time and latency much more

diff --git a/LibOS/shim/test/regression/.gitignore b/LibOS/shim/test/regression/.gitignore
@@ -19,6 +19,7 @@
 /devfs
 /device_passthrough
 /double_fork
+/edmm_heap_mmap
 /env_from_file
 /env_from_host
 /epoll_epollet

diff --git a/LibOS/shim/test/regression/Makefile b/LibOS/shim/test/regression/Makefile
@@ -17,6 +17,7 @@ c_executables = \
 	devfs \
 	device_passthrough \
 	double_fork \
+	edmm_heap_mmap \
 	epoll_epollet \
 	epoll_wait_timeout \
 	eventfd \
@@ -156,6 +157,9 @@ LDLIBS-attestation += ../../../../common/src/crypto/mbedtls/install/lib/libmbedc
 CFLAGS-fp_multithread += -pthread -fno-builtin  # see comment in the test's source
 LDLIBS-fp_multithread += -lm
 
+CFLAGS-edmm_heap_mmap += -pthread
+LDLIBS-edmm_heap_mmap += -lm
+
 proc_common: proc_common.o dump.o
 	$(call cmd,cmulti)
 

diff --git a/LibOS/shim/test/regression/edmm_heap_mmap.c b/LibOS/shim/test/regression/edmm_heap_mmap.c
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2020 Intel Corporation */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <math.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#define min(a, b)               (((a) < (b)) ? (a) : (b))
+#define MAIN_THREAD_CNT         1
+#define INTERNAL_THREAD_CNT     2
+#define MANIFEST_SGX_THREAD_CNT 16 /* corresponds to sgx.thread_num in the manifest template */
+
+/* barrier to synchronize between parent and children */
+pthread_barrier_t barrier;
+
+static pid_t mygettid(void) {
+    return syscall(SYS_gettid);
+}
+
+double g_per_mmap_diff[MANIFEST_SGX_THREAD_CNT] = {0};
+double g_per_munmap_diff[MANIFEST_SGX_THREAD_CNT] = {0};
+
+static void mmap_munmap_memory(int val) {
+    size_t mmap_length = 0x4000;
+    struct timeval tv1 = {0};
+    struct timeval tv2 = {0};
+    long long mmap_diff;
+    long long munmap_diff;
+
+    for (int i = 0; i < 500; i++) {
+        if (gettimeofday(&tv1, NULL)) {
+            printf("Cannot get time 1: %m\n");
+        }
+        void* a = mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+                      -1, 0);
+        if (a == MAP_FAILED) {
+            err(EXIT_FAILURE, "mmap failed for tid=%d for size 0x%lx\n",  mygettid(),
+                 mmap_length);
+        }
+        if (gettimeofday(&tv2, NULL)) {
+            printf("Cannot get time 2: %m\n");
+        }
+
+        mmap_diff += ((tv2.tv_sec - tv1.tv_sec) * 1000000ll);
+        mmap_diff += tv2.tv_usec - tv1.tv_usec;
+
+        memset(a, val, mmap_length);
+        if (gettimeofday(&tv1, NULL)) {
+            printf("Cannot get time 1: %m\n");
+        }
+        int rv = munmap(a, mmap_length);
+        if (rv) {
+            err(EXIT_FAILURE, "munmap failed for tid =%d for size 0x%lx\n",  mygettid(),
+                 mmap_length);
+        }
+        if (gettimeofday(&tv2, NULL)) {
+            printf("Cannot get time 2: %m\n");
+        }
+
+        munmap_diff += ((tv2.tv_sec - tv1.tv_sec) * 1000000ll);
+        munmap_diff += tv2.tv_usec - tv1.tv_usec;
+    }
+
+    int tid = mygettid();
+    assert(tid);
+    g_per_mmap_diff[tid-1] = ((double)(mmap_diff/500))/1000;
+    g_per_munmap_diff[tid-1] = ((double)(munmap_diff/500))/1000;
+}
+
+/* Run a busy loop for some iterations, so that we can verify affinity with htop manually */
+static void* dowork(void* args) {
+    uint32_t val = *(uint32_t*)args;
+
+    mmap_munmap_memory(val);
+
+    /* child waits on barrier */
+    int ret = pthread_barrier_wait(&barrier);
+    if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
+        errx(EXIT_FAILURE, "Child did not wait on barrier!");
+    }
+
+    return NULL;
+}
+
+static int run(int sgx_thread_cnt) {
+    int ret;
+    long numprocs = sysconf(_SC_NPROCESSORS_ONLN);
+    if (numprocs < 0) {
+        err(EXIT_FAILURE, "Failed to retrieve the number of logical processors!");
+    }
+
+    /* If you want to run on all cores then increase sgx.thread_num in the manifest.template and
+     * also set MANIFEST_SGX_THREAD_CNT to the same value.
+     */
+    numprocs = min(numprocs, (sgx_thread_cnt - (INTERNAL_THREAD_CNT + MAIN_THREAD_CNT)));
+    printf("NO. of threads created = %ld\n", numprocs);
+
+    pthread_t* threads = (pthread_t*)malloc(numprocs * sizeof(pthread_t));
+    if (!threads) {
+         errx(EXIT_FAILURE, "memory allocation failed");
+    }
+
+    /*per-thread unique values */
+    int* per_thread_val = (int*)malloc(numprocs * sizeof(int));
+    if (!per_thread_val) {
+         errx(EXIT_FAILURE, "per-thread memory allocation failed");
+    }
+
+    if (pthread_barrier_init(&barrier, NULL, numprocs + 1)) {
+        free(threads);
+        errx(EXIT_FAILURE, "pthread barrier init failed");
+    }
+
+    /* Validate parent set/get affinity for child */
+    for (uint32_t i = 0; i < numprocs; i++) {
+        per_thread_val[i] = i + 1;
+        ret = pthread_create(&threads[i], NULL, dowork, (void*)&per_thread_val[i]);
+        if (ret != 0) {
+            free(threads);
+            errx(EXIT_FAILURE, "pthread_create failed!");
+        }
+    }
+
+    /* parent waits on barrier */
+    ret = pthread_barrier_wait(&barrier);
+    if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
+        free(threads);
+        errx(EXIT_FAILURE, "Parent did not wait on barrier!");
+    }
+
+    mmap_munmap_memory(0);
+
+    for (int i = 0; i < numprocs; i++) {
+        ret = pthread_join(threads[i], NULL);
+        if (ret != 0) {
+            free(threads);
+            errx(EXIT_FAILURE, "pthread_join failed!");
+        }
+    }
+
+    /* Validating parent set/get affinity for children done. Free resources */
+    pthread_barrier_destroy(&barrier);
+    free(per_thread_val);
+    free(threads);
+
+    double total_mmap_diff = 0;
+    double total_munmap_diff = 0;
+    for (int i = 1; i < numprocs+1; i++) {
+        printf("Average mmap_time(ms): %lf, munmap_time(ms): %lf for thread %d\n",
+                g_per_mmap_diff[i], g_per_munmap_diff[i], i);
+        total_mmap_diff += g_per_mmap_diff[i];
+        total_munmap_diff += g_per_munmap_diff[i];
+    }
+    printf("Avg across all threads, mmap_time(ms): %lf, munmap_time(ms): %lf\n",
+           (float)(total_mmap_diff/numprocs), (float)(total_munmap_diff/numprocs) );
+
+    printf("===================================================================================\n");
+    return 0;
+}
+
+#define MAX_THREADS 64
+int main(int argc, const char** argv) {
+
+    int num_threads = min(MAX_THREADS, MANIFEST_SGX_THREAD_CNT);
+    /*Run tests for 1, 2, 4, 8 ...threads until num_threads */
+    for (int i = 1, j = 4; j < num_threads; i++) {
+        run(j);
+        j = pow(2, i) + INTERNAL_THREAD_CNT + MAIN_THREAD_CNT;
+        sleep(5);
+    }
+
+    printf("TEST OK\n");
+    return 0;
+}
diff --git a/LibOS/shim/test/regression/openmp.manifest.template b/LibOS/shim/test/regression/openmp.manifest.template
@@ -11,7 +11,7 @@ loader.env.LD_LIBRARY_PATH = "/lib:/usrlib"
 # the manifest options below are added only for testing, they have no significance for OpenMP
 libos.check_invalid_pointers = false
 sys.enable_sigterm_injection = true
-sgx.preheat_enclave = true
+sgx.preheat_enclave_sz = "1"
 
 fs.mount.lib.type = "chroot"
 fs.mount.lib.path = "/lib"

diff --git a/LibOS/shim/test/regression/test_libos.py b/LibOS/shim/test/regression/test_libos.py
@@ -451,7 +451,10 @@ def test_043_futex_wake_op(self):
 
         self.assertIn('Test successful!', stdout)
 
-    def test_050_mmap(self):
+    @unittest.skipIf(HAS_SGX,
+        'On SGX, SIGBUS isn\'t always implemented correctly, for lack '
+        'of memory protection. For now, some of these cases won\'t work.')
+    def test_051_mmap(self):
         stdout, _ = self.run_binary(['mmap_file'], timeout=60)
 
         # Private mmap beyond file range
@@ -464,14 +467,6 @@ def test_050_mmap(self):
         self.assertIn('mmap test 3 passed', stdout)
         self.assertIn('mmap test 4 passed', stdout)
 
-        # "test 5" and "test 8" are checked below, in test_051_mmap_sgx
-
-    @unittest.skipIf(HAS_SGX,
-        'On SGX, SIGBUS isn\'t always implemented correctly, for lack '
-        'of memory protection. For now, some of these cases won\'t work.')
-    def test_051_mmap_sgx(self):
-        stdout, _ = self.run_binary(['mmap_file'], timeout=60)
-
         # SIGBUS test
         self.assertIn('mmap test 5 passed', stdout)
         self.assertIn('mmap test 8 passed', stdout)

diff --git a/Pal/include/pal_internal.h b/Pal/include/pal_internal.h
@@ -270,6 +270,8 @@ void free(void* mem);
 #error Unsupported compiler
 #endif
 
+PAL_IDX pal_get_cur_tid(void);
+
 int _DkInitDebugStream(const char* path);
 int _DkDebugLog(const void* buf, size_t size);
 

diff --git a/Pal/src/host/Linux-SGX/Makefile b/Pal/src/host/Linux-SGX/Makefile
@@ -50,6 +50,7 @@ enclave-objs = \
 	db_sockets.o \
 	db_streams.o \
 	db_threading.o \
+	edmm_pages.o \
 	enclave_ecalls.o \
 	enclave_framework.o \
 	enclave_ocalls.o \