Skip to content

Debug Out Of Memory (OOM) Errors in Simulation and Production

Jingyu Zhou edited this page Feb 29, 2024 · 1 revision

Option 1: use jemalloc

From version 7.0, jemalloc is used by FDB. Thus we can use jemalloc's profiling capability, with some minimal changes:

diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake
index aedb83480..e2836a11f 100644
--- a/cmake/Jemalloc.cmake
+++ b/cmake/Jemalloc.cmake
@@ -15,7 +15,9 @@ ExternalProject_add(Jemalloc_project
   "${JEMALLOC_DIR}/lib/libjemalloc.a"
   "${JEMALLOC_DIR}/lib/libjemalloc_pic.a"
   PATCH_COMMAND patch -p1 < ${CMAKE_SOURCE_DIR}/cmake/jemalloc.patch
-  CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof
+  CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof  --enable-stats --
with-malloc-conf=prof:true,prof_prefix:/var/tmp/fdbserver
   BUILD_IN_SOURCE ON
   BUILD_COMMAND make
   INSTALL_DIR "${JEMALLOC_DIR}"

Note --with-malloc-conf option is used for configuring jemalloc. This is needed because fdbmonitor can't set the environmental variable MALLOC_CONF when invoking fdbserver.

Another change needed is to remove SignalSafeUnwind.*, because chain_dl_iterate_phdr in glibc was overwritten in the file, which causes deadlocks during jemalloc initialization. As a result, SlowTaskWorkload.actor.cpp should also be removed. Finally, flow should be linked with jemalloc so that many binaries can link correctly.

diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt
index 31d8c3ed8..bd6d32f9c 100644
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@@ -280,7 +280,7 @@ set(FDBSERVER_SRCS
   workloads/Sideband.actor.cpp
   workloads/SidebandSingle.actor.cpp
   workloads/SimpleAtomicAdd.actor.cpp
-  workloads/SlowTaskWorkload.actor.cpp
+#  workloads/SlowTaskWorkload.actor.cpp
   workloads/SnapTest.actor.cpp
   workloads/SpecialKeySpaceCorrectness.actor.cpp
   workloads/StatusWorkload.actor.cpp
diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index 5cd37810b..801c8e732 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -55,8 +55,8 @@ set(FLOW_SRCS
   Profiler.actor.cpp
   Profiler.h
   SendBufferIterator.h
-  SignalSafeUnwind.cpp
-  SignalSafeUnwind.h
+#  SignalSafeUnwind.cpp
+#  SignalSafeUnwind.h
   SimpleOpt.h
   StreamCipher.h
   SystemMonitor.cpp
@@ -135,6 +135,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY
 add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS})
 target_link_libraries(flow PRIVATE stacktrace)
 target_link_libraries(flow PUBLIC fmt::fmt)
+target_link_libraries(flow PRIVATE jemalloc)

 add_flow_target(STATIC_LIBRARY NAME flow_sampling SRCS ${FLOW_SRCS})
 target_link_libraries(flow_sampling PRIVATE stacktrace)

To debug OOM, we may want to dump the profile when FDB exits with an error. The following change achieves this:

diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 748050b37..8b7f999aa 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -57,6 +57,10 @@

 #include "fdbclient/AnnotateActor.h"

+#ifdef USE_JEMALLOC
+#include <jemalloc/jemalloc.h>
+#endif
+
 #ifdef _WIN32
 #include <windows.h>
 #include <winioctl.h>
@@ -3247,6 +3251,12 @@ extern "C" void flushAndExit(int exitCode) {
        // to the crashAndDie call below.
        TerminateProcess(GetCurrentProcess(), exitCode);
 #else
+#ifdef USE_JEMALLOC
+       // malloc_stats_print(nullptr, nullptr, nullptr);
+       if (exitCode != FDB_EXIT_SUCCESS) {
+               mallctl("prof.dump", nullptr, nullptr, nullptr, 0);
+       }
+#endif
        // Send a signal to allow the Kernel to generate a coredump for this process.
        // See: https://man7.org/linux/man-pages/man5/core.5.html
        // The abort method will send a SIGABRT, which causes the kernel to collect a coredump.

Once we have the heap profile, we can generate the figure with unstripped fdbserver binary:

jeprof --show_bytes --pdf fdbserver jeprof.82759.48.i48.heap > figure.pdf

Option 2: Follow these steps to obtain heap profiles

  1. Install gperftools if needed (skip this step if using the development docker), e.g., yum install -y gperftools-devel gperftools-libs gperftools ghostscript.x86_64 gv.x86_64
  2. Compile with gperf tools: cmake -DUSE_GPERFTOOLS=1 ../foundationdb -G Ninja; ninja (may need to comment out target_compile_definitions(gperftools PUBLIC USE_GPERFTOOLS) in cmake/FindGperftools.cmake).
  3. Run with gperftools enabled: HEAPPROFILE=/tmp/fdbserver fdbserver [args...]
  4. Profile the heap profile: pprof-symbolize gperf-build/bin/fdbserver /tmp/fdbserver.0065.heap

Note that the profiling runs are at least 10X slower than the runs without profiling.

See a sample profile here.

Option 3: Use Valgrind tool massif

See massif manual.

  1. Compile with Valgrind, e.g., cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D USE_VALGRIND=ON -G Ninja && ninja -C ${HOME}/build_output -j 80 fdbserver
  2. Run with massif tool, e.g., valgrind --tool=massif ./build_output/bin/fdbserver -r simulation --crash --logsize 1024MB -f ./foundationdb/tests/fast/ConfigureLocked.toml -s 93093841 -b on

Trace events of GetMagazineSample and HugeArenaSample

  • GetMagazineSample logs when the fast allocators adds more magazines, the backtraces will be reliably the problem.
  • HugeArenaSample could point to arenas that eventually get deallocated, so it might not be a memory leak.
Clone this wiki locally