Debug Out Of Memory (OOM) Errors in Simulation and Production
From version 7.0, jemalloc is used by FDB. Thus we can use jemalloc's profiling capability, with some minimal changes:
diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake
index aedb83480..e2836a11f 100644
--- a/cmake/Jemalloc.cmake
+++ b/cmake/Jemalloc.cmake
@@ -15,7 +15,9 @@ ExternalProject_add(Jemalloc_project
"${JEMALLOC_DIR}/lib/libjemalloc.a"
"${JEMALLOC_DIR}/lib/libjemalloc_pic.a"
PATCH_COMMAND patch -p1 < ${CMAKE_SOURCE_DIR}/cmake/jemalloc.patch
- CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof
+ CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof --enable-stats --
with-malloc-conf=prof:true,prof_prefix:/var/tmp/fdbserver
BUILD_IN_SOURCE ON
BUILD_COMMAND make
INSTALL_DIR "${JEMALLOC_DIR}"
Note --with-malloc-conf
option is used for configuring jemalloc. This is needed because fdbmonitor
can't set the environmental variable MALLOC_CONF
when invoking fdbserver
.
Another change needed is to remove SignalSafeUnwind.*
, because chain_dl_iterate_phdr
in glibc was overwritten in the file, which causes deadlocks during jemalloc initialization. As a result, SlowTaskWorkload.actor.cpp
should also be removed. Finally, flow
should be linked with jemalloc
so that many binaries can link correctly.
diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt
index 31d8c3ed8..bd6d32f9c 100644
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@@ -280,7 +280,7 @@ set(FDBSERVER_SRCS
workloads/Sideband.actor.cpp
workloads/SidebandSingle.actor.cpp
workloads/SimpleAtomicAdd.actor.cpp
- workloads/SlowTaskWorkload.actor.cpp
+# workloads/SlowTaskWorkload.actor.cpp
workloads/SnapTest.actor.cpp
workloads/SpecialKeySpaceCorrectness.actor.cpp
workloads/StatusWorkload.actor.cpp
diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index 5cd37810b..801c8e732 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -55,8 +55,8 @@ set(FLOW_SRCS
Profiler.actor.cpp
Profiler.h
SendBufferIterator.h
- SignalSafeUnwind.cpp
- SignalSafeUnwind.h
+# SignalSafeUnwind.cpp
+# SignalSafeUnwind.h
SimpleOpt.h
StreamCipher.h
SystemMonitor.cpp
@@ -135,6 +135,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY
add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS})
target_link_libraries(flow PRIVATE stacktrace)
target_link_libraries(flow PUBLIC fmt::fmt)
+target_link_libraries(flow PRIVATE jemalloc)
add_flow_target(STATIC_LIBRARY NAME flow_sampling SRCS ${FLOW_SRCS})
target_link_libraries(flow_sampling PRIVATE stacktrace)
To debug OOM, we may want to dump the profile when FDB exits with an error. The following change achieves this:
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 748050b37..8b7f999aa 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -57,6 +57,10 @@
#include "fdbclient/AnnotateActor.h"
+#ifdef USE_JEMALLOC
+#include <jemalloc/jemalloc.h>
+#endif
+
#ifdef _WIN32
#include <windows.h>
#include <winioctl.h>
@@ -3247,6 +3251,12 @@ extern "C" void flushAndExit(int exitCode) {
// to the crashAndDie call below.
TerminateProcess(GetCurrentProcess(), exitCode);
#else
+#ifdef USE_JEMALLOC
+ // malloc_stats_print(nullptr, nullptr, nullptr);
+ if (exitCode != FDB_EXIT_SUCCESS) {
+ mallctl("prof.dump", nullptr, nullptr, nullptr, 0);
+ }
+#endif
// Send a signal to allow the Kernel to generate a coredump for this process.
// See: https://man7.org/linux/man-pages/man5/core.5.html
// The abort method will send a SIGABRT, which causes the kernel to collect a coredump.
Once we have the heap profile, we can generate the figure with unstripped fdbserver
binary:
jeprof --show_bytes --pdf fdbserver jeprof.82759.48.i48.heap > figure.pdf
- Install gperftools if needed (skip this step if using the development docker), e.g.,
yum install -y gperftools-devel gperftools-libs gperftools ghostscript.x86_64 gv.x86_64
- Compile with gperf tools:
cmake -DUSE_GPERFTOOLS=1 ../foundationdb -G Ninja; ninja
(may need to comment out).target_compile_definitions(gperftools PUBLIC USE_GPERFTOOLS)
incmake/FindGperftools.cmake
- Run with gperftools enabled:
HEAPPROFILE=/tmp/fdbserver fdbserver [args...]
- Profile the heap profile:
pprof-symbolize gperf-build/bin/fdbserver /tmp/fdbserver.0065.heap
Note that the profiling runs are at least 10X slower than the runs without profiling.
See a sample profile here.
See massif manual.
- Compile with Valgrind, e.g.,
cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D USE_VALGRIND=ON -G Ninja && ninja -C ${HOME}/build_output -j 80 fdbserver
- Run with massif tool, e.g.,
valgrind --tool=massif ./build_output/bin/fdbserver -r simulation --crash --logsize 1024MB -f ./foundationdb/tests/fast/ConfigureLocked.toml -s 93093841 -b on
-
GetMagazineSample
logs when the fast allocators adds more magazines, the backtraces will be reliably the problem. -
HugeArenaSample
could point to arenas that eventually get deallocated, so it might not be a memory leak.