Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wal-g delete retain taking time and then timing out #1644

Open
nikhil-postgres opened this issue Feb 15, 2024 · 6 comments
Open

wal-g delete retain taking time and then timing out #1644

nikhil-postgres opened this issue Feb 15, 2024 · 6 comments

Comments

@nikhil-postgres
Copy link

nikhil-postgres commented Feb 15, 2024

PostgreSQL: 13.6
WAL-G: wal-g version v1.1.2-rc

Issue description

For highly transactional databases, we have seen than wal-g delete retain FIND_FULL is taking lot of time and then times out.

Describe your problem

Attaching strace output

Logs

strace: Process 30075 attached
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc0000ad550, FUTEX_WAKE_PRIVATE, 1) = 1
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 16280
sched_yield()                           = 0
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xc000100d50, FUTEX_WAKE_PRIVATE, 1) = 1
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 8
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 8
epoll_pwait(3, [], 128, 0, NULL, 47482952) = 0
futex(0xc000100950, FUTEX_WAKE_PRIVATE, 1) = 1
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 3469
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 54043598986936434
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 30094, SIGURG)            = 0
epoll_pwait(3, [], 128, 0, NULL, 0)     = 0
futex(0xc0000ad550, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x2d49d58, FUTEX_WAIT_PRIVATE, 0, {tv_sec=0, tv_nsec=100000}) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
epoll_pwait(3, [], 128, 0, NULL, 0)     = 0
epoll_pwait(3, [], 128, 4030, NULL, 46203501878253498) = 0
futex(0xc0000ad550, FUTEX_WAKE_PRIVATE, 1) = 1
epoll_pwait(3, [], 128, 0, NULL, 2)     = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
epoll_pwait(3, [], 128, 0, NULL, 0)     = 0
futex(0xc0000ad150, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 30075
tgkill(30075, 30092, SIGURG)            = 0
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 0
getpid()                                = 30075
tgkill(30075, 30095, SIGURG)            = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0xc000100d50, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d28, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x2d61340, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc000181150, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc000101150, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 21615, SIGURG)            = 0
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 6112
epoll_pwait(3, [], 128, 0, NULL, 0)     = 0
futex(0xc000523150, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d58, FUTEX_WAIT_PRIVATE, 0, {tv_sec=0, tv_nsec=100000}) = 0
futex(0xc000523150, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc000523150, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
getpid()                                = 30075
tgkill(30075, 21615, SIGURG)            = 0
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d28, FUTEX_WAIT_PRIVATE, 0, {tv_sec=0, tv_nsec=100000}) = -1 EAGAIN (Resource temporarily unavailable)
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 0, NULL, 0) = 1
futex(0x2d49d38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0xc000680550, FUTEX_WAKE_PRIVATE, 1) = 1
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 45982272
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 824670565728
sched_yield()                           = 0
futex(0x2d61340, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xc000100d50, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 30077, SIGURG)            = 0
futex(0xc000522150, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 30075
tgkill(30075, 30083, SIGURG)            = 0
getpid()                                = 30075
tgkill(30075, 30094, SIGURG)            = 0
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0xc000523150, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 0
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 0
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d28, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
sched_yield()                           = 0
futex(0x2d49c38, FUTEX_WAIT_PRIVATE, 2, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0xc000523150, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc000523150, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0xc000523150, FUTEX_WAKE_PRIVATE, 1) = 1
write(6, "\27\3\3\2\237\0\0\0\0\0\0\1\376:a\226\230\2\371\\O3?r\235\340\225\205\30\355Zx"..., 676) = 676
futex(0xc000523150, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d49d28, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc000522150, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 30090, SIGURG)            = 0
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 824673265152
epoll_pwait(3, [], 128, 0, NULL, 0)     = 0
futex(0xc000680550, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc000680550, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc000180950, FUTEX_WAKE_PRIVATE, 1) = 1
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 16
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 16
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 16
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 30080, SIGURG)            = 0
epoll_pwait(3, [], 128, 0, NULL, 0)     = 0
futex(0xc000180950, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d58, FUTEX_WAIT_PRIVATE, 0, {tv_sec=0, tv_nsec=100000}) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 8761, NULL, 46204121878253498) = 1
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 0, NULL, 47679320) = 1
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 8761, NULL, 46204121878253498) = 1
epoll_pwait(3, [], 128, 0, NULL, 47679320) = 0
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 8761, NULL, 46204121878253498) = 1
epoll_pwait(3, [], 128, 0, NULL, 47679320) = 0
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 8761, NULL, 46204121878253498) = 1
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 0, NULL, 47679320) = 1
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 8761, NULL, 46204121878253498) = 1
epoll_pwait(3, [], 128, 0, NULL, 47679320) = 0
epoll_pwait(3, [{EPOLLIN|EPOLLOUT, {u32=2618776216, u64=140297725500056}}], 128, 8760, NULL, 46204121878253498) = 1
epoll_pwait(3, [], 128, 0, NULL, 47679320) = 0
epoll_pwait(3, [], 128, 8760, NULL, 46204121878253498) = 0
epoll_pwait(3, [], 128, 0, NULL, 46204121878253498) = 0
futex(0xc000180950, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc000522150, FUTEX_WAKE_PRIVATE, 1) = 1
sched_yield()                           = 0
futex(0x2d49d28, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0xc0000ac950, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 30075
tgkill(30075, 30094, SIGURG)            = 0
getpid()                                = 30075
tgkill(30075, 30084, SIGURG)            = 0
getpid()                                = 30075
tgkill(30075, 30080, SIGURG)            = 0
getpid()                                = 30075
tgkill(30075, 30085, SIGURG)            = 0
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 30085, SIGURG)            = 0
epoll_pwait(3, [], 128, 0, NULL, 140296988246016) = 0
epoll_pwait(3, [], 128, 0, NULL, 140297792717942) = 0
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 30094, SIGURG)            = 0
epoll_pwait(3, [], 128, 0, NULL, 140296988176384) = 0
epoll_pwait(3, [], 128, 0, NULL, 47482952) = 0
epoll_pwait(3, [], 128, 0, NULL, 140297792253086) = 0
epoll_pwait(3, [], 128, 0, NULL, 140297792232617) = 0
epoll_pwait(3, [], 128, 0, NULL, 140297792271725) = 0
epoll_pwait(3, [], 128, 0, NULL, 140297792135233) = 0
epoll_pwait(3, [], 128, 0, NULL, 47482952) = 0
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 21614, SIGURG)            = 0
futex(0xc000680550, FUTEX_WAKE_PRIVATE, 1) = 1
getpid()                                = 30075
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 30075
tgkill(30075, 21615, SIGURG)            = 0
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 0
getpid()                                = 30075
tgkill(30075, 21615, SIGURG)            = 0
--- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=30075, si_uid=26} ---
rt_sigreturn({mask=[]})                 = 0
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49d28, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x2d49d58, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d49c38, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
write(5, "\0", 1)                       = 1
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x2d48710, FUTEX_WAIT_PRIVATE, 0, NULL
@x4m
Copy link
Collaborator

x4m commented Feb 15, 2024

Thanks for the report! Can you plz kill it with SIGABRT during such wait? It will pring gorountines so we can conclude what's going on. I bet it's listing WALs...

@nikhil-postgres
Copy link
Author

Below is the output of SIGABRT

SIGABRT: abort
PC=0x478e61 m=0 sigcode=0

goroutine 0 [idle]:
runtime.futex()
	/usr/local/go/src/runtime/sys_linux_amd64.s:552 +0x21
runtime.futexsleep(0x10000000000?, 0x0?, 0x4?)
	/usr/local/go/src/runtime/os_linux.go:66 +0x36
runtime.notesleep(0x2fada08)
	/usr/local/go/src/runtime/lock_futex.go:159 +0x87
runtime.mPark(...)
	/usr/local/go/src/runtime/proc.go:1449
runtime.stopm()
	/usr/local/go/src/runtime/proc.go:2236 +0x8d
runtime.findrunnable()
	/usr/local/go/src/runtime/proc.go:2812 +0x865
runtime.schedule()
	/usr/local/go/src/runtime/proc.go:3195 +0x239
runtime.park_m(0xc0004a3a00?)
	/usr/local/go/src/runtime/proc.go:3344 +0x14d
runtime.mcall()
	/usr/local/go/src/runtime/asm_amd64.s:425 +0x43

goroutine 1 [select]:
net/http.(*persistConn).roundTrip(0xc000832480, 0xc0004fd200)
	/usr/local/go/src/net/http/transport.go:2620 +0x974
net/http.(*Transport).roundTrip(0x2e68b80, 0xc000e1ef00)
	/usr/local/go/src/net/http/transport.go:594 +0x7c9
net/http.(*Transport).RoundTrip(0x418fe5?, 0x1de9c40?)
	/usr/local/go/src/net/http/roundtrip.go:17 +0x19
net/http.send(0xc000e1ef00, {0x1de9c40, 0x2e68b80}, {0x19f2320?, 0x1?, 0x0?})
	/usr/local/go/src/net/http/client.go:252 +0x5d8
net/http.(*Client).send(0x2fac560, 0xc000e1ef00, {0x2?, 0x1a42737?, 0x0?})
	/usr/local/go/src/net/http/client.go:176 +0x9b
net/http.(*Client).do(0x2fac560, 0xc000e1ef00)
	/usr/local/go/src/net/http/client.go:725 +0x8f5
net/http.(*Client).Do(...)
	/usr/local/go/src/net/http/client.go:593
github.com/aws/aws-sdk-go/aws/corehandlers.sendFollowRedirects(0xc16bd4cfa6701d61?)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/aws/aws-sdk-go/aws/corehandlers/handlers.go:120 +0x27
github.com/aws/aws-sdk-go/aws/corehandlers.glob..func3(0xc000e4cf00)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/aws/aws-sdk-go/aws/corehandlers/handlers.go:112 +0x176
github.com/aws/aws-sdk-go/aws/request.(*HandlerList).Run(0xc000e4d128, 0xc000e4cf00)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/aws/aws-sdk-go/aws/request/handlers.go:267 +0x9a
github.com/aws/aws-sdk-go/aws/request.(*Request).sendRequest(0xc000e4cf00)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/aws/aws-sdk-go/aws/request/request.go:602 +0xbf
github.com/aws/aws-sdk-go/aws/request.(*Request).Send(0xc000e4cf00)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/aws/aws-sdk-go/aws/request/request.go:555 +0x148
github.com/aws/aws-sdk-go/aws/request.(*Pagination).Next(0xc00105b7c0)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/aws/aws-sdk-go/aws/request/request_pagination.go:107 +0xb3
github.com/aws/aws-sdk-go/service/s3.(*S3).ListObjectsV2PagesWithContext(0xc0000101b8, {0x1df6638?, 0xc000140000}, 0xc000e22640, 0xc000e0db00, {0x0, 0x0, 0x0})
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/aws/aws-sdk-go/service/s3/api.go:6874 +0x19d
github.com/aws/aws-sdk-go/service/s3.(*S3).ListObjectsV2Pages(0x419347?, 0x10?, 0x1668820?)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/aws/aws-sdk-go/service/s3/api.go:6849 +0x3a
github.com/wal-g/wal-g/pkg/storages/s3.(*Folder).listObjectsPagesV2(0xc0004e22d0, 0xc000e0dae0, 0xc000e0daf0, 0xc000e007e0)
	/root/go_projects/src/github.com/wal-g/wal-g/pkg/storages/s3/folder.go:294 +0xe6
github.com/wal-g/wal-g/pkg/storages/s3.(*Folder).ListFolder(0xc0004e22d0)
	/root/go_projects/src/github.com/wal-g/wal-g/pkg/storages/s3/folder.go:265 +0x150
github.com/wal-g/wal-g/pkg/storages/storage.ListFolderRecursivelyWithFilter({0x1dfa178?, 0xc0004e22d0}, 0x40?)
	/root/go_projects/src/github.com/wal-g/wal-g/pkg/storages/storage/folder.go:70 +0x172
github.com/wal-g/wal-g/pkg/storages/storage.DeleteObjectsWhere({0x1dfa178, 0xc0004e22d0}, 0x1, 0xc00105bbf0, 0xc000e02d80?)
	/root/go_projects/src/github.com/wal-g/wal-g/pkg/storages/storage/folder.go:35 +0x4d
github.com/wal-g/wal-g/internal.(*DeleteHandler).DeleteBeforeTargetWhere(0xc000e225f0, {0x1dfa648, 0xc000e02d80}, 0x0?, 0x1ad79a8, 0x8?)
	/root/go_projects/src/github.com/wal-g/wal-g/internal/delete_handler.go:363 +0x20b
github.com/wal-g/wal-g/internal.(*DeleteHandler).DeleteBeforeTarget(0xc000e225f0?, {0x1dfa648?, 0xc000e02d80?}, 0x8?)
	/root/go_projects/src/github.com/wal-g/wal-g/internal/delete_handler.go:352 +0x31
github.com/wal-g/wal-g/internal.(*DeleteHandler).HandleDeleteRetain(0x1dfa178?, {0xc0007e09f0?, 0xc0001a1a00?, 0xc000d3fd60?}, 0x0?)
	/root/go_projects/src/github.com/wal-g/wal-g/internal/delete_handler.go:146 +0x2d0
github.com/wal-g/wal-g/cmd/pg.runDeleteRetain(0x2e72da0?, {0xc0007e09f0, 0x2, 0x3})
	/root/go_projects/src/github.com/wal-g/wal-g/cmd/pg/delete.go:84 +0x1aa
github.com/spf13/cobra.(*Command).execute(0x2e72da0, {0xc0007e0990, 0x3, 0x3})
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/spf13/cobra/command.go:860 +0x663
github.com/spf13/cobra.(*Command).ExecuteC(0x2e737a0)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/spf13/cobra/command.go:974 +0x3b4
github.com/spf13/cobra.(*Command).Execute(...)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/github.com/spf13/cobra/command.go:902
github.com/wal-g/wal-g/cmd/pg.Execute()
	/root/go_projects/src/github.com/wal-g/wal-g/cmd/pg/pg.go:45 +0x25
main.main()
	/root/go_projects/src/github.com/wal-g/wal-g/main/pg/main.go:8 +0x17

goroutine 10 [select]:
go.opencensus.io/stats/view.(*worker).start(0xc0001fc400)
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/go.opencensus.io/stats/view/worker.go:276 +0xad
created by go.opencensus.io/stats/view.init.0
	/root/go_projects/src/github.com/wal-g/wal-g/vendor/go.opencensus.io/stats/view/worker.go:34 +0x8d

goroutine 14 [IO wait]:
internal/poll.runtime_pollWait(0x7f82be691a50, 0x72)
	/usr/local/go/src/runtime/netpoll.go:302 +0x89
internal/poll.(*pollDesc).wait(0xc0001fcc00?, 0xc00031b800?, 0x0)
	/usr/local/go/src/internal/poll/fd_poll_runtime.go:83 +0x32
internal/poll.(*pollDesc).waitRead(...)
	/usr/local/go/src/internal/poll/fd_poll_runtime.go:88
internal/poll.(*FD).Read(0xc0001fcc00, {0xc00031b800, 0x17ac, 0x17ac})
	/usr/local/go/src/internal/poll/fd_unix.go:167 +0x25a
net.(*netFD).Read(0xc0001fcc00, {0xc00031b800?, 0xc000d3c100?, 0xc00031b80d?})
	/usr/local/go/src/net/fd_posix.go:55 +0x29
net.(*conn).Read(0xc000010218, {0xc00031b800?, 0x4?, 0x1668820?})
	/usr/local/go/src/net/net.go:183 +0x45
crypto/tls.(*atLeastReader).Read(0xc0004fac60, {0xc00031b800?, 0x0?, 0xc000cbc680?})
	/usr/local/go/src/crypto/tls/conn.go:785 +0x3d
bytes.(*Buffer).ReadFrom(0xc00082ccf8, {0x1dd8520, 0xc0004fac60})
	/usr/local/go/src/bytes/buffer.go:204 +0x98
crypto/tls.(*Conn).readFromUntil(0xc00082ca80, {0x1de9b60?, 0xc000010218}, 0x2?)
	/usr/local/go/src/crypto/tls/conn.go:807 +0xe5
crypto/tls.(*Conn).readRecordOrCCS(0xc00082ca80, 0x0)
	/usr/local/go/src/crypto/tls/conn.go:614 +0x116
crypto/tls.(*Conn).readRecord(...)
	/usr/local/go/src/crypto/tls/conn.go:582
crypto/tls.(*Conn).Read(0xc00082ca80, {0xc000831000, 0x1000, 0x0?})
	/usr/local/go/src/crypto/tls/conn.go:1285 +0x16f
net/http.(*persistConn).Read(0xc000832480, {0xc000831000?, 0xc0001180c0?, 0xc000d8ad30?})
	/usr/local/go/src/net/http/transport.go:1929 +0x4e
bufio.(*Reader).fill(0xc000b13560)
	/usr/local/go/src/bufio/bufio.go:106 +0x103
bufio.(*Reader).Peek(0xc000b13560, 0x1)
	/usr/local/go/src/bufio/bufio.go:144 +0x5d
net/http.(*persistConn).readLoop(0xc000832480)
	/usr/local/go/src/net/http/transport.go:2093 +0x1ac
created by net/http.(*Transport).dialConn
	/usr/local/go/src/net/http/transport.go:1750 +0x173e

goroutine 15 [select]:
net/http.(*persistConn).writeLoop(0xc000832480)
	/usr/local/go/src/net/http/transport.go:2392 +0xf5
created by net/http.(*Transport).dialConn
	/usr/local/go/src/net/http/transport.go:1751 +0x1791

rax    0xca
rbx    0x0
rcx    0x478e63
rdx    0x0
rdi    0x2fada08
rsi    0x80
rbp    0x7fff72322da0
rsp    0x7fff72322d58
r8     0x0
r9     0x0
r10    0x0
r11    0x286
r12    0x0
r13    0xe
r14    0x2fad180
r15    0x5b8
rip    0x478e61
rflags 0x286
cs     0x33
fs     0x0
gs     0x0

I added debug logging and below are the last few lines of the output where it is hung

DEBUG: 2024/02/16 03:46:24.239901 which_declare=declare -f
INFO: 2024/02/16 03:46:24.240262 retrieving permanent objects
DEBUG: 2024/02/16 03:46:26.644771 processing base_0000000400070A20000000D9_backup_stop_sentinel.json
DEBUG: 2024/02/16 03:46:26.644785 processing base_0000000400070187000000D1_backup_stop_sentinel.json
DEBUG: 2024/02/16 03:46:26.644787 processing base_000000040006FEB40000009E_backup_stop_sentinel.json
DEBUG: 2024/02/16 03:46:26.644789 processing base_000000040006FBF000000036_backup_stop_sentinel.json
DEBUG: 2024/02/16 03:46:26.644792 processing base_000000040006F8C700000055_backup_stop_sentinel.json
DEBUG: 2024/02/16 03:46:26.644794 processing base_000000040006F4BC0000008A_backup_stop_sentinel.json
DEBUG: 2024/02/16 03:46:26.644797 processing base_000000040006F21900000007_backup_stop_sentinel.json
INFO: 2024/02/16 03:46:26.644800 Start delete

@nikhil-postgres
Copy link
Author

nikhil-postgres commented Feb 19, 2024

Hi @x4m , Do you see any issues with the goroutines and the debug log?

The issue started after we added postgres majorversion in the bucket path as recommended in various Issues.

Thanks,
Nikhil

@x4m
Copy link
Collaborator

x4m commented Feb 20, 2024 via email

@nikhil-postgres
Copy link
Author

Got it.

What we can do on wal-g side is to delete in smaller chunks, so that several calls to ‘’’wal-g delete’’’ will finally do the job
-- I dont see an option to delete in chunks. Do we have the option to delete in chunks in latest version of wal-g ?

@nikhil-postgres
Copy link
Author

Hi @x4m , Is there any parameter to increase the timeout parameter for listing backup?

I see only WALG_STOP_BACKUP_TIMEOUT which is used during pg_stop_backup

Thanks,
Nikhil

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants