Skip to content

Commit

Permalink
mm: vmpressure: Fix rampant inaccuracies caused by stale data usage
Browse files Browse the repository at this point in the history
After a period of intense memory pressure is over, it's common for
vmpressure to still have old reclaim efficiency data accumulated from
this time. When memory pressure starts to rise again, this stale data
will factor into vmpressure's calculations, and can cause vmpressure to
report an erroneously high pressure. The reverse is possible, too:
vmpressure may report pressures that are erroneously low due to stale
data that's been stored.

Furthermore, since kswapd can still be performing reclaim when there are
no failed memory allocations stuck in the page allocator's slow path,
vmpressure may still report pressures when there aren't any memory
allocations to satisfy. This can cause last-resort memory reclaimers to
kill processes to free memory when it's not needed.

To fix the rampant stale data, keep track of when there are processes
utilizing reclaim in the page allocator's slow path, and reset the
accumulated data in vmpressure when a new period of elevated memory
pressure begins. Extra measures are taken for the kswapd issue mentioned
above by ignoring all reclaim efficiency data reported by kswapd when
there aren't any failed memory allocations in the page allocator which
utilize reclaim.

Note that since sr_lock can now be used from IRQ context, IRQs must be
disabled whenever sr_lock is used to prevent deadlocks.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
Signed-off-by: UtsavBalar1231 <utsavbalar1231@gmail.com>
  • Loading branch information
kerneltoast authored and UtsavBalar1231 committed Jul 14, 2021
1 parent d220bf4 commit 2795315
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 11 deletions.
5 changes: 5 additions & 0 deletions include/linux/vmpressure.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ struct vmpressure {
struct mutex events_lock;

struct work_struct work;

atomic_long_t users;
rwlock_t users_lock;
};

struct mem_cgroup;
Expand All @@ -37,6 +40,8 @@ extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
int order);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio,
int order);
extern bool vmpressure_inc_users(int order);
extern void vmpressure_dec_users(void);

#ifdef CONFIG_MEMCG
extern void vmpressure_init(struct vmpressure *vmpr);
Expand Down
7 changes: 7 additions & 0 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -4078,6 +4078,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int cpuset_mems_cookie;
int reserve_flags;
bool woke_kswapd = false;
bool used_vmpressure = false;

/*
* We also sanity check to catch abuse of atomic reserves being used by
Expand Down Expand Up @@ -4116,6 +4117,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
atomic_long_inc(&kswapd_waiters);
woke_kswapd = true;
}
if (!used_vmpressure)
used_vmpressure = vmpressure_inc_users(order);
wake_all_kswapds(order, ac);
}

Expand Down Expand Up @@ -4209,6 +4212,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;

/* Try direct reclaim and then allocating */
if (!used_vmpressure)
used_vmpressure = vmpressure_inc_users(order);
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
Expand Down Expand Up @@ -4320,6 +4325,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
got_pg:
if (woke_kswapd)
atomic_long_dec(&kswapd_waiters);
if (used_vmpressure)
vmpressure_dec_users();
if (!page)
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
Expand Down
74 changes: 63 additions & 11 deletions mm/vmpressure.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,11 +217,12 @@ static void vmpressure_work_fn(struct work_struct *work)
unsigned long scanned;
unsigned long reclaimed;
unsigned long pressure;
unsigned long flags;
enum vmpressure_levels level;
bool ancestor = false;
bool signalled = false;

spin_lock(&vmpr->sr_lock);
spin_lock_irqsave(&vmpr->sr_lock, flags);
/*
* Several contexts might be calling vmpressure(), so it is
* possible that the work was rescheduled again before the old
Expand All @@ -232,14 +233,14 @@ static void vmpressure_work_fn(struct work_struct *work)
*/
scanned = vmpr->tree_scanned;
if (!scanned) {
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
return;
}

reclaimed = vmpr->tree_reclaimed;
vmpr->tree_scanned = 0;
vmpr->tree_reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);

pressure = vmpressure_calc_pressure(scanned, reclaimed);
level = vmpressure_level(pressure);
Expand Down Expand Up @@ -279,6 +280,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
unsigned long flags;

/*
* If we got here with no pages scanned, then that is an indicator
Expand All @@ -294,10 +296,10 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
return;

if (tree) {
spin_lock(&vmpr->sr_lock);
spin_lock_irqsave(&vmpr->sr_lock, flags);
scanned = vmpr->tree_scanned += scanned;
vmpr->tree_reclaimed += reclaimed;
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);

if (!critical && scanned < calculate_vmpressure_win())
return;
Expand All @@ -310,15 +312,15 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
if (!memcg || memcg == root_mem_cgroup)
return;

spin_lock(&vmpr->sr_lock);
spin_lock_irqsave(&vmpr->sr_lock, flags);
scanned = vmpr->scanned += scanned;
reclaimed = vmpr->reclaimed += reclaimed;
if (!critical && scanned < calculate_vmpressure_win()) {
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
return;
}
vmpr->scanned = vmpr->reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);

pressure = vmpressure_calc_pressure(scanned, reclaimed);
level = vmpressure_level(pressure);
Expand All @@ -342,17 +344,49 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
unsigned long reclaimed) { }
#endif

bool vmpressure_inc_users(int order)
{
struct vmpressure *vmpr = &global_vmpressure;
unsigned long flags;

if (order > PAGE_ALLOC_COSTLY_ORDER)
return false;

write_lock_irqsave(&vmpr->users_lock, flags);
if (atomic_long_inc_return_relaxed(&vmpr->users) == 1) {
/* Clear out stale vmpressure data when reclaim begins */
spin_lock(&vmpr->sr_lock);
vmpr->scanned = 0;
vmpr->reclaimed = 0;
vmpr->stall = 0;
spin_unlock(&vmpr->sr_lock);
}
write_unlock_irqrestore(&vmpr->users_lock, flags);

return true;
}

void vmpressure_dec_users(void)
{
struct vmpressure *vmpr = &global_vmpressure;

/* Decrement the vmpressure user count with release semantics */
smp_mb__before_atomic();
atomic_long_dec(&vmpr->users);
}

static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical,
unsigned long reclaimed)
{
struct vmpressure *vmpr = &global_vmpressure;
unsigned long pressure;
unsigned long stall;
unsigned long flags;

if (critical)
scanned = calculate_vmpressure_win();

spin_lock(&vmpr->sr_lock);
spin_lock_irqsave(&vmpr->sr_lock, flags);
if (scanned) {
vmpr->scanned += scanned;
vmpr->reclaimed += reclaimed;
Expand All @@ -365,14 +399,14 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical,
reclaimed = vmpr->reclaimed;

if (!critical && scanned < calculate_vmpressure_win()) {
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
return;
}
}
vmpr->scanned = 0;
vmpr->reclaimed = 0;
vmpr->stall = 0;
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);

if (scanned) {
pressure = vmpressure_calc_pressure(scanned, reclaimed);
Expand Down Expand Up @@ -418,9 +452,25 @@ static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed, int order)
{
struct vmpressure *vmpr = &global_vmpressure;
unsigned long flags;

if (order > PAGE_ALLOC_COSTLY_ORDER)
return;

/*
* It's possible for kswapd to keep doing reclaim even though memory
* pressure isn't high anymore. We should only track vmpressure when
* there are failed memory allocations actively stuck in the page
* allocator's slow path. No failed allocations means pressure is fine.
*/
read_lock_irqsave(&vmpr->users_lock, flags);
if (!atomic_long_read(&vmpr->users)) {
read_unlock_irqrestore(&vmpr->users_lock, flags);
return;
}
read_unlock_irqrestore(&vmpr->users_lock, flags);

__vmpressure(gfp, memcg, false, tree, scanned, reclaimed);
}

Expand Down Expand Up @@ -589,6 +639,8 @@ void vmpressure_init(struct vmpressure *vmpr)
mutex_init(&vmpr->events_lock);
INIT_LIST_HEAD(&vmpr->events);
INIT_WORK(&vmpr->work, vmpressure_work_fn);
atomic_long_set(&vmpr->users, 0);
rwlock_init(&vmpr->users_lock);
}

/**
Expand Down

0 comments on commit 2795315

Please sign in to comment.