+From 8ee8571e47aa75221e5fbd4c9c7802fc4244c346 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Wed, 21 Dec 2022 21:19:04 -0700
+Subject: [PATCH 06/19] BACKPORT: mm: multi-gen LRU: per-node lru_gen_folio
+ lists
+
+For each node, memcgs are divided into two generations: the old and
+the young. For each generation, memcgs are randomly sharded into
+multiple bins to improve scalability. For each bin, an RCU hlist_nulls
+is virtually divided into three segments: the head, the tail and the
+default.
+
+An onlining memcg is added to the tail of a random bin in the old
+generation. The eviction starts at the head of a random bin in the old
+generation. The per-node memcg generation counter, whose reminder (mod
+2) indexes the old generation, is incremented when all its bins become
+empty.
+
+There are four operations:
+1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
+ its current generation (old or young) and updates its "seg" to
+ "head";
+2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
+ its current generation (old or young) and updates its "seg" to
+ "tail";
+3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
+ the old generation, updates its "gen" to "old" and resets its "seg"
+ to "default";
+4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
+ in the young generation, updates its "gen" to "young" and resets
+ its "seg" to "default".
+
+The events that trigger the above operations are:
+1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
+2. The first attempt to reclaim an memcg below low, which triggers
+ MEMCG_LRU_TAIL;
+3. The first attempt to reclaim an memcg below reclaimable size
+ threshold, which triggers MEMCG_LRU_TAIL;
+4. The second attempt to reclaim an memcg below reclaimable size
+ threshold, which triggers MEMCG_LRU_YOUNG;
+5. Attempting to reclaim an memcg below min, which triggers
+ MEMCG_LRU_YOUNG;
+6. Finishing the aging on the eviction path, which triggers
+ MEMCG_LRU_YOUNG;
+7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
+
+Note that memcg LRU only applies to global reclaim, and the
+round-robin incrementing of their max_seq counters ensures the
+eventual fairness to all eligible memcgs. For memcg reclaim, it still
+relies on mem_cgroup_iter().
+
+Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Jonathan Corbet <corbet@lwn.net>
+Cc: Michael Larabel <Michael@MichaelLarabel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mike Rapoport <rppt@kernel.org>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Bug: 274865848
+(cherry picked from commit e4dde56cd208674ce899b47589f263499e5b8cdc)
+[TJ: Resolved conflicts with older function signatures for
+min_cgroup_below_min / min_cgroup_below_low and includes]
+Change-Id: Idc8a0f635e035d72dd911f807d1224cb47cbd655
+Signed-off-by: T.J. Mercier <tjmercier@google.com>
+---
+ include/linux/memcontrol.h | 10 +
+ include/linux/mm_inline.h | 17 ++
+ include/linux/mmzone.h | 117 +++++++++++-
+ mm/memcontrol.c | 16 ++
+ mm/page_alloc.c | 1 +
+ mm/vmscan.c | 374 +++++++++++++++++++++++++++++++++----
+ 6 files changed, 500 insertions(+), 35 deletions(-)
+
+diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
+index e039763029563..82d28b052a9e5 100644
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -790,6 +790,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+ percpu_ref_put(&objcg->refcnt);
+ }
+
++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
++{
++ return !memcg || css_tryget(&memcg->css);
++}
++
+ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+ {
+ if (memcg)
+@@ -1290,6 +1295,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+ {
+ }
+
++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
++{
++ return true;
++}
++
+ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+ {
+ }
+diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
+index da38e3d962e2f..c1fd3922dc5dd 100644
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void)
+ return current->in_lru_fault;
+ }
+
++#ifdef CONFIG_MEMCG
++static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
++{
++ return READ_ONCE(lruvec->lrugen.seg);
++}
++#else
++static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
++{
++ return 0;
++}
++#endif
++
+ static inline int lru_gen_from_seq(unsigned long seq)
+ {
+ return seq % MAX_NR_GENS;
+@@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void)
+ return false;
+ }
+
++static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
++{
++ return 0;
++}
++
+ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+ {
+ return false;
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 02e4323744715..66e067a635682 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -7,6 +7,7 @@
+
+ #include <linux/spinlock.h>
+ #include <linux/list.h>
++#include <linux/list_nulls.h>
+ #include <linux/wait.h>
+ #include <linux/bitops.h>
+ #include <linux/cache.h>
+@@ -367,6 +368,15 @@ struct page_vma_mapped_walk;
+ #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+ #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+
++/* see the comment on MEMCG_NR_GENS */
++enum {
++ MEMCG_LRU_NOP,
++ MEMCG_LRU_HEAD,
++ MEMCG_LRU_TAIL,
++ MEMCG_LRU_OLD,
++ MEMCG_LRU_YOUNG,
++};
++
+ #ifdef CONFIG_LRU_GEN
+
+ enum {
+@@ -426,6 +436,14 @@ struct lru_gen_folio {
+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multi-gen LRU is enabled */
+ bool enabled;
++#ifdef CONFIG_MEMCG
++ /* the memcg generation this lru_gen_folio belongs to */
++ u8 gen;
++ /* the list segment this lru_gen_folio belongs to */
++ u8 seg;
++ /* per-node lru_gen_folio list for global reclaim */
++ struct hlist_nulls_node list;
++#endif
+ };
+
+ enum {
+@@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
+ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+
+ #ifdef CONFIG_MEMCG
++
++/*
++ * For each node, memcgs are divided into two generations: the old and the
++ * young. For each generation, memcgs are randomly sharded into multiple bins
++ * to improve scalability. For each bin, the hlist_nulls is virtually divided
++ * into three segments: the head, the tail and the default.
++ *
++ * An onlining memcg is added to the tail of a random bin in the old generation.
++ * The eviction starts at the head of a random bin in the old generation. The
++ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
++ * the old generation, is incremented when all its bins become empty.
++ *
++ * There are four operations:
++ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
++ * current generation (old or young) and updates its "seg" to "head";
++ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
++ * current generation (old or young) and updates its "seg" to "tail";
++ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
++ * generation, updates its "gen" to "old" and resets its "seg" to "default";
++ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
++ * young generation, updates its "gen" to "young" and resets its "seg" to
++ * "default".
++ *
++ * The events that trigger the above operations are:
++ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
++ * 2. The first attempt to reclaim an memcg below low, which triggers
++ * MEMCG_LRU_TAIL;
++ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
++ * which triggers MEMCG_LRU_TAIL;
++ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
++ * which triggers MEMCG_LRU_YOUNG;
++ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
++ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
++ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
++ *
++ * Note that memcg LRU only applies to global reclaim, and the round-robin
++ * incrementing of their max_seq counters ensures the eventual fairness to all
++ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
++ */
++#define MEMCG_NR_GENS 2
++#define MEMCG_NR_BINS 8
++
++struct lru_gen_memcg {
++ /* the per-node memcg generation counter */
++ unsigned long seq;
++ /* each memcg has one lru_gen_folio per node */
++ unsigned long nr_memcgs[MEMCG_NR_GENS];
++ /* per-node lru_gen_folio list for global reclaim */
++ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
++ /* protects the above */
++ spinlock_t lock;
++};
++
++void lru_gen_init_pgdat(struct pglist_data *pgdat);
++
+ void lru_gen_init_memcg(struct mem_cgroup *memcg);
+ void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+-#endif
++void lru_gen_online_memcg(struct mem_cgroup *memcg);
++void lru_gen_offline_memcg(struct mem_cgroup *memcg);
++void lru_gen_release_memcg(struct mem_cgroup *memcg);
++void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
++
++#else /* !CONFIG_MEMCG */
++
++#define MEMCG_NR_GENS 1
++
++struct lru_gen_memcg {
++};
++
++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++}
++
++#endif /* CONFIG_MEMCG */
+
+ #else /* !CONFIG_LRU_GEN */
+
++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++}
++
+ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+ {
+ }
+@@ -494,6 +587,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ }
+
+ #ifdef CONFIG_MEMCG
++
+ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+ }
+@@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+ {
+ }
+-#endif
++
++static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
++{
++}
++
++#endif /* CONFIG_MEMCG */
+
+ #endif /* CONFIG_LRU_GEN */
+
+@@ -1219,6 +1330,8 @@ typedef struct pglist_data {
+ #ifdef CONFIG_LRU_GEN
+ /* kswap mm walk data */
+ struct lru_gen_mm_walk mm_walk;
++ /* lru_gen_folio list */
++ struct lru_gen_memcg memcg_lru;
+ #endif
+
+ CACHELINE_PADDING(_pad2_);
+diff --git a/mm/memcontrol.c b/mm/memcontrol.c
+index 3e8f1ad0fe9db..7815d556e38cc 100644
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -477,6 +477,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
+ struct mem_cgroup_per_node *mz;
+ struct mem_cgroup_tree_per_node *mctz;
+
++ if (lru_gen_enabled()) {
++ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
++
++ /* see the comment on MEMCG_NR_GENS */
++ if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
++
++ return;
++ }
++
+ mctz = soft_limit_tree.rb_tree_per_node[nid];
+ if (!mctz)
+ return;
+@@ -3522,6 +3532,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+ struct mem_cgroup_tree_per_node *mctz;
+ unsigned long excess;
+
++ if (lru_gen_enabled())
++ return 0;
++
+ if (order > 0)
+ return 0;
+
+@@ -5382,6 +5395,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ 2UL*HZ);
++ lru_gen_online_memcg(memcg);
+ return 0;
+ offline_kmem:
+ memcg_offline_kmem(memcg);
+@@ -5413,6 +5427,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
+ memcg_offline_kmem(memcg);
+ reparent_shrinker_deferred(memcg);
+ wb_memcg_offline(memcg);
++ lru_gen_offline_memcg(memcg);
+
+ drain_all_stock(memcg);
+
+@@ -5424,6 +5439,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ invalidate_reclaim_iterators(memcg);
++ lru_gen_release_memcg(memcg);
+ }
+
+ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 69668817fed37..473057b81a9df 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -7957,6 +7957,7 @@ static void __init free_area_init_node(int nid)
+ pgdat_set_deferred_range(pgdat);
+
+ free_area_init_core(pgdat);
++ lru_gen_init_pgdat(pgdat);
+ }
+
+ static void __init free_area_init_memoryless_node(int nid)
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 0c47952714b26..65eb28448f216 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -54,6 +54,8 @@
+ #include <linux/shmem_fs.h>
+ #include <linux/ctype.h>
+ #include <linux/debugfs.h>
++#include <linux/rculist_nulls.h>
++#include <linux/random.h>
+
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -134,11 +136,6 @@ struct scan_control {
+ /* Always discard instead of demoting to lower tier memory */
+ unsigned int no_demotion:1;
+
+-#ifdef CONFIG_LRU_GEN
+- /* help kswapd make better choices among multiple memcgs */
+- unsigned long last_reclaimed;
+-#endif
+-
+ /* Allocation order */
+ s8 order;
+
+@@ -3160,6 +3157,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
++#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
++#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
++
+ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
+ {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+@@ -4440,8 +4440,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ if (sc->priority <= DEF_PRIORITY - 2)
+ wait_event_killable(lruvec->mm_state.wait,
+ max_seq < READ_ONCE(lrugen->max_seq));
+-
+- return max_seq < READ_ONCE(lrugen->max_seq);
++ return false;
+ }
+
+ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+@@ -4514,8 +4513,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+
+ VM_WARN_ON_ONCE(!current_is_kswapd());
+
+- sc->last_reclaimed = sc->nr_reclaimed;
+-
+ /* check the order to exclude compaction-induced reclaim */
+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
+ return;
+@@ -5104,8 +5101,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
+ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
+ * reclaim.
+ */
+-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+- bool can_swap)
++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+ {
+ unsigned long nr_to_scan;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+@@ -5122,10 +5118,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
+ if (sc->priority == DEF_PRIORITY)
+ return nr_to_scan;
+
+- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
+-
+ /* skip this lruvec as it's low on cold folios */
+- return 0;
++ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+ }
+
+ static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+@@ -5134,29 +5128,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+ if (!global_reclaim(sc))
+ return -1;
+
+- /* discount the previous progress for kswapd */
+- if (current_is_kswapd())
+- return sc->nr_to_reclaim + sc->last_reclaimed;
+-
+ return max(sc->nr_to_reclaim, compact_gap(sc->order));
+ }
+
+-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+- struct blk_plug plug;
++ long nr_to_scan;
+ unsigned long scanned = 0;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+
+- lru_add_drain();
+-
+- blk_start_plug(&plug);
+-
+- set_mm_walk(lruvec_pgdat(lruvec));
+-
+ while (true) {
+ int delta;
+ int swappiness;
+- unsigned long nr_to_scan;
+
+ if (sc->may_swap)
+ swappiness = get_swappiness(lruvec, sc);
+@@ -5166,7 +5149,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
+ swappiness = 0;
+
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+- if (!nr_to_scan)
++ if (nr_to_scan <= 0)
+ break;
+
+ delta = evict_folios(lruvec, sc, swappiness);
+@@ -5183,11 +5166,252 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
+ cond_resched();
+ }
+
++ /* whether try_to_inc_max_seq() was successful */
++ return nr_to_scan < 0;
++}
++
++static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
++{
++ bool success;
++ unsigned long scanned = sc->nr_scanned;
++ unsigned long reclaimed = sc->nr_reclaimed;
++ int seg = lru_gen_memcg_seg(lruvec);
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++ /* see the comment on MEMCG_NR_GENS */
++ if (!lruvec_is_sizable(lruvec, sc))
++ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
++
++ mem_cgroup_calculate_protection(NULL, memcg);
++
++ if (mem_cgroup_below_min(memcg))
++ return MEMCG_LRU_YOUNG;
++
++ if (mem_cgroup_below_low(memcg)) {
++ /* see the comment on MEMCG_NR_GENS */
++ if (seg != MEMCG_LRU_TAIL)
++ return MEMCG_LRU_TAIL;
++
++ memcg_memory_event(memcg, MEMCG_LOW);
++ }
++
++ success = try_to_shrink_lruvec(lruvec, sc);
++
++ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
++
++ if (!sc->proactive)
++ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
++ sc->nr_reclaimed - reclaimed);
++
++ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
++ current->reclaim_state->reclaimed_slab = 0;
++
++ return success ? MEMCG_LRU_YOUNG : 0;
++}
++
++#ifdef CONFIG_MEMCG
++
++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
++{
++ int gen;
++ int bin;
++ int first_bin;
++ struct lruvec *lruvec;
++ struct lru_gen_folio *lrugen;
++ const struct hlist_nulls_node *pos;
++ int op = 0;
++ struct mem_cgroup *memcg = NULL;
++ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
++
++ bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
++restart:
++ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
++
++ rcu_read_lock();
++
++ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
++ if (op)
++ lru_gen_rotate_memcg(lruvec, op);
++
++ mem_cgroup_put(memcg);
++
++ lruvec = container_of(lrugen, struct lruvec, lrugen);
++ memcg = lruvec_memcg(lruvec);
++
++ if (!mem_cgroup_tryget(memcg)) {
++ op = 0;
++ memcg = NULL;
++ continue;
++ }
++
++ rcu_read_unlock();
++
++ op = shrink_one(lruvec, sc);
++
++ if (sc->nr_reclaimed >= nr_to_reclaim)
++ goto success;
++
++ rcu_read_lock();
++ }
++
++ rcu_read_unlock();
++
++ /* restart if raced with lru_gen_rotate_memcg() */
++ if (gen != get_nulls_value(pos))
++ goto restart;
++
++ /* try the rest of the bins of the current generation */
++ bin = get_memcg_bin(bin + 1);
++ if (bin != first_bin)
++ goto restart;
++success:
++ if (op)
++ lru_gen_rotate_memcg(lruvec, op);
++
++ mem_cgroup_put(memcg);
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++ struct blk_plug plug;
++
++ VM_WARN_ON_ONCE(global_reclaim(sc));
++
++ lru_add_drain();
++
++ blk_start_plug(&plug);
++
++ set_mm_walk(lruvec_pgdat(lruvec));
++
++ if (try_to_shrink_lruvec(lruvec, sc))
++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
++
+ clear_mm_walk();
+
+ blk_finish_plug(&plug);
+ }
+
++#else /* !CONFIG_MEMCG */
++
++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
++{
++ BUILD_BUG();
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++ BUILD_BUG();
++}
++
++#endif
++
++static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
++{
++ int priority;
++ unsigned long reclaimable;
++ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
++
++ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
++ return;
++ /*
++ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
++ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
++ * estimated reclaimed_to_scanned_ratio = inactive / total.
++ */
++ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
++ if (get_swappiness(lruvec, sc))
++ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
++
++ reclaimable /= MEMCG_NR_GENS;
++
++ /* round down reclaimable and round up sc->nr_to_reclaim */
++ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
++
++ sc->priority = clamp(priority, 0, DEF_PRIORITY);
++}
++
++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++ struct blk_plug plug;
++ unsigned long reclaimed = sc->nr_reclaimed;
++
++ VM_WARN_ON_ONCE(!global_reclaim(sc));
++
++ lru_add_drain();
++
++ blk_start_plug(&plug);
++
++ set_mm_walk(pgdat);
++
++ set_initial_priority(pgdat, sc);
++
++ if (current_is_kswapd())
++ sc->nr_reclaimed = 0;
++
++ if (mem_cgroup_disabled())
++ shrink_one(&pgdat->__lruvec, sc);
++ else
++ shrink_many(pgdat, sc);
++
++ if (current_is_kswapd())
++ sc->nr_reclaimed += reclaimed;
++
++ clear_mm_walk();
++
++ blk_finish_plug(&plug);
++
++ /* kswapd should never fail */
++ pgdat->kswapd_failures = 0;
++}
++
++#ifdef CONFIG_MEMCG
++void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
++{
++ int seg;
++ int old, new;
++ int bin = get_random_u32_below(MEMCG_NR_BINS);
++ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++ spin_lock(&pgdat->memcg_lru.lock);
++
++ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++ seg = 0;
++ new = old = lruvec->lrugen.gen;
++
++ /* see the comment on MEMCG_NR_GENS */
++ if (op == MEMCG_LRU_HEAD)
++ seg = MEMCG_LRU_HEAD;
++ else if (op == MEMCG_LRU_TAIL)
++ seg = MEMCG_LRU_TAIL;
++ else if (op == MEMCG_LRU_OLD)
++ new = get_memcg_gen(pgdat->memcg_lru.seq);
++ else if (op == MEMCG_LRU_YOUNG)
++ new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
++ else
++ VM_WARN_ON_ONCE(true);
++
++ hlist_nulls_del_rcu(&lruvec->lrugen.list);
++
++ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
++ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
++ else
++ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
++
++ pgdat->memcg_lru.nr_memcgs[old]--;
++ pgdat->memcg_lru.nr_memcgs[new]++;
++
++ lruvec->lrugen.gen = new;
++ WRITE_ONCE(lruvec->lrugen.seg, seg);
++
++ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
++ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
++
++ spin_unlock(&pgdat->memcg_lru.lock);
++}
++#endif
++
+ /******************************************************************************
+ * state change
+ ******************************************************************************/
+@@ -5644,11 +5868,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
++
+ memcg = mem_cgroup_from_id(memcg_id);
+-#ifdef CONFIG_MEMCG
+- if (memcg && !css_tryget(&memcg->css))
++ if (!mem_cgroup_tryget(memcg))
+ memcg = NULL;
+-#endif
++
+ rcu_read_unlock();
+
+ if (!memcg)
+@@ -5796,6 +6020,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
+ }
+
+ #ifdef CONFIG_MEMCG
++
++void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++ int i, j;
++
++ spin_lock_init(&pgdat->memcg_lru.lock);
++
++ for (i = 0; i < MEMCG_NR_GENS; i++) {
++ for (j = 0; j < MEMCG_NR_BINS; j++)
++ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
++ }
++}
++
+ void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
+@@ -5819,7 +6056,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+ }
+ }
+ }
+-#endif
++
++void lru_gen_online_memcg(struct mem_cgroup *memcg)
++{
++ int gen;
++ int nid;
++ int bin = get_random_u32_below(MEMCG_NR_BINS);
++
++ for_each_node(nid) {
++ struct pglist_data *pgdat = NODE_DATA(nid);
++ struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++ spin_lock(&pgdat->memcg_lru.lock);
++
++ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++ gen = get_memcg_gen(pgdat->memcg_lru.seq);
++
++ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
++ pgdat->memcg_lru.nr_memcgs[gen]++;
++
++ lruvec->lrugen.gen = gen;
++
++ spin_unlock(&pgdat->memcg_lru.lock);
++ }
++}
++
++void lru_gen_offline_memcg(struct mem_cgroup *memcg)
++{
++ int nid;
++
++ for_each_node(nid) {
++ struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
++ }
++}
++
++void lru_gen_release_memcg(struct mem_cgroup *memcg)
++{
++ int gen;
++ int nid;
++
++ for_each_node(nid) {
++ struct pglist_data *pgdat = NODE_DATA(nid);
++ struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++ spin_lock(&pgdat->memcg_lru.lock);
++
++ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++ gen = lruvec->lrugen.gen;
++
++ hlist_nulls_del_rcu(&lruvec->lrugen.list);
++ pgdat->memcg_lru.nr_memcgs[gen]--;
++
++ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
++ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
++
++ spin_unlock(&pgdat->memcg_lru.lock);
++ }
++}
++
++#endif /* CONFIG_MEMCG */
+
+ static int __init init_lru_gen(void)
+ {
+@@ -5846,6 +6145,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
+ {
+ }
+
++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++}
++
+ #endif /* CONFIG_LRU_GEN */
+
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -5859,7 +6162,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ bool proportional_reclaim;
+ struct blk_plug plug;
+
+- if (lru_gen_enabled()) {
++ if (lru_gen_enabled() && !global_reclaim(sc)) {
+ lru_gen_shrink_lruvec(lruvec, sc);
+ return;
+ }
+@@ -6102,6 +6405,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ struct lruvec *target_lruvec;
+ bool reclaimable = false;
+
++ if (lru_gen_enabled() && global_reclaim(sc)) {
++ lru_gen_shrink_node(pgdat, sc);
++ return;
++ }
++
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ again:
+--
+2.40.1
+