kernel: Backport MGLRU patch from 6.4
[openwrt/openwrt.git] / target / linux / generic / backport-5.15 / 021-v6.4-mm-Multi-gen-LRU-remove-wait_event_killable.patch
1 From 087ed25eaf5a78a678508e893f80addab9b1c103 Mon Sep 17 00:00:00 2001
2 From: Kalesh Singh <kaleshsingh@google.com>
3 Date: Thu, 13 Apr 2023 14:43:26 -0700
4 Subject: [PATCH] mm: Multi-gen LRU: remove wait_event_killable()
5
6 Android 14 and later default to MGLRU [1] and field telemetry showed
7 occasional long tail latency (>100ms) in the reclaim path.
8
9 Tracing revealed priority inversion in the reclaim path. In
10 try_to_inc_max_seq(), when high priority tasks were blocked on
11 wait_event_killable(), the preemption of the low priority task to call
12 wake_up_all() caused those high priority tasks to wait longer than
13 necessary. In general, this problem is not different from others of its
14 kind, e.g., one caused by mutex_lock(). However, it is specific to MGLRU
15 because it introduced the new wait queue lruvec->mm_state.wait.
16
17 The purpose of this new wait queue is to avoid the thundering herd
18 problem. If many direct reclaimers rush into try_to_inc_max_seq(), only
19 one can succeed, i.e., the one to wake up the rest, and the rest who
20 failed might cause premature OOM kills if they do not wait. So far there
21 is no evidence supporting this scenario, based on how often the wait has
22 been hit. And this begs the question how useful the wait queue is in
23 practice.
24
25 Based on Minchan's recommendation, which is in line with his commit
26 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
27 rest of the MGLRU code which also uses trylock when possible, remove the
28 wait queue.
29
30 [1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf
31
32 Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
33 Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
34 Change-Id: I911f3968fd1adb25171279cc5b6f48ccb7efc8de
35 Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
36 Suggested-by: Minchan Kim <minchan@kernel.org>
37 Reported-by: Wei Wang <wvw@google.com>
38 Acked-by: Yu Zhao <yuzhao@google.com>
39 Cc: Minchan Kim <minchan@kernel.org>
40 Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
41 Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
42 Cc: Suleiman Souhlal <suleiman@google.com>
43 Cc: Suren Baghdasaryan <surenb@google.com>
44 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
45 ---
46 include/linux/mmzone.h | 8 +--
47 mm/vmscan.c | 111 +++++++++++++++--------------------------
48 2 files changed, 42 insertions(+), 77 deletions(-)
49
50 --- a/include/linux/mmzone.h
51 +++ b/include/linux/mmzone.h
52 @@ -452,18 +452,14 @@ enum {
53 struct lru_gen_mm_state {
54 /* set to max_seq after each iteration */
55 unsigned long seq;
56 - /* where the current iteration continues (inclusive) */
57 + /* where the current iteration continues after */
58 struct list_head *head;
59 - /* where the last iteration ended (exclusive) */
60 + /* where the last iteration ended before */
61 struct list_head *tail;
62 - /* to wait for the last page table walker to finish */
63 - struct wait_queue_head wait;
64 /* Bloom filters flip after each iteration */
65 unsigned long *filters[NR_BLOOM_FILTERS];
66 /* the mm stats for debugging */
67 unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
68 - /* the number of concurrent page table walkers */
69 - int nr_walkers;
70 };
71
72 struct lru_gen_mm_walk {
73 --- a/mm/vmscan.c
74 +++ b/mm/vmscan.c
75 @@ -2999,18 +2999,13 @@ void lru_gen_del_mm(struct mm_struct *mm
76 if (!lruvec)
77 continue;
78
79 - /* where the last iteration ended (exclusive) */
80 + /* where the current iteration continues after */
81 + if (lruvec->mm_state.head == &mm->lru_gen.list)
82 + lruvec->mm_state.head = lruvec->mm_state.head->prev;
83 +
84 + /* where the last iteration ended before */
85 if (lruvec->mm_state.tail == &mm->lru_gen.list)
86 lruvec->mm_state.tail = lruvec->mm_state.tail->next;
87 -
88 - /* where the current iteration continues (inclusive) */
89 - if (lruvec->mm_state.head != &mm->lru_gen.list)
90 - continue;
91 -
92 - lruvec->mm_state.head = lruvec->mm_state.head->next;
93 - /* the deletion ends the current iteration */
94 - if (lruvec->mm_state.head == &mm_list->fifo)
95 - WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
96 }
97
98 list_del_init(&mm->lru_gen.list);
99 @@ -3194,68 +3189,54 @@ static bool iterate_mm_list(struct lruve
100 struct mm_struct **iter)
101 {
102 bool first = false;
103 - bool last = true;
104 + bool last = false;
105 struct mm_struct *mm = NULL;
106 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
107 struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
108 struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
109
110 /*
111 - * There are four interesting cases for this page table walker:
112 - * 1. It tries to start a new iteration of mm_list with a stale max_seq;
113 - * there is nothing left to do.
114 - * 2. It's the first of the current generation, and it needs to reset
115 - * the Bloom filter for the next generation.
116 - * 3. It reaches the end of mm_list, and it needs to increment
117 - * mm_state->seq; the iteration is done.
118 - * 4. It's the last of the current generation, and it needs to reset the
119 - * mm stats counters for the next generation.
120 + * mm_state->seq is incremented after each iteration of mm_list. There
121 + * are three interesting cases for this page table walker:
122 + * 1. It tries to start a new iteration with a stale max_seq: there is
123 + * nothing left to do.
124 + * 2. It started the next iteration: it needs to reset the Bloom filter
125 + * so that a fresh set of PTE tables can be recorded.
126 + * 3. It ended the current iteration: it needs to reset the mm stats
127 + * counters and tell its caller to increment max_seq.
128 */
129 spin_lock(&mm_list->lock);
130
131 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
132 - VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
133 - VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
134
135 - if (walk->max_seq <= mm_state->seq) {
136 - if (!*iter)
137 - last = false;
138 + if (walk->max_seq <= mm_state->seq)
139 goto done;
140 - }
141
142 - if (!mm_state->nr_walkers) {
143 - VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
144 + if (!mm_state->head)
145 + mm_state->head = &mm_list->fifo;
146
147 - mm_state->head = mm_list->fifo.next;
148 + if (mm_state->head == &mm_list->fifo)
149 first = true;
150 - }
151 -
152 - while (!mm && mm_state->head != &mm_list->fifo) {
153 - mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
154
155 + do {
156 mm_state->head = mm_state->head->next;
157 + if (mm_state->head == &mm_list->fifo) {
158 + WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
159 + last = true;
160 + break;
161 + }
162
163 /* force scan for those added after the last iteration */
164 - if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
165 - mm_state->tail = mm_state->head;
166 + if (!mm_state->tail || mm_state->tail == mm_state->head) {
167 + mm_state->tail = mm_state->head->next;
168 walk->force_scan = true;
169 }
170
171 + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
172 if (should_skip_mm(mm, walk))
173 mm = NULL;
174 - }
175 -
176 - if (mm_state->head == &mm_list->fifo)
177 - WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
178 + } while (!mm);
179 done:
180 - if (*iter && !mm)
181 - mm_state->nr_walkers--;
182 - if (!*iter && mm)
183 - mm_state->nr_walkers++;
184 -
185 - if (mm_state->nr_walkers)
186 - last = false;
187 -
188 if (*iter || last)
189 reset_mm_stats(lruvec, walk, last);
190
191 @@ -3283,9 +3264,9 @@ static bool iterate_mm_list_nowalk(struc
192
193 VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
194
195 - if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
196 - VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
197 -
198 + if (max_seq > mm_state->seq) {
199 + mm_state->head = NULL;
200 + mm_state->tail = NULL;
201 WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
202 reset_mm_stats(lruvec, NULL, true);
203 success = true;
204 @@ -3894,10 +3875,6 @@ restart:
205
206 walk_pmd_range(&val, addr, next, args);
207
208 - /* a racy check to curtail the waiting time */
209 - if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
210 - return 1;
211 -
212 if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
213 end = (addr | ~PUD_MASK) + 1;
214 goto done;
215 @@ -3930,8 +3907,14 @@ static void walk_mm(struct lruvec *lruve
216 walk->next_addr = FIRST_USER_ADDRESS;
217
218 do {
219 + DEFINE_MAX_SEQ(lruvec);
220 +
221 err = -EBUSY;
222
223 + /* another thread might have called inc_max_seq() */
224 + if (walk->max_seq != max_seq)
225 + break;
226 +
227 /* page_update_gen() requires stable page_memcg() */
228 if (!mem_cgroup_trylock_pages(memcg))
229 break;
230 @@ -4164,25 +4147,12 @@ static bool try_to_inc_max_seq(struct lr
231 success = iterate_mm_list(lruvec, walk, &mm);
232 if (mm)
233 walk_mm(lruvec, mm, walk);
234 -
235 - cond_resched();
236 } while (mm);
237 done:
238 - if (!success) {
239 - if (sc->priority <= DEF_PRIORITY - 2)
240 - wait_event_killable(lruvec->mm_state.wait,
241 - max_seq < READ_ONCE(lrugen->max_seq));
242 - return false;
243 - }
244 + if (success)
245 + inc_max_seq(lruvec, can_swap, force_scan);
246
247 - VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
248 -
249 - inc_max_seq(lruvec, can_swap, force_scan);
250 - /* either this sees any waiters or they will see updated max_seq */
251 - if (wq_has_sleeper(&lruvec->mm_state.wait))
252 - wake_up_all(&lruvec->mm_state.wait);
253 -
254 - return true;
255 + return success;
256 }
257
258 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
259 @@ -5746,7 +5716,6 @@ void lru_gen_init_lruvec(struct lruvec *
260 INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
261
262 lruvec->mm_state.seq = MIN_NR_GENS;
263 - init_waitqueue_head(&lruvec->mm_state.wait);
264 }
265
266 #ifdef CONFIG_MEMCG