1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
|
From 087ed25eaf5a78a678508e893f80addab9b1c103 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Thu, 13 Apr 2023 14:43:26 -0700
Subject: [PATCH] mm: Multi-gen LRU: remove wait_event_killable()
Android 14 and later default to MGLRU [1] and field telemetry showed
occasional long tail latency (>100ms) in the reclaim path.
Tracing revealed priority inversion in the reclaim path. In
try_to_inc_max_seq(), when high priority tasks were blocked on
wait_event_killable(), the preemption of the low priority task to call
wake_up_all() caused those high priority tasks to wait longer than
necessary. In general, this problem is not different from others of its
kind, e.g., one caused by mutex_lock(). However, it is specific to MGLRU
because it introduced the new wait queue lruvec->mm_state.wait.
The purpose of this new wait queue is to avoid the thundering herd
problem. If many direct reclaimers rush into try_to_inc_max_seq(), only
one can succeed, i.e., the one to wake up the rest, and the rest who
failed might cause premature OOM kills if they do not wait. So far there
is no evidence supporting this scenario, based on how often the wait has
been hit. And this begs the question how useful the wait queue is in
practice.
Based on Minchan's recommendation, which is in line with his commit
6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
rest of the MGLRU code which also uses trylock when possible, remove the
wait queue.
[1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf
Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Change-Id: I911f3968fd1adb25171279cc5b6f48ccb7efc8de
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
Reported-by: Wei Wang <wvw@google.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/mmzone.h | 8 +--
mm/vmscan.c | 111 +++++++++++++++--------------------------
2 files changed, 42 insertions(+), 77 deletions(-)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -452,18 +452,14 @@ enum {
struct lru_gen_mm_state {
/* set to max_seq after each iteration */
unsigned long seq;
- /* where the current iteration continues (inclusive) */
+ /* where the current iteration continues after */
struct list_head *head;
- /* where the last iteration ended (exclusive) */
+ /* where the last iteration ended before */
struct list_head *tail;
- /* to wait for the last page table walker to finish */
- struct wait_queue_head wait;
/* Bloom filters flip after each iteration */
unsigned long *filters[NR_BLOOM_FILTERS];
/* the mm stats for debugging */
unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
- /* the number of concurrent page table walkers */
- int nr_walkers;
};
struct lru_gen_mm_walk {
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2999,18 +2999,13 @@ void lru_gen_del_mm(struct mm_struct *mm
if (!lruvec)
continue;
- /* where the last iteration ended (exclusive) */
+ /* where the current iteration continues after */
+ if (lruvec->mm_state.head == &mm->lru_gen.list)
+ lruvec->mm_state.head = lruvec->mm_state.head->prev;
+
+ /* where the last iteration ended before */
if (lruvec->mm_state.tail == &mm->lru_gen.list)
lruvec->mm_state.tail = lruvec->mm_state.tail->next;
-
- /* where the current iteration continues (inclusive) */
- if (lruvec->mm_state.head != &mm->lru_gen.list)
- continue;
-
- lruvec->mm_state.head = lruvec->mm_state.head->next;
- /* the deletion ends the current iteration */
- if (lruvec->mm_state.head == &mm_list->fifo)
- WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
}
list_del_init(&mm->lru_gen.list);
@@ -3194,68 +3189,54 @@ static bool iterate_mm_list(struct lruve
struct mm_struct **iter)
{
bool first = false;
- bool last = true;
+ bool last = false;
struct mm_struct *mm = NULL;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
/*
- * There are four interesting cases for this page table walker:
- * 1. It tries to start a new iteration of mm_list with a stale max_seq;
- * there is nothing left to do.
- * 2. It's the first of the current generation, and it needs to reset
- * the Bloom filter for the next generation.
- * 3. It reaches the end of mm_list, and it needs to increment
- * mm_state->seq; the iteration is done.
- * 4. It's the last of the current generation, and it needs to reset the
- * mm stats counters for the next generation.
+ * mm_state->seq is incremented after each iteration of mm_list. There
+ * are three interesting cases for this page table walker:
+ * 1. It tries to start a new iteration with a stale max_seq: there is
+ * nothing left to do.
+ * 2. It started the next iteration: it needs to reset the Bloom filter
+ * so that a fresh set of PTE tables can be recorded.
+ * 3. It ended the current iteration: it needs to reset the mm stats
+ * counters and tell its caller to increment max_seq.
*/
spin_lock(&mm_list->lock);
VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
- VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
- VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
- if (walk->max_seq <= mm_state->seq) {
- if (!*iter)
- last = false;
+ if (walk->max_seq <= mm_state->seq)
goto done;
- }
- if (!mm_state->nr_walkers) {
- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+ if (!mm_state->head)
+ mm_state->head = &mm_list->fifo;
- mm_state->head = mm_list->fifo.next;
+ if (mm_state->head == &mm_list->fifo)
first = true;
- }
-
- while (!mm && mm_state->head != &mm_list->fifo) {
- mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+ do {
mm_state->head = mm_state->head->next;
+ if (mm_state->head == &mm_list->fifo) {
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ last = true;
+ break;
+ }
/* force scan for those added after the last iteration */
- if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
- mm_state->tail = mm_state->head;
+ if (!mm_state->tail || mm_state->tail == mm_state->head) {
+ mm_state->tail = mm_state->head->next;
walk->force_scan = true;
}
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
if (should_skip_mm(mm, walk))
mm = NULL;
- }
-
- if (mm_state->head == &mm_list->fifo)
- WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ } while (!mm);
done:
- if (*iter && !mm)
- mm_state->nr_walkers--;
- if (!*iter && mm)
- mm_state->nr_walkers++;
-
- if (mm_state->nr_walkers)
- last = false;
-
if (*iter || last)
reset_mm_stats(lruvec, walk, last);
@@ -3283,9 +3264,9 @@ static bool iterate_mm_list_nowalk(struc
VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
- if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
-
+ if (max_seq > mm_state->seq) {
+ mm_state->head = NULL;
+ mm_state->tail = NULL;
WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
reset_mm_stats(lruvec, NULL, true);
success = true;
@@ -3894,10 +3875,6 @@ restart:
walk_pmd_range(&val, addr, next, args);
- /* a racy check to curtail the waiting time */
- if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
- return 1;
-
if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
end = (addr | ~PUD_MASK) + 1;
goto done;
@@ -3930,8 +3907,14 @@ static void walk_mm(struct lruvec *lruve
walk->next_addr = FIRST_USER_ADDRESS;
do {
+ DEFINE_MAX_SEQ(lruvec);
+
err = -EBUSY;
+ /* another thread might have called inc_max_seq() */
+ if (walk->max_seq != max_seq)
+ break;
+
/* page_update_gen() requires stable page_memcg() */
if (!mem_cgroup_trylock_pages(memcg))
break;
@@ -4164,25 +4147,12 @@ static bool try_to_inc_max_seq(struct lr
success = iterate_mm_list(lruvec, walk, &mm);
if (mm)
walk_mm(lruvec, mm, walk);
-
- cond_resched();
} while (mm);
done:
- if (!success) {
- if (sc->priority <= DEF_PRIORITY - 2)
- wait_event_killable(lruvec->mm_state.wait,
- max_seq < READ_ONCE(lrugen->max_seq));
- return false;
- }
+ if (success)
+ inc_max_seq(lruvec, can_swap, force_scan);
- VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
-
- inc_max_seq(lruvec, can_swap, force_scan);
- /* either this sees any waiters or they will see updated max_seq */
- if (wq_has_sleeper(&lruvec->mm_state.wait))
- wake_up_all(&lruvec->mm_state.wait);
-
- return true;
+ return success;
}
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
@@ -5746,7 +5716,6 @@ void lru_gen_init_lruvec(struct lruvec *
INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
lruvec->mm_state.seq = MIN_NR_GENS;
- init_waitqueue_head(&lruvec->mm_state.wait);
}
#ifdef CONFIG_MEMCG
|