1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/cgroup.h>
16#include <linux/fs.h>
17#include <linux/log2.h>
18#include <linux/sched.h>
19#include <linux/mm.h>
20#include <linux/vmstat.h>
21#include <linux/eventfd.h>
22#include <linux/slab.h>
23#include <linux/swap.h>
24#include <linux/printk.h>
25#include <linux/vmpressure.h>
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
42
43
44
45
46
47
48
49static const unsigned int vmpressure_level_med = 60;
50static const unsigned int vmpressure_level_critical = 95;
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
72
73static struct vmpressure *work_to_vmpressure(struct work_struct *work)
74{
75 return container_of(work, struct vmpressure, work);
76}
77
78static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
79{
80 struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
81 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
82
83 memcg = parent_mem_cgroup(memcg);
84 if (!memcg)
85 return NULL;
86 return memcg_to_vmpressure(memcg);
87}
88
89enum vmpressure_levels {
90 VMPRESSURE_LOW = 0,
91 VMPRESSURE_MEDIUM,
92 VMPRESSURE_CRITICAL,
93 VMPRESSURE_NUM_LEVELS,
94};
95
96enum vmpressure_modes {
97 VMPRESSURE_NO_PASSTHROUGH = 0,
98 VMPRESSURE_HIERARCHY,
99 VMPRESSURE_LOCAL,
100 VMPRESSURE_NUM_MODES,
101};
102
103static const char * const vmpressure_str_levels[] = {
104 [VMPRESSURE_LOW] = "low",
105 [VMPRESSURE_MEDIUM] = "medium",
106 [VMPRESSURE_CRITICAL] = "critical",
107};
108
109static const char * const vmpressure_str_modes[] = {
110 [VMPRESSURE_NO_PASSTHROUGH] = "default",
111 [VMPRESSURE_HIERARCHY] = "hierarchy",
112 [VMPRESSURE_LOCAL] = "local",
113};
114
115static enum vmpressure_levels vmpressure_level(unsigned long pressure)
116{
117 if (pressure >= vmpressure_level_critical)
118 return VMPRESSURE_CRITICAL;
119 else if (pressure >= vmpressure_level_med)
120 return VMPRESSURE_MEDIUM;
121 return VMPRESSURE_LOW;
122}
123
124static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
125 unsigned long reclaimed)
126{
127 unsigned long scale = scanned + reclaimed;
128 unsigned long pressure = 0;
129
130
131
132
133
134
135 if (reclaimed >= scanned)
136 goto out;
137
138
139
140
141
142
143
144 pressure = scale - (reclaimed * scale / scanned);
145 pressure = pressure * 100 / scale;
146
147out:
148 pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
149 scanned, reclaimed);
150
151 return vmpressure_level(pressure);
152}
153
154struct vmpressure_event {
155 struct eventfd_ctx *efd;
156 enum vmpressure_levels level;
157 enum vmpressure_modes mode;
158 struct list_head node;
159};
160
161static bool vmpressure_event(struct vmpressure *vmpr,
162 const enum vmpressure_levels level,
163 bool ancestor, bool signalled)
164{
165 struct vmpressure_event *ev;
166 bool ret = false;
167
168 mutex_lock(&vmpr->events_lock);
169 list_for_each_entry(ev, &vmpr->events, node) {
170 if (ancestor && ev->mode == VMPRESSURE_LOCAL)
171 continue;
172 if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
173 continue;
174 if (level < ev->level)
175 continue;
176 eventfd_signal(ev->efd, 1);
177 ret = true;
178 }
179 mutex_unlock(&vmpr->events_lock);
180
181 return ret;
182}
183
184static void vmpressure_work_fn(struct work_struct *work)
185{
186 struct vmpressure *vmpr = work_to_vmpressure(work);
187 unsigned long scanned;
188 unsigned long reclaimed;
189 enum vmpressure_levels level;
190 bool ancestor = false;
191 bool signalled = false;
192
193 spin_lock(&vmpr->sr_lock);
194
195
196
197
198
199
200
201
202 scanned = vmpr->tree_scanned;
203 if (!scanned) {
204 spin_unlock(&vmpr->sr_lock);
205 return;
206 }
207
208 reclaimed = vmpr->tree_reclaimed;
209 vmpr->tree_scanned = 0;
210 vmpr->tree_reclaimed = 0;
211 spin_unlock(&vmpr->sr_lock);
212
213 level = vmpressure_calc_level(scanned, reclaimed);
214
215 do {
216 if (vmpressure_event(vmpr, level, ancestor, signalled))
217 signalled = true;
218 ancestor = true;
219 } while ((vmpr = vmpressure_parent(vmpr)));
220}
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
244 unsigned long scanned, unsigned long reclaimed)
245{
246 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
247
248
249
250
251
252
253
254
255
256
257
258
259 if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
260 return;
261
262
263
264
265
266
267
268
269
270 if (!scanned)
271 return;
272
273 if (tree) {
274 spin_lock(&vmpr->sr_lock);
275 scanned = vmpr->tree_scanned += scanned;
276 vmpr->tree_reclaimed += reclaimed;
277 spin_unlock(&vmpr->sr_lock);
278
279 if (scanned < vmpressure_win)
280 return;
281 schedule_work(&vmpr->work);
282 } else {
283 enum vmpressure_levels level;
284
285
286 if (!memcg || memcg == root_mem_cgroup)
287 return;
288
289 spin_lock(&vmpr->sr_lock);
290 scanned = vmpr->scanned += scanned;
291 reclaimed = vmpr->reclaimed += reclaimed;
292 if (scanned < vmpressure_win) {
293 spin_unlock(&vmpr->sr_lock);
294 return;
295 }
296 vmpr->scanned = vmpr->reclaimed = 0;
297 spin_unlock(&vmpr->sr_lock);
298
299 level = vmpressure_calc_level(scanned, reclaimed);
300
301 if (level > VMPRESSURE_LOW) {
302
303
304
305
306
307
308
309
310 memcg->socket_pressure = jiffies + HZ;
311 }
312 }
313}
314
315
316
317
318
319
320
321
322
323
324
325
326void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
327{
328
329
330
331
332 if (prio > vmpressure_level_critical_prio)
333 return;
334
335
336
337
338
339
340
341
342 vmpressure(gfp, memcg, true, vmpressure_win, 0);
343}
344
345static enum vmpressure_levels str_to_level(const char *arg)
346{
347 enum vmpressure_levels level;
348
349 for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++)
350 if (!strcmp(vmpressure_str_levels[level], arg))
351 return level;
352 return -1;
353}
354
355static enum vmpressure_modes str_to_mode(const char *arg)
356{
357 enum vmpressure_modes mode;
358
359 for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++)
360 if (!strcmp(vmpressure_str_modes[mode], arg))
361 return mode;
362 return -1;
363}
364
365#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382int vmpressure_register_event(struct mem_cgroup *memcg,
383 struct eventfd_ctx *eventfd, const char *args)
384{
385 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
386 struct vmpressure_event *ev;
387 enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
388 enum vmpressure_levels level = -1;
389 char *spec, *spec_orig;
390 char *token;
391 int ret = 0;
392
393 spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL);
394 if (!spec) {
395 ret = -ENOMEM;
396 goto out;
397 }
398 strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN);
399
400
401 token = strsep(&spec, ",");
402 level = str_to_level(token);
403 if (level == -1) {
404 ret = -EINVAL;
405 goto out;
406 }
407
408
409 token = strsep(&spec, ",");
410 if (token) {
411 mode = str_to_mode(token);
412 if (mode == -1) {
413 ret = -EINVAL;
414 goto out;
415 }
416 }
417
418 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
419 if (!ev) {
420 ret = -ENOMEM;
421 goto out;
422 }
423
424 ev->efd = eventfd;
425 ev->level = level;
426 ev->mode = mode;
427
428 mutex_lock(&vmpr->events_lock);
429 list_add(&ev->node, &vmpr->events);
430 mutex_unlock(&vmpr->events_lock);
431out:
432 kfree(spec_orig);
433 return ret;
434}
435
436
437
438
439
440
441
442
443
444
445
446
447void vmpressure_unregister_event(struct mem_cgroup *memcg,
448 struct eventfd_ctx *eventfd)
449{
450 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
451 struct vmpressure_event *ev;
452
453 mutex_lock(&vmpr->events_lock);
454 list_for_each_entry(ev, &vmpr->events, node) {
455 if (ev->efd != eventfd)
456 continue;
457 list_del(&ev->node);
458 kfree(ev);
459 break;
460 }
461 mutex_unlock(&vmpr->events_lock);
462}
463
464
465
466
467
468
469
470
471void vmpressure_init(struct vmpressure *vmpr)
472{
473 spin_lock_init(&vmpr->sr_lock);
474 mutex_init(&vmpr->events_lock);
475 INIT_LIST_HEAD(&vmpr->events);
476 INIT_WORK(&vmpr->work, vmpressure_work_fn);
477}
478
479
480
481
482
483
484
485
486void vmpressure_cleanup(struct vmpressure *vmpr)
487{
488
489
490
491
492 flush_work(&vmpr->work);
493}
494